HonestAI

Paused

JatsTheAIGen commited on Nov 7

Commit

0747201

1 Parent(s): bd329bc

Replace Novita AI with ZeroGPU Chat API (RunPod)

- Replace Novita AI API integration with ZeroGPU Chat API
- Update llm_router.py to use aiohttp for HTTP requests with JWT authentication
- Add automatic token refresh and authentication handling
- Update config.py with ZeroGPU settings (base_url, email, password)
- Update ENV_EXAMPLE_CONTENT.txt with ZeroGPU configuration
- Update flask_api_standalone.py references
- Remove OpenAI dependency from requirements.txt
- Implement task type mapping (general_reasoning -> general, etc.)
- Add context conversion for API format compatibility

Files changed (5) hide show

ENV_EXAMPLE_CONTENT.txt +12 -21
flask_api_standalone.py +15 -14
requirements.txt +2 -2
src/config.py +43 -50
src/llm_router.py +264 -140

ENV_EXAMPLE_CONTENT.txt CHANGED Viewed

@@ -5,27 +5,18 @@
 # Never commit .env to version control!
 # =============================================================================
-# Novita AI Configuration (REQUIRED)
 # =============================================================================
-# Get your API key from: https://novita.ai
-NOVITA_API_KEY=your_novita_api_key_here
-# Dedicated endpoint base URL (default for dedicated endpoints)
-NOVITA_BASE_URL=https://api.novita.ai/dedicated/v1/openai
-# Your dedicated endpoint model ID
-# Format: model-name:endpoint-id
-NOVITA_MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-7B:de-1a706eeafbf3ebc2
-# =============================================================================
-# DeepSeek-R1 Optimized Settings
-# =============================================================================
-# Temperature: 0.5-0.7 range (0.6 recommended for DeepSeek-R1)
-DEEPSEEK_R1_TEMPERATURE=0.6
-# Force reasoning trigger: Enable to ensure DeepSeek-R1 uses reasoning pattern
-# Set to True to add `<think>` prefix for reasoning tasks
-DEEPSEEK_R1_FORCE_REASONING=True
 # =============================================================================
 # Token Allocation Configuration
@@ -45,10 +36,10 @@ CONTEXT_PRUNING_THRESHOLD=115000
 PRIORITIZE_USER_INPUT=True
 # Model context window (actual limit for your deployed model)
-# Default: 128000 tokens for DeepSeek R1 (128K context window)
 # This is the maximum total tokens (input + output) the model can handle
-# Take full advantage of DeepSeek R1's 128K capability
-NOVITA_MODEL_CONTEXT_WINDOW=128000
 # =============================================================================
 # Database Configuration

 # Never commit .env to version control!
 # =============================================================================
+# ZeroGPU Chat API Configuration (REQUIRED)
 # =============================================================================
+# Base URL for your ZeroGPU Chat API endpoint (RunPod)
+# Format: http://your-pod-ip:8000 or https://your-domain.com
+# Example: http://bm9njt1ypzvuqw-8000.proxy.runpod.net
+ZEROGPU_BASE_URL=http://your-pod-ip:8000
+# Email for authentication (register first via /register endpoint)
+ZEROGPU_EMAIL=your-email@example.com
+# Password for authentication
+ZEROGPU_PASSWORD=your_secure_password_here
 # =============================================================================
 # Token Allocation Configuration
 PRIORITIZE_USER_INPUT=True
 # Model context window (actual limit for your deployed model)
+# Default: 8192 tokens (adjust based on your model)
 # This is the maximum total tokens (input + output) the model can handle
+# Common values: 4096, 8192, 16384, 32768, etc.
+ZEROGPU_MODEL_CONTEXT_WINDOW=8192
 # =============================================================================
 # Database Configuration

flask_api_standalone.py CHANGED Viewed

@@ -145,7 +145,7 @@ initialization_attempted = False
 initialization_error = None
 def initialize_orchestrator():
-    """Initialize the AI orchestrator with Novita AI API only"""
     global orchestrator, orchestrator_available, initialization_attempted, initialization_error
     initialization_attempted = True
@@ -153,7 +153,7 @@ def initialize_orchestrator():
     try:
         logger.info("=" * 60)
-        logger.info("INITIALIZING AI ORCHESTRATOR (Novita AI API Only)")
         logger.info("=" * 60)
         from src.agents.intent_agent import create_intent_agent
@@ -166,16 +166,16 @@ def initialize_orchestrator():
         logger.info("✓ Imports successful")
-        # Initialize LLM Router - Novita AI API only
-        logger.info("Initializing LLM Router (Novita AI API only)...")
         try:
-            # Always use Novita AI API (local models disabled)
             llm_router = LLMRouter(hf_token=None, use_local_models=False)
-            logger.info("✓ LLM Router initialized (Novita AI API)")
         except Exception as e:
             logger.error(f"❌ Failed to initialize LLM Router: {e}", exc_info=True)
-            logger.error("This is a critical error - Novita AI API is required")
-            logger.error("Please ensure NOVITA_API_KEY is set in environment variables")
             raise
         logger.info("Initializing Agents...")
@@ -210,24 +210,25 @@ def initialize_orchestrator():
         orchestrator_available = True
         logger.info("=" * 60)
         logger.info("✓ AI ORCHESTRATOR READY")
-        logger.info("  - Novita AI API enabled")
         logger.info("  - MAX_WORKERS: 4")
         logger.info("=" * 60)
         return True
     except ValueError as e:
-        # Handle configuration errors (e.g., missing NOVITA_API_KEY)
-        if "NOVITA_API_KEY" in str(e) or "required" in str(e).lower():
             logger.error("=" * 60)
             logger.error("❌ CONFIGURATION ERROR")
             logger.error("=" * 60)
             logger.error(f"Error: {e}")
             logger.error("")
             logger.error("SOLUTION:")
-            logger.error("1. Set NOVITA_API_KEY in environment variables")
-            logger.error("2. Ensure NOVITA_BASE_URL is correct")
-            logger.error("3. Verify NOVITA_MODEL matches your endpoint")
             logger.error("=" * 60)
             orchestrator_available = False
             initialization_error = f"Configuration Error: {str(e)}"

 initialization_error = None
 def initialize_orchestrator():
+    """Initialize the AI orchestrator with ZeroGPU Chat API (RunPod)"""
     global orchestrator, orchestrator_available, initialization_attempted, initialization_error
     initialization_attempted = True
     try:
         logger.info("=" * 60)
+        logger.info("INITIALIZING AI ORCHESTRATOR (ZeroGPU Chat API - RunPod)")
         logger.info("=" * 60)
         from src.agents.intent_agent import create_intent_agent
         logger.info("✓ Imports successful")
+        # Initialize LLM Router - ZeroGPU Chat API
+        logger.info("Initializing LLM Router (ZeroGPU Chat API)...")
         try:
+            # Always use ZeroGPU Chat API (local models disabled)
             llm_router = LLMRouter(hf_token=None, use_local_models=False)
+            logger.info("✓ LLM Router initialized (ZeroGPU Chat API)")
         except Exception as e:
             logger.error(f"❌ Failed to initialize LLM Router: {e}", exc_info=True)
+            logger.error("This is a critical error - ZeroGPU Chat API is required")
+            logger.error("Please ensure ZEROGPU_BASE_URL, ZEROGPU_EMAIL, and ZEROGPU_PASSWORD are set in environment variables")
             raise
         logger.info("Initializing Agents...")
         orchestrator_available = True
         logger.info("=" * 60)
         logger.info("✓ AI ORCHESTRATOR READY")
+        logger.info("  - ZeroGPU Chat API enabled")
         logger.info("  - MAX_WORKERS: 4")
         logger.info("=" * 60)
         return True
     except ValueError as e:
+        # Handle configuration errors (e.g., missing ZeroGPU credentials)
+        if "ZEROGPU" in str(e) or "required" in str(e).lower():
             logger.error("=" * 60)
             logger.error("❌ CONFIGURATION ERROR")
             logger.error("=" * 60)
             logger.error(f"Error: {e}")
             logger.error("")
             logger.error("SOLUTION:")
+            logger.error("1. Set ZEROGPU_BASE_URL in environment variables (e.g., http://your-pod-ip:8000)")
+            logger.error("2. Set ZEROGPU_EMAIL in environment variables")
+            logger.error("3. Set ZEROGPU_PASSWORD in environment variables")
+            logger.error("4. Register your account first via the /register endpoint if needed")
             logger.error("=" * 60)
             orchestrator_available = False
             initialization_error = f"Configuration Error: {str(e)}"

requirements.txt CHANGED Viewed

@@ -107,6 +107,6 @@ debugpy>=1.7.0
 bandit>=1.7.5  # Security linter for Python code
 safety>=2.3.5  # Dependency vulnerability scanner
-# LLM API Client (required for Novita AI API)
-openai>=1.0.0

 bandit>=1.7.5  # Security linter for Python code
 safety>=2.3.5  # Dependency vulnerability scanner
+# HTTP Client for ZeroGPU Chat API (aiohttp already included above)
+# Note: No OpenAI client needed - using direct HTTP requests

src/config.py CHANGED Viewed

@@ -174,37 +174,24 @@ class Settings(BaseSettings):
         return self._cached_cache_dir
-    # ==================== Novita AI Configuration ====================
-    novita_api_key: str = Field(
-        default="",
-        description="Novita AI API key (required)",
-        env="NOVITA_API_KEY"
-    )
-    novita_base_url: str = Field(
-        default="https://api.novita.ai/dedicated/v1/openai",
-        description="Novita AI dedicated endpoint base URL",
-        env="NOVITA_BASE_URL"
-    )
-    novita_model: str = Field(
-        default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B:de-1a706eeafbf3ebc2",
-        description="Novita AI dedicated endpoint model ID",
-        env="NOVITA_MODEL"
     )
-    # DeepSeek-R1 optimized settings
-    deepseek_r1_temperature: float = Field(
-        default=0.6,
-        description="Temperature for DeepSeek-R1 models (0.5-0.7 range, 0.6 recommended)",
-        env="DEEPSEEK_R1_TEMPERATURE"
     )
-    deepseek_r1_force_reasoning: bool = Field(
-        default=True,
-        description="Force DeepSeek-R1 to start with reasoning trigger",
-        env="DEEPSEEK_R1_FORCE_REASONING"
     )
     # Token Allocation Configuration
@@ -233,34 +220,40 @@ class Settings(BaseSettings):
     )
     # Model Context Window Configuration
-    novita_model_context_window: int = Field(
-        default=128000,
-        description="Maximum context window for Novita AI model (input + output tokens). DeepSeek R1 supports 128K tokens.",
-        env="NOVITA_MODEL_CONTEXT_WINDOW"
     )
-    @validator("novita_api_key", pre=True)
-    def validate_novita_api_key(cls, v):
-        """Validate and clean Novita API key"""
         if v is None:
             return ""
         return str(v).strip()
-    @validator("deepseek_r1_temperature", pre=True)
-    def validate_deepseek_temperature(cls, v):
-        """Validate DeepSeek-R1 temperature is in recommended range"""
-        if isinstance(v, str):
-            v = float(v)
-        temp = float(v) if v else 0.6
-        return max(0.5, min(0.7, temp))
-    @validator("deepseek_r1_force_reasoning", pre=True)
-    def validate_force_reasoning(cls, v):
-        """Convert string to boolean for force_reasoning"""
-        if isinstance(v, str):
-            return v.lower() in ("true", "1", "yes", "on")
-        return bool(v)
     @validator("user_input_max_tokens", pre=True)
     def validate_user_input_tokens(cls, v):
         """Validate user input token limit"""
@@ -279,10 +272,10 @@ class Settings(BaseSettings):
         val = int(v) if v else 115000
         return max(4000, min(125000, val))  # Match context_preparation_budget limits
-    @validator("novita_model_context_window", pre=True)
     def validate_context_window(cls, v):
         """Validate context window size"""
-        val = int(v) if v else 128000
         return max(1000, min(200000, val))  # Support up to 200K for future models
     # ==================== Model Configuration ====================

         return self._cached_cache_dir
+    # ==================== ZeroGPU Chat API Configuration ====================
+    zerogpu_base_url: str = Field(
+        default="http://your-pod-ip:8000",
+        description="ZeroGPU Chat API base URL (RunPod endpoint)",
+        env="ZEROGPU_BASE_URL"
     )
+    zerogpu_email: str = Field(
+        default="",
+        description="ZeroGPU Chat API email for authentication (required)",
+        env="ZEROGPU_EMAIL"
     )
+    zerogpu_password: str = Field(
+        default="",
+        description="ZeroGPU Chat API password for authentication (required)",
+        env="ZEROGPU_PASSWORD"
     )
     # Token Allocation Configuration
     )
     # Model Context Window Configuration
+    zerogpu_model_context_window: int = Field(
+        default=8192,
+        description="Maximum context window for ZeroGPU Chat API model (input + output tokens). Adjust based on your deployed model.",
+        env="ZEROGPU_MODEL_CONTEXT_WINDOW"
     )
+    @validator("zerogpu_base_url", pre=True)
+    def validate_zerogpu_base_url(cls, v):
+        """Validate ZeroGPU base URL"""
+        if v is None:
+            return "http://your-pod-ip:8000"
+        url = str(v).strip()
+        # Remove trailing slash
+        if url.endswith('/'):
+            url = url[:-1]
+        return url
+    @validator("zerogpu_email", pre=True)
+    def validate_zerogpu_email(cls, v):
+        """Validate ZeroGPU email"""
+        if v is None:
+            return ""
+        email = str(v).strip()
+        if email and '@' not in email:
+            logger.warning("ZEROGPU_EMAIL may not be a valid email address")
+        return email
+    @validator("zerogpu_password", pre=True)
+    def validate_zerogpu_password(cls, v):
+        """Validate ZeroGPU password"""
         if v is None:
             return ""
         return str(v).strip()
     @validator("user_input_max_tokens", pre=True)
     def validate_user_input_tokens(cls, v):
         """Validate user input token limit"""
         val = int(v) if v else 115000
         return max(4000, min(125000, val))  # Match context_preparation_budget limits
+    @validator("zerogpu_model_context_window", pre=True)
     def validate_context_window(cls, v):
         """Validate context window size"""
+        val = int(v) if v else 8192
         return max(1000, min(200000, val))  # Support up to 200K for future models
     # ==================== Model Configuration ====================

src/llm_router.py CHANGED Viewed

@@ -1,67 +1,61 @@
-# llm_router.py - NOVITA AI API ONLY
 import logging
 import asyncio
 from typing import Dict, Optional
 from .models_config import LLM_CONFIG
 from .config import get_settings
-# Import OpenAI client for Novita AI API
-try:
-    from openai import OpenAI
-    OPENAI_AVAILABLE = True
-except ImportError:
-    OPENAI_AVAILABLE = False
-    logger = logging.getLogger(__name__)
-    logger.error("openai package not available - Novita AI API requires openai package")
 logger = logging.getLogger(__name__)
 class LLMRouter:
     def __init__(self, hf_token=None, use_local_models: bool = False):
         """
-        Initialize LLM Router with Novita AI API only.
         Args:
             hf_token: Not used (kept for backward compatibility)
             use_local_models: Must be False (local models disabled)
         """
         if use_local_models:
-            raise ValueError("Local models are disabled. Only Novita AI API is supported.")
         self.settings = get_settings()
-        self.novita_client = None
-        # Validate OpenAI package
-        if not OPENAI_AVAILABLE:
-            raise ImportError(
-                "openai package is required for Novita AI API. "
-                "Install it with: pip install openai>=1.0.0"
             )
-        # Validate API key
-        if not self.settings.novita_api_key:
             raise ValueError(
-                "NOVITA_API_KEY is required. "
-                "Set it in environment variables or .env file"
             )
-        # Initialize Novita AI client
         try:
-            self.novita_client = OpenAI(
-                base_url=self.settings.novita_base_url,
-                api_key=self.settings.novita_api_key,
-            )
-            logger.info("Novita AI API client initialized")
-            logger.info(f"Base URL: {self.settings.novita_base_url}")
-            logger.info(f"Model: {self.settings.novita_model}")
-            logger.info(f"Context Window: {self.settings.novita_model_context_window} tokens")
         except Exception as e:
-            logger.error(f"Failed to initialize Novita AI client: {e}")
-            raise RuntimeError(f"Could not initialize Novita AI API client: {e}") from e
     async def route_inference(self, task_type: str, prompt: str, **kwargs):
         """
-        Route inference to Novita AI API.
         Args:
             task_type: Type of task (general_reasoning, intent_classification, etc.)
@@ -71,101 +65,200 @@ class LLMRouter:
         Returns:
             Generated text response
         """
-        logger.info(f"Routing inference to Novita AI API for task: {task_type}")
-        if not self.novita_client:
-            raise RuntimeError("Novita AI client not initialized")
         try:
             # Handle embedding generation (may need special handling)
             if task_type == "embedding_generation":
-                logger.warning("Embedding generation via Novita API may require special implementation")
-                # For now, use chat completion (may need adjustment based on Novita API capabilities)
-                result = await self._call_novita_api(task_type, prompt, **kwargs)
             else:
-                result = await self._call_novita_api(task_type, prompt, **kwargs)
             if result is None:
-                logger.error(f"Novita AI API returned None for task: {task_type}")
                 raise RuntimeError(f"Inference failed for task: {task_type}")
-            logger.info(f"Inference complete for {task_type} (Novita AI API)")
             return result
         except Exception as e:
-            logger.error(f"Novita AI API inference failed: {e}", exc_info=True)
             raise RuntimeError(
                 f"Inference failed for task: {task_type}. "
-                f"Novita AI API error: {e}"
             ) from e
-    async def _call_novita_api(self, task_type: str, prompt: str, **kwargs) -> Optional[str]:
-        """Call Novita AI API for inference."""
-        if not self.novita_client:
-            return None
-        # Get model config
-        model_config = self._select_model(task_type)
-        model_name = kwargs.get('model', self.settings.novita_model)
-        # Get optimized parameters
-        requested_max_tokens = kwargs.get('max_tokens', model_config.get('max_tokens', 4096))
-        temperature = kwargs.get('temperature',
-            model_config.get('temperature', self.settings.deepseek_r1_temperature))
-        top_p = kwargs.get('top_p', model_config.get('top_p', 0.95))
-        stream = kwargs.get('stream', False)
-        # Format prompt according to DeepSeek-R1 best practices
-        formatted_prompt = self._format_deepseek_r1_prompt(prompt, task_type, model_config)
-        # IMPORTANT: Calculate safe max_tokens based on input size
-        max_tokens = self._calculate_safe_max_tokens(formatted_prompt, requested_max_tokens)
-        # IMPORTANT: No system prompt - all instructions in user prompt
-        messages = [{"role": "user", "content": formatted_prompt}]
-        # Build request parameters
-        request_params = {
-            "model": model_name,
-            "messages": messages,
-            "stream": stream,
-            "max_tokens": max_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
         }
         try:
-            if stream:
-                # Handle streaming response
-                response_text = ""
-                stream_response = self.novita_client.chat.completions.create(**request_params)
-                for chunk in stream_response:
-                    if chunk.choices and len(chunk.choices) > 0:
-                        delta = chunk.choices[0].delta
-                        if delta and delta.content:
-                            response_text += delta.content
-                # Clean up reasoning tags if present
-                response_text = self._clean_reasoning_tags(response_text)
-                logger.info(f"Novita AI API generated response (length: {len(response_text)})")
-                return response_text
-            else:
-                # Handle non-streaming response
-                response = self.novita_client.chat.completions.create(**request_params)
-                if response.choices and len(response.choices) > 0:
-                    result = response.choices[0].message.content
-                    # Clean up reasoning tags if present
-                    result = self._clean_reasoning_tags(result)
-                    logger.info(f"Novita AI API generated response (length: {len(result)})")
                     return result
                 else:
-                    logger.error("Novita AI API returned empty response")
                     return None
-        except Exception as e:
-            logger.error(f"Error calling Novita AI API: {e}", exc_info=True)
             raise
     def _calculate_safe_max_tokens(self, prompt: str, requested_max_tokens: int) -> int:
@@ -184,7 +277,7 @@ class LLMRouter:
         input_tokens = len(prompt) // 4
         # Get model context window from settings
-        context_window = self.settings.novita_model_context_window
         logger.debug(
             f"Calculating safe max_tokens: input ~{input_tokens} tokens, "
@@ -209,26 +302,14 @@ class LLMRouter:
         return safe_max_tokens
-    def _format_deepseek_r1_prompt(self, prompt: str, task_type: str, model_config: dict) -> str:
         """
-        Format prompt according to DeepSeek-R1 best practices:
-        - No system prompt (all instructions in user prompt)
-        - Force reasoning trigger for reasoning tasks
-        - Add math directive for mathematical problems
         """
         formatted_prompt = prompt
-        # Check if we should force reasoning prefix
-        force_reasoning = (
-            self.settings.deepseek_r1_force_reasoning and
-            model_config.get("force_reasoning_prefix", False)
-        )
-        if force_reasoning:
-            # Force model to start with reasoning trigger
-            formatted_prompt = f"`<think>`\n\n{formatted_prompt}"
-        # Add math directive for mathematical problems
         if self._is_math_query(prompt):
             math_directive = "Please reason step by step, and put your final answer within \\boxed{}."
             formatted_prompt = f"{formatted_prompt}\n\n{math_directive}"
@@ -246,7 +327,11 @@ class LLMRouter:
         return any(keyword in prompt_lower for keyword in math_keywords)
     def _clean_reasoning_tags(self, text: str) -> str:
-        """Clean up reasoning tags from response"""
         text = text.replace("`<think>`", "").replace("`</think>`", "")
         text = text.strip()
         return text
@@ -263,33 +348,72 @@ class LLMRouter:
         return model_map.get(task_type, LLM_CONFIG["models"]["reasoning_primary"])
     async def get_available_models(self):
-        """Get list of available models (Novita AI only)"""
-        return ["Novita AI API - DeepSeek-R1-Distill-Qwen-7B"]
     async def health_check(self):
-        """Perform health check on Novita AI API"""
         try:
-            # Test API with a simple request
-            test_response = self.novita_client.chat.completions.create(
-                model=self.settings.novita_model,
-                messages=[{"role": "user", "content": "test"}],
-                max_tokens=5
-            )
-            return {
-                "provider": "novita_api",
-                "status": "healthy",
-                "model": self.settings.novita_model,
-                "base_url": self.settings.novita_base_url
-            }
         except Exception as e:
             logger.error(f"Health check failed: {e}")
             return {
-                "provider": "novita_api",
                 "status": "unhealthy",
                 "error": str(e)
             }
     def prepare_context_for_llm(self, raw_context: Dict, max_tokens: Optional[int] = None,
                                 user_input: Optional[str] = None) -> str:
         """

+# llm_router.py - ZeroGPU Chat API (RunPod)
 import logging
 import asyncio
+import aiohttp
+import time
 from typing import Dict, Optional
 from .models_config import LLM_CONFIG
 from .config import get_settings
 logger = logging.getLogger(__name__)
 class LLMRouter:
     def __init__(self, hf_token=None, use_local_models: bool = False):
         """
+        Initialize LLM Router with ZeroGPU Chat API (RunPod).
         Args:
             hf_token: Not used (kept for backward compatibility)
             use_local_models: Must be False (local models disabled)
         """
         if use_local_models:
+            raise ValueError("Local models are disabled. Only ZeroGPU Chat API is supported.")
         self.settings = get_settings()
+        self.base_url = self.settings.zerogpu_base_url.rstrip('/')
+        self.access_token = None
+        self.refresh_token = None
+        self.token_expires_at = 0
+        self.session = None
+        # Validate base URL
+        if not self.settings.zerogpu_base_url:
+            raise ValueError(
+                "ZEROGPU_BASE_URL is required. "
+                "Set it in environment variables or .env file"
             )
+        # Validate credentials
+        if not self.settings.zerogpu_email or not self.settings.zerogpu_password:
             raise ValueError(
+                "ZEROGPU_EMAIL and ZEROGPU_PASSWORD are required. "
+                "Set them in environment variables or .env file"
             )
+        logger.info("ZeroGPU Chat API client initializing")
+        logger.info(f"Base URL: {self.base_url}")
+        # Initialize session and authenticate
         try:
+            # Authentication will happen on first request if needed
+            logger.info("ZeroGPU Chat API client initialized (authentication on first request)")
         except Exception as e:
+            logger.error(f"Failed to initialize ZeroGPU Chat API client: {e}")
+            raise RuntimeError(f"Could not initialize ZeroGPU Chat API client: {e}") from e
     async def route_inference(self, task_type: str, prompt: str, **kwargs):
         """
+        Route inference to ZeroGPU Chat API.
         Args:
             task_type: Type of task (general_reasoning, intent_classification, etc.)
         Returns:
             Generated text response
         """
+        logger.info(f"Routing inference to ZeroGPU Chat API for task: {task_type}")
         try:
+            # Ensure authenticated
+            await self._ensure_authenticated()
+            # Map internal task types to API task types
+            api_task = self._map_task_type(task_type)
+            # Pass original task type for model config lookup
+            kwargs['original_task_type'] = task_type
             # Handle embedding generation (may need special handling)
             if task_type == "embedding_generation":
+                logger.warning("Embedding generation via ZeroGPU API may require special implementation")
+                result = await self._call_zerogpu_api(api_task, prompt, **kwargs)
             else:
+                result = await self._call_zerogpu_api(api_task, prompt, **kwargs)
             if result is None:
+                logger.error(f"ZeroGPU Chat API returned None for task: {task_type}")
                 raise RuntimeError(f"Inference failed for task: {task_type}")
+            logger.info(f"Inference complete for {task_type} (ZeroGPU Chat API)")
             return result
         except Exception as e:
+            logger.error(f"ZeroGPU Chat API inference failed: {e}", exc_info=True)
             raise RuntimeError(
                 f"Inference failed for task: {task_type}. "
+                f"ZeroGPU Chat API error: {e}"
             ) from e
+    async def _ensure_authenticated(self):
+        """Ensure we have a valid access token, login if needed."""
+        # Check if token is expired (with 60 second buffer)
+        if self.access_token and time.time() < (self.token_expires_at - 60):
+            return
+        # Create session if needed
+        if self.session is None:
+            self.session = aiohttp.ClientSession()
+        # Login to get tokens
+        await self._login()
+    async def _login(self):
+        """Login to ZeroGPU Chat API and get access/refresh tokens."""
+        try:
+            login_url = f"{self.base_url}/login"
+            login_data = {
+                "email": self.settings.zerogpu_email,
+                "password": self.settings.zerogpu_password
+            }
+            async with self.session.post(login_url, json=login_data) as response:
+                if response.status == 401:
+                    raise ValueError("Invalid email or password for ZeroGPU Chat API")
+                response.raise_for_status()
+                data = await response.json()
+                self.access_token = data.get("access_token")
+                self.refresh_token = data.get("refresh_token")
+                # Access tokens typically expire in 15 minutes (900 seconds)
+                self.token_expires_at = time.time() + 900
+                logger.info("Successfully authenticated with ZeroGPU Chat API")
+        except aiohttp.ClientError as e:
+            logger.error(f"Failed to login to ZeroGPU Chat API: {e}")
+            raise RuntimeError(f"Authentication failed: {e}") from e
+    async def _refresh_token(self):
+        """Refresh access token using refresh token."""
+        try:
+            refresh_url = f"{self.base_url}/refresh"
+            headers = {"X-Refresh-Token": self.refresh_token}
+            async with self.session.post(refresh_url, headers=headers) as response:
+                if response.status == 401:
+                    # Refresh token expired, need to login again
+                    await self._login()
+                    return
+                response.raise_for_status()
+                data = await response.json()
+                self.access_token = data.get("access_token")
+                self.refresh_token = data.get("refresh_token")
+                self.token_expires_at = time.time() + 900
+                logger.info("Successfully refreshed ZeroGPU Chat API token")
+        except aiohttp.ClientError as e:
+            logger.error(f"Failed to refresh token: {e}")
+            # Try login as fallback
+            await self._login()
+    def _map_task_type(self, internal_task: str) -> str:
+        """Map internal task types to ZeroGPU Chat API task types."""
+        task_mapping = {
+            "general_reasoning": "general",
+            "response_synthesis": "general",
+            "intent_classification": "classification",
+            "safety_check": "classification",
+            "embedding_generation": "embedding"
+        }
+        return task_mapping.get(internal_task, "general")
+    async def _call_zerogpu_api(self, task: str, prompt: str, **kwargs) -> Optional[str]:
+        """Call ZeroGPU Chat API for inference."""
+        if not self.session:
+            self.session = aiohttp.ClientSession()
+        # Store original task type for model config lookup
+        original_task = kwargs.pop('original_task_type', None)
+        # Get model config for defaults
+        model_config = self._select_model(original_task or 'general_reasoning')
+        # Build request payload according to API documentation
+        payload = {
+            "message": prompt,
+            "task": task,
+            "max_tokens": kwargs.get('max_tokens', model_config.get('max_tokens', 512)),
+            "temperature": kwargs.get('temperature', model_config.get('temperature', 0.7)),
+            "top_p": kwargs.get('top_p', model_config.get('top_p', 0.9)),
+        }
+        # Add optional parameters
+        if 'context' in kwargs and kwargs['context']:
+            # Convert context to API format if needed
+            context = kwargs['context']
+            if isinstance(context, list) and len(context) > 0:
+                # Convert to API format: list of dicts with role, content, timestamp
+                api_context = []
+                for item in context[:50]:  # Max 50 messages
+                    if isinstance(item, (list, tuple)) and len(item) >= 2:
+                        # Format: [user_msg, assistant_msg]
+                        api_context.append({
+                            "role": "user",
+                            "content": str(item[0]),
+                            "timestamp": kwargs.get('timestamp', time.time())
+                        })
+                        api_context.append({
+                            "role": "assistant",
+                            "content": str(item[1]),
+                            "timestamp": kwargs.get('timestamp', time.time())
+                        })
+                    elif isinstance(item, dict):
+                        api_context.append(item)
+                payload["context"] = api_context
+        if 'system_prompt' in kwargs and kwargs['system_prompt']:
+            payload["system_prompt"] = kwargs['system_prompt']
+        if 'repetition_penalty' in kwargs:
+            payload["repetition_penalty"] = kwargs['repetition_penalty']
+        # Prepare headers
+        headers = {
+            "Authorization": f"Bearer {self.access_token}",
+            "Content-Type": "application/json"
         }
         try:
+            chat_url = f"{self.base_url}/chat"
+            async with self.session.post(chat_url, json=payload, headers=headers) as response:
+                # Handle token expiration
+                if response.status == 401:
+                    logger.info("Token expired, refreshing...")
+                    await self._refresh_token()
+                    headers["Authorization"] = f"Bearer {self.access_token}"
+                    # Retry request
+                    async with self.session.post(chat_url, json=payload, headers=headers) as retry_response:
+                        retry_response.raise_for_status()
+                        data = await retry_response.json()
+                        return data.get("response")
+                response.raise_for_status()
+                data = await response.json()
+                # Extract response from API
+                result = data.get("response")
+                if result:
+                    logger.info(f"ZeroGPU Chat API generated response (length: {len(result)})")
                     return result
                 else:
+                    logger.error("ZeroGPU Chat API returned empty response")
                     return None
+        except aiohttp.ClientError as e:
+            logger.error(f"Error calling ZeroGPU Chat API: {e}", exc_info=True)
             raise
     def _calculate_safe_max_tokens(self, prompt: str, requested_max_tokens: int) -> int:
         input_tokens = len(prompt) // 4
         # Get model context window from settings
+        context_window = self.settings.zerogpu_model_context_window
         logger.debug(
             f"Calculating safe max_tokens: input ~{input_tokens} tokens, "
         return safe_max_tokens
+    def _format_prompt(self, prompt: str, task_type: str, model_config: dict) -> str:
         """
+        Format prompt for ZeroGPU Chat API.
+        Can be customized based on model requirements.
         """
         formatted_prompt = prompt
+        # Add math directive for mathematical problems if needed
         if self._is_math_query(prompt):
             math_directive = "Please reason step by step, and put your final answer within \\boxed{}."
             formatted_prompt = f"{formatted_prompt}\n\n{math_directive}"
         return any(keyword in prompt_lower for keyword in math_keywords)
     def _clean_reasoning_tags(self, text: str) -> str:
+        """Clean up reasoning tags from response if present"""
+        if not text:
+            return text
+        # Remove common reasoning tags if present
+        text = text.replace("`<think>`", "").replace("`</think>`", "")
         text = text.replace("`<think>`", "").replace("`</think>`", "")
         text = text.strip()
         return text
         return model_map.get(task_type, LLM_CONFIG["models"]["reasoning_primary"])
     async def get_available_models(self):
+        """Get list of available models from ZeroGPU Chat API"""
+        try:
+            await self._ensure_authenticated()
+            if not self.session:
+                self.session = aiohttp.ClientSession()
+            tasks_url = f"{self.base_url}/tasks"
+            headers = {"Authorization": f"Bearer {self.access_token}"}
+            async with self.session.get(tasks_url, headers=headers) as response:
+                if response.status == 401:
+                    await self._refresh_token()
+                    headers["Authorization"] = f"Bearer {self.access_token}"
+                    async with self.session.get(tasks_url, headers=headers) as retry_response:
+                        retry_response.raise_for_status()
+                        data = await retry_response.json()
+                else:
+                    response.raise_for_status()
+                    data = await response.json()
+                tasks = data.get("tasks", {})
+                models = [f"ZeroGPU Chat API - {task}: {info.get('model', 'N/A')}"
+                         for task, info in tasks.items()]
+                return models if models else ["ZeroGPU Chat API"]
+        except Exception as e:
+            logger.error(f"Failed to get available models: {e}")
+            return ["ZeroGPU Chat API"]
     async def health_check(self):
+        """Perform health check on ZeroGPU Chat API"""
         try:
+            if not self.session:
+                self.session = aiohttp.ClientSession()
+            # Check health endpoint (no auth required)
+            health_url = f"{self.base_url}/health"
+            async with self.session.get(health_url) as response:
+                response.raise_for_status()
+                data = await response.json()
+                return {
+                    "provider": "zerogpu_chat_api",
+                    "status": "healthy" if data.get("status") == "healthy" else "unhealthy",
+                    "models_ready": data.get("models_ready", False),
+                    "base_url": self.base_url
+                }
         except Exception as e:
             logger.error(f"Health check failed: {e}")
             return {
+                "provider": "zerogpu_chat_api",
                 "status": "unhealthy",
                 "error": str(e)
             }
+    async def __aenter__(self):
+        """Async context manager entry"""
+        if not self.session:
+            self.session = aiohttp.ClientSession()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit"""
+        if self.session:
+            await self.session.close()
+            self.session = None
     def prepare_context_for_llm(self, raw_context: Dict, max_tokens: Optional[int] = None,
                                 user_input: Optional[str] = None) -> str:
         """