Spaces:

JatinAutonomousLabs
/

Research_AI_Assistant

Sleeping

JatsTheAIGen commited on Nov 7

Commit

b445296

1 Parent(s): fad63bf

feat: Make ZeroGPU API first priority for inference

- Auto-enable ZeroGPU if credentials are provided (no need for USE_ZERO_GPU=true)
- ZeroGPU is always tried FIRST before local models or HF API
- Keep trying ZeroGPU even if initialization had warnings
- Update routing logic to check for client existence, not just use_zero_gpu flag
- Improve logging to clearly indicate ZeroGPU as PRIMARY inference method
- Update all initialization logs to show inference priority order
- Only fallback to local models or HF API if ZeroGPU actually fails
- Explicitly disable with USE_ZERO_GPU=false if needed

Files changed (5) hide show

app.py +5 -3
config.py +8 -2
flask_api_standalone.py +5 -3
main.py +23 -7
src/llm_router.py +33 -21

app.py CHANGED Viewed

@@ -2071,14 +2071,16 @@ def initialize_orchestrator():
         logger.info("[ORCHESTRATION STEP 1/6] Initializing LLM Router...")
         logger.info("  → Checking inference backend configuration...")
         if zero_gpu_config and zero_gpu_config.get("enabled"):
-            logger.info(f"  → ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')}")
             if zero_gpu_config.get("per_user_mode"):
                 logger.info("  → Mode: Per-user (multi-tenant)")
             else:
                 logger.info("  → Mode: Service account (single-tenant)")
         else:
-            logger.info("  → ZeroGPU API: Disabled (using local/HF fallback)")
-        logger.info("  → Local models: Enabled (lazy loading)")
         llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
         logger.info("  ✓ LLM Router initialized")
         logger.info("  ✓ Inference routing configured")

         logger.info("[ORCHESTRATION STEP 1/6] Initializing LLM Router...")
         logger.info("  → Checking inference backend configuration...")
         if zero_gpu_config and zero_gpu_config.get("enabled"):
+            logger.info(f"  → ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')} [PRIMARY - FIRST PRIORITY]")
             if zero_gpu_config.get("per_user_mode"):
                 logger.info("  → Mode: Per-user (multi-tenant)")
             else:
                 logger.info("  → Mode: Service account (single-tenant)")
+            logger.info("  → Inference priority: ZeroGPU API → Local Models → HF API")
         else:
+            logger.warning("  ⚠ ZeroGPU API: Not configured (using local/HF fallback)")
+            logger.info("  → To enable: Set ZERO_GPU_EMAIL and ZERO_GPU_PASSWORD")
+        logger.info("  → Local models: Enabled (lazy loading - fallback only)")
         llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
         logger.info("  ✓ LLM Router initialized")
         logger.info("  ✓ Inference routing configured")

config.py CHANGED Viewed

@@ -38,8 +38,8 @@ class Settings(BaseSettings):
     log_format: str = os.getenv("LOG_FORMAT", "json")
     # ZeroGPU API settings
-    zero_gpu_enabled: bool = os.getenv("USE_ZERO_GPU", "false").lower() == "true"
-    # Default to Runpod proxy URL format: https://<pod-id>-8000.proxy.runpod.net
     zero_gpu_base_url: str = os.getenv("ZERO_GPU_API_URL", "https://bm9njt1ypzvuqw-8000.proxy.runpod.net")
     zero_gpu_email: str = os.getenv("ZERO_GPU_EMAIL", "")
     zero_gpu_password: str = os.getenv("ZERO_GPU_PASSWORD", "")
@@ -48,6 +48,12 @@ class Settings(BaseSettings):
     zero_gpu_admin_email: str = os.getenv("ZERO_GPU_ADMIN_EMAIL", "")
     zero_gpu_admin_password: str = os.getenv("ZERO_GPU_ADMIN_PASSWORD", "")
     class Config:
         env_file = ".env"

     log_format: str = os.getenv("LOG_FORMAT", "json")
     # ZeroGPU API settings
+    # Auto-enable if credentials are provided (ZeroGPU is first priority)
+    # Can be explicitly disabled with USE_ZERO_GPU=false
     zero_gpu_base_url: str = os.getenv("ZERO_GPU_API_URL", "https://bm9njt1ypzvuqw-8000.proxy.runpod.net")
     zero_gpu_email: str = os.getenv("ZERO_GPU_EMAIL", "")
     zero_gpu_password: str = os.getenv("ZERO_GPU_PASSWORD", "")
     zero_gpu_admin_email: str = os.getenv("ZERO_GPU_ADMIN_EMAIL", "")
     zero_gpu_admin_password: str = os.getenv("ZERO_GPU_ADMIN_PASSWORD", "")
+    # Auto-enable ZeroGPU if credentials are provided (unless explicitly disabled)
+    _explicit_disable = os.getenv("USE_ZERO_GPU", "").lower() == "false"
+    _has_service_creds = zero_gpu_email and zero_gpu_password
+    _has_per_user_creds = zero_gpu_per_user_mode and zero_gpu_admin_email and zero_gpu_admin_password
+    zero_gpu_enabled: bool = not _explicit_disable and (_has_service_creds or _has_per_user_creds)
     class Config:
         env_file = ".env"

flask_api_standalone.py CHANGED Viewed

@@ -96,10 +96,12 @@ def initialize_orchestrator():
         logger.info("[FLASK API STEP 1/5] Initializing LLM Router...")
         logger.info("  → Configuring inference backend...")
         if zero_gpu_config and zero_gpu_config.get("enabled"):
-            logger.info(f"  → ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')}")
         else:
-            logger.info("  → ZeroGPU API: Disabled (using local/HF fallback)")
-        logger.info("  → Local models: Enabled (lazy loading)")
         llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
         logger.info("  ✓ LLM Router initialized")

         logger.info("[FLASK API STEP 1/5] Initializing LLM Router...")
         logger.info("  → Configuring inference backend...")
         if zero_gpu_config and zero_gpu_config.get("enabled"):
+            logger.info(f"  → ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')} [PRIMARY - FIRST PRIORITY]")
+            logger.info("  → Inference priority: ZeroGPU API → Local Models → HF API")
         else:
+            logger.warning("  ⚠ ZeroGPU API: Not configured (using local/HF fallback)")
+            logger.info("  → To enable: Set ZERO_GPU_EMAIL and ZERO_GPU_PASSWORD")
+        logger.info("  → Local models: Enabled (lazy loading - fallback only)")
         llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
         logger.info("  ✓ LLM Router initialized")

main.py CHANGED Viewed

@@ -181,16 +181,32 @@ def main():
     else:
         logger.warning("  ⚠ HF_TOKEN not found - some features may be limited")
-    # Check ZeroGPU configuration
-    use_zero_gpu = os.getenv('USE_ZERO_GPU', 'false').lower() == 'true'
-    if use_zero_gpu:
-        zero_gpu_url = os.getenv('ZERO_GPU_API_URL', '')
         if zero_gpu_url:
-            logger.info(f"  ✓ ZeroGPU API enabled: {zero_gpu_url}")
         else:
-            logger.warning("  ⚠ ZeroGPU enabled but URL not configured")
     else:
-        logger.info("  ✓ ZeroGPU API disabled (using local/HF fallback)")
     logger.info("  ✓ Environment check complete")

     else:
         logger.warning("  ⚠ HF_TOKEN not found - some features may be limited")
+    # Check ZeroGPU configuration (auto-enabled if credentials provided)
+    zero_gpu_url = os.getenv('ZERO_GPU_API_URL', '')
+    zero_gpu_email = os.getenv('ZERO_GPU_EMAIL', '')
+    zero_gpu_password = os.getenv('ZERO_GPU_PASSWORD', '')
+    zero_gpu_admin_email = os.getenv('ZERO_GPU_ADMIN_EMAIL', '')
+    zero_gpu_admin_password = os.getenv('ZERO_GPU_ADMIN_PASSWORD', '')
+    zero_gpu_per_user = os.getenv('ZERO_GPU_PER_USER_MODE', 'false').lower() == 'true'
+    has_zero_gpu_creds = (zero_gpu_email and zero_gpu_password) or (zero_gpu_per_user and zero_gpu_admin_email and zero_gpu_admin_password)
+    explicit_disable = os.getenv('USE_ZERO_GPU', '').lower() == 'false'
+    if explicit_disable:
+        logger.info("  ⚠ ZeroGPU API explicitly disabled (USE_ZERO_GPU=false)")
+    elif has_zero_gpu_creds:
         if zero_gpu_url:
+            logger.info(f"  ✓ ZeroGPU API enabled: {zero_gpu_url} [PRIMARY - FIRST PRIORITY]")
+            if zero_gpu_per_user:
+                logger.info("  → Mode: Per-user (multi-tenant)")
+            else:
+                logger.info("  → Mode: Service account (single-tenant)")
+            logger.info("  → Inference priority: ZeroGPU API → Local Models → HF API")
         else:
+            logger.warning("  ⚠ ZeroGPU credentials provided but URL not configured")
     else:
+        logger.info("  ⚠ ZeroGPU API not configured (using local/HF fallback)")
+        logger.info("  → To enable: Set ZERO_GPU_EMAIL and ZERO_GPU_PASSWORD")
     logger.info("  ✓ Environment check complete")

src/llm_router.py CHANGED Viewed

@@ -45,6 +45,7 @@ class LLMRouter:
                         self.use_zero_gpu = True
                         self.zero_gpu_mode = "per_user"
                         logger.info("✓ ZeroGPU per-user mode enabled (multi-tenant)")
                     else:
                         logger.warning("ZeroGPU per-user mode enabled but admin credentials not provided")
                 except ImportError:
@@ -67,15 +68,19 @@ class LLMRouter:
                         self.use_zero_gpu = True
                         self.zero_gpu_mode = "service_account"
                         logger.info("✓ ZeroGPU API client initialized (service account mode)")
-                        # Wait for API to be ready (non-blocking, will fallback if not ready)
                         try:
-                            if not self.zero_gpu_client.wait_for_ready(timeout=10):
-                                logger.warning("ZeroGPU API not ready, will use HF fallback")
-                                self.use_zero_gpu = False
                         except Exception as e:
-                            logger.warning(f"Could not verify ZeroGPU API readiness: {e}. Will use HF fallback.")
-                            self.use_zero_gpu = False
                     else:
                         logger.warning("ZeroGPU enabled but credentials not provided")
                 except ImportError:
@@ -101,7 +106,10 @@ class LLMRouter:
     async def route_inference(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, user_id: Optional[str] = None, **kwargs):
         """
         Smart routing based on task specialization
-        Tries ZeroGPU API first, then local models as fallback (lazy loading), then HF Inference API
         Args:
             task_type: Task type (e.g., "intent_classification", "general_reasoning")
@@ -114,40 +122,44 @@ class LLMRouter:
         model_config = self._select_model(task_type)
         logger.info(f"Selected model: {model_config['model_id']}")
-        # Try ZeroGPU API first (primary path)
-        if self.use_zero_gpu:
             try:
                 result = await self._call_zero_gpu_endpoint(task_type, prompt, context, user_id, **kwargs)
-                if result is not None:
-                    logger.info(f"Inference complete for {task_type} (ZeroGPU API)")
                     return result
                 else:
-                    logger.warning("ZeroGPU API returned None, falling back to local models")
             except Exception as e:
-                logger.warning(f"ZeroGPU API inference failed: {e}. Falling back to local models.")
                 logger.debug("Exception details:", exc_info=True)
-        # Fallback to local models (lazy loading - only if ZeroGPU fails)
         if self.use_local_models and self.local_loader:
             try:
-                logger.info("ZeroGPU API unavailable, loading local model as fallback...")
                 # Handle embedding generation separately
                 if task_type == "embedding_generation":
                     result = await self._call_local_embedding(model_config, prompt, **kwargs)
                 else:
                     result = await self._call_local_model(model_config, prompt, task_type, **kwargs)
-                if result is not None:
-                    logger.info(f"Inference complete for {task_type} (local model fallback)")
                     return result
                 else:
-                    logger.warning("Local model returned None, falling back to HF API")
             except Exception as e:
-                logger.warning(f"Local model inference failed: {e}. Falling back to HF API.")
                 logger.debug("Exception details:", exc_info=True)
-        # Final fallback to HF Inference API
-        logger.info("Using HF Inference API as final fallback")
         # Health check and fallback logic
         if not await self._is_model_healthy(model_config["model_id"]):
             logger.warning(f"Model unhealthy, using fallback")

                         self.use_zero_gpu = True
                         self.zero_gpu_mode = "per_user"
                         logger.info("✓ ZeroGPU per-user mode enabled (multi-tenant)")
+                        logger.info("  → ZeroGPU API is PRIMARY inference method (first priority)")
                     else:
                         logger.warning("ZeroGPU per-user mode enabled but admin credentials not provided")
                 except ImportError:
                         self.use_zero_gpu = True
                         self.zero_gpu_mode = "service_account"
                         logger.info("✓ ZeroGPU API client initialized (service account mode)")
+                        logger.info("  → ZeroGPU API is PRIMARY inference method (first priority)")
+                        # Check API readiness (non-blocking, will still try during inference)
                         try:
+                            if self.zero_gpu_client.wait_for_ready(timeout=10):
+                                logger.info("  ✓ ZeroGPU API is ready")
+                            else:
+                                logger.warning("  ⚠ ZeroGPU API not ready yet (will retry during inference)")
+                                # Keep use_zero_gpu=True - we'll try it first anyway
                         except Exception as e:
+                            logger.warning(f"  ⚠ Could not verify ZeroGPU API readiness: {e}")
+                            logger.info("  → Will still attempt ZeroGPU first during inference")
+                            # Keep use_zero_gpu=True - we'll try it first anyway
                     else:
                         logger.warning("ZeroGPU enabled but credentials not provided")
                 except ImportError:
     async def route_inference(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, user_id: Optional[str] = None, **kwargs):
         """
         Smart routing based on task specialization
+        PRIORITY ORDER (ZeroGPU is FIRST):
+        1. ZeroGPU API (PRIMARY - always tried first if configured)
+        2. Local models (fallback - lazy loading, only if ZeroGPU fails)
+        3. HF Inference API (final fallback - only if both above fail)
         Args:
             task_type: Task type (e.g., "intent_classification", "general_reasoning")
         model_config = self._select_model(task_type)
         logger.info(f"Selected model: {model_config['model_id']}")
+        # PRIORITY 1: Try ZeroGPU API first (PRIMARY inference method)
+        # Always try if client is configured, even if initialization had warnings
+        if self.zero_gpu_client or self.zero_gpu_user_manager:
+            logger.info("→ Attempting ZeroGPU API (PRIMARY inference method)...")
             try:
                 result = await self._call_zero_gpu_endpoint(task_type, prompt, context, user_id, **kwargs)
+                if result is not None and result.strip():  # Check for non-empty result
+                    logger.info(f"✓ Inference complete for {task_type} (ZeroGPU API - PRIMARY)")
                     return result
                 else:
+                    logger.warning("ZeroGPU API returned empty result, falling back to local models")
             except Exception as e:
+                logger.warning(f"ZeroGPU API inference failed: {e}")
+                logger.info("→ Falling back to local models (lazy loading)...")
                 logger.debug("Exception details:", exc_info=True)
+        # PRIORITY 2: Fallback to local models (lazy loading - only if ZeroGPU fails)
         if self.use_local_models and self.local_loader:
             try:
+                logger.info("→ Loading local model as fallback (ZeroGPU unavailable)...")
                 # Handle embedding generation separately
                 if task_type == "embedding_generation":
                     result = await self._call_local_embedding(model_config, prompt, **kwargs)
                 else:
                     result = await self._call_local_model(model_config, prompt, task_type, **kwargs)
+                if result is not None and result.strip():  # Check for non-empty result
+                    logger.info(f"✓ Inference complete for {task_type} (local model fallback)")
                     return result
                 else:
+                    logger.warning("Local model returned empty result, falling back to HF API")
             except Exception as e:
+                logger.warning(f"Local model inference failed: {e}")
+                logger.info("→ Falling back to HF Inference API (final fallback)...")
                 logger.debug("Exception details:", exc_info=True)
+        # PRIORITY 3: Final fallback to HF Inference API (only if ZeroGPU and local models fail)
+        logger.info("→ Using HF Inference API as final fallback...")
         # Health check and fallback logic
         if not await self._is_model_healthy(model_config["model_id"]):
             logger.warning(f"Model unhealthy, using fallback")