Commit
Β·
b445296
1
Parent(s):
fad63bf
feat: Make ZeroGPU API first priority for inference
Browse files- Auto-enable ZeroGPU if credentials are provided (no need for USE_ZERO_GPU=true)
- ZeroGPU is always tried FIRST before local models or HF API
- Keep trying ZeroGPU even if initialization had warnings
- Update routing logic to check for client existence, not just use_zero_gpu flag
- Improve logging to clearly indicate ZeroGPU as PRIMARY inference method
- Update all initialization logs to show inference priority order
- Only fallback to local models or HF API if ZeroGPU actually fails
- Explicitly disable with USE_ZERO_GPU=false if needed
- app.py +5 -3
- config.py +8 -2
- flask_api_standalone.py +5 -3
- main.py +23 -7
- src/llm_router.py +33 -21
app.py
CHANGED
|
@@ -2071,14 +2071,16 @@ def initialize_orchestrator():
|
|
| 2071 |
logger.info("[ORCHESTRATION STEP 1/6] Initializing LLM Router...")
|
| 2072 |
logger.info(" β Checking inference backend configuration...")
|
| 2073 |
if zero_gpu_config and zero_gpu_config.get("enabled"):
|
| 2074 |
-
logger.info(f" β ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')}")
|
| 2075 |
if zero_gpu_config.get("per_user_mode"):
|
| 2076 |
logger.info(" β Mode: Per-user (multi-tenant)")
|
| 2077 |
else:
|
| 2078 |
logger.info(" β Mode: Service account (single-tenant)")
|
|
|
|
| 2079 |
else:
|
| 2080 |
-
logger.
|
| 2081 |
-
|
|
|
|
| 2082 |
llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
|
| 2083 |
logger.info(" β LLM Router initialized")
|
| 2084 |
logger.info(" β Inference routing configured")
|
|
|
|
| 2071 |
logger.info("[ORCHESTRATION STEP 1/6] Initializing LLM Router...")
|
| 2072 |
logger.info(" β Checking inference backend configuration...")
|
| 2073 |
if zero_gpu_config and zero_gpu_config.get("enabled"):
|
| 2074 |
+
logger.info(f" β ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')} [PRIMARY - FIRST PRIORITY]")
|
| 2075 |
if zero_gpu_config.get("per_user_mode"):
|
| 2076 |
logger.info(" β Mode: Per-user (multi-tenant)")
|
| 2077 |
else:
|
| 2078 |
logger.info(" β Mode: Service account (single-tenant)")
|
| 2079 |
+
logger.info(" β Inference priority: ZeroGPU API β Local Models β HF API")
|
| 2080 |
else:
|
| 2081 |
+
logger.warning(" β ZeroGPU API: Not configured (using local/HF fallback)")
|
| 2082 |
+
logger.info(" β To enable: Set ZERO_GPU_EMAIL and ZERO_GPU_PASSWORD")
|
| 2083 |
+
logger.info(" β Local models: Enabled (lazy loading - fallback only)")
|
| 2084 |
llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
|
| 2085 |
logger.info(" β LLM Router initialized")
|
| 2086 |
logger.info(" β Inference routing configured")
|
config.py
CHANGED
|
@@ -38,8 +38,8 @@ class Settings(BaseSettings):
|
|
| 38 |
log_format: str = os.getenv("LOG_FORMAT", "json")
|
| 39 |
|
| 40 |
# ZeroGPU API settings
|
| 41 |
-
|
| 42 |
-
#
|
| 43 |
zero_gpu_base_url: str = os.getenv("ZERO_GPU_API_URL", "https://bm9njt1ypzvuqw-8000.proxy.runpod.net")
|
| 44 |
zero_gpu_email: str = os.getenv("ZERO_GPU_EMAIL", "")
|
| 45 |
zero_gpu_password: str = os.getenv("ZERO_GPU_PASSWORD", "")
|
|
@@ -48,6 +48,12 @@ class Settings(BaseSettings):
|
|
| 48 |
zero_gpu_admin_email: str = os.getenv("ZERO_GPU_ADMIN_EMAIL", "")
|
| 49 |
zero_gpu_admin_password: str = os.getenv("ZERO_GPU_ADMIN_PASSWORD", "")
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
class Config:
|
| 52 |
env_file = ".env"
|
| 53 |
|
|
|
|
| 38 |
log_format: str = os.getenv("LOG_FORMAT", "json")
|
| 39 |
|
| 40 |
# ZeroGPU API settings
|
| 41 |
+
# Auto-enable if credentials are provided (ZeroGPU is first priority)
|
| 42 |
+
# Can be explicitly disabled with USE_ZERO_GPU=false
|
| 43 |
zero_gpu_base_url: str = os.getenv("ZERO_GPU_API_URL", "https://bm9njt1ypzvuqw-8000.proxy.runpod.net")
|
| 44 |
zero_gpu_email: str = os.getenv("ZERO_GPU_EMAIL", "")
|
| 45 |
zero_gpu_password: str = os.getenv("ZERO_GPU_PASSWORD", "")
|
|
|
|
| 48 |
zero_gpu_admin_email: str = os.getenv("ZERO_GPU_ADMIN_EMAIL", "")
|
| 49 |
zero_gpu_admin_password: str = os.getenv("ZERO_GPU_ADMIN_PASSWORD", "")
|
| 50 |
|
| 51 |
+
# Auto-enable ZeroGPU if credentials are provided (unless explicitly disabled)
|
| 52 |
+
_explicit_disable = os.getenv("USE_ZERO_GPU", "").lower() == "false"
|
| 53 |
+
_has_service_creds = zero_gpu_email and zero_gpu_password
|
| 54 |
+
_has_per_user_creds = zero_gpu_per_user_mode and zero_gpu_admin_email and zero_gpu_admin_password
|
| 55 |
+
zero_gpu_enabled: bool = not _explicit_disable and (_has_service_creds or _has_per_user_creds)
|
| 56 |
+
|
| 57 |
class Config:
|
| 58 |
env_file = ".env"
|
| 59 |
|
flask_api_standalone.py
CHANGED
|
@@ -96,10 +96,12 @@ def initialize_orchestrator():
|
|
| 96 |
logger.info("[FLASK API STEP 1/5] Initializing LLM Router...")
|
| 97 |
logger.info(" β Configuring inference backend...")
|
| 98 |
if zero_gpu_config and zero_gpu_config.get("enabled"):
|
| 99 |
-
logger.info(f" β ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')}")
|
|
|
|
| 100 |
else:
|
| 101 |
-
logger.
|
| 102 |
-
|
|
|
|
| 103 |
llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
|
| 104 |
logger.info(" β LLM Router initialized")
|
| 105 |
|
|
|
|
| 96 |
logger.info("[FLASK API STEP 1/5] Initializing LLM Router...")
|
| 97 |
logger.info(" β Configuring inference backend...")
|
| 98 |
if zero_gpu_config and zero_gpu_config.get("enabled"):
|
| 99 |
+
logger.info(f" β ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')} [PRIMARY - FIRST PRIORITY]")
|
| 100 |
+
logger.info(" β Inference priority: ZeroGPU API β Local Models β HF API")
|
| 101 |
else:
|
| 102 |
+
logger.warning(" β ZeroGPU API: Not configured (using local/HF fallback)")
|
| 103 |
+
logger.info(" β To enable: Set ZERO_GPU_EMAIL and ZERO_GPU_PASSWORD")
|
| 104 |
+
logger.info(" β Local models: Enabled (lazy loading - fallback only)")
|
| 105 |
llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
|
| 106 |
logger.info(" β LLM Router initialized")
|
| 107 |
|
main.py
CHANGED
|
@@ -181,16 +181,32 @@ def main():
|
|
| 181 |
else:
|
| 182 |
logger.warning(" β HF_TOKEN not found - some features may be limited")
|
| 183 |
|
| 184 |
-
# Check ZeroGPU configuration
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
if zero_gpu_url:
|
| 189 |
-
logger.info(f" β ZeroGPU API enabled: {zero_gpu_url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
else:
|
| 191 |
-
logger.warning(" β ZeroGPU
|
| 192 |
else:
|
| 193 |
-
logger.info("
|
|
|
|
| 194 |
|
| 195 |
logger.info(" β Environment check complete")
|
| 196 |
|
|
|
|
| 181 |
else:
|
| 182 |
logger.warning(" β HF_TOKEN not found - some features may be limited")
|
| 183 |
|
| 184 |
+
# Check ZeroGPU configuration (auto-enabled if credentials provided)
|
| 185 |
+
zero_gpu_url = os.getenv('ZERO_GPU_API_URL', '')
|
| 186 |
+
zero_gpu_email = os.getenv('ZERO_GPU_EMAIL', '')
|
| 187 |
+
zero_gpu_password = os.getenv('ZERO_GPU_PASSWORD', '')
|
| 188 |
+
zero_gpu_admin_email = os.getenv('ZERO_GPU_ADMIN_EMAIL', '')
|
| 189 |
+
zero_gpu_admin_password = os.getenv('ZERO_GPU_ADMIN_PASSWORD', '')
|
| 190 |
+
zero_gpu_per_user = os.getenv('ZERO_GPU_PER_USER_MODE', 'false').lower() == 'true'
|
| 191 |
+
|
| 192 |
+
has_zero_gpu_creds = (zero_gpu_email and zero_gpu_password) or (zero_gpu_per_user and zero_gpu_admin_email and zero_gpu_admin_password)
|
| 193 |
+
explicit_disable = os.getenv('USE_ZERO_GPU', '').lower() == 'false'
|
| 194 |
+
|
| 195 |
+
if explicit_disable:
|
| 196 |
+
logger.info(" β ZeroGPU API explicitly disabled (USE_ZERO_GPU=false)")
|
| 197 |
+
elif has_zero_gpu_creds:
|
| 198 |
if zero_gpu_url:
|
| 199 |
+
logger.info(f" β ZeroGPU API enabled: {zero_gpu_url} [PRIMARY - FIRST PRIORITY]")
|
| 200 |
+
if zero_gpu_per_user:
|
| 201 |
+
logger.info(" β Mode: Per-user (multi-tenant)")
|
| 202 |
+
else:
|
| 203 |
+
logger.info(" β Mode: Service account (single-tenant)")
|
| 204 |
+
logger.info(" β Inference priority: ZeroGPU API β Local Models β HF API")
|
| 205 |
else:
|
| 206 |
+
logger.warning(" β ZeroGPU credentials provided but URL not configured")
|
| 207 |
else:
|
| 208 |
+
logger.info(" β ZeroGPU API not configured (using local/HF fallback)")
|
| 209 |
+
logger.info(" β To enable: Set ZERO_GPU_EMAIL and ZERO_GPU_PASSWORD")
|
| 210 |
|
| 211 |
logger.info(" β Environment check complete")
|
| 212 |
|
src/llm_router.py
CHANGED
|
@@ -45,6 +45,7 @@ class LLMRouter:
|
|
| 45 |
self.use_zero_gpu = True
|
| 46 |
self.zero_gpu_mode = "per_user"
|
| 47 |
logger.info("β ZeroGPU per-user mode enabled (multi-tenant)")
|
|
|
|
| 48 |
else:
|
| 49 |
logger.warning("ZeroGPU per-user mode enabled but admin credentials not provided")
|
| 50 |
except ImportError:
|
|
@@ -67,15 +68,19 @@ class LLMRouter:
|
|
| 67 |
self.use_zero_gpu = True
|
| 68 |
self.zero_gpu_mode = "service_account"
|
| 69 |
logger.info("β ZeroGPU API client initialized (service account mode)")
|
|
|
|
| 70 |
|
| 71 |
-
#
|
| 72 |
try:
|
| 73 |
-
if
|
| 74 |
-
logger.
|
| 75 |
-
|
|
|
|
|
|
|
| 76 |
except Exception as e:
|
| 77 |
-
logger.warning(f"Could not verify ZeroGPU API readiness: {e}
|
| 78 |
-
|
|
|
|
| 79 |
else:
|
| 80 |
logger.warning("ZeroGPU enabled but credentials not provided")
|
| 81 |
except ImportError:
|
|
@@ -101,7 +106,10 @@ class LLMRouter:
|
|
| 101 |
async def route_inference(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, user_id: Optional[str] = None, **kwargs):
|
| 102 |
"""
|
| 103 |
Smart routing based on task specialization
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
Args:
|
| 107 |
task_type: Task type (e.g., "intent_classification", "general_reasoning")
|
|
@@ -114,40 +122,44 @@ class LLMRouter:
|
|
| 114 |
model_config = self._select_model(task_type)
|
| 115 |
logger.info(f"Selected model: {model_config['model_id']}")
|
| 116 |
|
| 117 |
-
# Try ZeroGPU API first (
|
| 118 |
-
if
|
|
|
|
|
|
|
| 119 |
try:
|
| 120 |
result = await self._call_zero_gpu_endpoint(task_type, prompt, context, user_id, **kwargs)
|
| 121 |
-
if result is not None:
|
| 122 |
-
logger.info(f"Inference complete for {task_type} (ZeroGPU API)")
|
| 123 |
return result
|
| 124 |
else:
|
| 125 |
-
logger.warning("ZeroGPU API returned
|
| 126 |
except Exception as e:
|
| 127 |
-
logger.warning(f"ZeroGPU API inference failed: {e}
|
|
|
|
| 128 |
logger.debug("Exception details:", exc_info=True)
|
| 129 |
|
| 130 |
-
# Fallback to local models (lazy loading - only if ZeroGPU fails)
|
| 131 |
if self.use_local_models and self.local_loader:
|
| 132 |
try:
|
| 133 |
-
logger.info("
|
| 134 |
# Handle embedding generation separately
|
| 135 |
if task_type == "embedding_generation":
|
| 136 |
result = await self._call_local_embedding(model_config, prompt, **kwargs)
|
| 137 |
else:
|
| 138 |
result = await self._call_local_model(model_config, prompt, task_type, **kwargs)
|
| 139 |
|
| 140 |
-
if result is not None:
|
| 141 |
-
logger.info(f"Inference complete for {task_type} (local model fallback)")
|
| 142 |
return result
|
| 143 |
else:
|
| 144 |
-
logger.warning("Local model returned
|
| 145 |
except Exception as e:
|
| 146 |
-
logger.warning(f"Local model inference failed: {e}
|
|
|
|
| 147 |
logger.debug("Exception details:", exc_info=True)
|
| 148 |
|
| 149 |
-
# Final fallback to HF Inference API
|
| 150 |
-
logger.info("Using HF Inference API as final fallback")
|
| 151 |
# Health check and fallback logic
|
| 152 |
if not await self._is_model_healthy(model_config["model_id"]):
|
| 153 |
logger.warning(f"Model unhealthy, using fallback")
|
|
|
|
| 45 |
self.use_zero_gpu = True
|
| 46 |
self.zero_gpu_mode = "per_user"
|
| 47 |
logger.info("β ZeroGPU per-user mode enabled (multi-tenant)")
|
| 48 |
+
logger.info(" β ZeroGPU API is PRIMARY inference method (first priority)")
|
| 49 |
else:
|
| 50 |
logger.warning("ZeroGPU per-user mode enabled but admin credentials not provided")
|
| 51 |
except ImportError:
|
|
|
|
| 68 |
self.use_zero_gpu = True
|
| 69 |
self.zero_gpu_mode = "service_account"
|
| 70 |
logger.info("β ZeroGPU API client initialized (service account mode)")
|
| 71 |
+
logger.info(" β ZeroGPU API is PRIMARY inference method (first priority)")
|
| 72 |
|
| 73 |
+
# Check API readiness (non-blocking, will still try during inference)
|
| 74 |
try:
|
| 75 |
+
if self.zero_gpu_client.wait_for_ready(timeout=10):
|
| 76 |
+
logger.info(" β ZeroGPU API is ready")
|
| 77 |
+
else:
|
| 78 |
+
logger.warning(" β ZeroGPU API not ready yet (will retry during inference)")
|
| 79 |
+
# Keep use_zero_gpu=True - we'll try it first anyway
|
| 80 |
except Exception as e:
|
| 81 |
+
logger.warning(f" β Could not verify ZeroGPU API readiness: {e}")
|
| 82 |
+
logger.info(" β Will still attempt ZeroGPU first during inference")
|
| 83 |
+
# Keep use_zero_gpu=True - we'll try it first anyway
|
| 84 |
else:
|
| 85 |
logger.warning("ZeroGPU enabled but credentials not provided")
|
| 86 |
except ImportError:
|
|
|
|
| 106 |
async def route_inference(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, user_id: Optional[str] = None, **kwargs):
|
| 107 |
"""
|
| 108 |
Smart routing based on task specialization
|
| 109 |
+
PRIORITY ORDER (ZeroGPU is FIRST):
|
| 110 |
+
1. ZeroGPU API (PRIMARY - always tried first if configured)
|
| 111 |
+
2. Local models (fallback - lazy loading, only if ZeroGPU fails)
|
| 112 |
+
3. HF Inference API (final fallback - only if both above fail)
|
| 113 |
|
| 114 |
Args:
|
| 115 |
task_type: Task type (e.g., "intent_classification", "general_reasoning")
|
|
|
|
| 122 |
model_config = self._select_model(task_type)
|
| 123 |
logger.info(f"Selected model: {model_config['model_id']}")
|
| 124 |
|
| 125 |
+
# PRIORITY 1: Try ZeroGPU API first (PRIMARY inference method)
|
| 126 |
+
# Always try if client is configured, even if initialization had warnings
|
| 127 |
+
if self.zero_gpu_client or self.zero_gpu_user_manager:
|
| 128 |
+
logger.info("β Attempting ZeroGPU API (PRIMARY inference method)...")
|
| 129 |
try:
|
| 130 |
result = await self._call_zero_gpu_endpoint(task_type, prompt, context, user_id, **kwargs)
|
| 131 |
+
if result is not None and result.strip(): # Check for non-empty result
|
| 132 |
+
logger.info(f"β Inference complete for {task_type} (ZeroGPU API - PRIMARY)")
|
| 133 |
return result
|
| 134 |
else:
|
| 135 |
+
logger.warning("ZeroGPU API returned empty result, falling back to local models")
|
| 136 |
except Exception as e:
|
| 137 |
+
logger.warning(f"ZeroGPU API inference failed: {e}")
|
| 138 |
+
logger.info("β Falling back to local models (lazy loading)...")
|
| 139 |
logger.debug("Exception details:", exc_info=True)
|
| 140 |
|
| 141 |
+
# PRIORITY 2: Fallback to local models (lazy loading - only if ZeroGPU fails)
|
| 142 |
if self.use_local_models and self.local_loader:
|
| 143 |
try:
|
| 144 |
+
logger.info("β Loading local model as fallback (ZeroGPU unavailable)...")
|
| 145 |
# Handle embedding generation separately
|
| 146 |
if task_type == "embedding_generation":
|
| 147 |
result = await self._call_local_embedding(model_config, prompt, **kwargs)
|
| 148 |
else:
|
| 149 |
result = await self._call_local_model(model_config, prompt, task_type, **kwargs)
|
| 150 |
|
| 151 |
+
if result is not None and result.strip(): # Check for non-empty result
|
| 152 |
+
logger.info(f"β Inference complete for {task_type} (local model fallback)")
|
| 153 |
return result
|
| 154 |
else:
|
| 155 |
+
logger.warning("Local model returned empty result, falling back to HF API")
|
| 156 |
except Exception as e:
|
| 157 |
+
logger.warning(f"Local model inference failed: {e}")
|
| 158 |
+
logger.info("β Falling back to HF Inference API (final fallback)...")
|
| 159 |
logger.debug("Exception details:", exc_info=True)
|
| 160 |
|
| 161 |
+
# PRIORITY 3: Final fallback to HF Inference API (only if ZeroGPU and local models fail)
|
| 162 |
+
logger.info("β Using HF Inference API as final fallback...")
|
| 163 |
# Health check and fallback logic
|
| 164 |
if not await self._is_model_healthy(model_config["model_id"]):
|
| 165 |
logger.warning(f"Model unhealthy, using fallback")
|