JatsTheAIGen commited on
Commit
b445296
Β·
1 Parent(s): fad63bf

feat: Make ZeroGPU API first priority for inference

Browse files

- Auto-enable ZeroGPU if credentials are provided (no need for USE_ZERO_GPU=true)
- ZeroGPU is always tried FIRST before local models or HF API
- Keep trying ZeroGPU even if initialization had warnings
- Update routing logic to check for client existence, not just use_zero_gpu flag
- Improve logging to clearly indicate ZeroGPU as PRIMARY inference method
- Update all initialization logs to show inference priority order
- Only fallback to local models or HF API if ZeroGPU actually fails
- Explicitly disable with USE_ZERO_GPU=false if needed

Files changed (5) hide show
  1. app.py +5 -3
  2. config.py +8 -2
  3. flask_api_standalone.py +5 -3
  4. main.py +23 -7
  5. src/llm_router.py +33 -21
app.py CHANGED
@@ -2071,14 +2071,16 @@ def initialize_orchestrator():
2071
  logger.info("[ORCHESTRATION STEP 1/6] Initializing LLM Router...")
2072
  logger.info(" β†’ Checking inference backend configuration...")
2073
  if zero_gpu_config and zero_gpu_config.get("enabled"):
2074
- logger.info(f" β†’ ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')}")
2075
  if zero_gpu_config.get("per_user_mode"):
2076
  logger.info(" β†’ Mode: Per-user (multi-tenant)")
2077
  else:
2078
  logger.info(" β†’ Mode: Service account (single-tenant)")
 
2079
  else:
2080
- logger.info(" β†’ ZeroGPU API: Disabled (using local/HF fallback)")
2081
- logger.info(" β†’ Local models: Enabled (lazy loading)")
 
2082
  llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
2083
  logger.info(" βœ“ LLM Router initialized")
2084
  logger.info(" βœ“ Inference routing configured")
 
2071
  logger.info("[ORCHESTRATION STEP 1/6] Initializing LLM Router...")
2072
  logger.info(" β†’ Checking inference backend configuration...")
2073
  if zero_gpu_config and zero_gpu_config.get("enabled"):
2074
+ logger.info(f" β†’ ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')} [PRIMARY - FIRST PRIORITY]")
2075
  if zero_gpu_config.get("per_user_mode"):
2076
  logger.info(" β†’ Mode: Per-user (multi-tenant)")
2077
  else:
2078
  logger.info(" β†’ Mode: Service account (single-tenant)")
2079
+ logger.info(" β†’ Inference priority: ZeroGPU API β†’ Local Models β†’ HF API")
2080
  else:
2081
+ logger.warning(" ⚠ ZeroGPU API: Not configured (using local/HF fallback)")
2082
+ logger.info(" β†’ To enable: Set ZERO_GPU_EMAIL and ZERO_GPU_PASSWORD")
2083
+ logger.info(" β†’ Local models: Enabled (lazy loading - fallback only)")
2084
  llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
2085
  logger.info(" βœ“ LLM Router initialized")
2086
  logger.info(" βœ“ Inference routing configured")
config.py CHANGED
@@ -38,8 +38,8 @@ class Settings(BaseSettings):
38
  log_format: str = os.getenv("LOG_FORMAT", "json")
39
 
40
  # ZeroGPU API settings
41
- zero_gpu_enabled: bool = os.getenv("USE_ZERO_GPU", "false").lower() == "true"
42
- # Default to Runpod proxy URL format: https://<pod-id>-8000.proxy.runpod.net
43
  zero_gpu_base_url: str = os.getenv("ZERO_GPU_API_URL", "https://bm9njt1ypzvuqw-8000.proxy.runpod.net")
44
  zero_gpu_email: str = os.getenv("ZERO_GPU_EMAIL", "")
45
  zero_gpu_password: str = os.getenv("ZERO_GPU_PASSWORD", "")
@@ -48,6 +48,12 @@ class Settings(BaseSettings):
48
  zero_gpu_admin_email: str = os.getenv("ZERO_GPU_ADMIN_EMAIL", "")
49
  zero_gpu_admin_password: str = os.getenv("ZERO_GPU_ADMIN_PASSWORD", "")
50
 
 
 
 
 
 
 
51
  class Config:
52
  env_file = ".env"
53
 
 
38
  log_format: str = os.getenv("LOG_FORMAT", "json")
39
 
40
  # ZeroGPU API settings
41
+ # Auto-enable if credentials are provided (ZeroGPU is first priority)
42
+ # Can be explicitly disabled with USE_ZERO_GPU=false
43
  zero_gpu_base_url: str = os.getenv("ZERO_GPU_API_URL", "https://bm9njt1ypzvuqw-8000.proxy.runpod.net")
44
  zero_gpu_email: str = os.getenv("ZERO_GPU_EMAIL", "")
45
  zero_gpu_password: str = os.getenv("ZERO_GPU_PASSWORD", "")
 
48
  zero_gpu_admin_email: str = os.getenv("ZERO_GPU_ADMIN_EMAIL", "")
49
  zero_gpu_admin_password: str = os.getenv("ZERO_GPU_ADMIN_PASSWORD", "")
50
 
51
+ # Auto-enable ZeroGPU if credentials are provided (unless explicitly disabled)
52
+ _explicit_disable = os.getenv("USE_ZERO_GPU", "").lower() == "false"
53
+ _has_service_creds = zero_gpu_email and zero_gpu_password
54
+ _has_per_user_creds = zero_gpu_per_user_mode and zero_gpu_admin_email and zero_gpu_admin_password
55
+ zero_gpu_enabled: bool = not _explicit_disable and (_has_service_creds or _has_per_user_creds)
56
+
57
  class Config:
58
  env_file = ".env"
59
 
flask_api_standalone.py CHANGED
@@ -96,10 +96,12 @@ def initialize_orchestrator():
96
  logger.info("[FLASK API STEP 1/5] Initializing LLM Router...")
97
  logger.info(" β†’ Configuring inference backend...")
98
  if zero_gpu_config and zero_gpu_config.get("enabled"):
99
- logger.info(f" β†’ ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')}")
 
100
  else:
101
- logger.info(" β†’ ZeroGPU API: Disabled (using local/HF fallback)")
102
- logger.info(" β†’ Local models: Enabled (lazy loading)")
 
103
  llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
104
  logger.info(" βœ“ LLM Router initialized")
105
 
 
96
  logger.info("[FLASK API STEP 1/5] Initializing LLM Router...")
97
  logger.info(" β†’ Configuring inference backend...")
98
  if zero_gpu_config and zero_gpu_config.get("enabled"):
99
+ logger.info(f" β†’ ZeroGPU API: {zero_gpu_config.get('base_url', 'N/A')} [PRIMARY - FIRST PRIORITY]")
100
+ logger.info(" β†’ Inference priority: ZeroGPU API β†’ Local Models β†’ HF API")
101
  else:
102
+ logger.warning(" ⚠ ZeroGPU API: Not configured (using local/HF fallback)")
103
+ logger.info(" β†’ To enable: Set ZERO_GPU_EMAIL and ZERO_GPU_PASSWORD")
104
+ logger.info(" β†’ Local models: Enabled (lazy loading - fallback only)")
105
  llm_router = LLMRouter(hf_token, use_local_models=True, zero_gpu_config=zero_gpu_config)
106
  logger.info(" βœ“ LLM Router initialized")
107
 
main.py CHANGED
@@ -181,16 +181,32 @@ def main():
181
  else:
182
  logger.warning(" ⚠ HF_TOKEN not found - some features may be limited")
183
 
184
- # Check ZeroGPU configuration
185
- use_zero_gpu = os.getenv('USE_ZERO_GPU', 'false').lower() == 'true'
186
- if use_zero_gpu:
187
- zero_gpu_url = os.getenv('ZERO_GPU_API_URL', '')
 
 
 
 
 
 
 
 
 
 
188
  if zero_gpu_url:
189
- logger.info(f" βœ“ ZeroGPU API enabled: {zero_gpu_url}")
 
 
 
 
 
190
  else:
191
- logger.warning(" ⚠ ZeroGPU enabled but URL not configured")
192
  else:
193
- logger.info(" βœ“ ZeroGPU API disabled (using local/HF fallback)")
 
194
 
195
  logger.info(" βœ“ Environment check complete")
196
 
 
181
  else:
182
  logger.warning(" ⚠ HF_TOKEN not found - some features may be limited")
183
 
184
+ # Check ZeroGPU configuration (auto-enabled if credentials provided)
185
+ zero_gpu_url = os.getenv('ZERO_GPU_API_URL', '')
186
+ zero_gpu_email = os.getenv('ZERO_GPU_EMAIL', '')
187
+ zero_gpu_password = os.getenv('ZERO_GPU_PASSWORD', '')
188
+ zero_gpu_admin_email = os.getenv('ZERO_GPU_ADMIN_EMAIL', '')
189
+ zero_gpu_admin_password = os.getenv('ZERO_GPU_ADMIN_PASSWORD', '')
190
+ zero_gpu_per_user = os.getenv('ZERO_GPU_PER_USER_MODE', 'false').lower() == 'true'
191
+
192
+ has_zero_gpu_creds = (zero_gpu_email and zero_gpu_password) or (zero_gpu_per_user and zero_gpu_admin_email and zero_gpu_admin_password)
193
+ explicit_disable = os.getenv('USE_ZERO_GPU', '').lower() == 'false'
194
+
195
+ if explicit_disable:
196
+ logger.info(" ⚠ ZeroGPU API explicitly disabled (USE_ZERO_GPU=false)")
197
+ elif has_zero_gpu_creds:
198
  if zero_gpu_url:
199
+ logger.info(f" βœ“ ZeroGPU API enabled: {zero_gpu_url} [PRIMARY - FIRST PRIORITY]")
200
+ if zero_gpu_per_user:
201
+ logger.info(" β†’ Mode: Per-user (multi-tenant)")
202
+ else:
203
+ logger.info(" β†’ Mode: Service account (single-tenant)")
204
+ logger.info(" β†’ Inference priority: ZeroGPU API β†’ Local Models β†’ HF API")
205
  else:
206
+ logger.warning(" ⚠ ZeroGPU credentials provided but URL not configured")
207
  else:
208
+ logger.info(" ⚠ ZeroGPU API not configured (using local/HF fallback)")
209
+ logger.info(" β†’ To enable: Set ZERO_GPU_EMAIL and ZERO_GPU_PASSWORD")
210
 
211
  logger.info(" βœ“ Environment check complete")
212
 
src/llm_router.py CHANGED
@@ -45,6 +45,7 @@ class LLMRouter:
45
  self.use_zero_gpu = True
46
  self.zero_gpu_mode = "per_user"
47
  logger.info("βœ“ ZeroGPU per-user mode enabled (multi-tenant)")
 
48
  else:
49
  logger.warning("ZeroGPU per-user mode enabled but admin credentials not provided")
50
  except ImportError:
@@ -67,15 +68,19 @@ class LLMRouter:
67
  self.use_zero_gpu = True
68
  self.zero_gpu_mode = "service_account"
69
  logger.info("βœ“ ZeroGPU API client initialized (service account mode)")
 
70
 
71
- # Wait for API to be ready (non-blocking, will fallback if not ready)
72
  try:
73
- if not self.zero_gpu_client.wait_for_ready(timeout=10):
74
- logger.warning("ZeroGPU API not ready, will use HF fallback")
75
- self.use_zero_gpu = False
 
 
76
  except Exception as e:
77
- logger.warning(f"Could not verify ZeroGPU API readiness: {e}. Will use HF fallback.")
78
- self.use_zero_gpu = False
 
79
  else:
80
  logger.warning("ZeroGPU enabled but credentials not provided")
81
  except ImportError:
@@ -101,7 +106,10 @@ class LLMRouter:
101
  async def route_inference(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, user_id: Optional[str] = None, **kwargs):
102
  """
103
  Smart routing based on task specialization
104
- Tries ZeroGPU API first, then local models as fallback (lazy loading), then HF Inference API
 
 
 
105
 
106
  Args:
107
  task_type: Task type (e.g., "intent_classification", "general_reasoning")
@@ -114,40 +122,44 @@ class LLMRouter:
114
  model_config = self._select_model(task_type)
115
  logger.info(f"Selected model: {model_config['model_id']}")
116
 
117
- # Try ZeroGPU API first (primary path)
118
- if self.use_zero_gpu:
 
 
119
  try:
120
  result = await self._call_zero_gpu_endpoint(task_type, prompt, context, user_id, **kwargs)
121
- if result is not None:
122
- logger.info(f"Inference complete for {task_type} (ZeroGPU API)")
123
  return result
124
  else:
125
- logger.warning("ZeroGPU API returned None, falling back to local models")
126
  except Exception as e:
127
- logger.warning(f"ZeroGPU API inference failed: {e}. Falling back to local models.")
 
128
  logger.debug("Exception details:", exc_info=True)
129
 
130
- # Fallback to local models (lazy loading - only if ZeroGPU fails)
131
  if self.use_local_models and self.local_loader:
132
  try:
133
- logger.info("ZeroGPU API unavailable, loading local model as fallback...")
134
  # Handle embedding generation separately
135
  if task_type == "embedding_generation":
136
  result = await self._call_local_embedding(model_config, prompt, **kwargs)
137
  else:
138
  result = await self._call_local_model(model_config, prompt, task_type, **kwargs)
139
 
140
- if result is not None:
141
- logger.info(f"Inference complete for {task_type} (local model fallback)")
142
  return result
143
  else:
144
- logger.warning("Local model returned None, falling back to HF API")
145
  except Exception as e:
146
- logger.warning(f"Local model inference failed: {e}. Falling back to HF API.")
 
147
  logger.debug("Exception details:", exc_info=True)
148
 
149
- # Final fallback to HF Inference API
150
- logger.info("Using HF Inference API as final fallback")
151
  # Health check and fallback logic
152
  if not await self._is_model_healthy(model_config["model_id"]):
153
  logger.warning(f"Model unhealthy, using fallback")
 
45
  self.use_zero_gpu = True
46
  self.zero_gpu_mode = "per_user"
47
  logger.info("βœ“ ZeroGPU per-user mode enabled (multi-tenant)")
48
+ logger.info(" β†’ ZeroGPU API is PRIMARY inference method (first priority)")
49
  else:
50
  logger.warning("ZeroGPU per-user mode enabled but admin credentials not provided")
51
  except ImportError:
 
68
  self.use_zero_gpu = True
69
  self.zero_gpu_mode = "service_account"
70
  logger.info("βœ“ ZeroGPU API client initialized (service account mode)")
71
+ logger.info(" β†’ ZeroGPU API is PRIMARY inference method (first priority)")
72
 
73
+ # Check API readiness (non-blocking, will still try during inference)
74
  try:
75
+ if self.zero_gpu_client.wait_for_ready(timeout=10):
76
+ logger.info(" βœ“ ZeroGPU API is ready")
77
+ else:
78
+ logger.warning(" ⚠ ZeroGPU API not ready yet (will retry during inference)")
79
+ # Keep use_zero_gpu=True - we'll try it first anyway
80
  except Exception as e:
81
+ logger.warning(f" ⚠ Could not verify ZeroGPU API readiness: {e}")
82
+ logger.info(" β†’ Will still attempt ZeroGPU first during inference")
83
+ # Keep use_zero_gpu=True - we'll try it first anyway
84
  else:
85
  logger.warning("ZeroGPU enabled but credentials not provided")
86
  except ImportError:
 
106
  async def route_inference(self, task_type: str, prompt: str, context: Optional[List[Dict]] = None, user_id: Optional[str] = None, **kwargs):
107
  """
108
  Smart routing based on task specialization
109
+ PRIORITY ORDER (ZeroGPU is FIRST):
110
+ 1. ZeroGPU API (PRIMARY - always tried first if configured)
111
+ 2. Local models (fallback - lazy loading, only if ZeroGPU fails)
112
+ 3. HF Inference API (final fallback - only if both above fail)
113
 
114
  Args:
115
  task_type: Task type (e.g., "intent_classification", "general_reasoning")
 
122
  model_config = self._select_model(task_type)
123
  logger.info(f"Selected model: {model_config['model_id']}")
124
 
125
+ # PRIORITY 1: Try ZeroGPU API first (PRIMARY inference method)
126
+ # Always try if client is configured, even if initialization had warnings
127
+ if self.zero_gpu_client or self.zero_gpu_user_manager:
128
+ logger.info("β†’ Attempting ZeroGPU API (PRIMARY inference method)...")
129
  try:
130
  result = await self._call_zero_gpu_endpoint(task_type, prompt, context, user_id, **kwargs)
131
+ if result is not None and result.strip(): # Check for non-empty result
132
+ logger.info(f"βœ“ Inference complete for {task_type} (ZeroGPU API - PRIMARY)")
133
  return result
134
  else:
135
+ logger.warning("ZeroGPU API returned empty result, falling back to local models")
136
  except Exception as e:
137
+ logger.warning(f"ZeroGPU API inference failed: {e}")
138
+ logger.info("β†’ Falling back to local models (lazy loading)...")
139
  logger.debug("Exception details:", exc_info=True)
140
 
141
+ # PRIORITY 2: Fallback to local models (lazy loading - only if ZeroGPU fails)
142
  if self.use_local_models and self.local_loader:
143
  try:
144
+ logger.info("β†’ Loading local model as fallback (ZeroGPU unavailable)...")
145
  # Handle embedding generation separately
146
  if task_type == "embedding_generation":
147
  result = await self._call_local_embedding(model_config, prompt, **kwargs)
148
  else:
149
  result = await self._call_local_model(model_config, prompt, task_type, **kwargs)
150
 
151
+ if result is not None and result.strip(): # Check for non-empty result
152
+ logger.info(f"βœ“ Inference complete for {task_type} (local model fallback)")
153
  return result
154
  else:
155
+ logger.warning("Local model returned empty result, falling back to HF API")
156
  except Exception as e:
157
+ logger.warning(f"Local model inference failed: {e}")
158
+ logger.info("β†’ Falling back to HF Inference API (final fallback)...")
159
  logger.debug("Exception details:", exc_info=True)
160
 
161
+ # PRIORITY 3: Final fallback to HF Inference API (only if ZeroGPU and local models fail)
162
+ logger.info("β†’ Using HF Inference API as final fallback...")
163
  # Health check and fallback logic
164
  if not await self._is_model_healthy(model_config["model_id"]):
165
  logger.warning(f"Model unhealthy, using fallback")