Commit
·
ca77f38
1
Parent(s):
fd88fa8
Add logging for context window configuration and improve max_tokens calculation debugging
Browse files- src/llm_router.py +17 -10
src/llm_router.py
CHANGED
|
@@ -54,7 +54,8 @@ class LLMRouter:
|
|
| 54 |
logger.info("Novita AI API client initialized")
|
| 55 |
logger.info(f"Base URL: {self.settings.novita_base_url}")
|
| 56 |
logger.info(f"Model: {self.settings.novita_model}")
|
| 57 |
-
|
|
|
|
| 58 |
logger.error(f"Failed to initialize Novita AI client: {e}")
|
| 59 |
raise RuntimeError(f"Could not initialize Novita AI API client: {e}") from e
|
| 60 |
|
|
@@ -149,7 +150,7 @@ class LLMRouter:
|
|
| 149 |
response_text = self._clean_reasoning_tags(response_text)
|
| 150 |
logger.info(f"Novita AI API generated response (length: {len(response_text)})")
|
| 151 |
return response_text
|
| 152 |
-
|
| 153 |
# Handle non-streaming response
|
| 154 |
response = self.novita_client.chat.completions.create(**request_params)
|
| 155 |
|
|
@@ -159,14 +160,14 @@ class LLMRouter:
|
|
| 159 |
result = self._clean_reasoning_tags(result)
|
| 160 |
logger.info(f"Novita AI API generated response (length: {len(result)})")
|
| 161 |
return result
|
| 162 |
-
|
| 163 |
logger.error("Novita AI API returned empty response")
|
| 164 |
return None
|
| 165 |
|
| 166 |
except Exception as e:
|
| 167 |
logger.error(f"Error calling Novita AI API: {e}", exc_info=True)
|
| 168 |
-
|
| 169 |
-
|
| 170 |
def _calculate_safe_max_tokens(self, prompt: str, requested_max_tokens: int) -> int:
|
| 171 |
"""
|
| 172 |
Calculate safe max_tokens based on input token count and model context window.
|
|
@@ -182,9 +183,14 @@ class LLMRouter:
|
|
| 182 |
# For more accuracy, you could use tiktoken if available
|
| 183 |
input_tokens = len(prompt) // 4
|
| 184 |
|
| 185 |
-
# Get model context window
|
| 186 |
context_window = self.settings.novita_model_context_window
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
# Reserve minimum 100 tokens for safety margin
|
| 189 |
available_tokens = context_window - input_tokens - 100
|
| 190 |
|
|
@@ -197,7 +203,8 @@ class LLMRouter:
|
|
| 197 |
if safe_max_tokens < requested_max_tokens:
|
| 198 |
logger.warning(
|
| 199 |
f"Reduced max_tokens from {requested_max_tokens} to {safe_max_tokens} "
|
| 200 |
-
f"(input: ~{input_tokens} tokens, context window: {context_window} tokens
|
|
|
|
| 201 |
)
|
| 202 |
|
| 203 |
return safe_max_tokens
|
|
@@ -375,7 +382,7 @@ class LLMRouter:
|
|
| 375 |
def _truncate_to_tokens(self, content: str, max_tokens: int) -> str:
|
| 376 |
"""Truncate content to fit within token limit"""
|
| 377 |
# Simple character-based truncation (1 token ≈ 4 chars)
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
return content[:max_chars - 3] + "..."
|
|
|
|
| 54 |
logger.info("Novita AI API client initialized")
|
| 55 |
logger.info(f"Base URL: {self.settings.novita_base_url}")
|
| 56 |
logger.info(f"Model: {self.settings.novita_model}")
|
| 57 |
+
logger.info(f"Context Window: {self.settings.novita_model_context_window} tokens")
|
| 58 |
+
except Exception as e:
|
| 59 |
logger.error(f"Failed to initialize Novita AI client: {e}")
|
| 60 |
raise RuntimeError(f"Could not initialize Novita AI API client: {e}") from e
|
| 61 |
|
|
|
|
| 150 |
response_text = self._clean_reasoning_tags(response_text)
|
| 151 |
logger.info(f"Novita AI API generated response (length: {len(response_text)})")
|
| 152 |
return response_text
|
| 153 |
+
else:
|
| 154 |
# Handle non-streaming response
|
| 155 |
response = self.novita_client.chat.completions.create(**request_params)
|
| 156 |
|
|
|
|
| 160 |
result = self._clean_reasoning_tags(result)
|
| 161 |
logger.info(f"Novita AI API generated response (length: {len(result)})")
|
| 162 |
return result
|
| 163 |
+
else:
|
| 164 |
logger.error("Novita AI API returned empty response")
|
| 165 |
return None
|
| 166 |
|
| 167 |
except Exception as e:
|
| 168 |
logger.error(f"Error calling Novita AI API: {e}", exc_info=True)
|
| 169 |
+
raise
|
| 170 |
+
|
| 171 |
def _calculate_safe_max_tokens(self, prompt: str, requested_max_tokens: int) -> int:
|
| 172 |
"""
|
| 173 |
Calculate safe max_tokens based on input token count and model context window.
|
|
|
|
| 183 |
# For more accuracy, you could use tiktoken if available
|
| 184 |
input_tokens = len(prompt) // 4
|
| 185 |
|
| 186 |
+
# Get model context window from settings
|
| 187 |
context_window = self.settings.novita_model_context_window
|
| 188 |
|
| 189 |
+
logger.debug(
|
| 190 |
+
f"Calculating safe max_tokens: input ~{input_tokens} tokens, "
|
| 191 |
+
f"context_window={context_window}, requested={requested_max_tokens}"
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
# Reserve minimum 100 tokens for safety margin
|
| 195 |
available_tokens = context_window - input_tokens - 100
|
| 196 |
|
|
|
|
| 203 |
if safe_max_tokens < requested_max_tokens:
|
| 204 |
logger.warning(
|
| 205 |
f"Reduced max_tokens from {requested_max_tokens} to {safe_max_tokens} "
|
| 206 |
+
f"(input: ~{input_tokens} tokens, context window: {context_window} tokens, "
|
| 207 |
+
f"available: {available_tokens} tokens)"
|
| 208 |
)
|
| 209 |
|
| 210 |
return safe_max_tokens
|
|
|
|
| 382 |
def _truncate_to_tokens(self, content: str, max_tokens: int) -> str:
|
| 383 |
"""Truncate content to fit within token limit"""
|
| 384 |
# Simple character-based truncation (1 token ≈ 4 chars)
|
| 385 |
+
max_chars = max_tokens * 4
|
| 386 |
+
if len(content) <= max_chars:
|
| 387 |
+
return content
|
| 388 |
return content[:max_chars - 3] + "..."
|