Spaces:

JatinAutonomousLabs
/

Research_AI_Assistant

Sleeping

App Files Files Community

Research_AI_Assistant / src /llm_router.py

JatsTheAIGen

cumulative upgrade - context + safety + response length v2

a5d9083 about 2 months ago

raw

history blame

12.4 kB

	# llm_router.py - FIXED VERSION
	import logging
	import asyncio
	from .models_config import LLM_CONFIG

	logger = logging.getLogger(__name__)

	class LLMRouter:
	def __init__(self, hf_token):
	self.hf_token = hf_token
	self.health_status = {}
	logger.info("LLMRouter initialized")
	if hf_token:
	logger.info("HF token available")
	else:
	logger.warning("No HF token provided")

	async def route_inference(self, task_type: str, prompt: str, **kwargs):
	"""
	Smart routing based on task specialization
	"""
	logger.info(f"Routing inference for task: {task_type}")
	model_config = self._select_model(task_type)
	logger.info(f"Selected model: {model_config['model_id']}")

	# Health check and fallback logic
	if not await self._is_model_healthy(model_config["model_id"]):
	logger.warning(f"Model unhealthy, using fallback")
	model_config = self._get_fallback_model(task_type)
	logger.info(f"Fallback model: {model_config['model_id']}")

	# FIXED: Ensure task_type is passed to the _call_hf_endpoint method
	result = await self._call_hf_endpoint(model_config, prompt, task_type, **kwargs)
	logger.info(f"Inference complete for {task_type}")
	return result

	def _select_model(self, task_type: str) -> dict:
	model_map = {
	"intent_classification": LLM_CONFIG["models"]["classification_specialist"],
	"embedding_generation": LLM_CONFIG["models"]["embedding_specialist"],
	"safety_check": LLM_CONFIG["models"]["safety_checker"],
	"general_reasoning": LLM_CONFIG["models"]["reasoning_primary"],
	"response_synthesis": LLM_CONFIG["models"]["reasoning_primary"]
	}
	return model_map.get(task_type, LLM_CONFIG["models"]["reasoning_primary"])

	async def _is_model_healthy(self, model_id: str) -> bool:
	"""
	Check if the model is healthy and available
	Mark models as healthy by default - actual availability checked at API call time
	"""
	# Check cached health status
	if model_id in self.health_status:
	return self.health_status[model_id]

	# All models marked healthy initially - real check happens during API call
	self.health_status[model_id] = True
	return True

	def _get_fallback_model(self, task_type: str) -> dict:
	"""
	Get fallback model configuration for the task type
	"""
	# Fallback mapping
	fallback_map = {
	"intent_classification": LLM_CONFIG["models"]["reasoning_primary"],
	"embedding_generation": LLM_CONFIG["models"]["embedding_specialist"],
	"safety_check": LLM_CONFIG["models"]["reasoning_primary"],
	"general_reasoning": LLM_CONFIG["models"]["reasoning_primary"],
	"response_synthesis": LLM_CONFIG["models"]["reasoning_primary"]
	}
	return fallback_map.get(task_type, LLM_CONFIG["models"]["reasoning_primary"])

	async def _call_hf_endpoint(self, model_config: dict, prompt: str, task_type: str, **kwargs):
	"""
	FIXED: Make actual call to Hugging Face Chat Completions API
	Uses the correct chat completions protocol with retry logic and exponential backoff

	IMPORTANT: task_type parameter is now properly included in the method signature
	"""
	# Retry configuration
	max_retries = kwargs.get('max_retries', 3)
	initial_delay = kwargs.get('initial_delay', 1.0) # Start with 1 second
	max_delay = kwargs.get('max_delay', 16.0) # Cap at 16 seconds
	timeout = kwargs.get('timeout', 30)

	try:
	import requests
	from requests.exceptions import Timeout, RequestException, ConnectionError as RequestsConnectionError

	model_id = model_config["model_id"]

	# Use the chat completions endpoint
	api_url = "https://router.huggingface.co/v1/chat/completions"

	logger.info(f"Calling HF Chat Completions API for model: {model_id}")
	logger.debug(f"Prompt length: {len(prompt)}")
	logger.info("=" * 80)
	logger.info("LLM API REQUEST - COMPLETE PROMPT:")
	logger.info("=" * 80)
	logger.info(f"Model: {model_id}")

	# FIXED: task_type is now properly available as a parameter
	logger.info(f"Task Type: {task_type}")
	logger.info(f"Prompt Length: {len(prompt)} characters")
	logger.info("-" * 40)
	logger.info("FULL PROMPT CONTENT:")
	logger.info("-" * 40)
	logger.info(prompt)
	logger.info("-" * 40)
	logger.info("END OF PROMPT")
	logger.info("=" * 80)

	# Prepare the request payload
	max_tokens = kwargs.get('max_tokens', 512)
	temperature = kwargs.get('temperature', 0.7)

	payload = {
	"model": model_id,
	"messages": [
	{
	"role": "user",
	"content": prompt
	}
	],
	"max_tokens": max_tokens,
	"temperature": temperature,
	"stream": False
	}

	headers = {
	"Authorization": f"Bearer {self.hf_token}",
	"Content-Type": "application/json"
	}

	# Retry logic with exponential backoff
	last_exception = None
	for attempt in range(max_retries + 1):
	try:
	if attempt > 0:
	# Calculate exponential backoff delay
	delay = min(initial_delay * (2 ** (attempt - 1)), max_delay)
	logger.warning(f"Retry attempt {attempt}/{max_retries} after {delay:.1f}s delay (exponential backoff)")
	await asyncio.sleep(delay)

	logger.info(f"Sending request to: {api_url} (attempt {attempt + 1}/{max_retries + 1})")
	logger.debug(f"Payload: {payload}")

	response = requests.post(api_url, json=payload, headers=headers, timeout=timeout)

	if response.status_code == 200:
	result = response.json()
	logger.debug(f"Raw response: {result}")

	if 'choices' in result and len(result['choices']) > 0:
	generated_text = result['choices'][0]['message']['content']

	if not generated_text or generated_text.strip() == "":
	logger.warning(f"Empty or invalid response, using fallback")
	return None

	if attempt > 0:
	logger.info(f"Successfully retrieved response after {attempt} retry attempts")

	logger.info(f"HF API returned response (length: {len(generated_text)})")
	logger.info("=" * 80)
	logger.info("COMPLETE LLM API RESPONSE:")
	logger.info("=" * 80)
	logger.info(f"Model: {model_id}")

	# FIXED: task_type is now properly available
	logger.info(f"Task Type: {task_type}")
	logger.info(f"Response Length: {len(generated_text)} characters")
	logger.info("-" * 40)
	logger.info("FULL RESPONSE CONTENT:")
	logger.info("-" * 40)
	logger.info(generated_text)
	logger.info("-" * 40)
	logger.info("END OF LLM RESPONSE")
	logger.info("=" * 80)
	return generated_text
	else:
	logger.error(f"Unexpected response format: {result}")
	return None
	elif response.status_code == 503:
	# Model is loading - this is retryable
	if attempt < max_retries:
	logger.warning(f"Model loading (503), will retry (attempt {attempt + 1}/{max_retries + 1})")
	last_exception = Exception(f"Model loading (503)")
	continue
	else:
	# After max retries, try fallback model
	logger.warning(f"Model loading (503) after {max_retries} retries, trying fallback model")
	fallback_config = self._get_fallback_model(task_type)

	# FIXED: Ensure task_type is passed in recursive call
	return await self._call_hf_endpoint(fallback_config, prompt, task_type, **kwargs)
	else:
	# Non-retryable HTTP errors
	logger.error(f"HF API error: {response.status_code} - {response.text}")
	return None

	except Timeout as e:
	last_exception = e
	if attempt < max_retries:
	logger.warning(f"Request timeout (attempt {attempt + 1}/{max_retries + 1}): {str(e)}")
	continue
	else:
	logger.error(f"Request timeout after {max_retries} retries: {str(e)}")
	# Try fallback model on final timeout
	logger.warning("Attempting fallback model due to persistent timeout")
	fallback_config = self._get_fallback_model(task_type)
	return await self._call_hf_endpoint(fallback_config, prompt, task_type, **kwargs)

	except (RequestsConnectionError, RequestException) as e:
	last_exception = e
	if attempt < max_retries:
	logger.warning(f"Connection error (attempt {attempt + 1}/{max_retries + 1}): {str(e)}")
	continue
	else:
	logger.error(f"Connection error after {max_retries} retries: {str(e)}")
	# Try fallback model on final connection error
	logger.warning("Attempting fallback model due to persistent connection error")
	fallback_config = self._get_fallback_model(task_type)
	return await self._call_hf_endpoint(fallback_config, prompt, task_type, **kwargs)

	# If we exhausted all retries and didn't return
	if last_exception:
	logger.error(f"Failed after {max_retries} retries. Last error: {last_exception}")
	return None

	except ImportError:
	logger.warning("requests library not available, using mock response")
	return f"[Mock] Response to: {prompt[:100]}..."
	except Exception as e:
	logger.error(f"Error calling HF endpoint: {e}", exc_info=True)
	return None

	async def get_available_models(self):
	"""
	Get list of available models for testing
	"""
	return list(LLM_CONFIG["models"].keys())

	async def health_check(self):
	"""
	Perform health check on all models
	"""
	health_status = {}
	for model_name, model_config in LLM_CONFIG["models"].items():
	model_id = model_config["model_id"]
	is_healthy = await self._is_model_healthy(model_id)
	health_status[model_name] = {
	"model_id": model_id,
	"healthy": is_healthy
	}

	return health_status