Spaces:

JatinAutonomousLabs
/

Research_AI_Assistant

Sleeping

File size: 12,389 Bytes

# llm_router.py - FIXED VERSION
import logging
import asyncio
from .models_config import LLM_CONFIG

logger = logging.getLogger(__name__)

class LLMRouter:
    def __init__(self, hf_token):
        self.hf_token = hf_token
        self.health_status = {}
        logger.info("LLMRouter initialized")
        if hf_token:
            logger.info("HF token available")
        else:
            logger.warning("No HF token provided")
        
    async def route_inference(self, task_type: str, prompt: str, **kwargs):
        """
        Smart routing based on task specialization
        """
        logger.info(f"Routing inference for task: {task_type}")
        model_config = self._select_model(task_type)
        logger.info(f"Selected model: {model_config['model_id']}")
        
        # Health check and fallback logic
        if not await self._is_model_healthy(model_config["model_id"]):
            logger.warning(f"Model unhealthy, using fallback")
            model_config = self._get_fallback_model(task_type)
            logger.info(f"Fallback model: {model_config['model_id']}")
            
        # FIXED: Ensure task_type is passed to the _call_hf_endpoint method
        result = await self._call_hf_endpoint(model_config, prompt, task_type, **kwargs)
        logger.info(f"Inference complete for {task_type}")
        return result
    
    def _select_model(self, task_type: str) -> dict:
        model_map = {
            "intent_classification": LLM_CONFIG["models"]["classification_specialist"],
            "embedding_generation": LLM_CONFIG["models"]["embedding_specialist"],
            "safety_check": LLM_CONFIG["models"]["safety_checker"],
            "general_reasoning": LLM_CONFIG["models"]["reasoning_primary"],
            "response_synthesis": LLM_CONFIG["models"]["reasoning_primary"]
        }
        return model_map.get(task_type, LLM_CONFIG["models"]["reasoning_primary"])
    
    async def _is_model_healthy(self, model_id: str) -> bool:
        """
        Check if the model is healthy and available
        Mark models as healthy by default - actual availability checked at API call time
        """
        # Check cached health status
        if model_id in self.health_status:
            return self.health_status[model_id]
        
        # All models marked healthy initially - real check happens during API call
        self.health_status[model_id] = True
        return True
    
    def _get_fallback_model(self, task_type: str) -> dict:
        """
        Get fallback model configuration for the task type
        """
        # Fallback mapping
        fallback_map = {
            "intent_classification": LLM_CONFIG["models"]["reasoning_primary"],
            "embedding_generation": LLM_CONFIG["models"]["embedding_specialist"],
            "safety_check": LLM_CONFIG["models"]["reasoning_primary"],
            "general_reasoning": LLM_CONFIG["models"]["reasoning_primary"],
            "response_synthesis": LLM_CONFIG["models"]["reasoning_primary"]
        }
        return fallback_map.get(task_type, LLM_CONFIG["models"]["reasoning_primary"])
    
    async def _call_hf_endpoint(self, model_config: dict, prompt: str, task_type: str, **kwargs):
        """
        FIXED: Make actual call to Hugging Face Chat Completions API
        Uses the correct chat completions protocol with retry logic and exponential backoff
        
        IMPORTANT: task_type parameter is now properly included in the method signature
        """
        # Retry configuration
        max_retries = kwargs.get('max_retries', 3)
        initial_delay = kwargs.get('initial_delay', 1.0)  # Start with 1 second
        max_delay = kwargs.get('max_delay', 16.0)  # Cap at 16 seconds
        timeout = kwargs.get('timeout', 30)
        
        try:
            import requests
            from requests.exceptions import Timeout, RequestException, ConnectionError as RequestsConnectionError
            
            model_id = model_config["model_id"]
            
            # Use the chat completions endpoint
            api_url = "https://router.huggingface.co/v1/chat/completions"
            
            logger.info(f"Calling HF Chat Completions API for model: {model_id}")
            logger.debug(f"Prompt length: {len(prompt)}")
            logger.info("=" * 80)
            logger.info("LLM API REQUEST - COMPLETE PROMPT:")
            logger.info("=" * 80)
            logger.info(f"Model: {model_id}")
            
            # FIXED: task_type is now properly available as a parameter
            logger.info(f"Task Type: {task_type}")
            logger.info(f"Prompt Length: {len(prompt)} characters")
            logger.info("-" * 40)
            logger.info("FULL PROMPT CONTENT:")
            logger.info("-" * 40)
            logger.info(prompt)
            logger.info("-" * 40)
            logger.info("END OF PROMPT")
            logger.info("=" * 80)
            
            # Prepare the request payload
            max_tokens = kwargs.get('max_tokens', 512)
            temperature = kwargs.get('temperature', 0.7)
            
            payload = {
                "model": model_id,
                "messages": [
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                "max_tokens": max_tokens,
                "temperature": temperature,
                "stream": False
            }
            
            headers = {
                "Authorization": f"Bearer {self.hf_token}",
                "Content-Type": "application/json"
            }
            
            # Retry logic with exponential backoff
            last_exception = None
            for attempt in range(max_retries + 1):
                try:
                    if attempt > 0:
                        # Calculate exponential backoff delay
                        delay = min(initial_delay * (2 ** (attempt - 1)), max_delay)
                        logger.warning(f"Retry attempt {attempt}/{max_retries} after {delay:.1f}s delay (exponential backoff)")
                        await asyncio.sleep(delay)
                    
                    logger.info(f"Sending request to: {api_url} (attempt {attempt + 1}/{max_retries + 1})")
                    logger.debug(f"Payload: {payload}")
                    
                    response = requests.post(api_url, json=payload, headers=headers, timeout=timeout)
                    
                    if response.status_code == 200:
                        result = response.json()
                        logger.debug(f"Raw response: {result}")
                        
                        if 'choices' in result and len(result['choices']) > 0:
                            generated_text = result['choices'][0]['message']['content']
                            
                            if not generated_text or generated_text.strip() == "":
                                logger.warning(f"Empty or invalid response, using fallback")
                                return None
                            
                            if attempt > 0:
                                logger.info(f"Successfully retrieved response after {attempt} retry attempts")
                            
                            logger.info(f"HF API returned response (length: {len(generated_text)})")
                            logger.info("=" * 80)
                            logger.info("COMPLETE LLM API RESPONSE:")
                            logger.info("=" * 80)
                            logger.info(f"Model: {model_id}")
                            
                            # FIXED: task_type is now properly available
                            logger.info(f"Task Type: {task_type}")
                            logger.info(f"Response Length: {len(generated_text)} characters")
                            logger.info("-" * 40)
                            logger.info("FULL RESPONSE CONTENT:")
                            logger.info("-" * 40)
                            logger.info(generated_text)
                            logger.info("-" * 40)
                            logger.info("END OF LLM RESPONSE")
                            logger.info("=" * 80)
                            return generated_text
                        else:
                            logger.error(f"Unexpected response format: {result}")
                            return None
                    elif response.status_code == 503:
                        # Model is loading - this is retryable
                        if attempt < max_retries:
                            logger.warning(f"Model loading (503), will retry (attempt {attempt + 1}/{max_retries + 1})")
                            last_exception = Exception(f"Model loading (503)")
                            continue
                        else:
                            # After max retries, try fallback model
                            logger.warning(f"Model loading (503) after {max_retries} retries, trying fallback model")
                            fallback_config = self._get_fallback_model(task_type)
                            
                            # FIXED: Ensure task_type is passed in recursive call
                            return await self._call_hf_endpoint(fallback_config, prompt, task_type, **kwargs)
                    else:
                        # Non-retryable HTTP errors
                        logger.error(f"HF API error: {response.status_code} - {response.text}")
                        return None
                        
                except Timeout as e:
                    last_exception = e
                    if attempt < max_retries:
                        logger.warning(f"Request timeout (attempt {attempt + 1}/{max_retries + 1}): {str(e)}")
                        continue
                    else:
                        logger.error(f"Request timeout after {max_retries} retries: {str(e)}")
                        # Try fallback model on final timeout
                        logger.warning("Attempting fallback model due to persistent timeout")
                        fallback_config = self._get_fallback_model(task_type)
                        return await self._call_hf_endpoint(fallback_config, prompt, task_type, **kwargs)
                        
                except (RequestsConnectionError, RequestException) as e:
                    last_exception = e
                    if attempt < max_retries:
                        logger.warning(f"Connection error (attempt {attempt + 1}/{max_retries + 1}): {str(e)}")
                        continue
                    else:
                        logger.error(f"Connection error after {max_retries} retries: {str(e)}")
                        # Try fallback model on final connection error
                        logger.warning("Attempting fallback model due to persistent connection error")
                        fallback_config = self._get_fallback_model(task_type)
                        return await self._call_hf_endpoint(fallback_config, prompt, task_type, **kwargs)
            
            # If we exhausted all retries and didn't return
            if last_exception:
                logger.error(f"Failed after {max_retries} retries. Last error: {last_exception}")
                return None
                
        except ImportError:
            logger.warning("requests library not available, using mock response")
            return f"[Mock] Response to: {prompt[:100]}..."
        except Exception as e:
            logger.error(f"Error calling HF endpoint: {e}", exc_info=True)
            return None
    
    async def get_available_models(self):
        """
        Get list of available models for testing
        """
        return list(LLM_CONFIG["models"].keys())
    
    async def health_check(self):
        """
        Perform health check on all models
        """
        health_status = {}
        for model_name, model_config in LLM_CONFIG["models"].items():
            model_id = model_config["model_id"]
            is_healthy = await self._is_model_healthy(model_id)
            health_status[model_name] = {
                "model_id": model_id,
                "healthy": is_healthy
            }
        
        return health_status