HonestAI

Paused

JatsTheAIGen commited on Nov 4

Commit

b3aba24

1 Parent(s): 79ea999

Update model IDs to use Cerebras deployment and add gated repository error handling

- Updated model IDs to use meta-llama/Llama-3.1-8B-Instruct:cerebras across all model configurations
- Added comprehensive GatedRepoError handling in local_model_loader.py
- Added GatedRepoError handling in llm_router.py with fallback model support
- Implemented API suffix stripping (:cerebras) for local model loading
- Updated default model configurations in config.py
- Added helpful error messages with links to request repository access

Files changed (4) hide show

src/config.py +4 -4
src/llm_router.py +49 -6
src/local_model_loader.py +82 -28
src/models_config.py +3 -3

src/config.py CHANGED Viewed

@@ -169,8 +169,8 @@ class Settings(BaseSettings):
     # ==================== Model Configuration ====================
     default_model: str = Field(
-        default="meta-llama/Llama-3.1-8B-Instruct",
-        description="Primary model for reasoning tasks (upgraded with 4-bit quantization)"
     )
     embedding_model: str = Field(
@@ -179,8 +179,8 @@ class Settings(BaseSettings):
     )
     classification_model: str = Field(
-        default="meta-llama/Llama-3.1-8B-Instruct",
-        description="Model for classification tasks"
     )
     # ==================== Performance Configuration ====================

     # ==================== Model Configuration ====================
     default_model: str = Field(
+        default="meta-llama/Llama-3.1-8B-Instruct:cerebras",
+        description="Primary model for reasoning tasks (Cerebras deployment with 4-bit quantization)"
     )
     embedding_model: str = Field(
     )
     classification_model: str = Field(
+        default="meta-llama/Llama-3.1-8B-Instruct:cerebras",
+        description="Model for classification tasks (Cerebras deployment)"
     )
     # ==================== Performance Configuration ====================

src/llm_router.py CHANGED Viewed

@@ -4,6 +4,13 @@ import asyncio
 from typing import Dict, Optional
 from .models_config import LLM_CONFIG
 logger = logging.getLogger(__name__)
 class LLMRouter:
@@ -96,11 +103,34 @@ class LLMRouter:
                     use_4bit = quantization_config.get("default_4bit", True)
                     use_8bit = quantization_config.get("default_8bit", False)
-                self.local_loader.load_chat_model(
-                    model_id,
-                    load_in_8bit=use_8bit,
-                    load_in_4bit=use_4bit
-                )
             # Format as chat messages if needed
             messages = [{"role": "user", "content": prompt}]
@@ -131,6 +161,9 @@ class LLMRouter:
             return result
         except Exception as e:
             logger.error(f"Error calling local model: {e}", exc_info=True)
             return None
@@ -146,7 +179,13 @@ class LLMRouter:
             # Ensure model is loaded
             if model_id not in self.local_loader.loaded_embedding_models:
                 logger.info(f"Loading embedding model {model_id} on demand...")
-                self.local_loader.load_embedding_model(model_id)
             # Generate embedding
             embedding = await asyncio.to_thread(
@@ -395,6 +434,10 @@ class LLMRouter:
             if not hasattr(self, 'tokenizer'):
                 try:
                     self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
                 except Exception as e:
                     logger.warning(f"Could not load tokenizer: {e}, using character count estimation")
                     self.tokenizer = None

 from typing import Dict, Optional
 from .models_config import LLM_CONFIG
+# Import GatedRepoError for handling gated repositories
+try:
+    from huggingface_hub.exceptions import GatedRepoError
+except ImportError:
+    # Fallback if huggingface_hub is not available
+    GatedRepoError = Exception
 logger = logging.getLogger(__name__)
 class LLMRouter:
                     use_4bit = quantization_config.get("default_4bit", True)
                     use_8bit = quantization_config.get("default_8bit", False)
+                try:
+                    self.local_loader.load_chat_model(
+                        model_id,
+                        load_in_8bit=use_8bit,
+                        load_in_4bit=use_4bit
+                    )
+                except GatedRepoError as e:
+                    logger.error(f"❌ Cannot access gated repository {model_id}")
+                    logger.error(f"   Visit https://huggingface.co/{model_id.split(':')[0] if ':' in model_id else model_id} to request access.")
+                    # Try fallback model if available
+                    fallback_model_id = model_config.get("fallback")
+                    if fallback_model_id:
+                        logger.warning(f"Attempting fallback model: {fallback_model_id}")
+                        try:
+                            # Create fallback config
+                            fallback_config = model_config.copy()
+                            fallback_config["model_id"] = fallback_model_id
+                            # Retry with fallback model
+                            return await self._call_local_model(fallback_config, prompt, task_type, **kwargs)
+                        except Exception as fallback_error:
+                            logger.error(f"Fallback model also failed: {fallback_error}")
+                            logger.warning("Falling back to HF Inference API")
+                            return None
+                    else:
+                        logger.warning("No fallback model configured, falling back to HF Inference API")
+                        return None
             # Format as chat messages if needed
             messages = [{"role": "user", "content": prompt}]
             return result
+        except GatedRepoError:
+            # Already handled above, return None to fall back to API
+            return None
         except Exception as e:
             logger.error(f"Error calling local model: {e}", exc_info=True)
             return None
             # Ensure model is loaded
             if model_id not in self.local_loader.loaded_embedding_models:
                 logger.info(f"Loading embedding model {model_id} on demand...")
+                try:
+                    self.local_loader.load_embedding_model(model_id)
+                except GatedRepoError as e:
+                    logger.error(f"❌ Cannot access gated repository {model_id}")
+                    logger.error(f"   Visit https://huggingface.co/{model_id.split(':')[0] if ':' in model_id else model_id} to request access.")
+                    logger.warning("Falling back to HF Inference API")
+                    return None
             # Generate embedding
             embedding = await asyncio.to_thread(
             if not hasattr(self, 'tokenizer'):
                 try:
                     self.tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
+                except GatedRepoError as e:
+                    logger.warning(f"Gated repository error loading tokenizer: {e}")
+                    logger.warning("Using character count estimation instead")
+                    self.tokenizer = None
                 except Exception as e:
                     logger.warning(f"Could not load tokenizer: {e}, using character count estimation")
                     self.tokenizer = None

src/local_model_loader.py CHANGED Viewed

@@ -7,6 +7,13 @@ from typing import Optional, Dict, Any
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
 from sentence_transformers import SentenceTransformer
 logger = logging.getLogger(__name__)
 class LocalModelLoader:
@@ -56,11 +63,27 @@ class LocalModelLoader:
         try:
             logger.info(f"Loading model {model_id} on {self.device}...")
             # Load tokenizer
-            tokenizer = AutoTokenizer.from_pretrained(
-                model_id,
-                trust_remote_code=True
-            )
             # Determine quantization config
             if load_in_4bit and self.device == "cuda":
@@ -86,28 +109,38 @@ class LocalModelLoader:
                 quantization_config = None
             # Load model with GPU optimization
-            if self.device == "cuda":
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_id,
-                    device_map="auto",  # Automatically uses GPU
-                    torch_dtype=torch.float16,  # Use FP16 for memory efficiency
-                    trust_remote_code=True,
-                    **(quantization_config if isinstance(quantization_config, dict) else {}),
-                    **({"quantization_config": quantization_config} if quantization_config and not isinstance(quantization_config, dict) else {})
-                )
-            else:
-                model = AutoModelForCausalLM.from_pretrained(
-                    model_id,
-                    torch_dtype=torch.float32,
-                    trust_remote_code=True
-                )
-                model = model.to(self.device)
             # Ensure padding token is set
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
-            # Cache models
             self.loaded_models[model_id] = model
             self.loaded_tokenizers[model_id] = tokenizer
@@ -117,9 +150,12 @@ class LocalModelLoader:
                 reserved = torch.cuda.memory_reserved(0) / 1024**3
                 logger.info(f"GPU Memory - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
-            logger.info(f"✓ Model {model_id} loaded successfully on {self.device}")
             return model, tokenizer
         except Exception as e:
             logger.error(f"Error loading model {model_id}: {e}", exc_info=True)
             raise
@@ -141,18 +177,36 @@ class LocalModelLoader:
         try:
             logger.info(f"Loading embedding model {model_id}...")
             # SentenceTransformer automatically handles GPU
-            model = SentenceTransformer(
-                model_id,
-                device=self.device
-            )
-            # Cache model
             self.loaded_embedding_models[model_id] = model
-            logger.info(f"✓ Embedding model {model_id} loaded successfully on {self.device}")
             return model
         except Exception as e:
             logger.error(f"Error loading embedding model {model_id}: {e}", exc_info=True)
             raise

 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
 from sentence_transformers import SentenceTransformer
+# Import GatedRepoError for handling gated repositories
+try:
+    from huggingface_hub.exceptions import GatedRepoError
+except ImportError:
+    # Fallback if huggingface_hub is not available
+    GatedRepoError = Exception
 logger = logging.getLogger(__name__)
 class LocalModelLoader:
         try:
             logger.info(f"Loading model {model_id} on {self.device}...")
+            # Strip API-specific suffixes (e.g., :cerebras, :novita) for local loading
+            # These suffixes are typically used for API endpoints, not local model identifiers
+            base_model_id = model_id.split(':')[0] if ':' in model_id else model_id
+            if base_model_id != model_id:
+                logger.info(f"Stripping API suffix from {model_id}, using base model: {base_model_id}")
             # Load tokenizer
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    base_model_id,
+                    trust_remote_code=True
+                )
+            except GatedRepoError as e:
+                logger.error(f"❌ Gated Repository Error: Cannot access gated repo {base_model_id}")
+                logger.error(f"   Access to model {base_model_id} is restricted and you are not in the authorized list.")
+                logger.error(f"   Visit https://huggingface.co/{base_model_id} to request access.")
+                logger.error(f"   Error details: {e}")
+                raise GatedRepoError(
+                    f"Cannot access gated repository {base_model_id}. "
+                    f"Visit https://huggingface.co/{base_model_id} to request access."
+                ) from e
             # Determine quantization config
             if load_in_4bit and self.device == "cuda":
                 quantization_config = None
             # Load model with GPU optimization
+            try:
+                if self.device == "cuda":
+                    model = AutoModelForCausalLM.from_pretrained(
+                        base_model_id,
+                        device_map="auto",  # Automatically uses GPU
+                        torch_dtype=torch.float16,  # Use FP16 for memory efficiency
+                        trust_remote_code=True,
+                        **(quantization_config if isinstance(quantization_config, dict) else {}),
+                        **({"quantization_config": quantization_config} if quantization_config and not isinstance(quantization_config, dict) else {})
+                    )
+                else:
+                    model = AutoModelForCausalLM.from_pretrained(
+                        base_model_id,
+                        torch_dtype=torch.float32,
+                        trust_remote_code=True
+                    )
+                    model = model.to(self.device)
+            except GatedRepoError as e:
+                logger.error(f"❌ Gated Repository Error: Cannot access gated repo {base_model_id}")
+                logger.error(f"   Access to model {base_model_id} is restricted and you are not in the authorized list.")
+                logger.error(f"   Visit https://huggingface.co/{base_model_id} to request access.")
+                logger.error(f"   Error details: {e}")
+                raise GatedRepoError(
+                    f"Cannot access gated repository {base_model_id}. "
+                    f"Visit https://huggingface.co/{base_model_id} to request access."
+                ) from e
             # Ensure padding token is set
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
+            # Cache models (use original model_id for cache key to maintain API compatibility)
             self.loaded_models[model_id] = model
             self.loaded_tokenizers[model_id] = tokenizer
                 reserved = torch.cuda.memory_reserved(0) / 1024**3
                 logger.info(f"GPU Memory - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
+            logger.info(f"✓ Model {model_id} (base: {base_model_id}) loaded successfully on {self.device}")
             return model, tokenizer
+        except GatedRepoError:
+            # Re-raise GatedRepoError to be handled by caller
+            raise
         except Exception as e:
             logger.error(f"Error loading model {model_id}: {e}", exc_info=True)
             raise
         try:
             logger.info(f"Loading embedding model {model_id}...")
+            # Strip API-specific suffixes for local loading
+            base_model_id = model_id.split(':')[0] if ':' in model_id else model_id
+            if base_model_id != model_id:
+                logger.info(f"Stripping API suffix from {model_id}, using base model: {base_model_id}")
             # SentenceTransformer automatically handles GPU
+            try:
+                model = SentenceTransformer(
+                    base_model_id,
+                    device=self.device
+                )
+            except GatedRepoError as e:
+                logger.error(f"❌ Gated Repository Error: Cannot access gated repo {base_model_id}")
+                logger.error(f"   Access to model {base_model_id} is restricted and you are not in the authorized list.")
+                logger.error(f"   Visit https://huggingface.co/{base_model_id} to request access.")
+                logger.error(f"   Error details: {e}")
+                raise GatedRepoError(
+                    f"Cannot access gated repository {base_model_id}. "
+                    f"Visit https://huggingface.co/{base_model_id} to request access."
+                ) from e
+            # Cache model (use original model_id for cache key)
             self.loaded_embedding_models[model_id] = model
+            logger.info(f"✓ Embedding model {model_id} (base: {base_model_id}) loaded successfully on {self.device}")
             return model
+        except GatedRepoError:
+            # Re-raise GatedRepoError to be handled by caller
+            raise
         except Exception as e:
             logger.error(f"Error loading embedding model {model_id}: {e}", exc_info=True)
             raise

src/models_config.py CHANGED Viewed

@@ -4,7 +4,7 @@ LLM_CONFIG = {
     "primary_provider": "huggingface",
     "models": {
         "reasoning_primary": {
-            "model_id": "meta-llama/Llama-3.1-8B-Instruct",  # Upgraded: Excellent reasoning with 4-bit quantization
             "task": "general_reasoning",
             "max_tokens": 10000,
             "temperature": 0.7,
@@ -23,7 +23,7 @@ LLM_CONFIG = {
             "is_chat_model": False
         },
         "classification_specialist": {
-            "model_id": "meta-llama/Llama-3.1-8B-Instruct",  # Use same chat model for classification (better than specialized models)
             "task": "intent_classification",
             "max_length": 512,
             "specialization": "fast_inference",
@@ -32,7 +32,7 @@ LLM_CONFIG = {
             "use_4bit_quantization": True
         },
         "safety_checker": {
-            "model_id": "meta-llama/Llama-3.1-8B-Instruct",  # Use same chat model for safety
             "task": "content_moderation",
             "confidence_threshold": 0.85,
             "purpose": "bias_detection",

     "primary_provider": "huggingface",
     "models": {
         "reasoning_primary": {
+            "model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras",  # Cerebras deployment
             "task": "general_reasoning",
             "max_tokens": 10000,
             "temperature": 0.7,
             "is_chat_model": False
         },
         "classification_specialist": {
+            "model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras",  # Cerebras deployment for classification
             "task": "intent_classification",
             "max_length": 512,
             "specialization": "fast_inference",
             "use_4bit_quantization": True
         },
         "safety_checker": {
+            "model_id": "meta-llama/Llama-3.1-8B-Instruct:cerebras",  # Cerebras deployment for safety
             "task": "content_moderation",
             "confidence_threshold": 0.85,
             "purpose": "bias_detection",