HonestAI

Paused

JatsTheAIGen commited on Nov 4

Commit

9959ea9

1 Parent(s): 5787d0a

Fix: Cache directory permissions and gated repository handling

CRITICAL FIXES:
- Fixed cache directory permission errors in Docker containers
- Added HF_TOKEN authentication for gated repository access
- Added non-gated fallback model (Mistral-7B-Instruct-v0.2)
- Improved Docker detection to prefer /tmp over ~/.cache

Changes:
- src/local_model_loader.py:
- Pass cache_dir to all from_pretrained calls
- Set HF_HOME and TRANSFORMERS_CACHE environment variables
- Authenticate with HF_TOKEN for gated repositories
- Use cache_dir from settings config

- src/config.py:
- Improved Docker detection for cache directory selection
- Prefer /tmp in Docker containers to avoid permission issues

- src/models_config.py:
- Added mistralai/Mistral-7B-Instruct-v0.2 as fallback model
- All text tasks now have non-gated fallback option

Fixes:
- PermissionError: [Errno 13] Permission denied: '/.cache'
- Gated repository access errors with proper fallback
- HF_TOKEN authentication for gated models

Ready for production testing.

Files changed (3) hide show

src/config.py +11 -3
src/local_model_loader.py +67 -9
src/models_config.py +5 -3

src/config.py CHANGED Viewed

@@ -61,12 +61,20 @@ class CacheDirectoryManager:
         Returns:
             str: Path to writable cache directory
         """
         cache_candidates = [
             os.getenv("HF_HOME"),
             os.getenv("TRANSFORMERS_CACHE"),
-            os.path.join(os.path.expanduser("~"), ".cache", "huggingface") if os.path.expanduser("~") else None,
-            os.path.join(os.path.expanduser("~"), ".cache", "huggingface_fallback") if os.path.expanduser("~") else None,
-            "/tmp/huggingface_cache"
         ]
         for cache_dir in cache_candidates:

         Returns:
             str: Path to writable cache directory
         """
+        # Priority order for cache directory
+        # In Docker, ~ may resolve to / which causes permission issues
+        # So we prefer /tmp over ~/.cache in containerized environments
+        is_docker = os.path.exists("/.dockerenv") or os.path.exists("/tmp")
         cache_candidates = [
             os.getenv("HF_HOME"),
             os.getenv("TRANSFORMERS_CACHE"),
+            # In Docker, prefer /tmp over ~/.cache
+            "/tmp/huggingface_cache" if is_docker else None,
+            os.path.join(os.path.expanduser("~"), ".cache", "huggingface") if os.path.expanduser("~") and not is_docker else None,
+            os.path.join(os.path.expanduser("~"), ".cache", "huggingface_fallback") if os.path.expanduser("~") and not is_docker else None,
+            "/tmp/huggingface_cache" if not is_docker else None,
+            "/tmp/huggingface"  # Final fallback
         ]
         for cache_dir in cache_candidates:

src/local_model_loader.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # Local GPU-based model loading for NVIDIA T4 Medium (16GB VRAM)
 # Optimized with 4-bit quantization to fit larger models
 import logging
 import torch
 from typing import Optional, Dict, Any
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
@@ -10,9 +11,20 @@ from sentence_transformers import SentenceTransformer
 # Import GatedRepoError for handling gated repositories
 try:
     from huggingface_hub.exceptions import GatedRepoError
 except ImportError:
     # Fallback if huggingface_hub is not available
     GatedRepoError = Exception
 logger = logging.getLogger(__name__)
@@ -39,6 +51,34 @@ class LocalModelLoader:
             self.device = device
             self.device_name = device
         # Model cache
         self.loaded_models: Dict[str, Any] = {}
         self.loaded_tokenizers: Dict[str, Any] = {}
@@ -69,10 +109,12 @@ class LocalModelLoader:
             if base_model_id != model_id:
                 logger.info(f"Stripping API suffix from {model_id}, using base model: {base_model_id}")
-            # Load tokenizer
             try:
                 tokenizer = AutoTokenizer.from_pretrained(
                     base_model_id,
                     trust_remote_code=True
                 )
             except GatedRepoError as e:
@@ -108,22 +150,36 @@ class LocalModelLoader:
             else:
                 quantization_config = None
-            # Load model with GPU optimization
             try:
                 if self.device == "cuda":
                     model = AutoModelForCausalLM.from_pretrained(
                         base_model_id,
-                        device_map="auto",  # Automatically uses GPU
-                        torch_dtype=torch.float16,  # Use FP16 for memory efficiency
-                        trust_remote_code=True,
-                        **(quantization_config if isinstance(quantization_config, dict) else {}),
-                        **({"quantization_config": quantization_config} if quantization_config and not isinstance(quantization_config, dict) else {})
                     )
                 else:
                     model = AutoModelForCausalLM.from_pretrained(
                         base_model_id,
-                        torch_dtype=torch.float32,
-                        trust_remote_code=True
                     )
                     model = model.to(self.device)
             except GatedRepoError as e:
@@ -183,6 +239,8 @@ class LocalModelLoader:
                 logger.info(f"Stripping API suffix from {model_id}, using base model: {base_model_id}")
             # SentenceTransformer automatically handles GPU
             try:
                 model = SentenceTransformer(
                     base_model_id,

 # Local GPU-based model loading for NVIDIA T4 Medium (16GB VRAM)
 # Optimized with 4-bit quantization to fit larger models
 import logging
+import os
 import torch
 from typing import Optional, Dict, Any
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
 # Import GatedRepoError for handling gated repositories
 try:
     from huggingface_hub.exceptions import GatedRepoError
+    from huggingface_hub import login as hf_login
 except ImportError:
     # Fallback if huggingface_hub is not available
     GatedRepoError = Exception
+    hf_login = None
+# Import settings for cache directory and HF token
+try:
+    from .config import settings
+except ImportError:
+    try:
+        from config import settings
+    except ImportError:
+        settings = None
 logger = logging.getLogger(__name__)
             self.device = device
             self.device_name = device
+        # Get cache directory from settings
+        if settings:
+            self.cache_dir = settings.hf_cache_dir
+            self.hf_token = settings.hf_token
+        else:
+            # Fallback to environment variables
+            self.cache_dir = os.getenv("HF_HOME") or os.getenv("TRANSFORMERS_CACHE") or "/tmp/huggingface"
+            self.hf_token = os.getenv("HF_TOKEN", "")
+        # Ensure cache directory exists and is writable
+        os.makedirs(self.cache_dir, exist_ok=True)
+        # Set environment variables for transformers/huggingface_hub
+        if not os.getenv("HF_HOME"):
+            os.environ["HF_HOME"] = self.cache_dir
+        if not os.getenv("TRANSFORMERS_CACHE"):
+            os.environ["TRANSFORMERS_CACHE"] = self.cache_dir
+        logger.info(f"Cache directory: {self.cache_dir}")
+        # Login to Hugging Face if token is provided (needed for gated repositories)
+        if self.hf_token and hf_login:
+            try:
+                hf_login(token=self.hf_token, add_to_git_credential=False)
+                logger.info("✓ HF_TOKEN authenticated for gated model access")
+            except Exception as e:
+                logger.warning(f"HF_TOKEN login failed (may not be needed): {e}")
         # Model cache
         self.loaded_models: Dict[str, Any] = {}
         self.loaded_tokenizers: Dict[str, Any] = {}
             if base_model_id != model_id:
                 logger.info(f"Stripping API suffix from {model_id}, using base model: {base_model_id}")
+            # Load tokenizer with cache directory
             try:
                 tokenizer = AutoTokenizer.from_pretrained(
                     base_model_id,
+                    cache_dir=self.cache_dir,
+                    token=self.hf_token if self.hf_token else None,
                     trust_remote_code=True
                 )
             except GatedRepoError as e:
             else:
                 quantization_config = None
+            # Load model with GPU optimization and cache directory
             try:
+                load_kwargs = {
+                    "cache_dir": self.cache_dir,
+                    "token": self.hf_token if self.hf_token else None,
+                    "trust_remote_code": True
+                }
                 if self.device == "cuda":
+                    load_kwargs.update({
+                        "device_map": "auto",  # Automatically uses GPU
+                        "torch_dtype": torch.float16,  # Use FP16 for memory efficiency
+                    })
+                    if quantization_config:
+                        if isinstance(quantization_config, dict):
+                            load_kwargs.update(quantization_config)
+                        else:
+                            load_kwargs["quantization_config"] = quantization_config
                     model = AutoModelForCausalLM.from_pretrained(
                         base_model_id,
+                        **load_kwargs
                     )
                 else:
+                    load_kwargs.update({
+                        "torch_dtype": torch.float32,
+                    })
                     model = AutoModelForCausalLM.from_pretrained(
                         base_model_id,
+                        **load_kwargs
                     )
                     model = model.to(self.device)
             except GatedRepoError as e:
                 logger.info(f"Stripping API suffix from {model_id}, using base model: {base_model_id}")
             # SentenceTransformer automatically handles GPU
+            # Note: SentenceTransformer uses cache_dir from environment or default location
+            # We can't directly pass cache_dir, but we've set HF_HOME and TRANSFORMERS_CACHE
             try:
                 model = SentenceTransformer(
                     base_model_id,

src/models_config.py CHANGED Viewed

@@ -9,7 +9,7 @@ LLM_CONFIG = {
             "task": "general_reasoning",
             "max_tokens": 8000,  # Reduced from 10000
             "temperature": 0.7,
-            "fallback": None,  # Will handle fallback in code if needed
             "is_chat_model": True,
             "use_4bit_quantization": True,  # Enable 4-bit quantization for 16GB T4
             "use_8bit_quantization": False
@@ -28,7 +28,8 @@ LLM_CONFIG = {
             "specialization": "fast_inference",
             "latency_target": "<100ms",
             "is_chat_model": True,
-            "use_4bit_quantization": True
         },
         "safety_checker": {
             "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Same model for all text tasks
@@ -36,7 +37,8 @@ LLM_CONFIG = {
             "confidence_threshold": 0.85,
             "purpose": "bias_detection",
             "is_chat_model": True,
-            "use_4bit_quantization": True
         }
     },
     "routing_logic": {

             "task": "general_reasoning",
             "max_tokens": 8000,  # Reduced from 10000
             "temperature": 0.7,
+            "fallback": "mistralai/Mistral-7B-Instruct-v0.2",  # Non-gated fallback model
             "is_chat_model": True,
             "use_4bit_quantization": True,  # Enable 4-bit quantization for 16GB T4
             "use_8bit_quantization": False
             "specialization": "fast_inference",
             "latency_target": "<100ms",
             "is_chat_model": True,
+            "use_4bit_quantization": True,
+            "fallback": "mistralai/Mistral-7B-Instruct-v0.2"  # Non-gated fallback
         },
         "safety_checker": {
             "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Same model for all text tasks
             "confidence_threshold": 0.85,
             "purpose": "bias_detection",
             "is_chat_model": True,
+            "use_4bit_quantization": True,
+            "fallback": "mistralai/Mistral-7B-Instruct-v0.2"  # Non-gated fallback
         }
     },
     "routing_logic": {