HonestAI

Paused

JatsTheAIGen commited on Nov 4

Commit

ea87e33

1 Parent(s): 67c580c

Fix: DynamicCache compatibility, dependencies, and Docker configuration

CRITICAL FIXES:
- Fixed DynamicCache.seen_tokens AttributeError for Phi-3 models
- Pinned transformers to compatible version (4.36.0-4.41.0)
- Added multi-level fallback chain (Mistral -> Phi-3)
- Fixed Docker user creation to prevent getpwuid errors
- Enhanced error handling for model generation

Changes:
- src/local_model_loader.py:
- Added DynamicCache compatibility fix for Phi-3 models
- Automatic use_cache=False for Phi-3 to avoid seen_tokens error
- Retry logic with cache disabled if DynamicCache error occurs
- Better error messages and logging

- src/llm_router.py:
- Enhanced fallback chain to support fallback + fallback2
- Sequential fallback attempts (Mistral -> Phi-3)
- Better error handling for multi-level fallbacks

- src/models_config.py:
- Updated fallback chain: Mistral (primary fallback) -> Phi-3 (secondary)
- Mistral is non-gated and has no DynamicCache issues
- Phi-3 as secondary fallback with DynamicCache workaround

- requirements.txt:
- Pinned transformers>=4.36.0,<4.41.0 (Phi-3 compatibility)
- Pinned torch>=2.0.0,<2.2.0 (avoid breaking changes)
- Pinned numpy>=1.24.0,<2.0.0 (avoid numpy 2.0 issues)
- Pinned bitsandbytes>=0.41.0,<0.43.0 (quantization stability)
- Added flash-attention as optional (commented out)

- Dockerfile:
- Creates appuser with UID 1000 (fixes getpwuid errors)
- Proper directory permissions for cache and logs
- Graceful user creation (handles existing users)

Fixes:
- AttributeError: 'DynamicCache' object has no attribute 'seen_tokens'
- KeyError: 'getpwuid(): uid not found: 1000'
- Model compatibility issues with Phi-3
- Version conflicts in dependencies

Ready for production deployment with improved stability.

Files changed (5) hide show

Dockerfile +14 -2
requirements.txt +15 -5
src/llm_router.py +45 -23
src/local_model_loader.py +47 -6
src/models_config.py +8 -3

Dockerfile CHANGED Viewed

@@ -16,12 +16,20 @@ RUN apt-get update && apt-get install -y \
     curl \
     && rm -rf /var/lib/apt/lists/*
 # Create cache directories with proper permissions
-# Hugging Face Spaces runs as root, so we can use /tmp without permission issues
 RUN mkdir -p /tmp/huggingface_cache && \
     chmod 777 /tmp/huggingface_cache && \
     mkdir -p /tmp/logs && \
-    chmod 777 /tmp/logs
 # Copy requirements file first (for better caching)
 COPY requirements.txt .
@@ -33,6 +41,10 @@ RUN pip install --no-cache-dir --upgrade pip && \
 # Copy application code
 COPY . .
 # Expose port 7860 (HF Spaces standard)
 EXPOSE 7860

     curl \
     && rm -rf /var/lib/apt/lists/*
+# Create app user with UID 1000 (fixes getpwuid(): uid not found: 1000 error)
+# Note: Hugging Face Spaces may run as root, but creating user prevents errors
+# Use || true to allow graceful failure if user already exists
+RUN (useradd -u 1000 -m -s /bin/bash appuser 2>/dev/null) || \
+    (groupadd -g 1000 appuser 2>/dev/null && useradd -u 1000 -g appuser -m -s /bin/bash appuser 2>/dev/null) || \
+    echo "User creation skipped (may already exist)"
 # Create cache directories with proper permissions
+# Hugging Face Spaces runs as root, but we ensure appuser can access
 RUN mkdir -p /tmp/huggingface_cache && \
     chmod 777 /tmp/huggingface_cache && \
     mkdir -p /tmp/logs && \
+    chmod 777 /tmp/logs && \
+    (chown -R appuser:appuser /tmp/huggingface_cache /tmp/logs 2>/dev/null || true)
 # Copy requirements file first (for better caching)
 COPY requirements.txt .
 # Copy application code
 COPY . .
+# Set ownership of application files (if running as root, this ensures appuser can access)
+# Use || true to allow graceful failure if chown not needed
+RUN chown -R appuser:appuser /app 2>/dev/null || true
 # Expose port 7860 (HF Spaces standard)
 EXPOSE 7860

requirements.txt CHANGED Viewed

@@ -6,21 +6,25 @@
 # PyTorch with CUDA support (for GPU inference)
 # Note: HF Spaces provides torch, but we ensure GPU support
-torch>=2.0.0
 # Web Framework & Interface
 aiohttp>=3.9.0
 httpx>=0.25.0
 # Hugging Face Ecosystem
-transformers>=4.35.0
-accelerate>=0.24.0
 tokenizers>=0.15.0
 sentence-transformers>=2.2.0
 # Vector Database & Search
 faiss-cpu>=1.7.4
-numpy>=1.24.0
 scipy>=1.11.0
 # Data Processing & Utilities
@@ -86,7 +90,13 @@ gradio-pdf>=0.0.6
 # Model-specific dependencies
 safetensors>=0.4.0
-bitsandbytes>=0.41.0  # Required for 4-bit and 8-bit quantization on GPU
 # Development/debugging
 ipython>=8.17.0

 # PyTorch with CUDA support (for GPU inference)
 # Note: HF Spaces provides torch, but we ensure GPU support
+# Pin to avoid breaking changes with newer versions
+torch>=2.0.0,<2.2.0
 # Web Framework & Interface
 aiohttp>=3.9.0
 httpx>=0.25.0
 # Hugging Face Ecosystem
+# PINNED for Phi-3 and DynamicCache compatibility
+# transformers 4.36.0+ has better Phi-3 support, but <4.41.0 to avoid breaking changes
+transformers>=4.36.0,<4.41.0
+accelerate>=0.24.0,<0.28.0
 tokenizers>=0.15.0
 sentence-transformers>=2.2.0
 # Vector Database & Search
 faiss-cpu>=1.7.4
+# Pin numpy to avoid compatibility issues with numpy 2.0
+numpy>=1.24.0,<2.0.0
 scipy>=1.11.0
 # Data Processing & Utilities
 # Model-specific dependencies
 safetensors>=0.4.0
+# Pin bitsandbytes to avoid breaking changes with quantization
+bitsandbytes>=0.41.0,<0.43.0  # Required for 4-bit and 8-bit quantization on GPU
+# Optional: Flash Attention (for better performance with transformer models)
+# Uncomment if you want flash attention (requires CUDA 11.8+ and compatible GPU)
+# Note: Improves performance but adds build complexity
+# flash-attn>=2.3.0  # Optional - improves performance but requires CUDA 11.8+
 # Development/debugging
 ipython>=8.17.0

src/llm_router.py CHANGED Viewed

@@ -144,29 +144,51 @@ class LLMRouter:
                         logger.error("❌ Fallback model also failed with gated repository error")
                         raise RuntimeError("Both primary and fallback models are gated repositories") from e
-                    # Try fallback model if available and this is not already a fallback attempt
-                    fallback_model_id = model_config.get("fallback")
-                    if fallback_model_id and fallback_model_id != model_id:  # Ensure fallback is different
-                        logger.warning(f"Attempting fallback model: {fallback_model_id}")
-                        try:
-                            # Create fallback config without fallback to prevent loops
-                            fallback_config = model_config.copy()
-                            fallback_config["model_id"] = fallback_model_id
-                            fallback_config.pop("fallback", None)  # Remove fallback to prevent infinite recursion
-                            # Retry with fallback model (mark as fallback attempt)
-                            return await self._call_local_model(
-                                fallback_config,
-                                prompt,
-                                task_type,
-                                **{**kwargs, '_is_fallback': True}
-                            )
-                        except GatedRepoError as fallback_gated_error:
-                            logger.error(f"❌ Fallback model {fallback_model_id} is also gated")
-                            raise RuntimeError("Both primary and fallback models are gated repositories") from fallback_gated_error
-                        except Exception as fallback_error:
-                            logger.error(f"Fallback model also failed: {fallback_error}")
-                            raise
                     else:
                         raise RuntimeError(f"Model {model_id} is a gated repository and no fallback available") from e
                 except (RuntimeError, ModuleNotFoundError, ImportError) as e:

                         logger.error("❌ Fallback model also failed with gated repository error")
                         raise RuntimeError("Both primary and fallback models are gated repositories") from e
+                    # Try fallback models in order (fallback, then fallback2)
+                    fallback_chain = []
+                    if model_config.get("fallback") and model_config.get("fallback") != model_id:
+                        fallback_chain.append(model_config.get("fallback"))
+                    if model_config.get("fallback2") and model_config.get("fallback2") != model_id:
+                        fallback_chain.append(model_config.get("fallback2"))
+                    if fallback_chain:
+                        last_error = e
+                        for fallback_idx, fallback_model_id in enumerate(fallback_chain):
+                            logger.warning(f"Attempting fallback model {fallback_idx + 1}/{len(fallback_chain)}: {fallback_model_id}")
+                            try:
+                                # Create fallback config
+                                fallback_config = model_config.copy()
+                                fallback_config["model_id"] = fallback_model_id
+                                # Remove this fallback and subsequent ones to prevent infinite recursion
+                                fallback_config.pop("fallback", None)
+                                fallback_config.pop("fallback2", None)
+                                # Retry with fallback model (mark as fallback attempt if this is the last fallback)
+                                is_last_fallback = (fallback_idx == len(fallback_chain) - 1)
+                                return await self._call_local_model(
+                                    fallback_config,
+                                    prompt,
+                                    task_type,
+                                    **{**kwargs, '_is_fallback': is_last_fallback}
+                                )
+                            except GatedRepoError as fallback_gated_error:
+                                logger.error(f"❌ Fallback model {fallback_model_id} is also gated")
+                                last_error = fallback_gated_error
+                                if fallback_idx == len(fallback_chain) - 1:
+                                    # Last fallback failed
+                                    raise RuntimeError("All models (primary and fallbacks) are gated repositories") from fallback_gated_error
+                                # Continue to next fallback
+                                continue
+                            except Exception as fallback_error:
+                                logger.error(f"Fallback model {fallback_model_id} failed: {fallback_error}")
+                                last_error = fallback_error
+                                if fallback_idx == len(fallback_chain) - 1:
+                                    # Last fallback failed
+                                    raise
+                                # Continue to next fallback
+                                continue
+                        # All fallbacks exhausted
+                        raise RuntimeError(f"All models failed. Last error: {last_error}") from last_error
                     else:
                         raise RuntimeError(f"Model {model_id} is a gated repository and no fallback available") from e
                 except (RuntimeError, ModuleNotFoundError, ImportError) as e:

src/local_model_loader.py CHANGED Viewed

@@ -369,16 +369,30 @@ class LocalModelLoader:
             # Tokenize input
             inputs = tokenizer(prompt, return_tensors="pt").to(self.device)
             # Generate
             with torch.no_grad():
                 outputs = model.generate(
                     **inputs,
-                    max_new_tokens=max_tokens,
-                    temperature=temperature,
-                    do_sample=True,
-                    pad_token_id=tokenizer.pad_token_id,
-                    eos_token_id=tokenizer.eos_token_id,
-                    **kwargs
                 )
             # Decode
@@ -390,6 +404,33 @@ class LocalModelLoader:
             return generated_text
         except Exception as e:
             logger.error(f"Error generating text: {e}", exc_info=True)
             raise

             # Tokenize input
             inputs = tokenizer(prompt, return_tensors="pt").to(self.device)
+            # Prepare generation kwargs
+            generation_kwargs = {
+                "max_new_tokens": max_tokens,
+                "temperature": temperature,
+                "do_sample": True,
+                "pad_token_id": tokenizer.pad_token_id,
+                "eos_token_id": tokenizer.eos_token_id,
+            }
+            # Add compatibility fix for Phi-3 DynamicCache issues
+            # Phi-3 models may use DynamicCache which doesn't have seen_tokens in some versions
+            if "phi" in model_id.lower() or "phi3" in model_id.lower() or "phi-3" in model_id.lower():
+                # Use cache=False as workaround for DynamicCache.seen_tokens AttributeError
+                generation_kwargs["use_cache"] = False
+                logger.debug(f"Using use_cache=False for Phi-3 model to avoid DynamicCache compatibility issues")
+            # Merge additional kwargs (may override above settings)
+            generation_kwargs.update(kwargs)
             # Generate
             with torch.no_grad():
                 outputs = model.generate(
                     **inputs,
+                    **generation_kwargs
                 )
             # Decode
             return generated_text
+        except AttributeError as e:
+            # Handle DynamicCache.seen_tokens AttributeError specifically
+            if "seen_tokens" in str(e) or "DynamicCache" in str(e):
+                logger.warning(f"DynamicCache compatibility issue detected ({e}), retrying without cache")
+                try:
+                    # Retry without cache to avoid DynamicCache issues
+                    with torch.no_grad():
+                        outputs = model.generate(
+                            **inputs,
+                            max_new_tokens=max_tokens,
+                            temperature=temperature,
+                            do_sample=True,
+                            use_cache=False,  # Disable cache to avoid DynamicCache issues
+                            pad_token_id=tokenizer.pad_token_id,
+                            eos_token_id=tokenizer.eos_token_id,
+                            **{k: v for k, v in kwargs.items() if k != "use_cache"}  # Remove use_cache from kwargs
+                        )
+                    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    if generated_text.startswith(prompt):
+                        generated_text = generated_text[len(prompt):].strip()
+                    logger.info("✓ Generation successful after DynamicCache workaround")
+                    return generated_text
+                except Exception as retry_error:
+                    logger.error(f"Retry without cache also failed: {retry_error}", exc_info=True)
+                    raise RuntimeError(f"Generation failed even with cache disabled: {retry_error}") from retry_error
+            # Re-raise if it's a different AttributeError
+            raise
         except Exception as e:
             logger.error(f"Error generating text: {e}", exc_info=True)
             raise

src/models_config.py CHANGED Viewed

@@ -5,11 +5,14 @@ LLM_CONFIG = {
     "primary_provider": "local",
     "models": {
         "reasoning_primary": {
             "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Single primary model for all text tasks
             "task": "general_reasoning",
             "max_tokens": 8000,  # Reduced from 10000
             "temperature": 0.7,
-            "fallback": "microsoft/Phi-3-mini-4k-instruct",  # Non-gated fallback model (3.8B, verified non-gated)
             "is_chat_model": True,
             "use_4bit_quantization": True,  # Enable 4-bit quantization for 16GB T4
             "use_8bit_quantization": False
@@ -29,7 +32,8 @@ LLM_CONFIG = {
             "latency_target": "<100ms",
             "is_chat_model": True,
             "use_4bit_quantization": True,
-            "fallback": "microsoft/Phi-3-mini-4k-instruct"  # Non-gated fallback (3.8B, verified non-gated)
         },
         "safety_checker": {
             "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Same model for all text tasks
@@ -38,7 +42,8 @@ LLM_CONFIG = {
             "purpose": "bias_detection",
             "is_chat_model": True,
             "use_4bit_quantization": True,
-            "fallback": "microsoft/Phi-3-mini-4k-instruct"  # Non-gated fallback (3.8B, verified non-gated)
         }
     },
     "routing_logic": {

     "primary_provider": "local",
     "models": {
         "reasoning_primary": {
+            # Primary: Qwen (gated, requires access) - Fallback: Mistral (non-gated, stable)
             "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Single primary model for all text tasks
             "task": "general_reasoning",
             "max_tokens": 8000,  # Reduced from 10000
             "temperature": 0.7,
+            # Fallback to Mistral (non-gated, no DynamicCache issues) before Phi-3
+            "fallback": "mistralai/Mistral-7B-Instruct-v0.2",  # Non-gated, stable, no DynamicCache issues
+            "fallback2": "microsoft/Phi-3-mini-4k-instruct",  # Secondary fallback (3.8B, has DynamicCache workaround)
             "is_chat_model": True,
             "use_4bit_quantization": True,  # Enable 4-bit quantization for 16GB T4
             "use_8bit_quantization": False
             "latency_target": "<100ms",
             "is_chat_model": True,
             "use_4bit_quantization": True,
+            "fallback": "mistralai/Mistral-7B-Instruct-v0.2",  # Non-gated, stable
+            "fallback2": "microsoft/Phi-3-mini-4k-instruct"  # Secondary fallback with DynamicCache workaround
         },
         "safety_checker": {
             "model_id": "Qwen/Qwen2.5-7B-Instruct",  # Same model for all text tasks
             "purpose": "bias_detection",
             "is_chat_model": True,
             "use_4bit_quantization": True,
+            "fallback": "mistralai/Mistral-7B-Instruct-v0.2",  # Non-gated, stable
+            "fallback2": "microsoft/Phi-3-mini-4k-instruct"  # Secondary fallback with DynamicCache workaround
         }
     },
     "routing_logic": {