Fix: DynamicCache compatibility, dependencies, and Docker configuration
Browse filesCRITICAL FIXES:
- Fixed DynamicCache.seen_tokens AttributeError for Phi-3 models
- Pinned transformers to compatible version (4.36.0-4.41.0)
- Added multi-level fallback chain (Mistral -> Phi-3)
- Fixed Docker user creation to prevent getpwuid errors
- Enhanced error handling for model generation
Changes:
- src/local_model_loader.py:
- Added DynamicCache compatibility fix for Phi-3 models
- Automatic use_cache=False for Phi-3 to avoid seen_tokens error
- Retry logic with cache disabled if DynamicCache error occurs
- Better error messages and logging
- src/llm_router.py:
- Enhanced fallback chain to support fallback + fallback2
- Sequential fallback attempts (Mistral -> Phi-3)
- Better error handling for multi-level fallbacks
- src/models_config.py:
- Updated fallback chain: Mistral (primary fallback) -> Phi-3 (secondary)
- Mistral is non-gated and has no DynamicCache issues
- Phi-3 as secondary fallback with DynamicCache workaround
- requirements.txt:
- Pinned transformers>=4.36.0,<4.41.0 (Phi-3 compatibility)
- Pinned torch>=2.0.0,<2.2.0 (avoid breaking changes)
- Pinned numpy>=1.24.0,<2.0.0 (avoid numpy 2.0 issues)
- Pinned bitsandbytes>=0.41.0,<0.43.0 (quantization stability)
- Added flash-attention as optional (commented out)
- Dockerfile:
- Creates appuser with UID 1000 (fixes getpwuid errors)
- Proper directory permissions for cache and logs
- Graceful user creation (handles existing users)
Fixes:
- AttributeError: 'DynamicCache' object has no attribute 'seen_tokens'
- KeyError: 'getpwuid(): uid not found: 1000'
- Model compatibility issues with Phi-3
- Version conflicts in dependencies
Ready for production deployment with improved stability.
- Dockerfile +14 -2
- requirements.txt +15 -5
- src/llm_router.py +45 -23
- src/local_model_loader.py +47 -6
- src/models_config.py +8 -3
|
@@ -16,12 +16,20 @@ RUN apt-get update && apt-get install -y \
|
|
| 16 |
curl \
|
| 17 |
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# Create cache directories with proper permissions
|
| 20 |
-
# Hugging Face Spaces runs as root,
|
| 21 |
RUN mkdir -p /tmp/huggingface_cache && \
|
| 22 |
chmod 777 /tmp/huggingface_cache && \
|
| 23 |
mkdir -p /tmp/logs && \
|
| 24 |
-
chmod 777 /tmp/logs
|
|
|
|
| 25 |
|
| 26 |
# Copy requirements file first (for better caching)
|
| 27 |
COPY requirements.txt .
|
|
@@ -33,6 +41,10 @@ RUN pip install --no-cache-dir --upgrade pip && \
|
|
| 33 |
# Copy application code
|
| 34 |
COPY . .
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
# Expose port 7860 (HF Spaces standard)
|
| 37 |
EXPOSE 7860
|
| 38 |
|
|
|
|
| 16 |
curl \
|
| 17 |
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
|
| 19 |
+
# Create app user with UID 1000 (fixes getpwuid(): uid not found: 1000 error)
|
| 20 |
+
# Note: Hugging Face Spaces may run as root, but creating user prevents errors
|
| 21 |
+
# Use || true to allow graceful failure if user already exists
|
| 22 |
+
RUN (useradd -u 1000 -m -s /bin/bash appuser 2>/dev/null) || \
|
| 23 |
+
(groupadd -g 1000 appuser 2>/dev/null && useradd -u 1000 -g appuser -m -s /bin/bash appuser 2>/dev/null) || \
|
| 24 |
+
echo "User creation skipped (may already exist)"
|
| 25 |
+
|
| 26 |
# Create cache directories with proper permissions
|
| 27 |
+
# Hugging Face Spaces runs as root, but we ensure appuser can access
|
| 28 |
RUN mkdir -p /tmp/huggingface_cache && \
|
| 29 |
chmod 777 /tmp/huggingface_cache && \
|
| 30 |
mkdir -p /tmp/logs && \
|
| 31 |
+
chmod 777 /tmp/logs && \
|
| 32 |
+
(chown -R appuser:appuser /tmp/huggingface_cache /tmp/logs 2>/dev/null || true)
|
| 33 |
|
| 34 |
# Copy requirements file first (for better caching)
|
| 35 |
COPY requirements.txt .
|
|
|
|
| 41 |
# Copy application code
|
| 42 |
COPY . .
|
| 43 |
|
| 44 |
+
# Set ownership of application files (if running as root, this ensures appuser can access)
|
| 45 |
+
# Use || true to allow graceful failure if chown not needed
|
| 46 |
+
RUN chown -R appuser:appuser /app 2>/dev/null || true
|
| 47 |
+
|
| 48 |
# Expose port 7860 (HF Spaces standard)
|
| 49 |
EXPOSE 7860
|
| 50 |
|
|
@@ -6,21 +6,25 @@
|
|
| 6 |
|
| 7 |
# PyTorch with CUDA support (for GPU inference)
|
| 8 |
# Note: HF Spaces provides torch, but we ensure GPU support
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
# Web Framework & Interface
|
| 12 |
aiohttp>=3.9.0
|
| 13 |
httpx>=0.25.0
|
| 14 |
|
| 15 |
# Hugging Face Ecosystem
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
tokenizers>=0.15.0
|
| 19 |
sentence-transformers>=2.2.0
|
| 20 |
|
| 21 |
# Vector Database & Search
|
| 22 |
faiss-cpu>=1.7.4
|
| 23 |
-
numpy
|
|
|
|
| 24 |
scipy>=1.11.0
|
| 25 |
|
| 26 |
# Data Processing & Utilities
|
|
@@ -86,7 +90,13 @@ gradio-pdf>=0.0.6
|
|
| 86 |
|
| 87 |
# Model-specific dependencies
|
| 88 |
safetensors>=0.4.0
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
# Development/debugging
|
| 92 |
ipython>=8.17.0
|
|
|
|
| 6 |
|
| 7 |
# PyTorch with CUDA support (for GPU inference)
|
| 8 |
# Note: HF Spaces provides torch, but we ensure GPU support
|
| 9 |
+
# Pin to avoid breaking changes with newer versions
|
| 10 |
+
torch>=2.0.0,<2.2.0
|
| 11 |
|
| 12 |
# Web Framework & Interface
|
| 13 |
aiohttp>=3.9.0
|
| 14 |
httpx>=0.25.0
|
| 15 |
|
| 16 |
# Hugging Face Ecosystem
|
| 17 |
+
# PINNED for Phi-3 and DynamicCache compatibility
|
| 18 |
+
# transformers 4.36.0+ has better Phi-3 support, but <4.41.0 to avoid breaking changes
|
| 19 |
+
transformers>=4.36.0,<4.41.0
|
| 20 |
+
accelerate>=0.24.0,<0.28.0
|
| 21 |
tokenizers>=0.15.0
|
| 22 |
sentence-transformers>=2.2.0
|
| 23 |
|
| 24 |
# Vector Database & Search
|
| 25 |
faiss-cpu>=1.7.4
|
| 26 |
+
# Pin numpy to avoid compatibility issues with numpy 2.0
|
| 27 |
+
numpy>=1.24.0,<2.0.0
|
| 28 |
scipy>=1.11.0
|
| 29 |
|
| 30 |
# Data Processing & Utilities
|
|
|
|
| 90 |
|
| 91 |
# Model-specific dependencies
|
| 92 |
safetensors>=0.4.0
|
| 93 |
+
# Pin bitsandbytes to avoid breaking changes with quantization
|
| 94 |
+
bitsandbytes>=0.41.0,<0.43.0 # Required for 4-bit and 8-bit quantization on GPU
|
| 95 |
+
|
| 96 |
+
# Optional: Flash Attention (for better performance with transformer models)
|
| 97 |
+
# Uncomment if you want flash attention (requires CUDA 11.8+ and compatible GPU)
|
| 98 |
+
# Note: Improves performance but adds build complexity
|
| 99 |
+
# flash-attn>=2.3.0 # Optional - improves performance but requires CUDA 11.8+
|
| 100 |
|
| 101 |
# Development/debugging
|
| 102 |
ipython>=8.17.0
|
|
@@ -144,29 +144,51 @@ class LLMRouter:
|
|
| 144 |
logger.error("❌ Fallback model also failed with gated repository error")
|
| 145 |
raise RuntimeError("Both primary and fallback models are gated repositories") from e
|
| 146 |
|
| 147 |
-
# Try fallback
|
| 148 |
-
|
| 149 |
-
if
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
else:
|
| 171 |
raise RuntimeError(f"Model {model_id} is a gated repository and no fallback available") from e
|
| 172 |
except (RuntimeError, ModuleNotFoundError, ImportError) as e:
|
|
|
|
| 144 |
logger.error("❌ Fallback model also failed with gated repository error")
|
| 145 |
raise RuntimeError("Both primary and fallback models are gated repositories") from e
|
| 146 |
|
| 147 |
+
# Try fallback models in order (fallback, then fallback2)
|
| 148 |
+
fallback_chain = []
|
| 149 |
+
if model_config.get("fallback") and model_config.get("fallback") != model_id:
|
| 150 |
+
fallback_chain.append(model_config.get("fallback"))
|
| 151 |
+
if model_config.get("fallback2") and model_config.get("fallback2") != model_id:
|
| 152 |
+
fallback_chain.append(model_config.get("fallback2"))
|
| 153 |
+
|
| 154 |
+
if fallback_chain:
|
| 155 |
+
last_error = e
|
| 156 |
+
for fallback_idx, fallback_model_id in enumerate(fallback_chain):
|
| 157 |
+
logger.warning(f"Attempting fallback model {fallback_idx + 1}/{len(fallback_chain)}: {fallback_model_id}")
|
| 158 |
+
try:
|
| 159 |
+
# Create fallback config
|
| 160 |
+
fallback_config = model_config.copy()
|
| 161 |
+
fallback_config["model_id"] = fallback_model_id
|
| 162 |
+
# Remove this fallback and subsequent ones to prevent infinite recursion
|
| 163 |
+
fallback_config.pop("fallback", None)
|
| 164 |
+
fallback_config.pop("fallback2", None)
|
| 165 |
+
|
| 166 |
+
# Retry with fallback model (mark as fallback attempt if this is the last fallback)
|
| 167 |
+
is_last_fallback = (fallback_idx == len(fallback_chain) - 1)
|
| 168 |
+
return await self._call_local_model(
|
| 169 |
+
fallback_config,
|
| 170 |
+
prompt,
|
| 171 |
+
task_type,
|
| 172 |
+
**{**kwargs, '_is_fallback': is_last_fallback}
|
| 173 |
+
)
|
| 174 |
+
except GatedRepoError as fallback_gated_error:
|
| 175 |
+
logger.error(f"❌ Fallback model {fallback_model_id} is also gated")
|
| 176 |
+
last_error = fallback_gated_error
|
| 177 |
+
if fallback_idx == len(fallback_chain) - 1:
|
| 178 |
+
# Last fallback failed
|
| 179 |
+
raise RuntimeError("All models (primary and fallbacks) are gated repositories") from fallback_gated_error
|
| 180 |
+
# Continue to next fallback
|
| 181 |
+
continue
|
| 182 |
+
except Exception as fallback_error:
|
| 183 |
+
logger.error(f"Fallback model {fallback_model_id} failed: {fallback_error}")
|
| 184 |
+
last_error = fallback_error
|
| 185 |
+
if fallback_idx == len(fallback_chain) - 1:
|
| 186 |
+
# Last fallback failed
|
| 187 |
+
raise
|
| 188 |
+
# Continue to next fallback
|
| 189 |
+
continue
|
| 190 |
+
# All fallbacks exhausted
|
| 191 |
+
raise RuntimeError(f"All models failed. Last error: {last_error}") from last_error
|
| 192 |
else:
|
| 193 |
raise RuntimeError(f"Model {model_id} is a gated repository and no fallback available") from e
|
| 194 |
except (RuntimeError, ModuleNotFoundError, ImportError) as e:
|
|
@@ -369,16 +369,30 @@ class LocalModelLoader:
|
|
| 369 |
# Tokenize input
|
| 370 |
inputs = tokenizer(prompt, return_tensors="pt").to(self.device)
|
| 371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
# Generate
|
| 373 |
with torch.no_grad():
|
| 374 |
outputs = model.generate(
|
| 375 |
**inputs,
|
| 376 |
-
|
| 377 |
-
temperature=temperature,
|
| 378 |
-
do_sample=True,
|
| 379 |
-
pad_token_id=tokenizer.pad_token_id,
|
| 380 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 381 |
-
**kwargs
|
| 382 |
)
|
| 383 |
|
| 384 |
# Decode
|
|
@@ -390,6 +404,33 @@ class LocalModelLoader:
|
|
| 390 |
|
| 391 |
return generated_text
|
| 392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
except Exception as e:
|
| 394 |
logger.error(f"Error generating text: {e}", exc_info=True)
|
| 395 |
raise
|
|
|
|
| 369 |
# Tokenize input
|
| 370 |
inputs = tokenizer(prompt, return_tensors="pt").to(self.device)
|
| 371 |
|
| 372 |
+
# Prepare generation kwargs
|
| 373 |
+
generation_kwargs = {
|
| 374 |
+
"max_new_tokens": max_tokens,
|
| 375 |
+
"temperature": temperature,
|
| 376 |
+
"do_sample": True,
|
| 377 |
+
"pad_token_id": tokenizer.pad_token_id,
|
| 378 |
+
"eos_token_id": tokenizer.eos_token_id,
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
# Add compatibility fix for Phi-3 DynamicCache issues
|
| 382 |
+
# Phi-3 models may use DynamicCache which doesn't have seen_tokens in some versions
|
| 383 |
+
if "phi" in model_id.lower() or "phi3" in model_id.lower() or "phi-3" in model_id.lower():
|
| 384 |
+
# Use cache=False as workaround for DynamicCache.seen_tokens AttributeError
|
| 385 |
+
generation_kwargs["use_cache"] = False
|
| 386 |
+
logger.debug(f"Using use_cache=False for Phi-3 model to avoid DynamicCache compatibility issues")
|
| 387 |
+
|
| 388 |
+
# Merge additional kwargs (may override above settings)
|
| 389 |
+
generation_kwargs.update(kwargs)
|
| 390 |
+
|
| 391 |
# Generate
|
| 392 |
with torch.no_grad():
|
| 393 |
outputs = model.generate(
|
| 394 |
**inputs,
|
| 395 |
+
**generation_kwargs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
)
|
| 397 |
|
| 398 |
# Decode
|
|
|
|
| 404 |
|
| 405 |
return generated_text
|
| 406 |
|
| 407 |
+
except AttributeError as e:
|
| 408 |
+
# Handle DynamicCache.seen_tokens AttributeError specifically
|
| 409 |
+
if "seen_tokens" in str(e) or "DynamicCache" in str(e):
|
| 410 |
+
logger.warning(f"DynamicCache compatibility issue detected ({e}), retrying without cache")
|
| 411 |
+
try:
|
| 412 |
+
# Retry without cache to avoid DynamicCache issues
|
| 413 |
+
with torch.no_grad():
|
| 414 |
+
outputs = model.generate(
|
| 415 |
+
**inputs,
|
| 416 |
+
max_new_tokens=max_tokens,
|
| 417 |
+
temperature=temperature,
|
| 418 |
+
do_sample=True,
|
| 419 |
+
use_cache=False, # Disable cache to avoid DynamicCache issues
|
| 420 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 421 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 422 |
+
**{k: v for k, v in kwargs.items() if k != "use_cache"} # Remove use_cache from kwargs
|
| 423 |
+
)
|
| 424 |
+
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 425 |
+
if generated_text.startswith(prompt):
|
| 426 |
+
generated_text = generated_text[len(prompt):].strip()
|
| 427 |
+
logger.info("✓ Generation successful after DynamicCache workaround")
|
| 428 |
+
return generated_text
|
| 429 |
+
except Exception as retry_error:
|
| 430 |
+
logger.error(f"Retry without cache also failed: {retry_error}", exc_info=True)
|
| 431 |
+
raise RuntimeError(f"Generation failed even with cache disabled: {retry_error}") from retry_error
|
| 432 |
+
# Re-raise if it's a different AttributeError
|
| 433 |
+
raise
|
| 434 |
except Exception as e:
|
| 435 |
logger.error(f"Error generating text: {e}", exc_info=True)
|
| 436 |
raise
|
|
@@ -5,11 +5,14 @@ LLM_CONFIG = {
|
|
| 5 |
"primary_provider": "local",
|
| 6 |
"models": {
|
| 7 |
"reasoning_primary": {
|
|
|
|
| 8 |
"model_id": "Qwen/Qwen2.5-7B-Instruct", # Single primary model for all text tasks
|
| 9 |
"task": "general_reasoning",
|
| 10 |
"max_tokens": 8000, # Reduced from 10000
|
| 11 |
"temperature": 0.7,
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
"is_chat_model": True,
|
| 14 |
"use_4bit_quantization": True, # Enable 4-bit quantization for 16GB T4
|
| 15 |
"use_8bit_quantization": False
|
|
@@ -29,7 +32,8 @@ LLM_CONFIG = {
|
|
| 29 |
"latency_target": "<100ms",
|
| 30 |
"is_chat_model": True,
|
| 31 |
"use_4bit_quantization": True,
|
| 32 |
-
"fallback": "
|
|
|
|
| 33 |
},
|
| 34 |
"safety_checker": {
|
| 35 |
"model_id": "Qwen/Qwen2.5-7B-Instruct", # Same model for all text tasks
|
|
@@ -38,7 +42,8 @@ LLM_CONFIG = {
|
|
| 38 |
"purpose": "bias_detection",
|
| 39 |
"is_chat_model": True,
|
| 40 |
"use_4bit_quantization": True,
|
| 41 |
-
"fallback": "
|
|
|
|
| 42 |
}
|
| 43 |
},
|
| 44 |
"routing_logic": {
|
|
|
|
| 5 |
"primary_provider": "local",
|
| 6 |
"models": {
|
| 7 |
"reasoning_primary": {
|
| 8 |
+
# Primary: Qwen (gated, requires access) - Fallback: Mistral (non-gated, stable)
|
| 9 |
"model_id": "Qwen/Qwen2.5-7B-Instruct", # Single primary model for all text tasks
|
| 10 |
"task": "general_reasoning",
|
| 11 |
"max_tokens": 8000, # Reduced from 10000
|
| 12 |
"temperature": 0.7,
|
| 13 |
+
# Fallback to Mistral (non-gated, no DynamicCache issues) before Phi-3
|
| 14 |
+
"fallback": "mistralai/Mistral-7B-Instruct-v0.2", # Non-gated, stable, no DynamicCache issues
|
| 15 |
+
"fallback2": "microsoft/Phi-3-mini-4k-instruct", # Secondary fallback (3.8B, has DynamicCache workaround)
|
| 16 |
"is_chat_model": True,
|
| 17 |
"use_4bit_quantization": True, # Enable 4-bit quantization for 16GB T4
|
| 18 |
"use_8bit_quantization": False
|
|
|
|
| 32 |
"latency_target": "<100ms",
|
| 33 |
"is_chat_model": True,
|
| 34 |
"use_4bit_quantization": True,
|
| 35 |
+
"fallback": "mistralai/Mistral-7B-Instruct-v0.2", # Non-gated, stable
|
| 36 |
+
"fallback2": "microsoft/Phi-3-mini-4k-instruct" # Secondary fallback with DynamicCache workaround
|
| 37 |
},
|
| 38 |
"safety_checker": {
|
| 39 |
"model_id": "Qwen/Qwen2.5-7B-Instruct", # Same model for all text tasks
|
|
|
|
| 42 |
"purpose": "bias_detection",
|
| 43 |
"is_chat_model": True,
|
| 44 |
"use_4bit_quantization": True,
|
| 45 |
+
"fallback": "mistralai/Mistral-7B-Instruct-v0.2", # Non-gated, stable
|
| 46 |
+
"fallback2": "microsoft/Phi-3-mini-4k-instruct" # Secondary fallback with DynamicCache workaround
|
| 47 |
}
|
| 48 |
},
|
| 49 |
"routing_logic": {
|