Spaces:
Sleeping
Sleeping
remove Phi-4 and DeepSeek Lite, change model key for ggufs
Browse files
app.py
CHANGED
|
@@ -35,36 +35,26 @@ os.makedirs("performance_metrics", exist_ok=True)
|
|
| 35 |
|
| 36 |
# Model configuration dictionary
|
| 37 |
MODEL_CONFIG = {
|
| 38 |
-
"Llama 2 Chat": {
|
| 39 |
"name": "TheBloke/Llama-2-7B-Chat-GGUF",
|
| 40 |
"description": "Llama 2 7B Chat model with good general performance",
|
| 41 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 42 |
},
|
| 43 |
-
"TinyLlama Chat": {
|
| 44 |
"name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
| 45 |
"description": "Lightweight model with 1.1B parameters, fast and efficient",
|
| 46 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 47 |
},
|
| 48 |
-
"Mistral Instruct": {
|
| 49 |
"name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
|
| 50 |
"description": "7B instruction-tuned model with excellent reasoning",
|
| 51 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 52 |
},
|
| 53 |
-
"Phi-4 Mini Instruct": {
|
| 54 |
-
"name": "microsoft/Phi-4-mini-instruct",
|
| 55 |
-
"description": "Lightweight model from Microsoft suitable for instructional tasks",
|
| 56 |
-
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 57 |
-
},
|
| 58 |
"DeepSeek Coder Instruct": {
|
| 59 |
"name": "deepseek-ai/deepseek-coder-1.3b-instruct",
|
| 60 |
"description": "1.3B model for code and data analysis",
|
| 61 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 62 |
},
|
| 63 |
-
"DeepSeek Lite Chat": {
|
| 64 |
-
"name": "deepseek-ai/DeepSeek-V2-Lite-Chat",
|
| 65 |
-
"description": "Light but powerful chat model from DeepSeek",
|
| 66 |
-
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 67 |
-
},
|
| 68 |
"Qwen2.5 Coder Instruct": {
|
| 69 |
"name": "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF",
|
| 70 |
"description": "3B model specialized for code and technical applications",
|
|
@@ -174,70 +164,6 @@ def initialize_model_once(model_key):
|
|
| 174 |
)
|
| 175 |
MODEL_CACHE["is_gguf"] = False
|
| 176 |
|
| 177 |
-
# For Phi-4 specifically
|
| 178 |
-
elif "Phi-4" in model_key:
|
| 179 |
-
MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 180 |
-
# Load model with optimized memory
|
| 181 |
-
try:
|
| 182 |
-
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
| 183 |
-
model_name,
|
| 184 |
-
device_map="cpu", # Force CPU explicitly
|
| 185 |
-
torch_dtype=torch.float32, # Use float32 for CPU
|
| 186 |
-
low_cpu_mem_usage=True,
|
| 187 |
-
trust_remote_code=True,
|
| 188 |
-
offload_folder="model_offload",
|
| 189 |
-
offload_state_dict=True,
|
| 190 |
-
max_memory={"cpu": "1.7GiB"} # Limit memory usage
|
| 191 |
-
)
|
| 192 |
-
|
| 193 |
-
except Exception as e:
|
| 194 |
-
print(f"Error loading Phi-4 with full settings: {str(e)}")
|
| 195 |
-
print("Trying with minimal configuration...")
|
| 196 |
-
|
| 197 |
-
# Fallback with minimum configuration
|
| 198 |
-
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
| 199 |
-
model_name,
|
| 200 |
-
device_map="cpu",
|
| 201 |
-
torch_dtype=torch.float32,
|
| 202 |
-
trust_remote_code=True,
|
| 203 |
-
offload_folder="model_offload",
|
| 204 |
-
low_cpu_mem_usage=True
|
| 205 |
-
)
|
| 206 |
-
|
| 207 |
-
MODEL_CACHE["is_gguf"] = False
|
| 208 |
-
|
| 209 |
-
# Special handling for DeepSeek Lite Chat
|
| 210 |
-
elif model_key == "DeepSeek Lite Chat":
|
| 211 |
-
MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 212 |
-
# Load model with optimized memory
|
| 213 |
-
try:
|
| 214 |
-
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
| 215 |
-
model_name,
|
| 216 |
-
device_map="cpu", # Force CPU
|
| 217 |
-
torch_dtype=torch.float32, # Use float32 for CPU
|
| 218 |
-
low_cpu_mem_usage=True,
|
| 219 |
-
trust_remote_code=True,
|
| 220 |
-
max_memory={"cpu": "1.7GiB"}
|
| 221 |
-
)
|
| 222 |
-
except Exception as e:
|
| 223 |
-
print(f"Error loading DeepSeek with full settings: {str(e)}")
|
| 224 |
-
print("Trying with lightweight approach...")
|
| 225 |
-
|
| 226 |
-
# Fallback to lighter approach
|
| 227 |
-
import torch.nn as nn
|
| 228 |
-
from transformers import PreTrainedModel
|
| 229 |
-
|
| 230 |
-
# Trying to load model with smaller fraction
|
| 231 |
-
MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
|
| 232 |
-
model_name,
|
| 233 |
-
device_map="cpu",
|
| 234 |
-
torch_dtype=torch.float32,
|
| 235 |
-
trust_remote_code=True,
|
| 236 |
-
low_cpu_mem_usage=True
|
| 237 |
-
)
|
| 238 |
-
|
| 239 |
-
MODEL_CACHE["is_gguf"] = False
|
| 240 |
-
|
| 241 |
# Handle standard HF models
|
| 242 |
else:
|
| 243 |
# Only use quantization if CUDA is available
|
|
@@ -296,28 +222,13 @@ def initialize_model_once(model_key):
|
|
| 296 |
def get_fallback_model(current_model):
|
| 297 |
"""Get appropriate fallback model for problematic models"""
|
| 298 |
fallback_map = {
|
| 299 |
-
"
|
| 300 |
-
"DeepSeek Lite Chat": "DeepSeek Coder Instruct",
|
| 301 |
-
"Flan T5 Small": "Llama 2 Chat"
|
| 302 |
}
|
| 303 |
-
return fallback_map.get(current_model, "Llama 2 Chat")
|
| 304 |
|
| 305 |
-
# Optimized pipeline for
|
| 306 |
def create_optimized_pipeline(model, tokenizer, model_key):
|
| 307 |
-
"""Optimized pipeline for
|
| 308 |
-
if model_key == "Phi-4 Mini Instruct" or model_key == "DeepSeek Lite Chat":
|
| 309 |
-
# Use minimum parameter
|
| 310 |
-
pipe = pipeline(
|
| 311 |
-
"text-generation",
|
| 312 |
-
model=model,
|
| 313 |
-
tokenizer=tokenizer,
|
| 314 |
-
max_new_tokens=128, # Kurangi jumlah token yang dihasilkan
|
| 315 |
-
temperature=0.3,
|
| 316 |
-
top_p=0.9,
|
| 317 |
-
return_full_text=False,
|
| 318 |
-
)
|
| 319 |
-
return HuggingFacePipeline(pipeline=pipe)
|
| 320 |
-
else:
|
| 321 |
# Default pipeline for other models
|
| 322 |
pipe = pipeline(
|
| 323 |
"text-generation",
|
|
@@ -428,12 +339,12 @@ def handle_model_loading_error(model_key, session_id):
|
|
| 428 |
# Regular suggestion logic for when fallbacks don't work or aren't applicable
|
| 429 |
suggested_models = [
|
| 430 |
"DeepSeek Coder Instruct", # 1.3B model
|
| 431 |
-
"TinyLlama Chat", # 1.1B model
|
| 432 |
"Qwen2.5 Coder Instruct" # Another option
|
| 433 |
]
|
| 434 |
|
| 435 |
# Remove problematic models and current model from suggestions
|
| 436 |
-
problem_models = ["
|
| 437 |
suggested_models = [m for m in suggested_models if m not in problem_models and m != model_key]
|
| 438 |
|
| 439 |
suggestions = ", ".join(suggested_models[:3]) # Only show top 3 suggestions
|
|
|
|
| 35 |
|
| 36 |
# Model configuration dictionary
|
| 37 |
MODEL_CONFIG = {
|
| 38 |
+
"Llama 2 Chat GGUF": {
|
| 39 |
"name": "TheBloke/Llama-2-7B-Chat-GGUF",
|
| 40 |
"description": "Llama 2 7B Chat model with good general performance",
|
| 41 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 42 |
},
|
| 43 |
+
"TinyLlama Chat GGUF": {
|
| 44 |
"name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
|
| 45 |
"description": "Lightweight model with 1.1B parameters, fast and efficient",
|
| 46 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 47 |
},
|
| 48 |
+
"Mistral Instruct GGUF": {
|
| 49 |
"name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
|
| 50 |
"description": "7B instruction-tuned model with excellent reasoning",
|
| 51 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 52 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
"DeepSeek Coder Instruct": {
|
| 54 |
"name": "deepseek-ai/deepseek-coder-1.3b-instruct",
|
| 55 |
"description": "1.3B model for code and data analysis",
|
| 56 |
"dtype": torch.float16 if torch.cuda.is_available() else torch.float32
|
| 57 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
"Qwen2.5 Coder Instruct": {
|
| 59 |
"name": "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF",
|
| 60 |
"description": "3B model specialized for code and technical applications",
|
|
|
|
| 164 |
)
|
| 165 |
MODEL_CACHE["is_gguf"] = False
|
| 166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
# Handle standard HF models
|
| 168 |
else:
|
| 169 |
# Only use quantization if CUDA is available
|
|
|
|
| 222 |
def get_fallback_model(current_model):
|
| 223 |
"""Get appropriate fallback model for problematic models"""
|
| 224 |
fallback_map = {
|
| 225 |
+
"Flan T5 Small": "Llama 2 Chat GGUF"
|
|
|
|
|
|
|
| 226 |
}
|
| 227 |
+
return fallback_map.get(current_model, "Llama 2 Chat GGUF")
|
| 228 |
|
| 229 |
+
# Optimized pipeline for models
|
| 230 |
def create_optimized_pipeline(model, tokenizer, model_key):
|
| 231 |
+
"""Optimized pipeline for models"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
# Default pipeline for other models
|
| 233 |
pipe = pipeline(
|
| 234 |
"text-generation",
|
|
|
|
| 339 |
# Regular suggestion logic for when fallbacks don't work or aren't applicable
|
| 340 |
suggested_models = [
|
| 341 |
"DeepSeek Coder Instruct", # 1.3B model
|
| 342 |
+
"TinyLlama Chat GGUF", # 1.1B model
|
| 343 |
"Qwen2.5 Coder Instruct" # Another option
|
| 344 |
]
|
| 345 |
|
| 346 |
# Remove problematic models and current model from suggestions
|
| 347 |
+
problem_models = ["Flan T5 Small"]
|
| 348 |
suggested_models = [m for m in suggested_models if m not in problem_models and m != model_key]
|
| 349 |
|
| 350 |
suggestions = ", ".join(suggested_models[:3]) # Only show top 3 suggestions
|