Spaces:

hmrizal
/

CSVBot-OpenSource

Sleeping

App Files Files Community

hmrizal commited on May 16

Commit

31d5efd

verified ·

1 Parent(s): 872a597

remove Phi-4 and DeepSeek Lite, change model key for ggufs

Browse files

Files changed (1) hide show

app.py +9 -98

app.py CHANGED Viewed

@@ -35,36 +35,26 @@ os.makedirs("performance_metrics", exist_ok=True)
 # Model configuration dictionary
 MODEL_CONFIG = {
-    "Llama 2 Chat": {
         "name": "TheBloke/Llama-2-7B-Chat-GGUF",
         "description": "Llama 2 7B Chat model with good general performance",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
-    "TinyLlama Chat": {
         "name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
         "description": "Lightweight model with 1.1B parameters, fast and efficient",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
-    "Mistral Instruct": {
         "name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
         "description": "7B instruction-tuned model with excellent reasoning",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
-    "Phi-4 Mini Instruct": {
-        "name": "microsoft/Phi-4-mini-instruct",
-        "description": "Lightweight model from Microsoft suitable for instructional tasks",
-        "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
-    },
     "DeepSeek Coder Instruct": {
         "name": "deepseek-ai/deepseek-coder-1.3b-instruct",
         "description": "1.3B model for code and data analysis",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
-    "DeepSeek Lite Chat": {
-        "name": "deepseek-ai/DeepSeek-V2-Lite-Chat",
-        "description": "Light but powerful chat model from DeepSeek",
-        "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
-    },
     "Qwen2.5 Coder Instruct": {
         "name": "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF",
         "description": "3B model specialized for code and technical applications",
@@ -174,70 +164,6 @@ def initialize_model_once(model_key):
                     )
                     MODEL_CACHE["is_gguf"] = False
-                # For Phi-4 specifically
-                elif "Phi-4" in model_key:
-                    MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                    # Load model with optimized memory
-                    try:
-                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
-                        model_name,
-                        device_map="cpu",  # Force CPU explicitly
-                        torch_dtype=torch.float32,  # Use float32 for CPU
-                        low_cpu_mem_usage=True,
-                        trust_remote_code=True,
-                        offload_folder="model_offload",
-                        offload_state_dict=True,
-                        max_memory={"cpu": "1.7GiB"}  # Limit memory usage
-                        )
-                    except Exception as e:
-                        print(f"Error loading Phi-4 with full settings: {str(e)}")
-                        print("Trying with minimal configuration...")
-                        # Fallback with minimum configuration
-                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
-                            model_name,
-                            device_map="cpu",
-                            torch_dtype=torch.float32,
-                            trust_remote_code=True,
-                            offload_folder="model_offload",
-                            low_cpu_mem_usage=True
-                        )
-                    MODEL_CACHE["is_gguf"] = False
-                # Special handling for DeepSeek Lite Chat
-                elif model_key == "DeepSeek Lite Chat":
-                    MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                    # Load model with optimized memory
-                    try:
-                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
-                        model_name,
-                        device_map="cpu",  # Force CPU
-                        torch_dtype=torch.float32,  # Use float32 for CPU
-                        low_cpu_mem_usage=True,
-                        trust_remote_code=True,
-                        max_memory={"cpu": "1.7GiB"}
-                        )
-                    except Exception as e:
-                        print(f"Error loading DeepSeek with full settings: {str(e)}")
-                        print("Trying with lightweight approach...")
-                        # Fallback to lighter approach
-                        import torch.nn as nn
-                        from transformers import PreTrainedModel
-                        # Trying to load model with smaller fraction
-                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
-                            model_name,
-                            device_map="cpu",
-                            torch_dtype=torch.float32,
-                            trust_remote_code=True,
-                            low_cpu_mem_usage=True
-                        )
-                    MODEL_CACHE["is_gguf"] = False
                 # Handle standard HF models
                 else:
                     # Only use quantization if CUDA is available
@@ -296,28 +222,13 @@ def initialize_model_once(model_key):
 def get_fallback_model(current_model):
     """Get appropriate fallback model for problematic models"""
     fallback_map = {
-        "Phi-4 Mini Instruct": "TinyLlama Chat",
-        "DeepSeek Lite Chat": "DeepSeek Coder Instruct",
-        "Flan T5 Small": "Llama 2 Chat"
     }
-    return fallback_map.get(current_model, "Llama 2 Chat")
-# Optimized pipeline for "problematic" models
 def create_optimized_pipeline(model, tokenizer, model_key):
-    """Optimized pipeline for problematic models"""
-    if model_key == "Phi-4 Mini Instruct" or model_key == "DeepSeek Lite Chat":
-        # Use minimum parameter
-        pipe = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
-            max_new_tokens=128,        # Kurangi jumlah token yang dihasilkan
-            temperature=0.3,
-            top_p=0.9,
-            return_full_text=False,
-        )
-        return HuggingFacePipeline(pipeline=pipe)
-    else:
         # Default pipeline for other models
         pipe = pipeline(
             "text-generation",
@@ -428,12 +339,12 @@ def handle_model_loading_error(model_key, session_id):
     # Regular suggestion logic for when fallbacks don't work or aren't applicable
     suggested_models = [
         "DeepSeek Coder Instruct",  # 1.3B model
-        "TinyLlama Chat",           # 1.1B model
         "Qwen2.5 Coder Instruct"    # Another option
     ]
     # Remove problematic models and current model from suggestions
-    problem_models = ["Phi-4 Mini Instruct", "DeepSeek Lite Chat", "Flan T5 Small"]
     suggested_models = [m for m in suggested_models if m not in problem_models and m != model_key]
     suggestions = ", ".join(suggested_models[:3])  # Only show top 3 suggestions

 # Model configuration dictionary
 MODEL_CONFIG = {
+    "Llama 2 Chat GGUF": {
         "name": "TheBloke/Llama-2-7B-Chat-GGUF",
         "description": "Llama 2 7B Chat model with good general performance",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
+    "TinyLlama Chat GGUF": {
         "name": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
         "description": "Lightweight model with 1.1B parameters, fast and efficient",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
+    "Mistral Instruct GGUF": {
         "name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
         "description": "7B instruction-tuned model with excellent reasoning",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
     "DeepSeek Coder Instruct": {
         "name": "deepseek-ai/deepseek-coder-1.3b-instruct",
         "description": "1.3B model for code and data analysis",
         "dtype": torch.float16 if torch.cuda.is_available() else torch.float32
     },
     "Qwen2.5 Coder Instruct": {
         "name": "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF",
         "description": "3B model specialized for code and technical applications",
                     )
                     MODEL_CACHE["is_gguf"] = False
                 # Handle standard HF models
                 else:
                     # Only use quantization if CUDA is available
 def get_fallback_model(current_model):
     """Get appropriate fallback model for problematic models"""
     fallback_map = {
+        "Flan T5 Small": "Llama 2 Chat GGUF"
     }
+    return fallback_map.get(current_model, "Llama 2 Chat GGUF")
+# Optimized pipeline for models
 def create_optimized_pipeline(model, tokenizer, model_key):
+    """Optimized pipeline for models"""
         # Default pipeline for other models
         pipe = pipeline(
             "text-generation",
     # Regular suggestion logic for when fallbacks don't work or aren't applicable
     suggested_models = [
         "DeepSeek Coder Instruct",  # 1.3B model
+        "TinyLlama Chat GGUF",           # 1.1B model
         "Qwen2.5 Coder Instruct"    # Another option
     ]
     # Remove problematic models and current model from suggestions
+    problem_models = ["Flan T5 Small"]
     suggested_models = [m for m in suggested_models if m not in problem_models and m != model_key]
     suggestions = ", ".join(suggested_models[:3])  # Only show top 3 suggestions