Spaces:

hmrizal
/

CSVBot-OpenSource

Sleeping

App Files Files Community

hmrizal commited on May 15

Commit

872a597

verified ·

1 Parent(s): 3dda2b6

fix Phi-4 and DeepSeek Lite Chat by limiting max new tokens and max memory and optimizing pipeline creation

Browse files

Files changed (1) hide show

app.py +78 -18

app.py CHANGED Viewed

@@ -177,25 +177,65 @@ def initialize_model_once(model_key):
                 # For Phi-4 specifically
                 elif "Phi-4" in model_key:
                     MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                    MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
                         model_name,
                         device_map="cpu",  # Force CPU explicitly
                         torch_dtype=torch.float32,  # Use float32 for CPU
                         low_cpu_mem_usage=True,
-                        trust_remote_code=True
-                    )
                     MODEL_CACHE["is_gguf"] = False
                 # Special handling for DeepSeek Lite Chat
                 elif model_key == "DeepSeek Lite Chat":
                     MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-                    MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
                         model_name,
                         device_map="cpu",  # Force CPU
                         torch_dtype=torch.float32,  # Use float32 for CPU
                         low_cpu_mem_usage=True,
-                        trust_remote_code=True
-                    )
                     MODEL_CACHE["is_gguf"] = False
                 # Handle standard HF models
@@ -262,6 +302,36 @@ def get_fallback_model(current_model):
     }
     return fallback_map.get(current_model, "Llama 2 Chat")
 def create_llm_pipeline(model_key):
     """Create a new pipeline using the specified model with better error handling"""
     try:
@@ -310,18 +380,8 @@ def create_llm_pipeline(model_key):
                 # Remove return_full_text parameter for T5 models
             )
         else:
-            print("Creating causal LM pipeline")
-            pipe = pipeline(
-                "text-generation",
-                model=model,
-                tokenizer=tokenizer,
-                max_new_tokens=256,  # Increased for more comprehensive answers
-                temperature=0.3,
-                top_p=0.9,
-                top_k=30,
-                repetition_penalty=1.2,
-                return_full_text=False,
-            )
         print("Pipeline created successfully")
         return HuggingFacePipeline(pipeline=pipe)

                 # For Phi-4 specifically
                 elif "Phi-4" in model_key:
                     MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+                    # Load model with optimized memory
+                    try:
+                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
                         model_name,
                         device_map="cpu",  # Force CPU explicitly
                         torch_dtype=torch.float32,  # Use float32 for CPU
                         low_cpu_mem_usage=True,
+                        trust_remote_code=True,
+                        offload_folder="model_offload",
+                        offload_state_dict=True,
+                        max_memory={"cpu": "1.7GiB"}  # Limit memory usage
+                        )
+                    except Exception as e:
+                        print(f"Error loading Phi-4 with full settings: {str(e)}")
+                        print("Trying with minimal configuration...")
+                        # Fallback with minimum configuration
+                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
+                            model_name,
+                            device_map="cpu",
+                            torch_dtype=torch.float32,
+                            trust_remote_code=True,
+                            offload_folder="model_offload",
+                            low_cpu_mem_usage=True
+                        )
                     MODEL_CACHE["is_gguf"] = False
                 # Special handling for DeepSeek Lite Chat
                 elif model_key == "DeepSeek Lite Chat":
                     MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+                    # Load model with optimized memory
+                    try:
+                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
                         model_name,
                         device_map="cpu",  # Force CPU
                         torch_dtype=torch.float32,  # Use float32 for CPU
                         low_cpu_mem_usage=True,
+                        trust_remote_code=True,
+                        max_memory={"cpu": "1.7GiB"}
+                        )
+                    except Exception as e:
+                        print(f"Error loading DeepSeek with full settings: {str(e)}")
+                        print("Trying with lightweight approach...")
+                        # Fallback to lighter approach
+                        import torch.nn as nn
+                        from transformers import PreTrainedModel
+                        # Trying to load model with smaller fraction
+                        MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
+                            model_name,
+                            device_map="cpu",
+                            torch_dtype=torch.float32,
+                            trust_remote_code=True,
+                            low_cpu_mem_usage=True
+                        )
                     MODEL_CACHE["is_gguf"] = False
                 # Handle standard HF models
     }
     return fallback_map.get(current_model, "Llama 2 Chat")
+# Optimized pipeline for "problematic" models
+def create_optimized_pipeline(model, tokenizer, model_key):
+    """Optimized pipeline for problematic models"""
+    if model_key == "Phi-4 Mini Instruct" or model_key == "DeepSeek Lite Chat":
+        # Use minimum parameter
+        pipe = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=128,        # Kurangi jumlah token yang dihasilkan
+            temperature=0.3,
+            top_p=0.9,
+            return_full_text=False,
+        )
+        return HuggingFacePipeline(pipeline=pipe)
+    else:
+        # Default pipeline for other models
+        pipe = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            max_new_tokens=256,
+            temperature=0.3,
+            top_p=0.9,
+            top_k=30,
+            repetition_penalty=1.2,
+            return_full_text=False,
+        )
+        return HuggingFacePipeline(pipeline=pipe)
 def create_llm_pipeline(model_key):
     """Create a new pipeline using the specified model with better error handling"""
     try:
                 # Remove return_full_text parameter for T5 models
             )
         else:
+            # Use optimized pipeline for problematic model
+            return create_optimized_pipeline(model, tokenizer, model_key)
         print("Pipeline created successfully")
         return HuggingFacePipeline(pipeline=pipe)