Spaces:

hmrizal
/

CSVBot-OpenSource

Sleeping

App Files Files Community

hmrizal commited on May 14

Commit

3dda2b6

verified ·

1 Parent(s): 516ac46

add reset_model_cache to prevent memory leak, force cpu_only and disable 8-bit quant for Phi-4

Browse files

Files changed (1) hide show

app.py +69 -33

app.py CHANGED Viewed

@@ -107,20 +107,24 @@ performance_tracker = PerformanceTracker()
 def initialize_model_once(model_key):
     with MODEL_CACHE["init_lock"]:
-        current_model = MODEL_CACHE["model_name"]
-        if MODEL_CACHE["model"] is None or current_model != model_key:
-            # Clear previous model
-            if MODEL_CACHE["model"] is not None:
-                del MODEL_CACHE["model"]
-                if MODEL_CACHE["tokenizer"] is not None:
-                    del MODEL_CACHE["tokenizer"]
                 torch.cuda.empty_cache() if torch.cuda.is_available() else None
-            model_info = MODEL_CONFIG[model_key]
-            model_name = model_info["name"]
-            MODEL_CACHE["model_name"] = model_key
-            try:
                 print(f"Loading model: {model_name}")
                 # Check if this is a GGUF model
@@ -169,22 +173,30 @@ def initialize_model_once(model_key):
                         low_cpu_mem_usage=True
                     )
                     MODEL_CACHE["is_gguf"] = False
-                # Special handling for models that cause memory issues
-                elif model_key in ["Phi-4 Mini Instruct", "DeepSeek Lite Chat"]:
-                    # Reduce memory footprint
-                    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
-                    # For CPU-only environments, load with 8-bit quantization
                     MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                     MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
                         model_name,
-                        load_in_8bit=True,  # Use 8-bit instead of 4-bit
-                        device_map="auto" if torch.cuda.is_available() else None,
                         low_cpu_mem_usage=True,
                         trust_remote_code=True
                     )
-                    MODEL_CACHE["is_gguf"] = False
                 # Handle standard HF models
                 else:
@@ -219,19 +231,26 @@ def initialize_model_once(model_key):
                     MODEL_CACHE["is_gguf"] = False
                 print(f"Model {model_name} loaded successfully")
-            except Exception as e:
-                import traceback
-                print(f"Error loading model {model_name}: {str(e)}")
-                print(traceback.format_exc())
-                raise RuntimeError(f"Failed to load model {model_name}: {str(e)}")
-        # Final verification that model loaded correctly
-        if MODEL_CACHE["model"] is None:
-            print(f"WARNING: Model {model_name} appears to be None after loading")
-            # Try to free memory before returning
-            torch.cuda.empty_cache() if torch.cuda.is_available() else None
             gc.collect()
     return MODEL_CACHE["tokenizer"], MODEL_CACHE["model"], MODEL_CACHE.get("is_gguf", False)
 def get_fallback_model(current_model):
@@ -312,6 +331,22 @@ def create_llm_pipeline(model_key):
         print(traceback.format_exc())
         raise RuntimeError(f"Failed to create pipeline: {str(e)}")
 # Modified handle_model_loading_error function
 def handle_model_loading_error(model_key, session_id):
     """Handle model loading errors by providing alternative model suggestions or fallbacks"""
@@ -724,6 +759,7 @@ def create_gradio_interface():
         # Reset handler - enables model selection again
         def reset_session():
             return None, False, [], gr.update(interactive=True)
         reset_button.click(

 def initialize_model_once(model_key):
     with MODEL_CACHE["init_lock"]:
+        try:
+            current_model = MODEL_CACHE["model_name"]
+            if MODEL_CACHE["model"] is None or current_model != model_key:
+                # Clear previous model
+                if MODEL_CACHE["model"] is not None:
+                    del MODEL_CACHE["model"]
+                    if MODEL_CACHE["tokenizer"] is not None:
+                        del MODEL_CACHE["tokenizer"]
+                # Force garbage collection
+                gc.collect()
                 torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                time.sleep(1)  # Give system time to release memory
+                model_info = MODEL_CONFIG[model_key]
+                model_name = model_info["name"]
+                MODEL_CACHE["model_name"] = model_key
                 print(f"Loading model: {model_name}")
                 # Check if this is a GGUF model
                         low_cpu_mem_usage=True
                     )
                     MODEL_CACHE["is_gguf"] = False
+                # For Phi-4 specifically
+                elif "Phi-4" in model_key:
                     MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                     MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
                         model_name,
+                        device_map="cpu",  # Force CPU explicitly
+                        torch_dtype=torch.float32,  # Use float32 for CPU
+                        low_cpu_mem_usage=True,
+                        trust_remote_code=True
+                    )
+                    MODEL_CACHE["is_gguf"] = False
+                # Special handling for DeepSeek Lite Chat
+                elif model_key == "DeepSeek Lite Chat":
+                    MODEL_CACHE["tokenizer"] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+                    MODEL_CACHE["model"] = AutoModelForCausalLM.from_pretrained(
+                        model_name,
+                        device_map="cpu",  # Force CPU
+                        torch_dtype=torch.float32,  # Use float32 for CPU
                         low_cpu_mem_usage=True,
                         trust_remote_code=True
                     )
+                    MODEL_CACHE["is_gguf"] = False
                 # Handle standard HF models
                 else:
                     MODEL_CACHE["is_gguf"] = False
                 print(f"Model {model_name} loaded successfully")
+                # Final verification that model loaded correctly
+                if MODEL_CACHE["model"] is None:
+                    print(f"WARNING: Model {model_name} appears to be None after loading")
+                    # Try to free memory before returning
+                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                    gc.collect()
+        except Exception as e:
+            # Reset model cache on error
+            MODEL_CACHE["model"] = None
+            MODEL_CACHE["tokenizer"] = None
+            # Force garbage collection
             gc.collect()
+            torch.cuda.empty_cache() if torch.cuda.is_available() else None
+            import traceback
+            print(f"Error loading model {model_key}: {str(e)}")
+            print(traceback.format_exc())
+            raise RuntimeError(f"Failed to load model {model_key}: {str(e)}")
     return MODEL_CACHE["tokenizer"], MODEL_CACHE["model"], MODEL_CACHE.get("is_gguf", False)
 def get_fallback_model(current_model):
         print(traceback.format_exc())
         raise RuntimeError(f"Failed to create pipeline: {str(e)}")
+# add a reset function to clear models between sessions
+def reset_model_cache():
+    """Force clear all model cache"""
+    with MODEL_CACHE["init_lock"]:
+        if MODEL_CACHE["model"] is not None:
+            del MODEL_CACHE["model"]
+        if MODEL_CACHE["tokenizer"] is not None:
+            del MODEL_CACHE["tokenizer"]
+        MODEL_CACHE["model"] = None
+        MODEL_CACHE["tokenizer"] = None
+        MODEL_CACHE["model_name"] = None
+        MODEL_CACHE["is_gguf"] = False
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        time.sleep(1)
 # Modified handle_model_loading_error function
 def handle_model_loading_error(model_key, session_id):
     """Handle model loading errors by providing alternative model suggestions or fallbacks"""
         # Reset handler - enables model selection again
         def reset_session():
+            reset_model_cache() # call reset model cache
             return None, False, [], gr.update(interactive=True)
         reset_button.click(