ocr_mcp_1

Sleeping

App Files Files Community

vachaspathi commited on 28 days ago

Commit

ad08316

verified ·

1 Parent(s): c5b3162

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -17

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import re
 import logging
 import asyncio
 import gc
 # --- Import OCR Engine & Prompts ---
 try:
@@ -54,7 +55,7 @@ def _normalize_local_path_args(args: Any) -> Any:
         args["file_url"] = f"file://{fp}"
     return args
-# --- Model Loading (Lazy & Light) ---
 def init_local_model():
     global LLM_PIPELINE, TOKENIZER
     if LLM_PIPELINE is not None: return
@@ -64,14 +65,11 @@ def init_local_model():
         logger.info(f"Loading lighter model: {LOCAL_MODEL}...")
         TOKENIZER = AutoTokenizer.from_pretrained(LOCAL_MODEL)
-        # Load model (Standard load is fine for Qwen on CPU)
         model = AutoModelForCausalLM.from_pretrained(
             LOCAL_MODEL,
             device_map="auto",
             torch_dtype="auto"
         )
         LLM_PIPELINE = pipeline("text-generation", model=model, tokenizer=TOKENIZER)
         logger.info("Model loaded.")
     except Exception as e:
@@ -85,13 +83,12 @@ def local_llm_generate(prompt: str, max_tokens: int = 512) -> Dict[str, Any]:
         return {"text": "Model not loaded.", "raw": None}
     try:
-        # Standard generation (Qwen is robust, no cache hacks needed)
         out = LLM_PIPELINE(
             prompt,
             max_new_tokens=max_tokens,
             return_full_text=False,
-            do_sample=False, # Deterministic for tools
-            temperature=0.0
         )
         text = out[0]["generated_text"] if out else ""
         return {"text": text, "raw": out}
@@ -114,7 +111,6 @@ def create_record(module_name: str, record_data: dict) -> str:
     if not h: return "Auth Failed"
     r = requests.post(f"{API_BASE}/{module_name}", headers=h, json={"data": [record_data]})
     if r.status_code in (200, 201):
-        # Extract ID for downstream use
         try:
             d = r.json().get("data", [{}])[0].get("details", {})
             return json.dumps({"status": "success", "id": d.get("id"), "response": r.json()})
@@ -132,7 +128,9 @@ def create_invoice(data: dict) -> str:
 @mcp.tool()
 def process_document(file_path: str, target_module: Optional[str] = "Contacts") -> dict:
-    if not os.path.exists(file_path): return {"error": "File not found"}
     # 1. OCR
     raw_text = extract_text_from_file(file_path)
@@ -154,11 +152,9 @@ def parse_and_execute(model_text: str, history: list) -> str:
     payload = extract_json_safely(model_text)
     if not payload: return "No valid tool call found."
-    # Normalize
     cmds = [payload] if isinstance(payload, dict) else payload
     results = []
-    # Context State
     last_contact_id = None
     for cmd in cmds:
@@ -169,7 +165,6 @@ def parse_and_execute(model_text: str, history: list) -> str:
         if tool == "create_record":
             res = create_record(args.get("module", "Contacts"), args)
             results.append(f"Record: {res}")
-            # Try capture ID
             try:
                 rj = json.loads(res)
                 if isinstance(rj, dict) and "id" in rj:
@@ -177,11 +172,9 @@ def parse_and_execute(model_text: str, history: list) -> str:
             except: pass
         elif tool == "create_invoice":
-            # Auto-fill contact_id if we just created one
             if not args.get("customer_id") and last_contact_id:
                 args["customer_id"] = last_contact_id
-            # Map Items
             items = []
             for it in args.get("line_items", []):
                 items.append({
@@ -197,6 +190,7 @@ def parse_and_execute(model_text: str, history: list) -> str:
             results.append(f"Invoice: {res}")
         elif tool == "process_document":
             res = process_document(args.get("file_path"))
             results.append(f"Processed: {res}")
@@ -204,9 +198,11 @@ def parse_and_execute(model_text: str, history: list) -> str:
 # --- Chat Core ---
 def chat_logic(message: str, file_path: str, history: list) -> str:
-    # 1. Ingest File
     file_context = ""
     if file_path:
         doc = process_document(file_path)
         if doc.get("status") == "success":
             file_context = json.dumps(doc["extracted_data"])
@@ -214,12 +210,14 @@ def chat_logic(message: str, file_path: str, history: list) -> str:
         else:
             return f"OCR Failed: {doc}"
-    # 2. Decision
     hist_txt = "\n".join([f"U: {h[0]}\nA: {h[1]}" for h in history])
     prompt = get_agent_prompt(hist_txt, file_context, message)
     # 3. Gen & Execute
     gen = local_llm_generate(prompt, max_tokens=200)
     tool_data = extract_json_safely(gen["text"])
     if tool_data:
@@ -233,6 +231,9 @@ def chat_handler(msg, hist):
     files = msg.get("files", [])
     path = files[0] if files else None
     # Direct path bypass for debugging
     if not path and txt.startswith("/mnt/data"):
         return str(process_document(txt))
@@ -241,6 +242,5 @@ def chat_handler(msg, hist):
 if __name__ == "__main__":
     gc.collect()
-    # Lazy init will happen on first request, saving startup memory
     demo = gr.ChatInterface(fn=chat_handler, multimodal=True)
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import logging
 import asyncio
 import gc
+import shutil
 # --- Import OCR Engine & Prompts ---
 try:
         args["file_url"] = f"file://{fp}"
     return args
+# --- Model Loading ---
 def init_local_model():
     global LLM_PIPELINE, TOKENIZER
     if LLM_PIPELINE is not None: return
         logger.info(f"Loading lighter model: {LOCAL_MODEL}...")
         TOKENIZER = AutoTokenizer.from_pretrained(LOCAL_MODEL)
         model = AutoModelForCausalLM.from_pretrained(
             LOCAL_MODEL,
             device_map="auto",
             torch_dtype="auto"
         )
         LLM_PIPELINE = pipeline("text-generation", model=model, tokenizer=TOKENIZER)
         logger.info("Model loaded.")
     except Exception as e:
         return {"text": "Model not loaded.", "raw": None}
     try:
+        # FIX: Removed invalid flags 'temperature', 'top_p', etc. when do_sample is False
         out = LLM_PIPELINE(
             prompt,
             max_new_tokens=max_tokens,
             return_full_text=False,
+            do_sample=False  # Deterministic
         )
         text = out[0]["generated_text"] if out else ""
         return {"text": text, "raw": out}
     if not h: return "Auth Failed"
     r = requests.post(f"{API_BASE}/{module_name}", headers=h, json={"data": [record_data]})
     if r.status_code in (200, 201):
         try:
             d = r.json().get("data", [{}])[0].get("details", {})
             return json.dumps({"status": "success", "id": d.get("id"), "response": r.json()})
 @mcp.tool()
 def process_document(file_path: str, target_module: Optional[str] = "Contacts") -> dict:
+    if not os.path.exists(file_path):
+        logger.error(f"process_document: File not found at {file_path}")
+        return {"error": f"File not found at path: {file_path}"}
     # 1. OCR
     raw_text = extract_text_from_file(file_path)
     payload = extract_json_safely(model_text)
     if not payload: return "No valid tool call found."
     cmds = [payload] if isinstance(payload, dict) else payload
     results = []
     last_contact_id = None
     for cmd in cmds:
         if tool == "create_record":
             res = create_record(args.get("module", "Contacts"), args)
             results.append(f"Record: {res}")
             try:
                 rj = json.loads(res)
                 if isinstance(rj, dict) and "id" in rj:
             except: pass
         elif tool == "create_invoice":
             if not args.get("customer_id") and last_contact_id:
                 args["customer_id"] = last_contact_id
             items = []
             for it in args.get("line_items", []):
                 items.append({
             results.append(f"Invoice: {res}")
         elif tool == "process_document":
+            # NOTE: Prompts try to prevent this, but if it happens, we rely on args being correct
             res = process_document(args.get("file_path"))
             results.append(f"Processed: {res}")
 # --- Chat Core ---
 def chat_logic(message: str, file_path: str, history: list) -> str:
+    # 1. Ingest File IMMEDIATELY
     file_context = ""
     if file_path:
+        logger.info(f"Ingesting file from path: {file_path}")
         doc = process_document(file_path)
         if doc.get("status") == "success":
             file_context = json.dumps(doc["extracted_data"])
         else:
             return f"OCR Failed: {doc}"
+    # 2. Decision Prompt (With context injected)
     hist_txt = "\n".join([f"U: {h[0]}\nA: {h[1]}" for h in history])
     prompt = get_agent_prompt(hist_txt, file_context, message)
     # 3. Gen & Execute
     gen = local_llm_generate(prompt, max_tokens=200)
+    logger.info(f"LLM Decision: {gen['text']}")
     tool_data = extract_json_safely(gen["text"])
     if tool_data:
     files = msg.get("files", [])
     path = files[0] if files else None
+    if path:
+        logger.info(f"UI received file: {path}")
     # Direct path bypass for debugging
     if not path and txt.startswith("/mnt/data"):
         return str(process_document(txt))
 if __name__ == "__main__":
     gc.collect()
     demo = gr.ChatInterface(fn=chat_handler, multimodal=True)
     demo.launch(server_name="0.0.0.0", server_port=7860)