LightOnOCR

Paused

App Files Files Community

IFMedTechdemo commited on 26 days ago

Commit

c2a331b

verified ·

1 Parent(s): 068d019

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -62

app.py CHANGED Viewed

@@ -21,30 +21,40 @@ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipelin
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Choose best attention implementation based on device
 if device == "cuda":
-    attn_implementation = "sdpa"
     dtype = torch.bfloat16
     print("Using sdpa for GPU")
 else:
-    attn_implementation = "eager"  # Best for CPU
     dtype = torch.float32
     print("Using eager attention for CPU")
-# Initialize the LightOnOCR model and processor
-print(f"Loading model on {device} with {attn_implementation} attention...")
-model = LightOnOCRForConditionalGeneration.from_pretrained(
     "lightonai/LightOnOCR-1B-1025",
     attn_implementation=attn_implementation,
     torch_dtype=dtype,
-    trust_remote_code=True
 ).to(device).eval()
 processor = LightOnOCRProcessor.from_pretrained(
     "lightonai/LightOnOCR-1B-1025",
-    trust_remote_code=True
 )
-print("Model loaded successfully!")
 def render_pdf_page(page, max_resolution=1540, scale=2.77):
@@ -99,34 +109,38 @@ def clean_output_text(text):
 @spaces.GPU
 def extract_text_from_image(image, temperature=0.2, stream=False):
-    """Extract text from image using LightOnOCR model."""
     # Prepare the chat format
     chat = [
         {
             "role": "user",
             "content": [
-                {"type": "image", "url": image},
             ],
         }
     ]
-    # Apply chat template and tokenize
     inputs = processor.apply_chat_template(
         chat,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
-        return_tensors="pt"
     )
-    # Move inputs to device AND convert to the correct dtype
     inputs = {
-        k: v.to(device=device, dtype=dtype) if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
-        else v.to(device) if isinstance(v, torch.Tensor)
-        else v
         for k, v in inputs.items()
     }
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=2048,
@@ -134,49 +148,39 @@ def extract_text_from_image(image, temperature=0.2, stream=False):
         use_cache=True,
         do_sample=temperature > 0,
     )
     if stream:
-        # Setup streamer for streaming generation
         streamer = TextIteratorStreamer(
             processor.tokenizer,
             skip_prompt=True,
-            skip_special_tokens=True
         )
         generation_kwargs["streamer"] = streamer
-        # Run generation in a separate thread
-        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
-        # Yield chunks as they arrive
         full_text = ""
         for new_text in streamer:
             full_text += new_text
-            # Clean the accumulated text
             cleaned_text = clean_output_text(full_text)
-            yield cleaned_text
         thread.join()
     else:
         # Non-streaming generation
         with torch.no_grad():
-            outputs = model.generate(**generation_kwargs)
-        # Decode the output
         output_text = processor.decode(outputs[0], skip_special_tokens=True)
-        # Clean the output
         cleaned_text = clean_output_text(output_text)
-        #########  clinical NER  ##############
-        tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
-        model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
-        ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
-        #Clinical NER process
-        entities = ner(cleaned_text)
         medications = []
         for ent in entities:
             if ent["entity_group"] == "treatment":
@@ -185,50 +189,62 @@ def extract_text_from_image(image, temperature=0.2, stream=False):
                     medications[-1] += word[2:]
                 else:
                     medications.append(word)
         medications_str = ", ".join(set(medications)) if medications else "None detected"
-        yield cleaned_text
-        yield medications_s
 def process_input(file_input, temperature, page_num, enable_streaming):
     """Process uploaded file (image or PDF) and extract text with optional streaming."""
     if file_input is None:
-        yield "Please upload an image or PDF first.", "", "", None, gr.update()
         return
     image_to_process = None
     page_info = ""
     file_path = file_input if isinstance(file_input, str) else file_input.name
     # Handle PDF files
-    if file_path.lower().endswith('.pdf'):
         try:
             image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
             page_info = f"Processing page {actual_page} of {total_pages}"
         except Exception as e:
-            yield f"Error processing PDF: {str(e)}", "", "", None, gr.update()
             return
-    # Handle image files
     else:
         try:
             image_to_process = Image.open(file_path)
             page_info = "Processing image"
         except Exception as e:
-            yield f"Error opening image: {str(e)}", "", "", None, gr.update()
             return
     try:
         # Extract text using LightOnOCR with optional streaming
-        for extracted_text, medications in extract_text_from_image(image_to_process, temperature, stream=enable_streaming):
-            yield extracted_text, medications, page_info, image_to_process, gr.update()
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
-        yield error_msg, error_msg, page_info, image_to_process, gr.update()
 def update_slider(file_input):

 device = "cuda" if torch.cuda.is_available() else "cpu"
 if device == "cuda":
+    attn_implementation = "sdpa"
     dtype = torch.bfloat16
     print("Using sdpa for GPU")
 else:
+    attn_implementation = "eager"
     dtype = torch.float32
     print("Using eager attention for CPU")
+print(f"Loading LightOnOCR model on {device} with {attn_implementation} attention...")
+ocr_model = LightOnOCRForConditionalGeneration.from_pretrained(
     "lightonai/LightOnOCR-1B-1025",
     attn_implementation=attn_implementation,
     torch_dtype=dtype,
+    trust_remote_code=True,
 ).to(device).eval()
 processor = LightOnOCRProcessor.from_pretrained(
     "lightonai/LightOnOCR-1B-1025",
+    trust_remote_code=True,
 )
+print("LightOnOCR model loaded successfully!")
+# -------- Clinical NER models (load ONCE) --------
+print("Loading clinical NER model...")
+ner_tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+ner_model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+ner_pipeline = pipeline(
+    "ner",
+    model=ner_model,
+    tokenizer=ner_tokenizer,
+    aggregation_strategy="simple",
+)
+print("Clinical NER model loaded successfully!")
 def render_pdf_page(page, max_resolution=1540, scale=2.77):
 @spaces.GPU
 def extract_text_from_image(image, temperature=0.2, stream=False):
+    """Extract text from image using LightOnOCR model, and run clinical NER."""
     # Prepare the chat format
     chat = [
         {
             "role": "user",
             "content": [
+                {"type": "image", "url": image},  # adjust to {"type": "image", "image": image} if LightOnOCR expects that
             ],
         }
     ]
+    # Tokenize
     inputs = processor.apply_chat_template(
         chat,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
+        return_tensors="pt",
     )
+    # Move inputs to device
     inputs = {
+        k: (
+            v.to(device=device, dtype=dtype)
+            if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
+            else v.to(device)
+            if isinstance(v, torch.Tensor)
+            else v
+        )
         for k, v in inputs.items()
     }
     generation_kwargs = dict(
         **inputs,
         max_new_tokens=2048,
         use_cache=True,
         do_sample=temperature > 0,
     )
     if stream:
+        # Streaming generation
         streamer = TextIteratorStreamer(
             processor.tokenizer,
             skip_prompt=True,
+            skip_special_tokens=True,
         )
         generation_kwargs["streamer"] = streamer
+        thread = threading.Thread(target=ocr_model.generate, kwargs=generation_kwargs)
         thread.start()
         full_text = ""
         for new_text in streamer:
             full_text += new_text
             cleaned_text = clean_output_text(full_text)
+            # For streaming, we’ll only show text progressively,
+            # and keep medications empty (or compute at the end if you prefer).
+            yield cleaned_text, ""
         thread.join()
     else:
         # Non-streaming generation
         with torch.no_grad():
+            outputs = ocr_model.generate(**generation_kwargs)
         output_text = processor.decode(outputs[0], skip_special_tokens=True)
         cleaned_text = clean_output_text(output_text)
+        # Clinical NER on the full cleaned text
+        entities = ner_pipeline(cleaned_text)
         medications = []
         for ent in entities:
             if ent["entity_group"] == "treatment":
                     medications[-1] += word[2:]
                 else:
                     medications.append(word)
         medications_str = ", ".join(set(medications)) if medications else "None detected"
+        yield cleaned_text, medications_str
 def process_input(file_input, temperature, page_num, enable_streaming):
     """Process uploaded file (image or PDF) and extract text with optional streaming."""
     if file_input is None:
+        # 6 outputs: [output_text, medications_output, raw_output, page_info, rendered_image, num_pages]
+        yield "Please upload an image or PDF first.", "", "", "", None, 1
         return
     image_to_process = None
     page_info = ""
+    slider_value = page_num
     file_path = file_input if isinstance(file_input, str) else file_input.name
     # Handle PDF files
+    if file_path.lower().endswith(".pdf"):
         try:
             image_to_process, total_pages, actual_page = process_pdf(file_path, int(page_num))
             page_info = f"Processing page {actual_page} of {total_pages}"
+            slider_value = actual_page
         except Exception as e:
+            msg = f"Error processing PDF: {str(e)}"
+            yield msg, "", msg, "", None, slider_value
             return
     else:
+        # Handle image files
         try:
             image_to_process = Image.open(file_path)
             page_info = "Processing image"
         except Exception as e:
+            msg = f"Error opening image: {str(e)}"
+            yield msg, "", msg, "", None, slider_value
             return
     try:
         # Extract text using LightOnOCR with optional streaming
+        for extracted_text, medications in extract_text_from_image(
+            image_to_process, temperature, stream=enable_streaming
+        ):
+            raw_md = extracted_text  # or you can keep a different raw version
+            # 6 outputs: markdown_text, medications, raw_output, page_info, image, slider
+            yield extracted_text, medications, raw_md, page_info, image_to_process, gr.update(
+                value=slider_value
+            )
     except Exception as e:
         error_msg = f"Error during text extraction: {str(e)}"
+        # 6 outputs
+        yield error_msg, "", error_msg, page_info, image_to_process, gr.update(value=slider_value)
 def update_slider(file_input):