LightOnOCR

Paused

App Files Files Community

IFMedTechdemo commited on 19 days ago

Commit

aff30bc

verified ·

1 Parent(s): ad78c1f

Update app.py

Browse files

Files changed (1) hide show

app.py +335 -87

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-#################################################################################################
 import spaces
 import gradio as gr
@@ -7,86 +8,109 @@ import numpy as np
 import cv2
 import re
-import re
 def extract_medication_lines(text):
     """
-    Extracts medication/drug lines from text using flexible regex.
-    Supports tablet, capsule, syrup, drops, injection, ointment, cream, gel, patch, solution, etc.
-    Matches dose like '1/2/10/250/500 mg/ml/mcg/g/kg' or concentration '1%/2%/0.2%/0.5%/10%' w/w, w/v, v/v.
     """
     form_pattern = r"(TAB(L?ET)?|CAP(SULE)?|SYRUP|SYP|DROP(S)?|INJ(CTION)?|OINTMENT|CREAM|GEL|PATCH|SOL(UTION)?|ORAL)"
-    # Drug name: starts with a word (alphanumeric, maybe a hyphen), up to 4 words (spaces, hyphens or slash)
     name_pattern = r"([A-Z0-9\-/]+(?:\s+[A-Z0-9\-/]+){0,4})"
-    # Dose: e.g., 250mg, 10ml, 0.5%, 10 mcg, 150mcg, etc. and concentration/w/w/w/v/etc.
     dose_pattern = r"(\d{1,4}\s*(mg|ml|mcg|g|kg|units|IU)|\d{1,2}\s*%(\s*w\/w|\s*w\/v|\s*v\/v)?)"
-    # concentration can appear for creams/gels: e.g. "1% w/w", "2%"
-    # Main pattern: will attempt to capture form anywhere, then name, then dose/concentration
     main_pattern = (
-        r"(?:" + form_pattern + r"\s+)?" +          # Form prefix optional
-        name_pattern + r"\s*" +
-        r"(?:" + form_pattern + r"\s*)?" +          # Form mid/suffix optional
-        r"(?:" + dose_pattern + r")"                # Dose/concentration required
     )
     med_regex = re.compile(main_pattern, re.IGNORECASE)
     meds = []
     for line in text.split('\n'):
         line_stripped = line.strip()
         match = med_regex.search(line_stripped)
         if match:
-            meds.append(line_stripped)
     return '\n'.join(meds)
-def preprocess_image_for_ocr(image):
-    image_rgb = image.convert("RGB")
-    img_np = np.array(image_rgb)
-    gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
-    adaptive_threshold = cv2.adaptiveThreshold(
-        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 85,35,
     )
-    preprocessed_pil = Image.fromarray(adaptive_threshold)
-    return preprocessed_pil
 @spaces.GPU
-def extract_text_from_image(image, temperature=0.2, use_ner=False):
-    # Import and load within GPU context!
     import torch
-    from transformers import (
-        LightOnOCRForConditionalGeneration,
-        LightOnOCRProcessor,
-        AutoTokenizer, AutoModelForTokenClassification, pipeline,
-    )
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    attn_implementation = "sdpa" if device == "cuda" else "eager"
     dtype = torch.bfloat16 if device == "cuda" else torch.float32
     ocr_model = LightOnOCRForConditionalGeneration.from_pretrained(
         "lightonai/LightOnOCR-1B-1025",
-        attn_implementation=attn_implementation,
         torch_dtype=dtype,
         trust_remote_code=True,
     ).to(device).eval()
     processor = LightOnOCRProcessor.from_pretrained(
         "lightonai/LightOnOCR-1B-1025",
         trust_remote_code=True,
     )
-    # NER only if requested
-    if use_ner:
-        ner_tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
-        ner_model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
-        ner_pipeline = pipeline(
-            "ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple"
-        )
     processed_img = preprocess_image_for_ocr(image)
     chat = [
         {
             "role": "user",
@@ -122,38 +146,27 @@ def extract_text_from_image(image, temperature=0.2, use_ner=False):
         outputs = ocr_model.generate(**generation_kwargs)
     output_text = processor.decode(outputs[0], skip_special_tokens=True)
-    cleaned_text = output_text.strip()
-    # Extract medicines
-    if use_ner:
-        entities = ner_pipeline(cleaned_text)
-        meds = []
-        for ent in entities:
-            if ent["entity_group"] == "treatment":
-                word = ent["word"]
-                if word.startswith("##") and meds:
-                    meds[-1] += word[2:]
-                else:
-                    meds.append(word)
-        result_meds = ", ".join(set(meds)) if meds else "None detected"
-    else:
-        result_meds = extract_medication_lines(cleaned_text) or "None detected"
-    yield result_meds, processed_img  # Only medicines and processed image
-def process_input(file_input, temperature, page_num, extraction_mode):
     if file_input is None:
-        yield "Please upload an image or PDF first.", None
         return
-    image_to_process = Image.open(file_input) if not str(file_input).lower().endswith(".pdf") else None  # simplify to image only
-    use_ner = extraction_mode == "Clinical NER"
-    for meds_out, processed_img in extract_text_from_image(image_to_process, temperature, use_ner):
-        yield meds_out, processed_img
 with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo:
     file_input = gr.File(
-        label="🖼️ Upload Image",
-        file_types=[".png", ".jpg", ".jpeg"],
         type="filepath"
     )
     temperature = gr.Slider(
@@ -164,39 +177,274 @@ with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo
         label="Temperature"
     )
     extraction_mode = gr.Radio(
-        choices=["Clinical NER", "Regex"],
         value="Regex",
-        label="Extraction Method",
-        info="Clinical NER uses ML, Regex uses rules"
     )
     medicines_output = gr.Textbox(
-        label="💊 Extracted Medicines/Drugs",
-        placeholder="Medicine/drug names will appear here...",
-        lines=2,
-        max_lines=10,
         interactive=False,
         show_copy_button=True
     )
     rendered_image = gr.Image(
-        label="Processed Image (Adaptive Thresholded for OCR)",
         interactive=False
     )
     submit_btn = gr.Button("Extract Medicines", variant="primary")
-    page_slider = gr.Slider(minimum=1, maximum=20, value=1, step=1, label="Page Number")
     submit_btn.click(
-    fn=process_input,
-    inputs=[file_input, temperature, page_slider, extraction_mode],
-    outputs=[medicines_output, rendered_image]
-)
 if __name__ == "__main__":
     demo.launch()
 ####################################################  running code only NER #######################
 #!/usr/bin/env python3

+######################################   version  4  NER change done   #######################################################
 import spaces
 import gradio as gr
 import cv2
 import re
+def preprocess_image_for_ocr(image):
+    image_rgb = image.convert("RGB")
+    img_np = np.array(image_rgb)
+    gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
+    adaptive_threshold = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 85, 11,
+    )
+    preprocessed_pil = Image.fromarray(adaptive_threshold)
+    return preprocessed_pil
 def extract_medication_lines(text):
     """
+    Flexible regex: Find lines with [form], [name], [dose] anywhere.
+    Handles free text/table/mixed layouts.
     """
+    # Medicine forms
     form_pattern = r"(TAB(L?ET)?|CAP(SULE)?|SYRUP|SYP|DROP(S)?|INJ(CTION)?|OINTMENT|CREAM|GEL|PATCH|SOL(UTION)?|ORAL)"
+    # Name: up to 4 tokens (space/hyphen/slash), case/mixed
     name_pattern = r"([A-Z0-9\-/]+(?:\s+[A-Z0-9\-/]+){0,4})"
+    # Dose/concentration: 1-4 digits, optional space, units
     dose_pattern = r"(\d{1,4}\s*(mg|ml|mcg|g|kg|units|IU)|\d{1,2}\s*%(\s*w\/w|\s*w\/v|\s*v\/v)?)"
+    # Allow any order: form+name+dose/mid/suffix/prefix
     main_pattern = (
+        r"(?<!\w)(" + form_pattern + r")[\s\-]+"
+        r"" + name_pattern + r""  # name after form
+        r"[^|,\n]{0,50}?"
+        r"" + dose_pattern + r""  # dose somewhere after name
     )
     med_regex = re.compile(main_pattern, re.IGNORECASE)
     meds = []
     for line in text.split('\n'):
         line_stripped = line.strip()
         match = med_regex.search(line_stripped)
         if match:
+            # Compose: form + name + dose
+            cleaned = f"{match.group(1).upper()} {match.group(2).upper()} {match.group(5)}"
+            meds.append(cleaned.strip())
     return '\n'.join(meds)
+def clinical_ner_extract(text, use_gpu=False):
+    """
+    Uses ClinicalNER for medicine name, then finds form/dose in source sentence.
+    Returns clean combinations: form + entity + dose (no unwanted text).
+    """
+    # Load models in GPU context if required
+    import torch
+    from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+    device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
+    tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+    model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+    ner_pipeline = pipeline(
+        "ner",
+        model=model,
+        tokenizer=tokenizer,
+        aggregation_strategy="simple",
+        device=0 if device=="cuda" else -1
     )
+    text_lines = text.split('\n')
+    entities = ner_pipeline(text)
+    meds = []
+    for ent in entities:
+        if ent["entity_group"] == "treatment":
+            # For each detected medicine entity, scan lines for context
+            entity_name = ent["word"].lower()
+            for line in text_lines:
+                if entity_name in line.lower():
+                    # Find form and dose
+                    form_match = re.search(r"(TAB(L?ET)?|CAP(SULE)?|SYRUP|SYP|DROP(S)?|INJ(CTION)?|OINTMENT|CREAM|GEL|PATCH|SOL(UTION)?|ORAL)", line, re.IGNORECASE)
+                    dose_match = re.search(r"(\d{1,4} ?(mg|ml|mcg|g|kg|units|IU)|\d{1,2} ?%( ?w\/w| ?w\/v| ?v\/v)?)", line, re.IGNORECASE)
+                    tokens = []
+                    if form_match:
+                        tokens.append(form_match.group(0).upper())
+                    tokens.append(ent["word"].upper())
+                    if dose_match:
+                        tokens.append(dose_match.group(0))
+                    meds.append(" ".join(tokens).strip())
+                    break
+    return '\n'.join(set(meds)) if meds else "None detected"
 @spaces.GPU
+def run_ocr_and_extract(image, temperature=0.2, extraction_mode="Regex"):
+    # Load OCR model ONLY in GPU context!
     import torch
+    from transformers import LightOnOCRForConditionalGeneration, LightOnOCRProcessor
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    attn = "sdpa" if device == "cuda" else "eager"
     dtype = torch.bfloat16 if device == "cuda" else torch.float32
     ocr_model = LightOnOCRForConditionalGeneration.from_pretrained(
         "lightonai/LightOnOCR-1B-1025",
+        attn_implementation=attn,
         torch_dtype=dtype,
         trust_remote_code=True,
     ).to(device).eval()
     processor = LightOnOCRProcessor.from_pretrained(
         "lightonai/LightOnOCR-1B-1025",
         trust_remote_code=True,
     )
     processed_img = preprocess_image_for_ocr(image)
     chat = [
         {
             "role": "user",
         outputs = ocr_model.generate(**generation_kwargs)
     output_text = processor.decode(outputs[0], skip_special_tokens=True)
+    raw_text = output_text.strip()
+    # Clean medicines using selected extraction method
+    if extraction_mode == "Clinical NER":
+        meds = clinical_ner_extract(raw_text, use_gpu=(device=="cuda"))
+    else: # Regex
+        meds = extract_medication_lines(raw_text)
+    yield meds, raw_text, processed_img
+def process_input(file_input, temperature, extraction_mode):
     if file_input is None:
+        yield "Please upload an image/PDF.", "", None
         return
+    image_to_process = Image.open(file_input)
+    for meds_out, raw_text, processed_img in run_ocr_and_extract(image_to_process, temperature, extraction_mode):
+        yield meds_out, raw_text, processed_img
 with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo:
     file_input = gr.File(
+        label="Upload Image (or PDF first page for OCR)",
+        file_types=[".png", ".jpg", ".jpeg"], # PDF support: requires render as image first
         type="filepath"
     )
     temperature = gr.Slider(
         label="Temperature"
     )
     extraction_mode = gr.Radio(
+        choices=["Regex", "Clinical NER"],
         value="Regex",
+        label="Extraction Method"
     )
     medicines_output = gr.Textbox(
+        label="💊 Cleaned Medicines",
+        lines=10,
+        interactive=False,
+        show_copy_button=True
+    )
+    raw_output = gr.Textbox(
+        label="Raw OCR Output",
+        lines=10,
         interactive=False,
         show_copy_button=True
     )
     rendered_image = gr.Image(
+        label="Processed Image (Thresholded for OCR)",
         interactive=False
     )
     submit_btn = gr.Button("Extract Medicines", variant="primary")
     submit_btn.click(
+        fn=process_input,
+        inputs=[file_input, temperature, extraction_mode],
+        outputs=[medicines_output, raw_output, rendered_image]
+    )
 if __name__ == "__main__":
     demo.launch()
+#####################################    version  3  NER modification to be done  ############################################################
+# import spaces
+# import gradio as gr
+# from PIL import Image
+# import numpy as np
+# import cv2
+# import re
+# import re
+# def extract_medication_lines(text):
+#     """
+#     Extracts medication/drug lines from text using flexible regex.
+#     Supports tablet, capsule, syrup, drops, injection, ointment, cream, gel, patch, solution, etc.
+#     Matches dose like '1/2/10/250/500 mg/ml/mcg/g/kg' or concentration '1%/2%/0.2%/0.5%/10%' w/w, w/v, v/v.
+#     """
+#     form_pattern = r"(TAB(L?ET)?|CAP(SULE)?|SYRUP|SYP|DROP(S)?|INJ(CTION)?|OINTMENT|CREAM|GEL|PATCH|SOL(UTION)?|ORAL)"
+#     # Drug name: starts with a word (alphanumeric, maybe a hyphen), up to 4 words (spaces, hyphens or slash)
+#     name_pattern = r"([A-Z0-9\-/]+(?:\s+[A-Z0-9\-/]+){0,4})"
+#     # Dose: e.g., 250mg, 10ml, 0.5%, 10 mcg, 150mcg, etc. and concentration/w/w/w/v/etc.
+#     dose_pattern = r"(\d{1,4}\s*(mg|ml|mcg|g|kg|units|IU)|\d{1,2}\s*%(\s*w\/w|\s*w\/v|\s*v\/v)?)"
+#     # concentration can appear for creams/gels: e.g. "1% w/w", "2%"
+#     # Main pattern: will attempt to capture form anywhere, then name, then dose/concentration
+#     main_pattern = (
+#         r"(?:" + form_pattern + r"\s+)?" +          # Form prefix optional
+#         name_pattern + r"\s*" +
+#         r"(?:" + form_pattern + r"\s*)?" +          # Form mid/suffix optional
+#         r"(?:" + dose_pattern + r")"                # Dose/concentration required
+#     )
+#     med_regex = re.compile(main_pattern, re.IGNORECASE)
+#     meds = []
+#     for line in text.split('\n'):
+#         line_stripped = line.strip()
+#         match = med_regex.search(line_stripped)
+#         if match:
+#             meds.append(line_stripped)
+#     return '\n'.join(meds)
+# ###########################    added NER modification to be done ###################################
+# def get_medicine_context(entities, text_lines):
+#     """
+#     For each medicine entity detected by NER, find its form and dose context from its source line.
+#     Returns list of strings like 'TAB ALDACTONE 25MG'.
+#     """
+#     output = []
+#     for ent in entities:
+#         if ent["entity_group"] == "treatment":
+#             # Find line containing the entity's word (robust for multiline output)
+#             for line in text_lines:
+#                 if ent["word"].lower() in line.lower():
+#                     # Search line for context
+#                     match = re.search(r"((TAB(L?ET)?|CAP(SULE)?|SYRUP|SYP|DROP(S)?|INJ(CTION)?|OINTMENT|CREAM|GEL|PATCH|SOL(UTION)?|ORAL).{0,40})", line, re.IGNORECASE)
+#                     dose = re.search(r"\d{1,4}\s*(mg|ml|mcg|g|kg|units|IU)|\d{1,2}\s*%(\s*w\/w|\s*w\/v|\s*v\/v)?", line, re.IGNORECASE)
+#                     info = []
+#                     if match:
+#                         info.append(match.group(0).strip())
+#                     else:
+#                         info.append(ent["word"].strip())
+#                     if dose:
+#                         info.append(dose.group(0).strip())
+#                     output.append(" ".join(info))
+#                     break
+#     return "\n".join(set(output)) if output else "None detected"
+# ################################
+# def preprocess_image_for_ocr(image):
+#     image_rgb = image.convert("RGB")
+#     img_np = np.array(image_rgb)
+#     gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
+#     adaptive_threshold = cv2.adaptiveThreshold(
+#         gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 85,35,
+#     )
+#     preprocessed_pil = Image.fromarray(adaptive_threshold)
+#     return preprocessed_pil
+# @spaces.GPU
+# def extract_text_from_image(image, temperature=0.2, use_ner=False):
+#     # Import and load within GPU context!
+#     import torch
+#     from transformers import (
+#         LightOnOCRForConditionalGeneration,
+#         LightOnOCRProcessor,
+#         AutoTokenizer, AutoModelForTokenClassification, pipeline,
+#     )
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     attn_implementation = "sdpa" if device == "cuda" else "eager"
+#     dtype = torch.bfloat16 if device == "cuda" else torch.float32
+#     ocr_model = LightOnOCRForConditionalGeneration.from_pretrained(
+#         "lightonai/LightOnOCR-1B-1025",
+#         attn_implementation=attn_implementation,
+#         torch_dtype=dtype,
+#         trust_remote_code=True,
+#     ).to(device).eval()
+#     processor = LightOnOCRProcessor.from_pretrained(
+#         "lightonai/LightOnOCR-1B-1025",
+#         trust_remote_code=True,
+#     )
+#     # NER only if requested
+#     if use_ner:
+#         ner_tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+#         ner_model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
+#         ner_pipeline = pipeline(
+#             "ner", model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="simple"
+#         )
+#     processed_img = preprocess_image_for_ocr(image)
+#     chat = [
+#         {
+#             "role": "user",
+#             "content": [
+#                 {"type": "image", "image": processed_img}
+#             ],
+#         }
+#     ]
+#     inputs = processor.apply_chat_template(
+#         chat,
+#         add_generation_prompt=True,
+#         tokenize=True,
+#         return_dict=True,
+#         return_tensors="pt",
+#     )
+#     inputs = {
+#         k: (v.to(device=device, dtype=dtype)
+#             if isinstance(v, torch.Tensor) and v.dtype in [torch.float32, torch.float16, torch.bfloat16]
+#             else v.to(device)
+#             if isinstance(v, torch.Tensor)
+#             else v)
+#         for k, v in inputs.items()
+#     }
+#     generation_kwargs = dict(
+#         **inputs,
+#         max_new_tokens=2048,
+#         temperature=temperature if temperature > 0 else 0.0,
+#         use_cache=True,
+#         do_sample=temperature > 0,
+#     )
+#     with torch.no_grad():
+#         outputs = ocr_model.generate(**generation_kwargs)
+#     output_text = processor.decode(outputs[0], skip_special_tokens=True)
+#     cleaned_text = output_text.strip()
+#     # Extract medicines
+#     if use_ner:
+#         entities = ner_pipeline(cleaned_text)
+#         meds = []
+#         for ent in entities:
+#             if ent["entity_group"] == "treatment":
+#                 word = ent["word"]
+#                 if word.startswith("##") and meds:
+#                     meds[-1] += word[2:]
+#                 else:
+#                     meds.append(word)
+#         result_meds = ", ".join(set(meds)) if meds else "None detected"
+#     else:
+#         result_meds = extract_medication_lines(cleaned_text) or "None detected"
+#     yield result_meds, processed_img  # Only medicines and processed image
+# def process_input(file_input, temperature, page_num, extraction_mode):
+#     if file_input is None:
+#         yield "Please upload an image or PDF first.", None
+#         return
+#     image_to_process = Image.open(file_input) if not str(file_input).lower().endswith(".pdf") else None  # simplify to image only
+#     use_ner = extraction_mode == "Clinical NER"
+#     for meds_out, processed_img in extract_text_from_image(image_to_process, temperature, use_ner):
+#         yield meds_out, processed_img
+# with gr.Blocks(title="💊 Medicine Extraction", theme=gr.themes.Soft()) as demo:
+#     file_input = gr.File(
+#         label="🖼️ Upload Image",
+#         file_types=[".png", ".jpg", ".jpeg"],
+#         type="filepath"
+#     )
+#     temperature = gr.Slider(
+#         minimum=0.0,
+#         maximum=1.0,
+#         value=0.2,
+#         step=0.05,
+#         label="Temperature"
+#     )
+#     extraction_mode = gr.Radio(
+#         choices=["Clinical NER", "Regex"],
+#         value="Regex",
+#         label="Extraction Method",
+#         info="Clinical NER uses ML, Regex uses rules"
+#     )
+#     medicines_output = gr.Textbox(
+#         label="💊 Extracted Medicines/Drugs",
+#         placeholder="Medicine/drug names will appear here...",
+#         lines=2,
+#         max_lines=10,
+#         interactive=False,
+#         show_copy_button=True
+#     )
+#     rendered_image = gr.Image(
+#         label="Processed Image (Adaptive Thresholded for OCR)",
+#         interactive=False
+#     )
+#     submit_btn = gr.Button("Extract Medicines", variant="primary")
+#     page_slider = gr.Slider(minimum=1, maximum=20, value=1, step=1, label="Page Number")
+#     submit_btn.click(
+#     fn=process_input,
+#     inputs=[file_input, temperature, page_slider, extraction_mode],
+#     outputs=[medicines_output, rendered_image]
+# )
+# if __name__ == "__main__":
+#     demo.launch()
 ####################################################  running code only NER #######################
 #!/usr/bin/env python3