Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on 11 days ago

Commit

08ebc2c

verified ·

1 Parent(s): 8b08643

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -4

app.py CHANGED Viewed

@@ -129,6 +129,7 @@ MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -138,6 +139,7 @@ MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -147,6 +149,7 @@ MODEL_ID_A = "CohereForAI/aya-vision-8b"
 processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
 model_a = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID_A,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -156,6 +159,7 @@ MODEL_ID_W = "allenai/olmOCR-7B-0725"
 processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
 model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_W,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -165,6 +169,7 @@ MODEL_ID_M = "reducto/RolmOCR"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -245,8 +250,8 @@ image_examples = [
 ]
 # Create the Gradio Interface
-with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
-    gr.Markdown("# **Multimodal [OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations)**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
             image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
@@ -267,7 +272,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
         with gr.Column(scale=3):
                 gr.Markdown("## Output", elem_id="output-title")
-                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=11, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
@@ -285,4 +290,4 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
 model_a = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID_A,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
 model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_W,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_M,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 ]
 # Create the Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown("# **Multimodal OCR**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
             image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
         with gr.Column(scale=3):
                 gr.Markdown("## Output", elem_id="output-title")
+                output = gr.Textbox(label="Raw Output Stream", interactive=True, lines=11)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)