Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on Aug 4

Commit

f0d72f0

1 Parent(s): 277c09f

Remove format lines option

Browse files

Files changed (13) hide show

README.md +5 -6
benchmarks/throughput/main.py +0 -5
marker/builders/line.py +17 -48
marker/builders/ocr.py +1 -1
marker/converters/ocr.py +1 -1
marker/processors/table.py +1 -6
marker/scripts/extraction_app.py +0 -6
marker/scripts/streamlit_app.py +0 -6
poetry.lock +4 -4
pyproject.toml +1 -1
tests/builders/test_line_builder.py +0 -24
tests/builders/test_merged_lines.py +0 -18
tests/config/test_config.py +1 -2

README.md CHANGED Viewed

@@ -80,8 +80,8 @@ pip install marker-pdf[full]
 First, some configuration:
 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
-- Some PDFs, even digital ones, have bad text in them.  Set the `format_lines` flag to ensure the bad lines are fixed and formatted. You can also set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
-- If you care about inline math, set `format_lines` to automatically convert inline math to LaTeX.
 ## Interactive App
@@ -106,8 +106,7 @@ Options:
 - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
 - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
 - `--use_llm`: Uses an LLM to improve accuracy.  You will need to configure the LLM backend - see [below](#llm-services).
-- `--format_lines`: Reformat all lines using a local OCR model (inline math, underlines, bold, etc.).  This will give very good quality math output.
-- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.
 - `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker.  This is useful for custom formatting or logic that you want to apply to the output.
 - `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
 - `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
@@ -232,7 +231,7 @@ marker_single FILENAME --use_llm --force_layout_block Table --converter_cls mark
 ### OCR Only
-If you only want to run OCR, you can also do that through the `OCRConverter`.  Set `--keep_chars` to keep individual characters and bounding boxes.  You can also set `--force_ocr` and `--format_lines` with this converter.
 ```python
 from marker.converters.ocr import OCRConverter
@@ -556,4 +555,4 @@ PDF is a tricky format, so marker will not always work perfectly.  Here are some
 - Very complex layouts, with nested tables and forms, may not work
 - Forms may not be rendered well
-Note: Passing the `--use_llm` and `--format_lines` flags will mostly solve these issues.

 First, some configuration:
 - Your torch device will be automatically detected, but you can override this.  For example, `TORCH_DEVICE=cuda`.
+- Some PDFs, even digital ones, have bad text in them.  Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
+- If you care about inline math, set `force_ocr` to convert inline math to LaTeX.
 ## Interactive App
 - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
 - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
 - `--use_llm`: Uses an LLM to improve accuracy.  You will need to configure the LLM backend - see [below](#llm-services).
+- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.  This will also format inline math properly.
 - `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker.  This is useful for custom formatting or logic that you want to apply to the output.
 - `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
 - `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
 ### OCR Only
+If you only want to run OCR, you can also do that through the `OCRConverter`.  Set `--keep_chars` to keep individual characters and bounding boxes.
 ```python
 from marker.converters.ocr import OCRConverter
 - Very complex layouts, with nested tables and forms, may not work
 - Forms may not be rendered well
+Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues.

benchmarks/throughput/main.py CHANGED Viewed

@@ -25,7 +25,6 @@ def get_next_pdf(ds: datasets.Dataset, i: int):
 def single_batch(
     batch_size: int,
-    format_lines: bool,
     num_threads: int,
     force_ocr: bool,
     quantize: bool,
@@ -83,7 +82,6 @@ def single_batch(
                     artifact_dict=model_dict,
                     config={
                         "disable_tqdm": worker_id > 0,
-                        "format_lines": format_lines,
                         "page_range": page_range,
                         "force_ocr": force_ocr,
                     },
@@ -104,14 +102,12 @@ def single_batch(
 @click.command(help="Benchmark PDF to MD conversion throughput.")
 @click.option("--workers", default=1, help="Number of workers to use.")
 @click.option("--batch_size", default=1, help="Batch size for inference.")
-@click.option("--format_lines", is_flag=True, help="Format lines in the output.")
 @click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
 @click.option("--quantize", is_flag=True, help="Use quantized model.")
 @click.option("--compile", is_flag=True, help="Use compiled model.")
 def main(
     workers: int,
     batch_size: int,
-    format_lines: bool,
     force_ocr: bool,
     quantize: bool,
     compile: bool,
@@ -127,7 +123,6 @@ def main(
             executor.submit(
                 single_batch,
                 batch_size,
-                format_lines,
                 cpus_per_worker,
                 force_ocr,
                 quantize,

 def single_batch(
     batch_size: int,
     num_threads: int,
     force_ocr: bool,
     quantize: bool,
                     artifact_dict=model_dict,
                     config={
                         "disable_tqdm": worker_id > 0,
                         "page_range": page_range,
                         "force_ocr": force_ocr,
                     },
 @click.command(help="Benchmark PDF to MD conversion throughput.")
 @click.option("--workers", default=1, help="Number of workers to use.")
 @click.option("--batch_size", default=1, help="Batch size for inference.")
 @click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
 @click.option("--quantize", is_flag=True, help="Use quantized model.")
 @click.option("--compile", is_flag=True, help="Use compiled model.")
 def main(
     workers: int,
     batch_size: int,
     force_ocr: bool,
     quantize: bool,
     compile: bool,
             executor.submit(
                 single_batch,
                 batch_size,
                 cpus_per_worker,
                 force_ocr,
                 quantize,

marker/builders/line.py CHANGED Viewed

@@ -42,9 +42,6 @@ class LineBuilder(BaseBuilder):
         bool,
         "Whether to skip OCR on tables.  The TableProcessor will re-OCR them.  Only enable if the TableProcessor is not running.",
     ] = False
-    format_lines: Annotated[
-        bool, "Enable good provider lines to be checked and fixed by the OCR model"
-    ] = False
     layout_coverage_min_lines: Annotated[
         int,
         "The minimum number of PdfProvider lines that must be covered by the layout model",
@@ -173,7 +170,7 @@ class LineBuilder(BaseBuilder):
         if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
             layout_good = [True] * len(document.pages)
-        run_detection = [(not good or self.format_lines) for good in layout_good]
         page_images = [
             page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks)
             for page, good in zip(document.pages, run_detection)
@@ -222,14 +219,16 @@ class LineBuilder(BaseBuilder):
                     )
                 )
-                # If fixing lines, mark every line to be passed to the OCR model
-                for provider_line in merged_provider_lines:
-                    provider_line.line.text_extraction_method = (
-                        "hybrid" if self.format_lines else "pdftext"
-                    )
-                page_lines[document_page.page_id] = (
-                    merged_provider_lines + detected_only_lines
-                )
             else:
                 document_page.text_extraction_method = "surya"
                 boxes_to_ocr[document_page.page_id].extend(detection_boxes)
@@ -403,31 +402,13 @@ class LineBuilder(BaseBuilder):
         page_size,
         page_id,
     ):
-        # If no lines detected, skip the merging
         if not detected_lines:
-            return provider_lines, []
-        # If no provider lines, return all detected text lines
         if not provider_lines:
-            detected_only_lines = []
-            LineClass: Line = get_block_class(BlockTypes.Line)
-            for detected_line in detected_lines:
-                detected_line_polygon = PolygonBox(
-                    polygon=detected_line.polygon
-                ).rescale(image_size, page_size)
-                detected_only_lines.append(
-                    ProviderOutput(
-                        line=LineClass(
-                            polygon=detected_line_polygon,
-                            page_id=page_id,
-                            text_extraction_method="surya",
-                        ),
-                        spans=[],
-                        chars=[],
-                    )
-                )
-            return [], detected_only_lines
         out_provider_lines = []
         horizontal_provider_lines = []
@@ -614,25 +595,13 @@ class LineBuilder(BaseBuilder):
         out_provider_lines = [p for _, p in out_provider_lines]
         # Detected lines that do not overlap with any provider lines shoudl be outputted as-is
-        detected_only_lines = []
-        LineClass: Line = get_block_class(BlockTypes.Line)
         for j in range(len(detected_line_boxes)):
             # Ensure we don't do max on an empty array
             if provider_detected_overlaps[:, j].size == 0:
                 continue
             if np.max(provider_detected_overlaps[:, j]) == 0:
-                detected_line_polygon = PolygonBox.from_bbox(detected_line_boxes[j])
-                detected_only_lines.append(
-                    ProviderOutput(
-                        line=LineClass(
-                            polygon=detected_line_polygon,
-                            page_id=page_id,
-                            text_extraction_method="surya",
-                        ),
-                        spans=[],
-                        chars=[],
-                    )
-                )
         return out_provider_lines, detected_only_lines

         bool,
         "Whether to skip OCR on tables.  The TableProcessor will re-OCR them.  Only enable if the TableProcessor is not running.",
     ] = False
     layout_coverage_min_lines: Annotated[
         int,
         "The minimum number of PdfProvider lines that must be covered by the layout model",
         if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
             layout_good = [True] * len(document.pages)
+        run_detection = [not good for good in layout_good]
         page_images = [
             page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks)
             for page, good in zip(document.pages, run_detection)
                     )
                 )
+                if detected_only_lines:
+                    # If not all the lines are captured, then make sure we OCR the page
+                    document_page.text_extraction_method = "surya"
+                    boxes_to_ocr[document_page.page_id].extend(detection_boxes)
+                else:
+                    # Mark extraction method as pdftext, since all lines are good
+                    for provider_line in merged_provider_lines:
+                        provider_line.line.text_extraction_method = "pdftext"
+                    page_lines[document_page.page_id] = merged_provider_lines
             else:
                 document_page.text_extraction_method = "surya"
                 boxes_to_ocr[document_page.page_id].extend(detection_boxes)
         page_size,
         page_id,
     ):
+        # If no lines detected, skip page OCR
         if not detected_lines:
+            return provider_lines, False
+        # If no provider lines, ensure we OCR the page
         if not provider_lines:
+            return [], True
         out_provider_lines = []
         horizontal_provider_lines = []
         out_provider_lines = [p for _, p in out_provider_lines]
         # Detected lines that do not overlap with any provider lines shoudl be outputted as-is
+        detected_only_lines = False
         for j in range(len(detected_line_boxes)):
             # Ensure we don't do max on an empty array
             if provider_detected_overlaps[:, j].size == 0:
                 continue
             if np.max(provider_detected_overlaps[:, j]) == 0:
+                detected_only_lines = True
         return out_provider_lines, detected_only_lines

marker/builders/ocr.py CHANGED Viewed

@@ -108,7 +108,7 @@ class OcrBuilder(BaseBuilder):
                 block_lines_to_ocr = [
                     block_line
                     for block_line in block_lines
-                    if block_line.text_extraction_method in ["surya", "hybrid"]
                 ]
                 # Set extraction method of OCR-only pages

                 block_lines_to_ocr = [
                     block_line
                     for block_line in block_lines
+                    if block_line.text_extraction_method == "surya"
                 ]
                 # Set extraction method of OCR-only pages

marker/converters/ocr.py CHANGED Viewed

@@ -19,7 +19,7 @@ class OCRConverter(PdfConverter):
         if not self.config:
             self.config = {}
-        self.config["format_lines"] = True
         self.renderer = OCRJSONRenderer
     def build_document(self, filepath: str):

         if not self.config:
             self.config = {}
+        self.config["force_ocr"] = True
         self.renderer = OCRJSONRenderer
     def build_document(self, filepath: str):

marker/processors/table.py CHANGED Viewed

@@ -64,10 +64,6 @@ class TableProcessor(BaseProcessor):
         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
-    format_lines: Annotated[
-        bool,
-        "Whether to format the lines.",
-    ] = False
     drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
     def __init__(
@@ -104,9 +100,8 @@ class TableProcessor(BaseProcessor):
                         "img_size": page.get_image(highres=True).size,
                         "ocr_block": any(
                             [
-                                page.text_extraction_method in ["surya", "hybrid"],
                                 page.ocr_errors_detected,
-                                self.format_lines,
                             ]
                         ),
                     }

         bool,
         "Whether to disable the tqdm progress bar.",
     ] = False
     drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
     def __init__(
                         "img_size": page.get_image(highres=True).size,
                         "ocr_block": any(
                             [
+                                page.text_extraction_method in ["surya"],
                                 page.ocr_errors_detected,
                             ]
                         ),
                     }

marker/scripts/extraction_app.py CHANGED Viewed

@@ -192,11 +192,6 @@ strip_existing_ocr = st.sidebar.checkbox(
     help="Strip existing OCR text from the PDF and re-OCR.",
     value=False,
 )
-format_lines = st.sidebar.checkbox(
-    "Format lines",
-    help="Format lines in the document with OCR model",
-    value=False,
-)
 # Check if schema is provided before running
 if run_marker:
@@ -217,7 +212,6 @@ if run_marker:
                 "force_ocr": force_ocr,
                 "use_llm": use_llm,
                 "strip_existing_ocr": strip_existing_ocr,
-                "format_lines": format_lines,
             }
         )

     help="Strip existing OCR text from the PDF and re-OCR.",
     value=False,
 )
 # Check if schema is provided before running
 if run_marker:
                 "force_ocr": force_ocr,
                 "use_llm": use_llm,
                 "strip_existing_ocr": strip_existing_ocr,
             }
         )

marker/scripts/streamlit_app.py CHANGED Viewed

@@ -108,11 +108,6 @@ strip_existing_ocr = st.sidebar.checkbox(
     value=False,
 )
 debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
-format_lines = st.sidebar.checkbox(
-    "Format lines",
-    help="Format lines in the document with OCR model",
-    value=False,
-)
 disable_ocr_math = st.sidebar.checkbox(
     "Disable math",
     help="Disable math in OCR output - no inline math",
@@ -137,7 +132,6 @@ with tempfile.TemporaryDirectory() as tmp_dir:
             "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
             "use_llm": use_llm,
             "strip_existing_ocr": strip_existing_ocr,
-            "format_lines": format_lines,
             "disable_ocr_math": disable_ocr_math,
         }
     )

     value=False,
 )
 debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
 disable_ocr_math = st.sidebar.checkbox(
     "Disable math",
     help="Disable math in OCR output - no inline math",
             "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
             "use_llm": use_llm,
             "strip_existing_ocr": strip_existing_ocr,
             "disable_ocr_math": disable_ocr_math,
         }
     )

poetry.lock CHANGED Viewed

@@ -5390,14 +5390,14 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"",
 [[package]]
 name = "surya-ocr"
-version = "0.15.0"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 optional = false
 python-versions = "<4.0,>=3.10"
 groups = ["main"]
 files = [
-    {file = "surya_ocr-0.15.0-py3-none-any.whl", hash = "sha256:d29630d90a9b8d62c53a2852b1825948ee3eb60500f747159dcff285d1e8a50c"},
-    {file = "surya_ocr-0.15.0.tar.gz", hash = "sha256:d9bef35f0607181a5d1f6cd08c2e4c6431d86698305dc50db6ffef4a196fbb34"},
 ]
 [package.dependencies]
@@ -6505,4 +6505,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.10"
-content-hash = "33961d3942c2009c7e08dc10610e71997b8709ee8d34a9e6b4ad607b0db59009"

 [[package]]
 name = "surya-ocr"
+version = "0.15.1"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 optional = false
 python-versions = "<4.0,>=3.10"
 groups = ["main"]
 files = [
+    {file = "surya_ocr-0.15.1-py3-none-any.whl", hash = "sha256:1551831ec43550e2f5cc6cb7f7c8ef8c4ea3654551e904ee3e1f8d47023b05da"},
+    {file = "surya_ocr-0.15.1.tar.gz", hash = "sha256:ec8b1a5c99bb8b265289ad75c6037af09e2f6894b4104a66faec63733ecdcf6d"},
 ]
 [package.dependencies]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.10"
+content-hash = "ad384e1af2795ac728e705765d1368aa6556a34d8e3d928abcc0a8098cc3a5dd"

pyproject.toml CHANGED Viewed

@@ -26,7 +26,7 @@ torch = "^2.7.0"
 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 rapidfuzz = "^3.8.1"
-surya-ocr = "^0.15.0"
 regex = "^2024.4.28"
 pdftext = "~0.6.3"
 markdownify = "^1.1.0"

 tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 rapidfuzz = "^3.8.1"
+surya-ocr = "^0.15.1"
 regex = "^2024.4.28"
 pdftext = "~0.6.3"
 markdownify = "^1.1.0"

tests/builders/test_line_builder.py DELETED Viewed

@@ -1,24 +0,0 @@
-import pytest
-from marker.schema import BlockTypes
-# Page contains provider lines that are longer than detected lines
-# Any bad merging will cause broken final OCR results with format lines
-@pytest.mark.filename("mixed_eng_hindi.pdf")
-@pytest.mark.config({"page_range": [2], "format_lines": True})
-def test_provider_detected_line_merge(pdf_document):
-    page = pdf_document.pages[0]
-    text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,))
-    # This count includes detected lines merged in with provider lines
-    assert len(text_lines) == 83
-# Page provider lines only contain english, while the hindi is missing
-# format_lines should fill in the missing lines
-@pytest.mark.filename("mixed_eng_hindi.pdf")
-@pytest.mark.config({"page_range": [0], "format_lines": True})
-def test_fill_missing_provider_lines(pdf_document):
-    page = pdf_document.pages[0]
-    raw_text = page.raw_text(pdf_document)
-    assert "प्राधिकार से प्रकाशित" in raw_text
-    assert "खान मंत्रालय" in raw_text

tests/builders/test_merged_lines.py DELETED Viewed

@@ -1,18 +0,0 @@
-import pytest
-from marker.schema import BlockTypes
-@pytest.mark.config({"page_range": [6], "format_lines": True, "disable_ocr": True})
-@pytest.mark.filename("bad_math.pdf")
-def test_keep_ocr(pdf_document):
-    contained_lines = pdf_document.pages[0].contained_blocks(
-        pdf_document, [BlockTypes.Line]
-    )
-    # Check that we grabbed the right text
-    assert "Lemma" in contained_lines[-1].formatted_text(pdf_document)
-    assert "distribution" in contained_lines[-2].formatted_text(pdf_document)
-    # Line 2 comes after line 1
-    assert contained_lines[-1].polygon.bbox[1] > contained_lines[-2].polygon.bbox[3]

tests/config/test_config.py CHANGED Viewed

@@ -67,10 +67,9 @@ def test_config_llm():
 def test_config_force_ocr():
-    kwargs = capture_kwargs(["test", "--force_ocr", "--format_lines"])
     parser = ConfigParser(kwargs)
     config_dict = parser.generate_config_dict()
     # Validate kwarg capturing
     assert config_dict["force_ocr"]
-    assert config_dict["format_lines"]

 def test_config_force_ocr():
+    kwargs = capture_kwargs(["test", "--force_ocr"])
     parser = ConfigParser(kwargs)
     config_dict = parser.generate_config_dict()
     # Validate kwarg capturing
     assert config_dict["force_ocr"]