Vik Paruchuri commited on
Commit
f0d72f0
·
1 Parent(s): 277c09f

Remove format lines option

Browse files
README.md CHANGED
@@ -80,8 +80,8 @@ pip install marker-pdf[full]
80
  First, some configuration:
81
 
82
  - Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
83
- - Some PDFs, even digital ones, have bad text in them. Set the `format_lines` flag to ensure the bad lines are fixed and formatted. You can also set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
84
- - If you care about inline math, set `format_lines` to automatically convert inline math to LaTeX.
85
 
86
  ## Interactive App
87
 
@@ -106,8 +106,7 @@ Options:
106
  - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
107
  - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
108
  - `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see [below](#llm-services).
109
- - `--format_lines`: Reformat all lines using a local OCR model (inline math, underlines, bold, etc.). This will give very good quality math output.
110
- - `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.
111
  - `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output.
112
  - `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
113
  - `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
@@ -232,7 +231,7 @@ marker_single FILENAME --use_llm --force_layout_block Table --converter_cls mark
232
 
233
  ### OCR Only
234
 
235
- If you only want to run OCR, you can also do that through the `OCRConverter`. Set `--keep_chars` to keep individual characters and bounding boxes. You can also set `--force_ocr` and `--format_lines` with this converter.
236
 
237
  ```python
238
  from marker.converters.ocr import OCRConverter
@@ -556,4 +555,4 @@ PDF is a tricky format, so marker will not always work perfectly. Here are some
556
  - Very complex layouts, with nested tables and forms, may not work
557
  - Forms may not be rendered well
558
 
559
- Note: Passing the `--use_llm` and `--format_lines` flags will mostly solve these issues.
 
80
  First, some configuration:
81
 
82
  - Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
83
+ - Some PDFs, even digital ones, have bad text in them. Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
84
+ - If you care about inline math, set `force_ocr` to convert inline math to LaTeX.
85
 
86
  ## Interactive App
87
 
 
106
  - `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
107
  - `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
108
  - `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see [below](#llm-services).
109
+ - `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text. This will also format inline math properly.
 
110
  - `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output.
111
  - `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
112
  - `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
 
231
 
232
  ### OCR Only
233
 
234
+ If you only want to run OCR, you can also do that through the `OCRConverter`. Set `--keep_chars` to keep individual characters and bounding boxes.
235
 
236
  ```python
237
  from marker.converters.ocr import OCRConverter
 
555
  - Very complex layouts, with nested tables and forms, may not work
556
  - Forms may not be rendered well
557
 
558
+ Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues.
benchmarks/throughput/main.py CHANGED
@@ -25,7 +25,6 @@ def get_next_pdf(ds: datasets.Dataset, i: int):
25
 
26
  def single_batch(
27
  batch_size: int,
28
- format_lines: bool,
29
  num_threads: int,
30
  force_ocr: bool,
31
  quantize: bool,
@@ -83,7 +82,6 @@ def single_batch(
83
  artifact_dict=model_dict,
84
  config={
85
  "disable_tqdm": worker_id > 0,
86
- "format_lines": format_lines,
87
  "page_range": page_range,
88
  "force_ocr": force_ocr,
89
  },
@@ -104,14 +102,12 @@ def single_batch(
104
  @click.command(help="Benchmark PDF to MD conversion throughput.")
105
  @click.option("--workers", default=1, help="Number of workers to use.")
106
  @click.option("--batch_size", default=1, help="Batch size for inference.")
107
- @click.option("--format_lines", is_flag=True, help="Format lines in the output.")
108
  @click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
109
  @click.option("--quantize", is_flag=True, help="Use quantized model.")
110
  @click.option("--compile", is_flag=True, help="Use compiled model.")
111
  def main(
112
  workers: int,
113
  batch_size: int,
114
- format_lines: bool,
115
  force_ocr: bool,
116
  quantize: bool,
117
  compile: bool,
@@ -127,7 +123,6 @@ def main(
127
  executor.submit(
128
  single_batch,
129
  batch_size,
130
- format_lines,
131
  cpus_per_worker,
132
  force_ocr,
133
  quantize,
 
25
 
26
  def single_batch(
27
  batch_size: int,
 
28
  num_threads: int,
29
  force_ocr: bool,
30
  quantize: bool,
 
82
  artifact_dict=model_dict,
83
  config={
84
  "disable_tqdm": worker_id > 0,
 
85
  "page_range": page_range,
86
  "force_ocr": force_ocr,
87
  },
 
102
  @click.command(help="Benchmark PDF to MD conversion throughput.")
103
  @click.option("--workers", default=1, help="Number of workers to use.")
104
  @click.option("--batch_size", default=1, help="Batch size for inference.")
 
105
  @click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
106
  @click.option("--quantize", is_flag=True, help="Use quantized model.")
107
  @click.option("--compile", is_flag=True, help="Use compiled model.")
108
  def main(
109
  workers: int,
110
  batch_size: int,
 
111
  force_ocr: bool,
112
  quantize: bool,
113
  compile: bool,
 
123
  executor.submit(
124
  single_batch,
125
  batch_size,
 
126
  cpus_per_worker,
127
  force_ocr,
128
  quantize,
marker/builders/line.py CHANGED
@@ -42,9 +42,6 @@ class LineBuilder(BaseBuilder):
42
  bool,
43
  "Whether to skip OCR on tables. The TableProcessor will re-OCR them. Only enable if the TableProcessor is not running.",
44
  ] = False
45
- format_lines: Annotated[
46
- bool, "Enable good provider lines to be checked and fixed by the OCR model"
47
- ] = False
48
  layout_coverage_min_lines: Annotated[
49
  int,
50
  "The minimum number of PdfProvider lines that must be covered by the layout model",
@@ -173,7 +170,7 @@ class LineBuilder(BaseBuilder):
173
  if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
174
  layout_good = [True] * len(document.pages)
175
 
176
- run_detection = [(not good or self.format_lines) for good in layout_good]
177
  page_images = [
178
  page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks)
179
  for page, good in zip(document.pages, run_detection)
@@ -222,14 +219,16 @@ class LineBuilder(BaseBuilder):
222
  )
223
  )
224
 
225
- # If fixing lines, mark every line to be passed to the OCR model
226
- for provider_line in merged_provider_lines:
227
- provider_line.line.text_extraction_method = (
228
- "hybrid" if self.format_lines else "pdftext"
229
- )
230
- page_lines[document_page.page_id] = (
231
- merged_provider_lines + detected_only_lines
232
- )
 
 
233
  else:
234
  document_page.text_extraction_method = "surya"
235
  boxes_to_ocr[document_page.page_id].extend(detection_boxes)
@@ -403,31 +402,13 @@ class LineBuilder(BaseBuilder):
403
  page_size,
404
  page_id,
405
  ):
406
- # If no lines detected, skip the merging
407
  if not detected_lines:
408
- return provider_lines, []
409
 
410
- # If no provider lines, return all detected text lines
411
  if not provider_lines:
412
- detected_only_lines = []
413
- LineClass: Line = get_block_class(BlockTypes.Line)
414
- for detected_line in detected_lines:
415
- detected_line_polygon = PolygonBox(
416
- polygon=detected_line.polygon
417
- ).rescale(image_size, page_size)
418
- detected_only_lines.append(
419
- ProviderOutput(
420
- line=LineClass(
421
- polygon=detected_line_polygon,
422
- page_id=page_id,
423
- text_extraction_method="surya",
424
- ),
425
- spans=[],
426
- chars=[],
427
- )
428
- )
429
-
430
- return [], detected_only_lines
431
 
432
  out_provider_lines = []
433
  horizontal_provider_lines = []
@@ -614,25 +595,13 @@ class LineBuilder(BaseBuilder):
614
  out_provider_lines = [p for _, p in out_provider_lines]
615
 
616
  # Detected lines that do not overlap with any provider lines shoudl be outputted as-is
617
- detected_only_lines = []
618
- LineClass: Line = get_block_class(BlockTypes.Line)
619
  for j in range(len(detected_line_boxes)):
620
  # Ensure we don't do max on an empty array
621
  if provider_detected_overlaps[:, j].size == 0:
622
  continue
623
 
624
  if np.max(provider_detected_overlaps[:, j]) == 0:
625
- detected_line_polygon = PolygonBox.from_bbox(detected_line_boxes[j])
626
- detected_only_lines.append(
627
- ProviderOutput(
628
- line=LineClass(
629
- polygon=detected_line_polygon,
630
- page_id=page_id,
631
- text_extraction_method="surya",
632
- ),
633
- spans=[],
634
- chars=[],
635
- )
636
- )
637
 
638
  return out_provider_lines, detected_only_lines
 
42
  bool,
43
  "Whether to skip OCR on tables. The TableProcessor will re-OCR them. Only enable if the TableProcessor is not running.",
44
  ] = False
 
 
 
45
  layout_coverage_min_lines: Annotated[
46
  int,
47
  "The minimum number of PdfProvider lines that must be covered by the layout model",
 
170
  if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
171
  layout_good = [True] * len(document.pages)
172
 
173
+ run_detection = [not good for good in layout_good]
174
  page_images = [
175
  page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks)
176
  for page, good in zip(document.pages, run_detection)
 
219
  )
220
  )
221
 
222
+ if detected_only_lines:
223
+ # If not all the lines are captured, then make sure we OCR the page
224
+ document_page.text_extraction_method = "surya"
225
+ boxes_to_ocr[document_page.page_id].extend(detection_boxes)
226
+ else:
227
+ # Mark extraction method as pdftext, since all lines are good
228
+ for provider_line in merged_provider_lines:
229
+ provider_line.line.text_extraction_method = "pdftext"
230
+
231
+ page_lines[document_page.page_id] = merged_provider_lines
232
  else:
233
  document_page.text_extraction_method = "surya"
234
  boxes_to_ocr[document_page.page_id].extend(detection_boxes)
 
402
  page_size,
403
  page_id,
404
  ):
405
+ # If no lines detected, skip page OCR
406
  if not detected_lines:
407
+ return provider_lines, False
408
 
409
+ # If no provider lines, ensure we OCR the page
410
  if not provider_lines:
411
+ return [], True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
 
413
  out_provider_lines = []
414
  horizontal_provider_lines = []
 
595
  out_provider_lines = [p for _, p in out_provider_lines]
596
 
597
  # Detected lines that do not overlap with any provider lines shoudl be outputted as-is
598
+ detected_only_lines = False
 
599
  for j in range(len(detected_line_boxes)):
600
  # Ensure we don't do max on an empty array
601
  if provider_detected_overlaps[:, j].size == 0:
602
  continue
603
 
604
  if np.max(provider_detected_overlaps[:, j]) == 0:
605
+ detected_only_lines = True
 
 
 
 
 
 
 
 
 
 
 
606
 
607
  return out_provider_lines, detected_only_lines
marker/builders/ocr.py CHANGED
@@ -108,7 +108,7 @@ class OcrBuilder(BaseBuilder):
108
  block_lines_to_ocr = [
109
  block_line
110
  for block_line in block_lines
111
- if block_line.text_extraction_method in ["surya", "hybrid"]
112
  ]
113
 
114
  # Set extraction method of OCR-only pages
 
108
  block_lines_to_ocr = [
109
  block_line
110
  for block_line in block_lines
111
+ if block_line.text_extraction_method == "surya"
112
  ]
113
 
114
  # Set extraction method of OCR-only pages
marker/converters/ocr.py CHANGED
@@ -19,7 +19,7 @@ class OCRConverter(PdfConverter):
19
  if not self.config:
20
  self.config = {}
21
 
22
- self.config["format_lines"] = True
23
  self.renderer = OCRJSONRenderer
24
 
25
  def build_document(self, filepath: str):
 
19
  if not self.config:
20
  self.config = {}
21
 
22
+ self.config["force_ocr"] = True
23
  self.renderer = OCRJSONRenderer
24
 
25
  def build_document(self, filepath: str):
marker/processors/table.py CHANGED
@@ -64,10 +64,6 @@ class TableProcessor(BaseProcessor):
64
  bool,
65
  "Whether to disable the tqdm progress bar.",
66
  ] = False
67
- format_lines: Annotated[
68
- bool,
69
- "Whether to format the lines.",
70
- ] = False
71
  drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
72
 
73
  def __init__(
@@ -104,9 +100,8 @@ class TableProcessor(BaseProcessor):
104
  "img_size": page.get_image(highres=True).size,
105
  "ocr_block": any(
106
  [
107
- page.text_extraction_method in ["surya", "hybrid"],
108
  page.ocr_errors_detected,
109
- self.format_lines,
110
  ]
111
  ),
112
  }
 
64
  bool,
65
  "Whether to disable the tqdm progress bar.",
66
  ] = False
 
 
 
 
67
  drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
68
 
69
  def __init__(
 
100
  "img_size": page.get_image(highres=True).size,
101
  "ocr_block": any(
102
  [
103
+ page.text_extraction_method in ["surya"],
104
  page.ocr_errors_detected,
 
105
  ]
106
  ),
107
  }
marker/scripts/extraction_app.py CHANGED
@@ -192,11 +192,6 @@ strip_existing_ocr = st.sidebar.checkbox(
192
  help="Strip existing OCR text from the PDF and re-OCR.",
193
  value=False,
194
  )
195
- format_lines = st.sidebar.checkbox(
196
- "Format lines",
197
- help="Format lines in the document with OCR model",
198
- value=False,
199
- )
200
 
201
  # Check if schema is provided before running
202
  if run_marker:
@@ -217,7 +212,6 @@ if run_marker:
217
  "force_ocr": force_ocr,
218
  "use_llm": use_llm,
219
  "strip_existing_ocr": strip_existing_ocr,
220
- "format_lines": format_lines,
221
  }
222
  )
223
 
 
192
  help="Strip existing OCR text from the PDF and re-OCR.",
193
  value=False,
194
  )
 
 
 
 
 
195
 
196
  # Check if schema is provided before running
197
  if run_marker:
 
212
  "force_ocr": force_ocr,
213
  "use_llm": use_llm,
214
  "strip_existing_ocr": strip_existing_ocr,
 
215
  }
216
  )
217
 
marker/scripts/streamlit_app.py CHANGED
@@ -108,11 +108,6 @@ strip_existing_ocr = st.sidebar.checkbox(
108
  value=False,
109
  )
110
  debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
111
- format_lines = st.sidebar.checkbox(
112
- "Format lines",
113
- help="Format lines in the document with OCR model",
114
- value=False,
115
- )
116
  disable_ocr_math = st.sidebar.checkbox(
117
  "Disable math",
118
  help="Disable math in OCR output - no inline math",
@@ -137,7 +132,6 @@ with tempfile.TemporaryDirectory() as tmp_dir:
137
  "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
138
  "use_llm": use_llm,
139
  "strip_existing_ocr": strip_existing_ocr,
140
- "format_lines": format_lines,
141
  "disable_ocr_math": disable_ocr_math,
142
  }
143
  )
 
108
  value=False,
109
  )
110
  debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
 
 
 
 
 
111
  disable_ocr_math = st.sidebar.checkbox(
112
  "Disable math",
113
  help="Disable math in OCR output - no inline math",
 
132
  "output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
133
  "use_llm": use_llm,
134
  "strip_existing_ocr": strip_existing_ocr,
 
135
  "disable_ocr_math": disable_ocr_math,
136
  }
137
  )
poetry.lock CHANGED
@@ -5390,14 +5390,14 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"",
5390
 
5391
  [[package]]
5392
  name = "surya-ocr"
5393
- version = "0.15.0"
5394
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
5395
  optional = false
5396
  python-versions = "<4.0,>=3.10"
5397
  groups = ["main"]
5398
  files = [
5399
- {file = "surya_ocr-0.15.0-py3-none-any.whl", hash = "sha256:d29630d90a9b8d62c53a2852b1825948ee3eb60500f747159dcff285d1e8a50c"},
5400
- {file = "surya_ocr-0.15.0.tar.gz", hash = "sha256:d9bef35f0607181a5d1f6cd08c2e4c6431d86698305dc50db6ffef4a196fbb34"},
5401
  ]
5402
 
5403
  [package.dependencies]
@@ -6505,4 +6505,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
6505
  [metadata]
6506
  lock-version = "2.1"
6507
  python-versions = "^3.10"
6508
- content-hash = "33961d3942c2009c7e08dc10610e71997b8709ee8d34a9e6b4ad607b0db59009"
 
5390
 
5391
  [[package]]
5392
  name = "surya-ocr"
5393
+ version = "0.15.1"
5394
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
5395
  optional = false
5396
  python-versions = "<4.0,>=3.10"
5397
  groups = ["main"]
5398
  files = [
5399
+ {file = "surya_ocr-0.15.1-py3-none-any.whl", hash = "sha256:1551831ec43550e2f5cc6cb7f7c8ef8c4ea3654551e904ee3e1f8d47023b05da"},
5400
+ {file = "surya_ocr-0.15.1.tar.gz", hash = "sha256:ec8b1a5c99bb8b265289ad75c6037af09e2f6894b4104a66faec63733ecdcf6d"},
5401
  ]
5402
 
5403
  [package.dependencies]
 
6505
  [metadata]
6506
  lock-version = "2.1"
6507
  python-versions = "^3.10"
6508
+ content-hash = "ad384e1af2795ac728e705765d1368aa6556a34d8e3d928abcc0a8098cc3a5dd"
pyproject.toml CHANGED
@@ -26,7 +26,7 @@ torch = "^2.7.0"
26
  tqdm = "^4.66.1"
27
  ftfy = "^6.1.1"
28
  rapidfuzz = "^3.8.1"
29
- surya-ocr = "^0.15.0"
30
  regex = "^2024.4.28"
31
  pdftext = "~0.6.3"
32
  markdownify = "^1.1.0"
 
26
  tqdm = "^4.66.1"
27
  ftfy = "^6.1.1"
28
  rapidfuzz = "^3.8.1"
29
+ surya-ocr = "^0.15.1"
30
  regex = "^2024.4.28"
31
  pdftext = "~0.6.3"
32
  markdownify = "^1.1.0"
tests/builders/test_line_builder.py DELETED
@@ -1,24 +0,0 @@
1
- import pytest
2
-
3
- from marker.schema import BlockTypes
4
-
5
- # Page contains provider lines that are longer than detected lines
6
- # Any bad merging will cause broken final OCR results with format lines
7
- @pytest.mark.filename("mixed_eng_hindi.pdf")
8
- @pytest.mark.config({"page_range": [2], "format_lines": True})
9
- def test_provider_detected_line_merge(pdf_document):
10
- page = pdf_document.pages[0]
11
- text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,))
12
-
13
- # This count includes detected lines merged in with provider lines
14
- assert len(text_lines) == 83
15
-
16
- # Page provider lines only contain english, while the hindi is missing
17
- # format_lines should fill in the missing lines
18
- @pytest.mark.filename("mixed_eng_hindi.pdf")
19
- @pytest.mark.config({"page_range": [0], "format_lines": True})
20
- def test_fill_missing_provider_lines(pdf_document):
21
- page = pdf_document.pages[0]
22
- raw_text = page.raw_text(pdf_document)
23
- assert "प्राधिकार से प्रकाशित" in raw_text
24
- assert "खान मंत्रालय" in raw_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/builders/test_merged_lines.py DELETED
@@ -1,18 +0,0 @@
1
- import pytest
2
-
3
- from marker.schema import BlockTypes
4
-
5
-
6
- @pytest.mark.config({"page_range": [6], "format_lines": True, "disable_ocr": True})
7
- @pytest.mark.filename("bad_math.pdf")
8
- def test_keep_ocr(pdf_document):
9
- contained_lines = pdf_document.pages[0].contained_blocks(
10
- pdf_document, [BlockTypes.Line]
11
- )
12
-
13
- # Check that we grabbed the right text
14
- assert "Lemma" in contained_lines[-1].formatted_text(pdf_document)
15
- assert "distribution" in contained_lines[-2].formatted_text(pdf_document)
16
-
17
- # Line 2 comes after line 1
18
- assert contained_lines[-1].polygon.bbox[1] > contained_lines[-2].polygon.bbox[3]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/config/test_config.py CHANGED
@@ -67,10 +67,9 @@ def test_config_llm():
67
 
68
 
69
  def test_config_force_ocr():
70
- kwargs = capture_kwargs(["test", "--force_ocr", "--format_lines"])
71
  parser = ConfigParser(kwargs)
72
  config_dict = parser.generate_config_dict()
73
 
74
  # Validate kwarg capturing
75
  assert config_dict["force_ocr"]
76
- assert config_dict["format_lines"]
 
67
 
68
 
69
  def test_config_force_ocr():
70
+ kwargs = capture_kwargs(["test", "--force_ocr"])
71
  parser = ConfigParser(kwargs)
72
  config_dict = parser.generate_config_dict()
73
 
74
  # Validate kwarg capturing
75
  assert config_dict["force_ocr"]