Vik Paruchuri
commited on
Commit
·
f0d72f0
1
Parent(s):
277c09f
Remove format lines option
Browse files- README.md +5 -6
- benchmarks/throughput/main.py +0 -5
- marker/builders/line.py +17 -48
- marker/builders/ocr.py +1 -1
- marker/converters/ocr.py +1 -1
- marker/processors/table.py +1 -6
- marker/scripts/extraction_app.py +0 -6
- marker/scripts/streamlit_app.py +0 -6
- poetry.lock +4 -4
- pyproject.toml +1 -1
- tests/builders/test_line_builder.py +0 -24
- tests/builders/test_merged_lines.py +0 -18
- tests/config/test_config.py +1 -2
README.md
CHANGED
|
@@ -80,8 +80,8 @@ pip install marker-pdf[full]
|
|
| 80 |
First, some configuration:
|
| 81 |
|
| 82 |
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
|
| 83 |
-
- Some PDFs, even digital ones, have bad text in them. Set
|
| 84 |
-
- If you care about inline math, set `
|
| 85 |
|
| 86 |
## Interactive App
|
| 87 |
|
|
@@ -106,8 +106,7 @@ Options:
|
|
| 106 |
- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
|
| 107 |
- `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
|
| 108 |
- `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see [below](#llm-services).
|
| 109 |
-
- `--
|
| 110 |
-
- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.
|
| 111 |
- `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output.
|
| 112 |
- `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
|
| 113 |
- `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
|
|
@@ -232,7 +231,7 @@ marker_single FILENAME --use_llm --force_layout_block Table --converter_cls mark
|
|
| 232 |
|
| 233 |
### OCR Only
|
| 234 |
|
| 235 |
-
If you only want to run OCR, you can also do that through the `OCRConverter`. Set `--keep_chars` to keep individual characters and bounding boxes.
|
| 236 |
|
| 237 |
```python
|
| 238 |
from marker.converters.ocr import OCRConverter
|
|
@@ -556,4 +555,4 @@ PDF is a tricky format, so marker will not always work perfectly. Here are some
|
|
| 556 |
- Very complex layouts, with nested tables and forms, may not work
|
| 557 |
- Forms may not be rendered well
|
| 558 |
|
| 559 |
-
Note: Passing the `--use_llm` and `--
|
|
|
|
| 80 |
First, some configuration:
|
| 81 |
|
| 82 |
- Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda`.
|
| 83 |
+
- Some PDFs, even digital ones, have bad text in them. Set `--force_ocr` to force OCR on all lines, or the `strip_existing_ocr` to keep all digital text, and strip out any existing OCR text.
|
| 84 |
+
- If you care about inline math, set `force_ocr` to convert inline math to LaTeX.
|
| 85 |
|
| 86 |
## Interactive App
|
| 87 |
|
|
|
|
| 106 |
- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
|
| 107 |
- `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
|
| 108 |
- `--use_llm`: Uses an LLM to improve accuracy. You will need to configure the LLM backend - see [below](#llm-services).
|
| 109 |
+
- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text. This will also format inline math properly.
|
|
|
|
| 110 |
- `--block_correction_prompt`: if LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output.
|
| 111 |
- `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
|
| 112 |
- `--redo_inline_math`: If you want the absolute highest quality inline math conversion, use this along with `--use_llm`.
|
|
|
|
| 231 |
|
| 232 |
### OCR Only
|
| 233 |
|
| 234 |
+
If you only want to run OCR, you can also do that through the `OCRConverter`. Set `--keep_chars` to keep individual characters and bounding boxes.
|
| 235 |
|
| 236 |
```python
|
| 237 |
from marker.converters.ocr import OCRConverter
|
|
|
|
| 555 |
- Very complex layouts, with nested tables and forms, may not work
|
| 556 |
- Forms may not be rendered well
|
| 557 |
|
| 558 |
+
Note: Passing the `--use_llm` and `--force_ocr` flags will mostly solve these issues.
|
benchmarks/throughput/main.py
CHANGED
|
@@ -25,7 +25,6 @@ def get_next_pdf(ds: datasets.Dataset, i: int):
|
|
| 25 |
|
| 26 |
def single_batch(
|
| 27 |
batch_size: int,
|
| 28 |
-
format_lines: bool,
|
| 29 |
num_threads: int,
|
| 30 |
force_ocr: bool,
|
| 31 |
quantize: bool,
|
|
@@ -83,7 +82,6 @@ def single_batch(
|
|
| 83 |
artifact_dict=model_dict,
|
| 84 |
config={
|
| 85 |
"disable_tqdm": worker_id > 0,
|
| 86 |
-
"format_lines": format_lines,
|
| 87 |
"page_range": page_range,
|
| 88 |
"force_ocr": force_ocr,
|
| 89 |
},
|
|
@@ -104,14 +102,12 @@ def single_batch(
|
|
| 104 |
@click.command(help="Benchmark PDF to MD conversion throughput.")
|
| 105 |
@click.option("--workers", default=1, help="Number of workers to use.")
|
| 106 |
@click.option("--batch_size", default=1, help="Batch size for inference.")
|
| 107 |
-
@click.option("--format_lines", is_flag=True, help="Format lines in the output.")
|
| 108 |
@click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
|
| 109 |
@click.option("--quantize", is_flag=True, help="Use quantized model.")
|
| 110 |
@click.option("--compile", is_flag=True, help="Use compiled model.")
|
| 111 |
def main(
|
| 112 |
workers: int,
|
| 113 |
batch_size: int,
|
| 114 |
-
format_lines: bool,
|
| 115 |
force_ocr: bool,
|
| 116 |
quantize: bool,
|
| 117 |
compile: bool,
|
|
@@ -127,7 +123,6 @@ def main(
|
|
| 127 |
executor.submit(
|
| 128 |
single_batch,
|
| 129 |
batch_size,
|
| 130 |
-
format_lines,
|
| 131 |
cpus_per_worker,
|
| 132 |
force_ocr,
|
| 133 |
quantize,
|
|
|
|
| 25 |
|
| 26 |
def single_batch(
|
| 27 |
batch_size: int,
|
|
|
|
| 28 |
num_threads: int,
|
| 29 |
force_ocr: bool,
|
| 30 |
quantize: bool,
|
|
|
|
| 82 |
artifact_dict=model_dict,
|
| 83 |
config={
|
| 84 |
"disable_tqdm": worker_id > 0,
|
|
|
|
| 85 |
"page_range": page_range,
|
| 86 |
"force_ocr": force_ocr,
|
| 87 |
},
|
|
|
|
| 102 |
@click.command(help="Benchmark PDF to MD conversion throughput.")
|
| 103 |
@click.option("--workers", default=1, help="Number of workers to use.")
|
| 104 |
@click.option("--batch_size", default=1, help="Batch size for inference.")
|
|
|
|
| 105 |
@click.option("--force_ocr", is_flag=True, help="Force OCR on all pages.")
|
| 106 |
@click.option("--quantize", is_flag=True, help="Use quantized model.")
|
| 107 |
@click.option("--compile", is_flag=True, help="Use compiled model.")
|
| 108 |
def main(
|
| 109 |
workers: int,
|
| 110 |
batch_size: int,
|
|
|
|
| 111 |
force_ocr: bool,
|
| 112 |
quantize: bool,
|
| 113 |
compile: bool,
|
|
|
|
| 123 |
executor.submit(
|
| 124 |
single_batch,
|
| 125 |
batch_size,
|
|
|
|
| 126 |
cpus_per_worker,
|
| 127 |
force_ocr,
|
| 128 |
quantize,
|
marker/builders/line.py
CHANGED
|
@@ -42,9 +42,6 @@ class LineBuilder(BaseBuilder):
|
|
| 42 |
bool,
|
| 43 |
"Whether to skip OCR on tables. The TableProcessor will re-OCR them. Only enable if the TableProcessor is not running.",
|
| 44 |
] = False
|
| 45 |
-
format_lines: Annotated[
|
| 46 |
-
bool, "Enable good provider lines to be checked and fixed by the OCR model"
|
| 47 |
-
] = False
|
| 48 |
layout_coverage_min_lines: Annotated[
|
| 49 |
int,
|
| 50 |
"The minimum number of PdfProvider lines that must be covered by the layout model",
|
|
@@ -173,7 +170,7 @@ class LineBuilder(BaseBuilder):
|
|
| 173 |
if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
|
| 174 |
layout_good = [True] * len(document.pages)
|
| 175 |
|
| 176 |
-
run_detection = [
|
| 177 |
page_images = [
|
| 178 |
page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks)
|
| 179 |
for page, good in zip(document.pages, run_detection)
|
|
@@ -222,14 +219,16 @@ class LineBuilder(BaseBuilder):
|
|
| 222 |
)
|
| 223 |
)
|
| 224 |
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
else:
|
| 234 |
document_page.text_extraction_method = "surya"
|
| 235 |
boxes_to_ocr[document_page.page_id].extend(detection_boxes)
|
|
@@ -403,31 +402,13 @@ class LineBuilder(BaseBuilder):
|
|
| 403 |
page_size,
|
| 404 |
page_id,
|
| 405 |
):
|
| 406 |
-
# If no lines detected, skip
|
| 407 |
if not detected_lines:
|
| 408 |
-
return provider_lines,
|
| 409 |
|
| 410 |
-
# If no provider lines,
|
| 411 |
if not provider_lines:
|
| 412 |
-
|
| 413 |
-
LineClass: Line = get_block_class(BlockTypes.Line)
|
| 414 |
-
for detected_line in detected_lines:
|
| 415 |
-
detected_line_polygon = PolygonBox(
|
| 416 |
-
polygon=detected_line.polygon
|
| 417 |
-
).rescale(image_size, page_size)
|
| 418 |
-
detected_only_lines.append(
|
| 419 |
-
ProviderOutput(
|
| 420 |
-
line=LineClass(
|
| 421 |
-
polygon=detected_line_polygon,
|
| 422 |
-
page_id=page_id,
|
| 423 |
-
text_extraction_method="surya",
|
| 424 |
-
),
|
| 425 |
-
spans=[],
|
| 426 |
-
chars=[],
|
| 427 |
-
)
|
| 428 |
-
)
|
| 429 |
-
|
| 430 |
-
return [], detected_only_lines
|
| 431 |
|
| 432 |
out_provider_lines = []
|
| 433 |
horizontal_provider_lines = []
|
|
@@ -614,25 +595,13 @@ class LineBuilder(BaseBuilder):
|
|
| 614 |
out_provider_lines = [p for _, p in out_provider_lines]
|
| 615 |
|
| 616 |
# Detected lines that do not overlap with any provider lines shoudl be outputted as-is
|
| 617 |
-
detected_only_lines =
|
| 618 |
-
LineClass: Line = get_block_class(BlockTypes.Line)
|
| 619 |
for j in range(len(detected_line_boxes)):
|
| 620 |
# Ensure we don't do max on an empty array
|
| 621 |
if provider_detected_overlaps[:, j].size == 0:
|
| 622 |
continue
|
| 623 |
|
| 624 |
if np.max(provider_detected_overlaps[:, j]) == 0:
|
| 625 |
-
|
| 626 |
-
detected_only_lines.append(
|
| 627 |
-
ProviderOutput(
|
| 628 |
-
line=LineClass(
|
| 629 |
-
polygon=detected_line_polygon,
|
| 630 |
-
page_id=page_id,
|
| 631 |
-
text_extraction_method="surya",
|
| 632 |
-
),
|
| 633 |
-
spans=[],
|
| 634 |
-
chars=[],
|
| 635 |
-
)
|
| 636 |
-
)
|
| 637 |
|
| 638 |
return out_provider_lines, detected_only_lines
|
|
|
|
| 42 |
bool,
|
| 43 |
"Whether to skip OCR on tables. The TableProcessor will re-OCR them. Only enable if the TableProcessor is not running.",
|
| 44 |
] = False
|
|
|
|
|
|
|
|
|
|
| 45 |
layout_coverage_min_lines: Annotated[
|
| 46 |
int,
|
| 47 |
"The minimum number of PdfProvider lines that must be covered by the layout model",
|
|
|
|
| 170 |
if sum(layout_good) > len(document.pages) * self.min_document_ocr_threshold:
|
| 171 |
layout_good = [True] * len(document.pages)
|
| 172 |
|
| 173 |
+
run_detection = [not good for good in layout_good]
|
| 174 |
page_images = [
|
| 175 |
page.get_image(highres=False, remove_blocks=self.ocr_remove_blocks)
|
| 176 |
for page, good in zip(document.pages, run_detection)
|
|
|
|
| 219 |
)
|
| 220 |
)
|
| 221 |
|
| 222 |
+
if detected_only_lines:
|
| 223 |
+
# If not all the lines are captured, then make sure we OCR the page
|
| 224 |
+
document_page.text_extraction_method = "surya"
|
| 225 |
+
boxes_to_ocr[document_page.page_id].extend(detection_boxes)
|
| 226 |
+
else:
|
| 227 |
+
# Mark extraction method as pdftext, since all lines are good
|
| 228 |
+
for provider_line in merged_provider_lines:
|
| 229 |
+
provider_line.line.text_extraction_method = "pdftext"
|
| 230 |
+
|
| 231 |
+
page_lines[document_page.page_id] = merged_provider_lines
|
| 232 |
else:
|
| 233 |
document_page.text_extraction_method = "surya"
|
| 234 |
boxes_to_ocr[document_page.page_id].extend(detection_boxes)
|
|
|
|
| 402 |
page_size,
|
| 403 |
page_id,
|
| 404 |
):
|
| 405 |
+
# If no lines detected, skip page OCR
|
| 406 |
if not detected_lines:
|
| 407 |
+
return provider_lines, False
|
| 408 |
|
| 409 |
+
# If no provider lines, ensure we OCR the page
|
| 410 |
if not provider_lines:
|
| 411 |
+
return [], True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
|
| 413 |
out_provider_lines = []
|
| 414 |
horizontal_provider_lines = []
|
|
|
|
| 595 |
out_provider_lines = [p for _, p in out_provider_lines]
|
| 596 |
|
| 597 |
# Detected lines that do not overlap with any provider lines shoudl be outputted as-is
|
| 598 |
+
detected_only_lines = False
|
|
|
|
| 599 |
for j in range(len(detected_line_boxes)):
|
| 600 |
# Ensure we don't do max on an empty array
|
| 601 |
if provider_detected_overlaps[:, j].size == 0:
|
| 602 |
continue
|
| 603 |
|
| 604 |
if np.max(provider_detected_overlaps[:, j]) == 0:
|
| 605 |
+
detected_only_lines = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
|
| 607 |
return out_provider_lines, detected_only_lines
|
marker/builders/ocr.py
CHANGED
|
@@ -108,7 +108,7 @@ class OcrBuilder(BaseBuilder):
|
|
| 108 |
block_lines_to_ocr = [
|
| 109 |
block_line
|
| 110 |
for block_line in block_lines
|
| 111 |
-
if block_line.text_extraction_method
|
| 112 |
]
|
| 113 |
|
| 114 |
# Set extraction method of OCR-only pages
|
|
|
|
| 108 |
block_lines_to_ocr = [
|
| 109 |
block_line
|
| 110 |
for block_line in block_lines
|
| 111 |
+
if block_line.text_extraction_method == "surya"
|
| 112 |
]
|
| 113 |
|
| 114 |
# Set extraction method of OCR-only pages
|
marker/converters/ocr.py
CHANGED
|
@@ -19,7 +19,7 @@ class OCRConverter(PdfConverter):
|
|
| 19 |
if not self.config:
|
| 20 |
self.config = {}
|
| 21 |
|
| 22 |
-
self.config["
|
| 23 |
self.renderer = OCRJSONRenderer
|
| 24 |
|
| 25 |
def build_document(self, filepath: str):
|
|
|
|
| 19 |
if not self.config:
|
| 20 |
self.config = {}
|
| 21 |
|
| 22 |
+
self.config["force_ocr"] = True
|
| 23 |
self.renderer = OCRJSONRenderer
|
| 24 |
|
| 25 |
def build_document(self, filepath: str):
|
marker/processors/table.py
CHANGED
|
@@ -64,10 +64,6 @@ class TableProcessor(BaseProcessor):
|
|
| 64 |
bool,
|
| 65 |
"Whether to disable the tqdm progress bar.",
|
| 66 |
] = False
|
| 67 |
-
format_lines: Annotated[
|
| 68 |
-
bool,
|
| 69 |
-
"Whether to format the lines.",
|
| 70 |
-
] = False
|
| 71 |
drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
|
| 72 |
|
| 73 |
def __init__(
|
|
@@ -104,9 +100,8 @@ class TableProcessor(BaseProcessor):
|
|
| 104 |
"img_size": page.get_image(highres=True).size,
|
| 105 |
"ocr_block": any(
|
| 106 |
[
|
| 107 |
-
page.text_extraction_method in ["surya"
|
| 108 |
page.ocr_errors_detected,
|
| 109 |
-
self.format_lines,
|
| 110 |
]
|
| 111 |
),
|
| 112 |
}
|
|
|
|
| 64 |
bool,
|
| 65 |
"Whether to disable the tqdm progress bar.",
|
| 66 |
] = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
drop_repeated_text: Annotated[bool, "Drop repeated text in OCR results."] = False
|
| 68 |
|
| 69 |
def __init__(
|
|
|
|
| 100 |
"img_size": page.get_image(highres=True).size,
|
| 101 |
"ocr_block": any(
|
| 102 |
[
|
| 103 |
+
page.text_extraction_method in ["surya"],
|
| 104 |
page.ocr_errors_detected,
|
|
|
|
| 105 |
]
|
| 106 |
),
|
| 107 |
}
|
marker/scripts/extraction_app.py
CHANGED
|
@@ -192,11 +192,6 @@ strip_existing_ocr = st.sidebar.checkbox(
|
|
| 192 |
help="Strip existing OCR text from the PDF and re-OCR.",
|
| 193 |
value=False,
|
| 194 |
)
|
| 195 |
-
format_lines = st.sidebar.checkbox(
|
| 196 |
-
"Format lines",
|
| 197 |
-
help="Format lines in the document with OCR model",
|
| 198 |
-
value=False,
|
| 199 |
-
)
|
| 200 |
|
| 201 |
# Check if schema is provided before running
|
| 202 |
if run_marker:
|
|
@@ -217,7 +212,6 @@ if run_marker:
|
|
| 217 |
"force_ocr": force_ocr,
|
| 218 |
"use_llm": use_llm,
|
| 219 |
"strip_existing_ocr": strip_existing_ocr,
|
| 220 |
-
"format_lines": format_lines,
|
| 221 |
}
|
| 222 |
)
|
| 223 |
|
|
|
|
| 192 |
help="Strip existing OCR text from the PDF and re-OCR.",
|
| 193 |
value=False,
|
| 194 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
# Check if schema is provided before running
|
| 197 |
if run_marker:
|
|
|
|
| 212 |
"force_ocr": force_ocr,
|
| 213 |
"use_llm": use_llm,
|
| 214 |
"strip_existing_ocr": strip_existing_ocr,
|
|
|
|
| 215 |
}
|
| 216 |
)
|
| 217 |
|
marker/scripts/streamlit_app.py
CHANGED
|
@@ -108,11 +108,6 @@ strip_existing_ocr = st.sidebar.checkbox(
|
|
| 108 |
value=False,
|
| 109 |
)
|
| 110 |
debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
|
| 111 |
-
format_lines = st.sidebar.checkbox(
|
| 112 |
-
"Format lines",
|
| 113 |
-
help="Format lines in the document with OCR model",
|
| 114 |
-
value=False,
|
| 115 |
-
)
|
| 116 |
disable_ocr_math = st.sidebar.checkbox(
|
| 117 |
"Disable math",
|
| 118 |
help="Disable math in OCR output - no inline math",
|
|
@@ -137,7 +132,6 @@ with tempfile.TemporaryDirectory() as tmp_dir:
|
|
| 137 |
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
|
| 138 |
"use_llm": use_llm,
|
| 139 |
"strip_existing_ocr": strip_existing_ocr,
|
| 140 |
-
"format_lines": format_lines,
|
| 141 |
"disable_ocr_math": disable_ocr_math,
|
| 142 |
}
|
| 143 |
)
|
|
|
|
| 108 |
value=False,
|
| 109 |
)
|
| 110 |
debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
disable_ocr_math = st.sidebar.checkbox(
|
| 112 |
"Disable math",
|
| 113 |
help="Disable math in OCR output - no inline math",
|
|
|
|
| 132 |
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
|
| 133 |
"use_llm": use_llm,
|
| 134 |
"strip_existing_ocr": strip_existing_ocr,
|
|
|
|
| 135 |
"disable_ocr_math": disable_ocr_math,
|
| 136 |
}
|
| 137 |
)
|
poetry.lock
CHANGED
|
@@ -5390,14 +5390,14 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"",
|
|
| 5390 |
|
| 5391 |
[[package]]
|
| 5392 |
name = "surya-ocr"
|
| 5393 |
-
version = "0.15.
|
| 5394 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 5395 |
optional = false
|
| 5396 |
python-versions = "<4.0,>=3.10"
|
| 5397 |
groups = ["main"]
|
| 5398 |
files = [
|
| 5399 |
-
{file = "surya_ocr-0.15.
|
| 5400 |
-
{file = "surya_ocr-0.15.
|
| 5401 |
]
|
| 5402 |
|
| 5403 |
[package.dependencies]
|
|
@@ -6505,4 +6505,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
|
|
| 6505 |
[metadata]
|
| 6506 |
lock-version = "2.1"
|
| 6507 |
python-versions = "^3.10"
|
| 6508 |
-
content-hash = "
|
|
|
|
| 5390 |
|
| 5391 |
[[package]]
|
| 5392 |
name = "surya-ocr"
|
| 5393 |
+
version = "0.15.1"
|
| 5394 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 5395 |
optional = false
|
| 5396 |
python-versions = "<4.0,>=3.10"
|
| 5397 |
groups = ["main"]
|
| 5398 |
files = [
|
| 5399 |
+
{file = "surya_ocr-0.15.1-py3-none-any.whl", hash = "sha256:1551831ec43550e2f5cc6cb7f7c8ef8c4ea3654551e904ee3e1f8d47023b05da"},
|
| 5400 |
+
{file = "surya_ocr-0.15.1.tar.gz", hash = "sha256:ec8b1a5c99bb8b265289ad75c6037af09e2f6894b4104a66faec63733ecdcf6d"},
|
| 5401 |
]
|
| 5402 |
|
| 5403 |
[package.dependencies]
|
|
|
|
| 6505 |
[metadata]
|
| 6506 |
lock-version = "2.1"
|
| 6507 |
python-versions = "^3.10"
|
| 6508 |
+
content-hash = "ad384e1af2795ac728e705765d1368aa6556a34d8e3d928abcc0a8098cc3a5dd"
|
pyproject.toml
CHANGED
|
@@ -26,7 +26,7 @@ torch = "^2.7.0"
|
|
| 26 |
tqdm = "^4.66.1"
|
| 27 |
ftfy = "^6.1.1"
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
-
surya-ocr = "^0.15.
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
pdftext = "~0.6.3"
|
| 32 |
markdownify = "^1.1.0"
|
|
|
|
| 26 |
tqdm = "^4.66.1"
|
| 27 |
ftfy = "^6.1.1"
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
+
surya-ocr = "^0.15.1"
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
pdftext = "~0.6.3"
|
| 32 |
markdownify = "^1.1.0"
|
tests/builders/test_line_builder.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
import pytest
|
| 2 |
-
|
| 3 |
-
from marker.schema import BlockTypes
|
| 4 |
-
|
| 5 |
-
# Page contains provider lines that are longer than detected lines
|
| 6 |
-
# Any bad merging will cause broken final OCR results with format lines
|
| 7 |
-
@pytest.mark.filename("mixed_eng_hindi.pdf")
|
| 8 |
-
@pytest.mark.config({"page_range": [2], "format_lines": True})
|
| 9 |
-
def test_provider_detected_line_merge(pdf_document):
|
| 10 |
-
page = pdf_document.pages[0]
|
| 11 |
-
text_lines = page.contained_blocks(pdf_document, (BlockTypes.Line,))
|
| 12 |
-
|
| 13 |
-
# This count includes detected lines merged in with provider lines
|
| 14 |
-
assert len(text_lines) == 83
|
| 15 |
-
|
| 16 |
-
# Page provider lines only contain english, while the hindi is missing
|
| 17 |
-
# format_lines should fill in the missing lines
|
| 18 |
-
@pytest.mark.filename("mixed_eng_hindi.pdf")
|
| 19 |
-
@pytest.mark.config({"page_range": [0], "format_lines": True})
|
| 20 |
-
def test_fill_missing_provider_lines(pdf_document):
|
| 21 |
-
page = pdf_document.pages[0]
|
| 22 |
-
raw_text = page.raw_text(pdf_document)
|
| 23 |
-
assert "प्राधिकार से प्रकाशित" in raw_text
|
| 24 |
-
assert "खान मंत्रालय" in raw_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/builders/test_merged_lines.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
| 1 |
-
import pytest
|
| 2 |
-
|
| 3 |
-
from marker.schema import BlockTypes
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
@pytest.mark.config({"page_range": [6], "format_lines": True, "disable_ocr": True})
|
| 7 |
-
@pytest.mark.filename("bad_math.pdf")
|
| 8 |
-
def test_keep_ocr(pdf_document):
|
| 9 |
-
contained_lines = pdf_document.pages[0].contained_blocks(
|
| 10 |
-
pdf_document, [BlockTypes.Line]
|
| 11 |
-
)
|
| 12 |
-
|
| 13 |
-
# Check that we grabbed the right text
|
| 14 |
-
assert "Lemma" in contained_lines[-1].formatted_text(pdf_document)
|
| 15 |
-
assert "distribution" in contained_lines[-2].formatted_text(pdf_document)
|
| 16 |
-
|
| 17 |
-
# Line 2 comes after line 1
|
| 18 |
-
assert contained_lines[-1].polygon.bbox[1] > contained_lines[-2].polygon.bbox[3]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/config/test_config.py
CHANGED
|
@@ -67,10 +67,9 @@ def test_config_llm():
|
|
| 67 |
|
| 68 |
|
| 69 |
def test_config_force_ocr():
|
| 70 |
-
kwargs = capture_kwargs(["test", "--force_ocr"
|
| 71 |
parser = ConfigParser(kwargs)
|
| 72 |
config_dict = parser.generate_config_dict()
|
| 73 |
|
| 74 |
# Validate kwarg capturing
|
| 75 |
assert config_dict["force_ocr"]
|
| 76 |
-
assert config_dict["format_lines"]
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
def test_config_force_ocr():
|
| 70 |
+
kwargs = capture_kwargs(["test", "--force_ocr"])
|
| 71 |
parser = ConfigParser(kwargs)
|
| 72 |
config_dict = parser.generate_config_dict()
|
| 73 |
|
| 74 |
# Validate kwarg capturing
|
| 75 |
assert config_dict["force_ocr"]
|
|
|