|
|
from unittest.mock import MagicMock, Mock |
|
|
|
|
|
import pytest |
|
|
from marker.processors.llm.llm_complex import LLMComplexRegionProcessor |
|
|
from marker.processors.llm.llm_equation import LLMEquationProcessor |
|
|
|
|
|
from marker.processors.llm.llm_form import LLMFormProcessor |
|
|
from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor |
|
|
from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor |
|
|
from marker.processors.llm.llm_table import LLMTableProcessor |
|
|
from marker.processors.table import TableProcessor |
|
|
from marker.renderers.markdown import MarkdownRenderer |
|
|
from marker.schema import BlockTypes |
|
|
from marker.schema.blocks import ComplexRegion |
|
|
|
|
|
|
|
|
@pytest.mark.filename("form_1040.pdf") |
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_llm_form_processor_no_config(pdf_document, llm_service): |
|
|
processor_lst = [LLMFormProcessor()] |
|
|
processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service) |
|
|
processor(pdf_document) |
|
|
|
|
|
forms = pdf_document.contained_blocks((BlockTypes.Form,)) |
|
|
assert forms[0].html is None |
|
|
|
|
|
|
|
|
@pytest.mark.filename("form_1040.pdf") |
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_llm_form_processor_no_cells(pdf_document, llm_service): |
|
|
config = {"use_llm": True, "gemini_api_key": "test"} |
|
|
processor_lst = [LLMFormProcessor(config)] |
|
|
processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service, config) |
|
|
processor(pdf_document) |
|
|
|
|
|
forms = pdf_document.contained_blocks((BlockTypes.Form,)) |
|
|
assert forms[0].html is None |
|
|
|
|
|
|
|
|
@pytest.mark.filename("form_1040.pdf") |
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_llm_form_processor(pdf_document, table_rec_model, recognition_model, detection_model): |
|
|
corrected_html = "<em>This is corrected markdown.</em>\n" * 100 |
|
|
corrected_html = "<p>" + corrected_html.strip() + "</p>\n" |
|
|
|
|
|
mock_cls = Mock() |
|
|
mock_cls.return_value = {"corrected_html": corrected_html} |
|
|
|
|
|
cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model) |
|
|
cell_processor(pdf_document) |
|
|
|
|
|
config = {"use_llm": True, "gemini_api_key": "test"} |
|
|
processor_lst = [LLMFormProcessor(config)] |
|
|
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) |
|
|
processor(pdf_document) |
|
|
|
|
|
forms = pdf_document.contained_blocks((BlockTypes.Form,)) |
|
|
assert forms[0].html == corrected_html.strip() |
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.filename("table_ex2.pdf") |
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_llm_table_processor(pdf_document, table_rec_model, recognition_model, detection_model): |
|
|
corrected_html = """ |
|
|
<table> |
|
|
<tr> |
|
|
<td>Column 1</td> |
|
|
<td>Column 2</td> |
|
|
<td>Column 3</td> |
|
|
<td>Column 4</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Value 1 <math>x</math></td> |
|
|
<td>Value 2</td> |
|
|
<td>Value 3</td> |
|
|
<td>Value 4</td> |
|
|
</tr> |
|
|
<tr> |
|
|
<td>Value 5</td> |
|
|
<td>Value 6</td> |
|
|
<td>Value 7</td> |
|
|
<td>Value 8</td> |
|
|
</tr> |
|
|
</table> |
|
|
""".strip() |
|
|
|
|
|
mock_cls = Mock() |
|
|
mock_cls.return_value = {"corrected_html": corrected_html} |
|
|
|
|
|
cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model) |
|
|
cell_processor(pdf_document) |
|
|
|
|
|
processor = LLMTableProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"}) |
|
|
processor(pdf_document) |
|
|
|
|
|
tables = pdf_document.contained_blocks((BlockTypes.Table,)) |
|
|
table_cells = tables[0].contained_blocks(pdf_document, (BlockTypes.TableCell,)) |
|
|
assert table_cells[0].text == "Column 1" |
|
|
|
|
|
markdown = MarkdownRenderer()(pdf_document).markdown |
|
|
assert "Value 1 $x$" in markdown |
|
|
|
|
|
|
|
|
@pytest.mark.filename("A17_FlightPlan.pdf") |
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_llm_caption_processor_disabled(pdf_document): |
|
|
config = {"use_llm": True, "gemini_api_key": "test"} |
|
|
mock_cls = MagicMock() |
|
|
processor_lst = [LLMImageDescriptionProcessor(config)] |
|
|
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) |
|
|
processor(pdf_document) |
|
|
|
|
|
contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure)) |
|
|
assert all(picture.description is None for picture in contained_pictures) |
|
|
|
|
|
@pytest.mark.filename("A17_FlightPlan.pdf") |
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_llm_caption_processor(pdf_document): |
|
|
description = "This is an image description." |
|
|
mock_cls = Mock() |
|
|
mock_cls.return_value = {"image_description": description} |
|
|
|
|
|
config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False} |
|
|
processor_lst = [LLMImageDescriptionProcessor(config)] |
|
|
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) |
|
|
processor(pdf_document) |
|
|
|
|
|
contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure)) |
|
|
assert all(picture.description == description for picture in contained_pictures) |
|
|
|
|
|
|
|
|
renderer = MarkdownRenderer({"extract_images": False}) |
|
|
md = renderer(pdf_document).markdown |
|
|
|
|
|
assert description in md |
|
|
|
|
|
|
|
|
@pytest.mark.filename("A17_FlightPlan.pdf") |
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_llm_complex_region_processor(pdf_document): |
|
|
md = "This is some *markdown* for a complex region." |
|
|
mock_cls = Mock() |
|
|
mock_cls.return_value = {"corrected_markdown": md * 25} |
|
|
|
|
|
|
|
|
old_block = pdf_document.pages[0].children[0] |
|
|
new_block = ComplexRegion( |
|
|
**old_block.dict(exclude=["id", "block_id", "block_type"]), |
|
|
) |
|
|
pdf_document.pages[0].replace_block(old_block, new_block) |
|
|
|
|
|
|
|
|
config = {"use_llm": True, "gemini_api_key": "test"} |
|
|
processor_lst = [LLMComplexRegionProcessor(config)] |
|
|
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) |
|
|
processor(pdf_document) |
|
|
|
|
|
|
|
|
renderer = MarkdownRenderer() |
|
|
rendered_md = renderer(pdf_document).markdown |
|
|
|
|
|
assert md in rendered_md |
|
|
|
|
|
@pytest.mark.filename("adversarial.pdf") |
|
|
@pytest.mark.config({"page_range": [0]}) |
|
|
def test_multi_llm_processors(pdf_document): |
|
|
description = "<math>This is an image description. And here is a lot of writing about it.</math>" * 10 |
|
|
mock_cls = Mock() |
|
|
mock_cls.return_value = {"image_description": description, "corrected_equation": description} |
|
|
|
|
|
config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001} |
|
|
processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)] |
|
|
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config) |
|
|
processor(pdf_document) |
|
|
|
|
|
contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure)) |
|
|
assert all(picture.description == description for picture in contained_pictures) |
|
|
|
|
|
contained_equations = pdf_document.contained_blocks((BlockTypes.Equation,)) |
|
|
print([equation.html for equation in contained_equations]) |
|
|
assert all(equation.html == description for equation in contained_equations) |