Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

marker / tests /processors /test_llm_processors.py

peppermenta

Fix tests - Include detection model in TableProcessor init

f14987e 3 months ago

raw

history blame contribute delete

7.04 kB

	from unittest.mock import MagicMock, Mock

	import pytest
	from marker.processors.llm.llm_complex import LLMComplexRegionProcessor
	from marker.processors.llm.llm_equation import LLMEquationProcessor

	from marker.processors.llm.llm_form import LLMFormProcessor
	from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor
	from marker.processors.llm.llm_meta import LLMSimpleBlockMetaProcessor
	from marker.processors.llm.llm_table import LLMTableProcessor
	from marker.processors.table import TableProcessor
	from marker.renderers.markdown import MarkdownRenderer
	from marker.schema import BlockTypes
	from marker.schema.blocks import ComplexRegion


	@pytest.mark.filename("form_1040.pdf")
	@pytest.mark.config({"page_range": [0]})
	def test_llm_form_processor_no_config(pdf_document, llm_service):
	processor_lst = [LLMFormProcessor()]
	processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service)
	processor(pdf_document)

	forms = pdf_document.contained_blocks((BlockTypes.Form,))
	assert forms[0].html is None


	@pytest.mark.filename("form_1040.pdf")
	@pytest.mark.config({"page_range": [0]})
	def test_llm_form_processor_no_cells(pdf_document, llm_service):
	config = {"use_llm": True, "gemini_api_key": "test"}
	processor_lst = [LLMFormProcessor(config)]
	processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service, config)
	processor(pdf_document)

	forms = pdf_document.contained_blocks((BlockTypes.Form,))
	assert forms[0].html is None


	@pytest.mark.filename("form_1040.pdf")
	@pytest.mark.config({"page_range": [0]})
	def test_llm_form_processor(pdf_document, table_rec_model, recognition_model, detection_model):
	corrected_html = "<em>This is corrected markdown.</em>\n" * 100
	corrected_html = "<p>" + corrected_html.strip() + "</p>\n"

	mock_cls = Mock()
	mock_cls.return_value = {"corrected_html": corrected_html}

	cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
	cell_processor(pdf_document)

	config = {"use_llm": True, "gemini_api_key": "test"}
	processor_lst = [LLMFormProcessor(config)]
	processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
	processor(pdf_document)

	forms = pdf_document.contained_blocks((BlockTypes.Form,))
	assert forms[0].html == corrected_html.strip()



	@pytest.mark.filename("table_ex2.pdf")
	@pytest.mark.config({"page_range": [0]})
	def test_llm_table_processor(pdf_document, table_rec_model, recognition_model, detection_model):
	corrected_html = """
	<table>
	<tr>
	<td>Column 1</td>
	<td>Column 2</td>
	<td>Column 3</td>
	<td>Column 4</td>
	</tr>
	<tr>
	<td>Value 1 <math>x</math></td>
	<td>Value 2</td>
	<td>Value 3</td>
	<td>Value 4</td>
	</tr>
	<tr>
	<td>Value 5</td>
	<td>Value 6</td>
	<td>Value 7</td>
	<td>Value 8</td>
	</tr>
	</table>
	""".strip()

	mock_cls = Mock()
	mock_cls.return_value = {"corrected_html": corrected_html}

	cell_processor = TableProcessor(recognition_model, table_rec_model, detection_model)
	cell_processor(pdf_document)

	processor = LLMTableProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
	processor(pdf_document)

	tables = pdf_document.contained_blocks((BlockTypes.Table,))
	table_cells = tables[0].contained_blocks(pdf_document, (BlockTypes.TableCell,))
	assert table_cells[0].text == "Column 1"

	markdown = MarkdownRenderer()(pdf_document).markdown
	assert "Value 1 $x$" in markdown


	@pytest.mark.filename("A17_FlightPlan.pdf")
	@pytest.mark.config({"page_range": [0]})
	def test_llm_caption_processor_disabled(pdf_document):
	config = {"use_llm": True, "gemini_api_key": "test"}
	mock_cls = MagicMock()
	processor_lst = [LLMImageDescriptionProcessor(config)]
	processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
	processor(pdf_document)

	contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
	assert all(picture.description is None for picture in contained_pictures)

	@pytest.mark.filename("A17_FlightPlan.pdf")
	@pytest.mark.config({"page_range": [0]})
	def test_llm_caption_processor(pdf_document):
	description = "This is an image description."
	mock_cls = Mock()
	mock_cls.return_value = {"image_description": description}

	config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False}
	processor_lst = [LLMImageDescriptionProcessor(config)]
	processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
	processor(pdf_document)

	contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
	assert all(picture.description == description for picture in contained_pictures)

	# Ensure the rendering includes the description
	renderer = MarkdownRenderer({"extract_images": False})
	md = renderer(pdf_document).markdown

	assert description in md


	@pytest.mark.filename("A17_FlightPlan.pdf")
	@pytest.mark.config({"page_range": [0]})
	def test_llm_complex_region_processor(pdf_document):
	md = "This is some markdown for a complex region."
	mock_cls = Mock()
	mock_cls.return_value = {"corrected_markdown": md * 25}

	# Replace the block with a complex region
	old_block = pdf_document.pages[0].children[0]
	new_block = ComplexRegion(
	**old_block.dict(exclude=["id", "block_id", "block_type"]),
	)
	pdf_document.pages[0].replace_block(old_block, new_block)

	# Test processor
	config = {"use_llm": True, "gemini_api_key": "test"}
	processor_lst = [LLMComplexRegionProcessor(config)]
	processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
	processor(pdf_document)

	# Ensure the rendering includes the description
	renderer = MarkdownRenderer()
	rendered_md = renderer(pdf_document).markdown

	assert md in rendered_md

	@pytest.mark.filename("adversarial.pdf")
	@pytest.mark.config({"page_range": [0]})
	def test_multi_llm_processors(pdf_document):
	description = "<math>This is an image description. And here is a lot of writing about it.</math>" * 10
	mock_cls = Mock()
	mock_cls.return_value = {"image_description": description, "corrected_equation": description}

	config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
	processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]
	processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
	processor(pdf_document)

	contained_pictures = pdf_document.contained_blocks((BlockTypes.Picture, BlockTypes.Figure))
	assert all(picture.description == description for picture in contained_pictures)

	contained_equations = pdf_document.contained_blocks((BlockTypes.Equation,))
	print([equation.html for equation in contained_equations])
	assert all(equation.html == description for equation in contained_equations)