Spaces:

CerealDev
/

Docling-UI

Configuration error

Docling-UI / docling_serve /docling_conversion.py

Tiago Santana

feat: expose options for new features (#92)

405b27f unverified 10 months ago

7.97 kB

	import hashlib
	import json
	import logging
	from collections.abc import Iterable, Iterator
	from pathlib import Path
	from typing import Any, Optional, Union

	from fastapi import HTTPException

	from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
	from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
	from docling.backend.pdf_backend import PdfDocumentBackend
	from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
	from docling.datamodel.base_models import DocumentStream, InputFormat
	from docling.datamodel.document import ConversionResult
	from docling.datamodel.pipeline_options import (
	EasyOcrOptions,
	OcrEngine,
	OcrOptions,
	PdfBackend,
	PdfPipelineOptions,
	RapidOcrOptions,
	TableFormerMode,
	TesseractOcrOptions,
	)
	from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
	from docling_core.types.doc import ImageRefMode

	from docling_serve.datamodel.convert import ConvertDocumentsOptions
	from docling_serve.helper_functions import _to_list_of_strings
	from docling_serve.settings import docling_serve_settings

	_log = logging.getLogger(__name__)


	# Document converters will be preloaded and stored in a dictionary
	converters: dict[bytes, DocumentConverter] = {}


	# Custom serializer for PdfFormatOption
	# (model_dump_json does not work with some classes)
	def _serialize_pdf_format_option(pdf_format_option: PdfFormatOption) -> str:
	data = pdf_format_option.model_dump()

	# pipeline_options are not fully serialized by model_dump, dedicated pass
	if pdf_format_option.pipeline_options:
	data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump()

	# Replace `artifacts_path` with a string representation
	data["pipeline_options"]["artifacts_path"] = repr(
	data["pipeline_options"]["artifacts_path"]
	)

	# Replace `pipeline_cls` with a string representation
	data["pipeline_cls"] = repr(data["pipeline_cls"])

	# Replace `backend` with a string representation
	data["backend"] = repr(data["backend"])

	# Handle `device` in `accelerator_options`
	if "accelerator_options" in data and "device" in data["accelerator_options"]:
	data["accelerator_options"]["device"] = repr(
	data["accelerator_options"]["device"]
	)

	# Serialize the dictionary to JSON with sorted keys to have consistent hashes
	return json.dumps(data, sort_keys=True)


	# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
	def get_pdf_pipeline_opts( # noqa: C901
	request: ConvertDocumentsOptions,
	) -> tuple[PdfFormatOption, bytes]:
	if request.ocr_engine == OcrEngine.EASYOCR:
	try:
	import easyocr # noqa: F401
	except ImportError:
	raise HTTPException(
	status_code=400,
	detail="The requested OCR engine"
	f" (ocr_engine={request.ocr_engine.value})"
	" is not available on this system. Please choose another OCR engine "
	"or contact your system administrator.",
	)
	ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=request.force_ocr)
	elif request.ocr_engine == OcrEngine.TESSERACT:
	try:
	import tesserocr # noqa: F401
	except ImportError:
	raise HTTPException(
	status_code=400,
	detail="The requested OCR engine"
	f" (ocr_engine={request.ocr_engine.value})"
	" is not available on this system. Please choose another OCR engine "
	"or contact your system administrator.",
	)
	ocr_options = TesseractOcrOptions(force_full_page_ocr=request.force_ocr)
	elif request.ocr_engine == OcrEngine.RAPIDOCR:
	try:
	from rapidocr_onnxruntime import RapidOCR # noqa: F401
	except ImportError:
	raise HTTPException(
	status_code=400,
	detail="The requested OCR engine"
	f" (ocr_engine={request.ocr_engine.value})"
	" is not available on this system. Please choose another OCR engine "
	"or contact your system administrator.",
	)
	ocr_options = RapidOcrOptions(force_full_page_ocr=request.force_ocr)
	else:
	raise RuntimeError(f"Unexpected OCR engine type {request.ocr_engine}")

	if request.ocr_lang is not None:
	if isinstance(request.ocr_lang, str):
	ocr_options.lang = _to_list_of_strings(request.ocr_lang)
	else:
	ocr_options.lang = request.ocr_lang

	pipeline_options = PdfPipelineOptions(
	do_ocr=request.do_ocr,
	ocr_options=ocr_options,
	do_table_structure=request.do_table_structure,
	do_code_enrichment=request.do_code_enrichment,
	do_formula_enrichment=request.do_formula_enrichment,
	do_picture_classification=request.do_picture_classification,
	do_picture_description=request.do_picture_description,
	)
	pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
	pipeline_options.table_structure_options.mode = TableFormerMode(request.table_mode)

	if request.image_export_mode != ImageRefMode.PLACEHOLDER:
	pipeline_options.generate_page_images = True
	if request.images_scale:
	pipeline_options.images_scale = request.images_scale

	if request.pdf_backend == PdfBackend.DLPARSE_V1:
	backend: type[PdfDocumentBackend] = DoclingParseDocumentBackend
	elif request.pdf_backend == PdfBackend.DLPARSE_V2:
	backend = DoclingParseV2DocumentBackend
	elif request.pdf_backend == PdfBackend.PYPDFIUM2:
	backend = PyPdfiumDocumentBackend
	else:
	raise RuntimeError(f"Unexpected PDF backend type {request.pdf_backend}")

	if docling_serve_settings.artifacts_path is not None:
	if str(docling_serve_settings.artifacts_path.absolute()) == "":
	_log.info(
	"artifacts_path is an empty path, model weights will be dowloaded "
	"at runtime."
	)
	pipeline_options.artifacts_path = None
	elif docling_serve_settings.artifacts_path.is_dir():
	_log.info(
	"artifacts_path is set to a valid directory. "
	"No model weights will be downloaded at runtime."
	)
	pipeline_options.artifacts_path = docling_serve_settings.artifacts_path
	else:
	_log.warning(
	"artifacts_path is set to an invalid directory. "
	"The system will download the model weights at runtime."
	)
	pipeline_options.artifacts_path = None
	else:
	_log.info(
	"artifacts_path is unset. "
	"The system will download the model weights at runtime."
	)

	pdf_format_option = PdfFormatOption(
	pipeline_options=pipeline_options,
	backend=backend,
	)

	serialized_data = _serialize_pdf_format_option(pdf_format_option)

	options_hash = hashlib.sha1(serialized_data.encode()).digest()

	return pdf_format_option, options_hash


	def convert_documents(
	sources: Iterable[Union[Path, str, DocumentStream]],
	options: ConvertDocumentsOptions,
	headers: Optional[dict[str, Any]] = None,
	):
	pdf_format_option, options_hash = get_pdf_pipeline_opts(options)

	if options_hash not in converters:
	format_options: dict[InputFormat, FormatOption] = {
	InputFormat.PDF: pdf_format_option,
	InputFormat.IMAGE: pdf_format_option,
	}

	converters[options_hash] = DocumentConverter(format_options=format_options)
	_log.info(f"We now have {len(converters)} converters in memory.")

	results: Iterator[ConversionResult] = converters[options_hash].convert_all(
	sources,
	headers=headers,
	)

	return results