ocr_mcp_1

Running

App Files Files Community

ocr_mcp_1 / ocr_engine.py

vachaspathi

Update ocr_engine.py

274213e verified 4 days ago

raw

history blame contribute delete

3.05 kB

	# ocr_engine.py
	# Lightweight Tesseract + pdf2image OCR wrapper that returns text and average confidence (0-100).
	# Fixes: proper handling of tesseract confidences which are strings, ignore '-1', robust error logging.

	import pytesseract
	from pytesseract import Output
	from pdf2image import convert_from_path
	from PIL import Image
	import os
	import logging
	from typing import Tuple

	logger = logging.getLogger("ocr_engine")
	logger.setLevel(logging.INFO)

	def extract_text_and_conf(file_path: str) -> Tuple[str, float]:
	"""
	Extracts text AND confidence score from a PDF or Image.
	Returns: (text_content, average_confidence_0_to_100)
	"""
	if not os.path.exists(file_path):
	logger.error("extract_text_and_conf: file not found: %s", file_path)
	return "", 0.0

	text_content = []
	confidences = []

	try:
	images = []
	if file_path.lower().endswith('.pdf'):
	try:
	images = convert_from_path(file_path)
	except Exception as e:
	logger.exception("PDF convert_from_path error: %s", e)
	return "", 0.0
	else:
	try:
	images = [Image.open(file_path).convert("RGB")]
	except Exception as e:
	logger.exception("Image open error: %s", e)
	return "", 0.0

	for i, image in enumerate(images):
	# Page-level text (layout preserved)
	page_text = pytesseract.image_to_string(image)
	text_content.append(f"--- Page {i+1} ---\n{page_text}\n")

	# Per-word confidence info
	try:
	data = pytesseract.image_to_data(image, output_type=Output.DICT)
	confs = data.get("conf", [])
	# confs are strings; filter out '-1' and non-numeric
	page_conf_vals = []
	for c in confs:
	try:
	# Some versions return ints already; ensure cast to int
	ci = int(float(c))
	if ci >= 0:
	page_conf_vals.append(ci)
	except Exception:
	# ignore non-numeric confs
	continue
	if page_conf_vals:
	confidences.extend(page_conf_vals)
	avg_page = sum(page_conf_vals) / len(page_conf_vals)
	logger.debug("OCR page %d avg confidence: %.2f", i+1, avg_page)
	except Exception as e:
	logger.exception("image_to_data failed on page %d: %s", i+1, e)
	# continue; don't fail entire extraction

	combined_text = "\n".join(text_content).strip()
	avg_conf = float(round(sum(confidences) / len(confidences), 2)) if confidences else 0.0
	logger.info("OCR extracted text length=%d, avg_conf=%.2f", len(combined_text), avg_conf)
	return combined_text, avg_conf

	except Exception as e:
	logger.exception("OCR critical error: %s", e)
	return "", 0.0