ocr_mcp_1 / ocr_engine.py
vachaspathi's picture
Update ocr_engine.py
274213e verified
# ocr_engine.py
# Lightweight Tesseract + pdf2image OCR wrapper that returns text and average confidence (0-100).
# Fixes: proper handling of tesseract confidences which are strings, ignore '-1', robust error logging.
import pytesseract
from pytesseract import Output
from pdf2image import convert_from_path
from PIL import Image
import os
import logging
from typing import Tuple
logger = logging.getLogger("ocr_engine")
logger.setLevel(logging.INFO)
def extract_text_and_conf(file_path: str) -> Tuple[str, float]:
"""
Extracts text AND confidence score from a PDF or Image.
Returns: (text_content, average_confidence_0_to_100)
"""
if not os.path.exists(file_path):
logger.error("extract_text_and_conf: file not found: %s", file_path)
return "", 0.0
text_content = []
confidences = []
try:
images = []
if file_path.lower().endswith('.pdf'):
try:
images = convert_from_path(file_path)
except Exception as e:
logger.exception("PDF convert_from_path error: %s", e)
return "", 0.0
else:
try:
images = [Image.open(file_path).convert("RGB")]
except Exception as e:
logger.exception("Image open error: %s", e)
return "", 0.0
for i, image in enumerate(images):
# Page-level text (layout preserved)
page_text = pytesseract.image_to_string(image)
text_content.append(f"--- Page {i+1} ---\n{page_text}\n")
# Per-word confidence info
try:
data = pytesseract.image_to_data(image, output_type=Output.DICT)
confs = data.get("conf", [])
# confs are strings; filter out '-1' and non-numeric
page_conf_vals = []
for c in confs:
try:
# Some versions return ints already; ensure cast to int
ci = int(float(c))
if ci >= 0:
page_conf_vals.append(ci)
except Exception:
# ignore non-numeric confs
continue
if page_conf_vals:
confidences.extend(page_conf_vals)
avg_page = sum(page_conf_vals) / len(page_conf_vals)
logger.debug("OCR page %d avg confidence: %.2f", i+1, avg_page)
except Exception as e:
logger.exception("image_to_data failed on page %d: %s", i+1, e)
# continue; don't fail entire extraction
combined_text = "\n".join(text_content).strip()
avg_conf = float(round(sum(confidences) / len(confidences), 2)) if confidences else 0.0
logger.info("OCR extracted text length=%d, avg_conf=%.2f", len(combined_text), avg_conf)
return combined_text, avg_conf
except Exception as e:
logger.exception("OCR critical error: %s", e)
return "", 0.0