# ocr_engine.py # Lightweight Tesseract + pdf2image OCR wrapper that returns text and average confidence (0-100). # Fixes: proper handling of tesseract confidences which are strings, ignore '-1', robust error logging. import pytesseract from pytesseract import Output from pdf2image import convert_from_path from PIL import Image import os import logging from typing import Tuple logger = logging.getLogger("ocr_engine") logger.setLevel(logging.INFO) def extract_text_and_conf(file_path: str) -> Tuple[str, float]: """ Extracts text AND confidence score from a PDF or Image. Returns: (text_content, average_confidence_0_to_100) """ if not os.path.exists(file_path): logger.error("extract_text_and_conf: file not found: %s", file_path) return "", 0.0 text_content = [] confidences = [] try: images = [] if file_path.lower().endswith('.pdf'): try: images = convert_from_path(file_path) except Exception as e: logger.exception("PDF convert_from_path error: %s", e) return "", 0.0 else: try: images = [Image.open(file_path).convert("RGB")] except Exception as e: logger.exception("Image open error: %s", e) return "", 0.0 for i, image in enumerate(images): # Page-level text (layout preserved) page_text = pytesseract.image_to_string(image) text_content.append(f"--- Page {i+1} ---\n{page_text}\n") # Per-word confidence info try: data = pytesseract.image_to_data(image, output_type=Output.DICT) confs = data.get("conf", []) # confs are strings; filter out '-1' and non-numeric page_conf_vals = [] for c in confs: try: # Some versions return ints already; ensure cast to int ci = int(float(c)) if ci >= 0: page_conf_vals.append(ci) except Exception: # ignore non-numeric confs continue if page_conf_vals: confidences.extend(page_conf_vals) avg_page = sum(page_conf_vals) / len(page_conf_vals) logger.debug("OCR page %d avg confidence: %.2f", i+1, avg_page) except Exception as e: logger.exception("image_to_data failed on page %d: %s", i+1, e) # continue; don't fail entire extraction combined_text = "\n".join(text_content).strip() avg_conf = float(round(sum(confidences) / len(confidences), 2)) if confidences else 0.0 logger.info("OCR extracted text length=%d, avg_conf=%.2f", len(combined_text), avg_conf) return combined_text, avg_conf except Exception as e: logger.exception("OCR critical error: %s", e) return "", 0.0