Spaces:
Running
Running
| # ocr_engine.py | |
| # Lightweight Tesseract + pdf2image OCR wrapper that returns text and average confidence (0-100). | |
| # Fixes: proper handling of tesseract confidences which are strings, ignore '-1', robust error logging. | |
| import pytesseract | |
| from pytesseract import Output | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import os | |
| import logging | |
| from typing import Tuple | |
| logger = logging.getLogger("ocr_engine") | |
| logger.setLevel(logging.INFO) | |
| def extract_text_and_conf(file_path: str) -> Tuple[str, float]: | |
| """ | |
| Extracts text AND confidence score from a PDF or Image. | |
| Returns: (text_content, average_confidence_0_to_100) | |
| """ | |
| if not os.path.exists(file_path): | |
| logger.error("extract_text_and_conf: file not found: %s", file_path) | |
| return "", 0.0 | |
| text_content = [] | |
| confidences = [] | |
| try: | |
| images = [] | |
| if file_path.lower().endswith('.pdf'): | |
| try: | |
| images = convert_from_path(file_path) | |
| except Exception as e: | |
| logger.exception("PDF convert_from_path error: %s", e) | |
| return "", 0.0 | |
| else: | |
| try: | |
| images = [Image.open(file_path).convert("RGB")] | |
| except Exception as e: | |
| logger.exception("Image open error: %s", e) | |
| return "", 0.0 | |
| for i, image in enumerate(images): | |
| # Page-level text (layout preserved) | |
| page_text = pytesseract.image_to_string(image) | |
| text_content.append(f"--- Page {i+1} ---\n{page_text}\n") | |
| # Per-word confidence info | |
| try: | |
| data = pytesseract.image_to_data(image, output_type=Output.DICT) | |
| confs = data.get("conf", []) | |
| # confs are strings; filter out '-1' and non-numeric | |
| page_conf_vals = [] | |
| for c in confs: | |
| try: | |
| # Some versions return ints already; ensure cast to int | |
| ci = int(float(c)) | |
| if ci >= 0: | |
| page_conf_vals.append(ci) | |
| except Exception: | |
| # ignore non-numeric confs | |
| continue | |
| if page_conf_vals: | |
| confidences.extend(page_conf_vals) | |
| avg_page = sum(page_conf_vals) / len(page_conf_vals) | |
| logger.debug("OCR page %d avg confidence: %.2f", i+1, avg_page) | |
| except Exception as e: | |
| logger.exception("image_to_data failed on page %d: %s", i+1, e) | |
| # continue; don't fail entire extraction | |
| combined_text = "\n".join(text_content).strip() | |
| avg_conf = float(round(sum(confidences) / len(confidences), 2)) if confidences else 0.0 | |
| logger.info("OCR extracted text length=%d, avg_conf=%.2f", len(combined_text), avg_conf) | |
| return combined_text, avg_conf | |
| except Exception as e: | |
| logger.exception("OCR critical error: %s", e) | |
| return "", 0.0 | |