# ocr_engine.py
# Lightweight Tesseract + pdf2image OCR wrapper that returns text and average confidence (0-100).
# Fixes: proper handling of tesseract confidences which are strings, ignore '-1', robust error logging.

import pytesseract
from pytesseract import Output
from pdf2image import convert_from_path
from PIL import Image
import os
import logging
from typing import Tuple

logger = logging.getLogger("ocr_engine")
logger.setLevel(logging.INFO)

def extract_text_and_conf(file_path: str) -> Tuple[str, float]:
    """
    Extracts text AND confidence score from a PDF or Image.
    Returns: (text_content, average_confidence_0_to_100)
    """
    if not os.path.exists(file_path):
        logger.error("extract_text_and_conf: file not found: %s", file_path)
        return "", 0.0

    text_content = []
    confidences = []

    try:
        images = []
        if file_path.lower().endswith('.pdf'):
            try:
                images = convert_from_path(file_path)
            except Exception as e:
                logger.exception("PDF convert_from_path error: %s", e)
                return "", 0.0
        else:
            try:
                images = [Image.open(file_path).convert("RGB")]
            except Exception as e:
                logger.exception("Image open error: %s", e)
                return "", 0.0

        for i, image in enumerate(images):
            # Page-level text (layout preserved)
            page_text = pytesseract.image_to_string(image)
            text_content.append(f"--- Page {i+1} ---\n{page_text}\n")

            # Per-word confidence info
            try:
                data = pytesseract.image_to_data(image, output_type=Output.DICT)
                confs = data.get("conf", [])
                # confs are strings; filter out '-1' and non-numeric
                page_conf_vals = []
                for c in confs:
                    try:
                        # Some versions return ints already; ensure cast to int
                        ci = int(float(c))
                        if ci >= 0:
                            page_conf_vals.append(ci)
                    except Exception:
                        # ignore non-numeric confs
                        continue
                if page_conf_vals:
                    confidences.extend(page_conf_vals)
                    avg_page = sum(page_conf_vals) / len(page_conf_vals)
                    logger.debug("OCR page %d avg confidence: %.2f", i+1, avg_page)
            except Exception as e:
                logger.exception("image_to_data failed on page %d: %s", i+1, e)
                # continue; don't fail entire extraction

        combined_text = "\n".join(text_content).strip()
        avg_conf = float(round(sum(confidences) / len(confidences), 2)) if confidences else 0.0
        logger.info("OCR extracted text length=%d, avg_conf=%.2f", len(combined_text), avg_conf)
        return combined_text, avg_conf

    except Exception as e:
        logger.exception("OCR critical error: %s", e)
        return "", 0.0