File size: 3,054 Bytes
274213e
 
 
 
35dd4bd
 
 
 
 
 
274213e
35dd4bd
 
274213e
35dd4bd
274213e
35dd4bd
 
 
 
 
274213e
35dd4bd
 
274213e
35dd4bd
274213e
35dd4bd
 
 
 
 
 
274213e
35dd4bd
274213e
35dd4bd
274213e
35dd4bd
274213e
35dd4bd
274213e
35dd4bd
274213e
35dd4bd
274213e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35dd4bd
 
274213e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# ocr_engine.py
# Lightweight Tesseract + pdf2image OCR wrapper that returns text and average confidence (0-100).
# Fixes: proper handling of tesseract confidences which are strings, ignore '-1', robust error logging.

import pytesseract
from pytesseract import Output
from pdf2image import convert_from_path
from PIL import Image
import os
import logging
from typing import Tuple

logger = logging.getLogger("ocr_engine")
logger.setLevel(logging.INFO)

def extract_text_and_conf(file_path: str) -> Tuple[str, float]:
    """
    Extracts text AND confidence score from a PDF or Image.
    Returns: (text_content, average_confidence_0_to_100)
    """
    if not os.path.exists(file_path):
        logger.error("extract_text_and_conf: file not found: %s", file_path)
        return "", 0.0

    text_content = []
    confidences = []

    try:
        images = []
        if file_path.lower().endswith('.pdf'):
            try:
                images = convert_from_path(file_path)
            except Exception as e:
                logger.exception("PDF convert_from_path error: %s", e)
                return "", 0.0
        else:
            try:
                images = [Image.open(file_path).convert("RGB")]
            except Exception as e:
                logger.exception("Image open error: %s", e)
                return "", 0.0

        for i, image in enumerate(images):
            # Page-level text (layout preserved)
            page_text = pytesseract.image_to_string(image)
            text_content.append(f"--- Page {i+1} ---\n{page_text}\n")

            # Per-word confidence info
            try:
                data = pytesseract.image_to_data(image, output_type=Output.DICT)
                confs = data.get("conf", [])
                # confs are strings; filter out '-1' and non-numeric
                page_conf_vals = []
                for c in confs:
                    try:
                        # Some versions return ints already; ensure cast to int
                        ci = int(float(c))
                        if ci >= 0:
                            page_conf_vals.append(ci)
                    except Exception:
                        # ignore non-numeric confs
                        continue
                if page_conf_vals:
                    confidences.extend(page_conf_vals)
                    avg_page = sum(page_conf_vals) / len(page_conf_vals)
                    logger.debug("OCR page %d avg confidence: %.2f", i+1, avg_page)
            except Exception as e:
                logger.exception("image_to_data failed on page %d: %s", i+1, e)
                # continue; don't fail entire extraction

        combined_text = "\n".join(text_content).strip()
        avg_conf = float(round(sum(confidences) / len(confidences), 2)) if confidences else 0.0
        logger.info("OCR extracted text length=%d, avg_conf=%.2f", len(combined_text), avg_conf)
        return combined_text, avg_conf

    except Exception as e:
        logger.exception("OCR critical error: %s", e)
        return "", 0.0