Spaces:
Running
Running
File size: 3,054 Bytes
274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e 35dd4bd 274213e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# ocr_engine.py
# Lightweight Tesseract + pdf2image OCR wrapper that returns text and average confidence (0-100).
# Fixes: proper handling of tesseract confidences which are strings, ignore '-1', robust error logging.
import pytesseract
from pytesseract import Output
from pdf2image import convert_from_path
from PIL import Image
import os
import logging
from typing import Tuple
logger = logging.getLogger("ocr_engine")
logger.setLevel(logging.INFO)
def extract_text_and_conf(file_path: str) -> Tuple[str, float]:
"""
Extracts text AND confidence score from a PDF or Image.
Returns: (text_content, average_confidence_0_to_100)
"""
if not os.path.exists(file_path):
logger.error("extract_text_and_conf: file not found: %s", file_path)
return "", 0.0
text_content = []
confidences = []
try:
images = []
if file_path.lower().endswith('.pdf'):
try:
images = convert_from_path(file_path)
except Exception as e:
logger.exception("PDF convert_from_path error: %s", e)
return "", 0.0
else:
try:
images = [Image.open(file_path).convert("RGB")]
except Exception as e:
logger.exception("Image open error: %s", e)
return "", 0.0
for i, image in enumerate(images):
# Page-level text (layout preserved)
page_text = pytesseract.image_to_string(image)
text_content.append(f"--- Page {i+1} ---\n{page_text}\n")
# Per-word confidence info
try:
data = pytesseract.image_to_data(image, output_type=Output.DICT)
confs = data.get("conf", [])
# confs are strings; filter out '-1' and non-numeric
page_conf_vals = []
for c in confs:
try:
# Some versions return ints already; ensure cast to int
ci = int(float(c))
if ci >= 0:
page_conf_vals.append(ci)
except Exception:
# ignore non-numeric confs
continue
if page_conf_vals:
confidences.extend(page_conf_vals)
avg_page = sum(page_conf_vals) / len(page_conf_vals)
logger.debug("OCR page %d avg confidence: %.2f", i+1, avg_page)
except Exception as e:
logger.exception("image_to_data failed on page %d: %s", i+1, e)
# continue; don't fail entire extraction
combined_text = "\n".join(text_content).strip()
avg_conf = float(round(sum(confidences) / len(confidences), 2)) if confidences else 0.0
logger.info("OCR extracted text length=%d, avg_conf=%.2f", len(combined_text), avg_conf)
return combined_text, avg_conf
except Exception as e:
logger.exception("OCR critical error: %s", e)
return "", 0.0
|