Spaces:

lucasgagneten
/

layoutlmv3-facturas-extractor

Sleeping

File size: 14,698 Bytes

# validator.py
"""
Validación y corrección de etiquetas extraídas de facturas
"""

import re
from datetime import datetime
from typing import Dict, List, Tuple, Optional


class InvoiceValidator:
    """Clase para validar y corregir datos extraídos de facturas."""
    
    # Etiquetas requeridas en el orden que deben aparecer
    REQUIRED_LABELS = [
        'PROVEEDOR_RAZON_SOCIAL',
        'PROVEEDOR_CUIT',
        'COMPROBANTE_NUMERO',
        'FECHA',
        'JURISDICCION_GASTO',
        'TIPO',
        'CONCEPTO_GASTO',
        'ALICUOTA',
        'IVA',
        'NETO',
        'TOTAL'
    ]
    
    def __init__(self):
        """Inicializa el validador."""
        self.validation_errors = {}
    
    def validate_and_correct(self, ner_results: List[Dict], ocr_text: List[str] = None) -> Tuple[List[List], Dict]:
        """
        Valida y corrige los resultados de NER.
        
        Args:
            ner_results: Lista de diccionarios con 'etiqueta' y 'valor'
            ocr_text: Lista opcional de palabras extraídas por OCR
            
        Returns:
            tuple: (tabla_corregida, errores_validacion)
                - tabla_corregida: Lista de [etiqueta, valor] (sin columna de validación)
                - errores_validacion: Dict con etiquetas que tienen errores
        """
        # Convertir resultados NER a diccionario
        ner_dict = {item['etiqueta']: item['valor'] for item in ner_results}
        
        print(f"\n=== VALIDACIÓN ===")
        print(f"Etiquetas detectadas: {list(ner_dict.keys())}")
        print(f"Total palabras OCR: {len(ocr_text) if ocr_text else 0}")
        
        # Resetear errores
        self.validation_errors = {}
        
        # Crear tabla con todas las etiquetas requeridas
        corrected_table = []
        
        for label in self.REQUIRED_LABELS:
            value = ner_dict.get(label, '')
            corrected_value, is_valid = self._validate_label(label, value, ner_dict, ocr_text)
            
            # Solo agregar [etiqueta, valor], sin la columna de estado
            corrected_table.append([label, corrected_value])
            
            print(f"{label}: '{value}' -> '{corrected_value}' (válido: {is_valid})")
            
            if not is_valid:
                self.validation_errors[label] = corrected_value
        
        print(f"Total campos inválidos: {len(self.validation_errors)}")
        print("==================\n")
        
        return corrected_table, self.validation_errors
    
    def _validate_label(self, label: str, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """
        Valida y corrige un valor específico según su etiqueta.
        
        Args:
            label: Nombre de la etiqueta
            value: Valor a validar
            all_values: Diccionario con todos los valores NER
            ocr_text: Lista de palabras del OCR
            
        Returns:
            tuple: (valor_corregido, es_valido)
        """
        validators = {
            'ALICUOTA': self._validate_alicuota,
            'COMPROBANTE_NUMERO': self._validate_comprobante,
            'CONCEPTO_GASTO': self._validate_concepto_gasto,
            'FECHA': self._validate_fecha,
            'IVA': self._validate_iva,
            'JURISDICCION_GASTO': self._validate_jurisdiccion,
            'NETO': self._validate_neto,
            'PROVEEDOR_CUIT': self._validate_cuit,
            'PROVEEDOR_RAZON_SOCIAL': self._validate_razon_social,
            'TIPO': self._validate_tipo,
            'TOTAL': self._validate_total
        }
        
        validator = validators.get(label)
        if validator:
            return validator(value, all_values, ocr_text)
        
        return value, True
    
    def _validate_alicuota(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida ALICUOTA: debe ser '21.00' o '10.5'."""
        value_clean = value.strip().replace(',', '.')
        
        if '21' in value_clean:
            return '21.00', True
        elif '10.5' in value_clean or '10,5' in value:
            return '10.5', True
        else:
            # Por defecto 21%
            return '21.00', False
    
    def _validate_comprobante(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida COMPROBANTE_NUMERO: formato #####-########."""
        # Buscar patrón correcto
        pattern = r'\d{4,5}-\d{8}'
        
        # Si el valor tiene el patrón, extraerlo
        match = re.search(pattern, value)
        
        if match:
            extracted = match.group(0)
            # Si después de extraer tiene el formato correcto, es válido
            return extracted, True
        
        # Si no coincide, buscar números y formatear
        numbers = re.findall(r'\d+', value)
        if len(numbers) >= 2:
            num1 = numbers[0].zfill(5)[:5]
            num2 = numbers[1].zfill(8)[:8]
            formatted = f"{num1}-{num2}"
            # Verificar si el valor formateado cumple con el patrón
            if re.match(pattern, formatted):
                return formatted, True
            return formatted, False
        
        return '00000-00000000', False
    
    def _validate_concepto_gasto(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida CONCEPTO_GASTO: cualquier texto es válido."""
        return value.strip() if value else '', True
    
    def _validate_fecha(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida FECHA: debe tener formato de fecha válido."""
        if not value:
            return datetime.now().strftime('%d/%m/%Y'), False
        
        # Intentar parsear diferentes formatos de fecha
        date_patterns = [
            r'(\d{1,2})[/-](\d{1,2})[/-](\d{4})',  # dd/mm/yyyy o dd-mm-yyyy
            r'(\d{1,2})[/-](\d{1,2})[/-](\d{2})',   # dd/mm/yy
            r'(\d{4})[/-](\d{1,2})[/-](\d{1,2})'    # yyyy/mm/dd
        ]
        
        for pattern in date_patterns:
            match = re.search(pattern, value)
            if match:
                try:
                    groups = match.groups()
                    if len(groups[2]) == 2:  # año con 2 dígitos
                        year = '20' + groups[2]
                        date_str = f"{groups[0]}/{groups[1]}/{year}"
                    else:
                        date_str = f"{groups[0]}/{groups[1]}/{groups[2]}"
                    
                    # Validar que sea una fecha válida
                    datetime.strptime(date_str, '%d/%m/%Y')
                    return date_str, True
                except ValueError:
                    continue
        
        # Si no se puede parsear, usar fecha actual
        return datetime.now().strftime('%d/%m/%Y'), False
    
    def _validate_total(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida TOTAL: debe ser un número, eliminar símbolo $."""
        if not value:
            # Buscar el número más alto con decimales en OCR
            if ocr_text:
                max_number = self._find_max_decimal_in_ocr(ocr_text)
                if max_number:
                    return max_number, False
            return '0.00', False
        
        # Limpiar valor
        clean_value = self._clean_currency(value)
        
        try:
            float(clean_value)
            return clean_value, True
        except ValueError:
            return '0.00', False
    
    def _validate_iva(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida IVA: debe ser un número, calcular (TOTAL/1.21)*0.21 si no existe."""
        if not value:
            # Calcular 21% del TOTAL
            total = all_values.get('TOTAL', '0')
            clean_total = self._clean_currency(total)
            
            try:
                total_num = float(clean_total)
                iva_calculated = round(total_num * 0.17355372, 2)
                return f"{iva_calculated:.2f}", False
            except ValueError:
                return '0.00', False
        
        # Limpiar valor
        clean_value = self._clean_currency(value)
        
        try:
            float(clean_value)
            return clean_value, True
        except ValueError:
            return '0.00', False
    
    def _validate_neto(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida NETO: debe ser un número, calcular TOTAL/1.21 si no existe."""
        if not value:
            # Calcular 79% del TOTAL (o TOTAL - IVA)
            total = all_values.get('TOTAL', '0')
            clean_total = self._clean_currency(total)
            
            try:
                total_num = float(clean_total)
                neto_calculated = round(total_num * 0.82644628, 2)
                return f"{neto_calculated:.2f}", False
            except ValueError:
                return '0.00', False
        
        # Limpiar valor
        clean_value = self._clean_currency(value)
        
        try:
            float(clean_value)
            return clean_value, True
        except ValueError:
            return '0.00', False
    
    def _validate_jurisdiccion(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida JURISDICCION_GASTO: texto de localidad."""
        return value.strip() if value else '', True
    
    def _validate_cuit(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida PROVEEDOR_CUIT: formato ##-########-#."""
        if not value:
            # Si no hay valor, buscar en OCR
            if ocr_text:
                ocr_combined = ' '.join(ocr_text)
                # Buscar patrón ##-########-#
                pattern = r'\d{2}-\d{8}-\d{1}'
                match = re.search(pattern, ocr_combined)
                if match:
                    return match.group(0), False
                
                # Buscar patrón sin guiones: 11 dígitos consecutivos
                pattern_no_dash = r'\b\d{11}\b'
                match = re.search(pattern_no_dash, ocr_combined)
                if match:
                    cuit = match.group(0)
                    formatted_cuit = f"{cuit[:2]}-{cuit[2:10]}-{cuit[10]}"
                    return formatted_cuit, False
            return '00-00000000-0', False
        
        # Limpiar valor: quitar TODO excepto números y guiones
        clean_value = re.sub(r'[^\d\-]', '', value)
        
        # Buscar patrón ##-########-# en el valor limpio
        pattern = r'\d{2}-\d{8}-\d{1}'
        match = re.search(pattern, clean_value)
        
        if match:
            extracted = match.group(0)
            # Si el valor extraído cumple con el formato, es válido
            return extracted, True
        
        # Si hay números pero no el formato correcto, intentar extraerlos y formatear
        numbers_only = re.sub(r'[^\d]', '', clean_value)
        if len(numbers_only) == 11:
            formatted_cuit = f"{numbers_only[:2]}-{numbers_only[2:10]}-{numbers_only[10]}"
            # El CUIT formateado es válido
            return formatted_cuit, True
        elif len(numbers_only) > 11:
            # Tomar los primeros 11 dígitos
            formatted_cuit = f"{numbers_only[:2]}-{numbers_only[2:10]}-{numbers_only[10]}"
            return formatted_cuit, True
        
        # Buscar en OCR si no se puede extraer del valor
        if ocr_text:
            ocr_combined = ' '.join(ocr_text)
            # Buscar con formato
            match = re.search(pattern, ocr_combined)
            if match:
                return match.group(0), False
            
            # Buscar sin guiones
            pattern_no_dash = r'\b\d{11}\b'
            match = re.search(pattern_no_dash, ocr_combined)
            if match:
                cuit = match.group(0)
                formatted_cuit = f"{cuit[:2]}-{cuit[2:10]}-{cuit[10]}"
                return formatted_cuit, False
        
        return '00-00000000-0', False
    
    def _validate_razon_social(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida PROVEEDOR_RAZON_SOCIAL: cualquier texto es válido."""
        return value.strip() if value else '', True
    
    def _validate_tipo(self, value: str, all_values: Dict, ocr_text: List[str]) -> Tuple[str, bool]:
        """Valida TIPO: debe ser A, B, C, M, E o T."""
        # Eliminar la palabra "factura"
        clean_value = re.sub(r'factura\s*', '', value, flags=re.IGNORECASE).strip().upper()
        
        # Validar tipos permitidos
        valid_types = ['A', 'B', 'C', 'M', 'E', 'T']
        
        # Buscar tipo en el valor limpio
        for tipo in valid_types:
            if tipo in clean_value:
                return tipo, True
        
        # Por defecto, tipo A
        return 'A', False
    
    def _clean_currency(self, value: str) -> str:
        """Limpia valores monetarios: elimina $, normaliza decimales."""
        if not value:
            return '0.00'
        
        # Eliminar símbolos de moneda y espacios
        clean = re.sub(r'[$\s]', '', value)
        
        # Normalizar separadores decimales (argentinos usan , o .)
        # Si hay tanto punto como coma, el último es el decimal
        if '.' in clean and ',' in clean:
            if clean.rindex('.') > clean.rindex(','):
                # Punto es decimal
                clean = clean.replace(',', '')
            else:
                # Coma es decimal
                clean = clean.replace('.', '').replace(',', '.')
        elif ',' in clean:
            # Solo coma: es decimal
            clean = clean.replace(',', '.')
        
        try:
            num = float(clean)
            return f"{num:.2f}"
        except ValueError:
            return '0.00'
    
    def _find_max_decimal_in_ocr(self, ocr_text: List[str]) -> Optional[str]:
        """Encuentra el número más alto con decimales en el texto OCR."""
        max_value = 0.0
        found = False
        
        for word in ocr_text:
            # Buscar números con decimales (con punto o coma)
            if '.' in word or ',' in word:
                clean = self._clean_currency(word)
                try:
                    num = float(clean)
                    if num > max_value:
                        max_value = num
                        found = True
                except ValueError:
                    continue
        
        return f"{max_value:.2f}" if found else None