Spaces:

GopalKrushnaMahapatra
/

TrueWrite-Scan-Backend

Sleeping

App Files Files Community

GopalKrushnaMahapatra commited on 18 days ago

Commit

2152439

verified ·

1 Parent(s): 44e9376

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -51

app.py CHANGED Viewed

@@ -383,39 +383,53 @@ except Exception as e:
 # ------------------ GECToR LOADING (Neural GEC) ------------------
 GEC_MODEL = None
-GEC_TOKENIZER = None
-GEC_ENCODE = None
-GEC_DECODE = None
-GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
-    try:
-        GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
-        GEC_VERB_FILE = os.getenv("GEC_VERB_FILE", "data/verb-form-vocab.txt")
-        GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
-        GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
-        GEC_ENCODE, GEC_DECODE = load_verb_dict(GEC_VERB_FILE)
-        print(f"[GECToR] Loaded model {GEC_MODEL_ID} on {GEC_DEVICE}")
-    except Exception as e:
-        GEC_MODEL = None
-        GEC_TOKENIZER = None
-        GEC_ENCODE = None
-        GEC_DECODE = None
-        print("[GECToR] Failed to load — falling back to LanguageTool/heuristics. Error:", e)
-else:
-    print("[GECToR] gector library not available; using LanguageTool/heuristics for grammar.")
 def gector_correct(text: str):
     """
     Run neural grammatical error correction using GECToR.
-    - Trims to 1000 words (server-side safety).
-    - Splits into sentences, runs GECToR, then joins back.
     """
-    if not (GEC_MODEL and GEC_TOKENIZER and GEC_ENCODE and GEC_DECODE):
-        raise RuntimeError("GECToR model not loaded")
     parts = text.strip().split()
     if len(parts) > 1000:
         text_proc = " ".join(parts[:1000])
@@ -425,33 +439,38 @@ def gector_correct(text: str):
     if not text_proc:
         return text_proc, 0, 0
     sentences = re.split(r"(?<=[.!?])\s+", text_proc)
-    sentences = [s for s in sentences if s.strip()]
-    if not sentences:
-        sentences = [text_proc]
-    keep_conf = float(os.getenv("GEC_KEEP_CONFIDENCE", "0.0"))
-    min_err_prob = float(os.getenv("GEC_MIN_ERROR_PROB", "0.0"))
-    n_iter = int(os.getenv("GEC_N_ITER", "5"))
-    batch_size = int(os.getenv("GEC_BATCH_SIZE", "8"))
-    corrected_sentences = gector_predict(
-        GEC_MODEL,
-        GEC_TOKENIZER,
-        sentences,
-        GEC_ENCODE,
-        GEC_DECODE,
-        keep_confidence=keep_conf,
-        min_error_prob=min_err_prob,
-        n_iteration=n_iter,
-        batch_size=batch_size,
-    )
-    corrected_text = " ".join(corrected_sentences)
-    original_words = len(text_proc.split())
-    corrections = sum(
-        1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b
-    )
-    return corrected_text, corrections, original_words
 # ------------------ FILE EXTRACTION HELPERS ------------------

 # ------------------ GECToR LOADING (Neural GEC) ------------------
 GEC_MODEL = None
+try:
+    # Import specific classes from the installed library
+    from gector.gec_model import GecBERTModel
+    from gector.utils.helpers import load_verb_dict
+    print("[GECToR] Initializing model... (This may take 30s)")
+        vocab_path="/app/data",         # Directory containing verb-form-vocab.txt
+        model_paths=["/app/data/gector_model.th"],
+        model_name='roberta-base',
+        max_len=50,
+        min_len=3,
+        iterations=5,
+        min_error_probability=0.0,
+        lowercase_tokens=0,
+        special_tokens_fix=1,
+        log=False,
+        is_ensemble=0,
+        weigths=None,
+        confidence=0,
+        del_confidence=0
+    )
+    # 2. Load and Attach the Verb Dictionary
+    # This maps verb forms (e.g., "go" -> "gone")
+    encode, decode = load_verb_dict("/app/data/verb-form-vocab.txt")
+    GEC_MODEL.encode = encode
+    GEC_MODEL.decode = decode
+    print(f"[GECToR] Model & Verb Dict Loaded Successfully!")
+except Exception as e:
+    GEC_MODEL = None
+    print(f"[GECToR] Failed to load. Error: {e}")
+    print("[GECToR] Ensure you updated Dockerfile to download 'gector_model.th'")
 def gector_correct(text: str):
     """
     Run neural grammatical error correction using GECToR.
     """
+    # 1. Check if model is loaded
+    if GEC_MODEL is None:
+        print("[GECToR] Model not loaded, skipping.")
+        return text, 0, len(text.split())
+    # 2. Safety Truncate (Server protection)
     parts = text.strip().split()
     if len(parts) > 1000:
         text_proc = " ".join(parts[:1000])
     if not text_proc:
         return text_proc, 0, 0
+    # 3. Split into sentences and then tokens
+    # GECToR expects a list of token lists: [['Hello', 'world'], ['How', 'are', 'you']]
     sentences = re.split(r"(?<=[.!?])\s+", text_proc)
+    batch = [s.strip().split() for s in sentences if s.strip()]
+    if not batch:
+        return text_proc, 0, 0
+    try:
+        # 4. Run Prediction
+        # We pass the encode/decode maps we loaded earlier
+        final_batch, total_updates = GEC_MODEL.handle_batch(
+            batch,
+            encode_mapping=GEC_MODEL.encode,
+            decode_mapping=GEC_MODEL.decode
+        )
+        # 5. Reconstruct Text
+        corrected_sentences = [" ".join(tokens) for tokens in final_batch]
+        corrected_text = " ".join(corrected_sentences)
+        # 6. Count Corrections
+        # Simple word-by-word comparison
+        original_words = len(text_proc.split())
+        corrections = sum(1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b)
+        return corrected_text, corrections, original_words
+    except Exception as e:
+        print(f"[GECToR] Prediction error: {e}")
+        # Fallback to original text if crash
+        return text_proc, 0, len(text_proc.split())
 # ------------------ FILE EXTRACTION HELPERS ------------------