GopalKrushnaMahapatra commited on
Commit
2152439
·
verified ·
1 Parent(s): 44e9376

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -51
app.py CHANGED
@@ -383,39 +383,53 @@ except Exception as e:
383
 
384
  # ------------------ GECToR LOADING (Neural GEC) ------------------
385
  GEC_MODEL = None
386
- GEC_TOKENIZER = None
387
- GEC_ENCODE = None
388
- GEC_DECODE = None
389
- GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
390
 
391
- if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
392
- try:
393
- GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
394
- GEC_VERB_FILE = os.getenv("GEC_VERB_FILE", "data/verb-form-vocab.txt")
395
- GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
396
- GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
397
- GEC_ENCODE, GEC_DECODE = load_verb_dict(GEC_VERB_FILE)
398
- print(f"[GECToR] Loaded model {GEC_MODEL_ID} on {GEC_DEVICE}")
399
- except Exception as e:
400
- GEC_MODEL = None
401
- GEC_TOKENIZER = None
402
- GEC_ENCODE = None
403
- GEC_DECODE = None
404
- print("[GECToR] Failed to load — falling back to LanguageTool/heuristics. Error:", e)
405
- else:
406
- print("[GECToR] gector library not available; using LanguageTool/heuristics for grammar.")
 
 
 
 
 
 
407
 
 
 
 
 
 
 
 
 
 
 
 
 
408
 
409
  def gector_correct(text: str):
410
  """
411
  Run neural grammatical error correction using GECToR.
412
-
413
- - Trims to 1000 words (server-side safety).
414
- - Splits into sentences, runs GECToR, then joins back.
415
  """
416
- if not (GEC_MODEL and GEC_TOKENIZER and GEC_ENCODE and GEC_DECODE):
417
- raise RuntimeError("GECToR model not loaded")
 
 
418
 
 
419
  parts = text.strip().split()
420
  if len(parts) > 1000:
421
  text_proc = " ".join(parts[:1000])
@@ -425,33 +439,38 @@ def gector_correct(text: str):
425
  if not text_proc:
426
  return text_proc, 0, 0
427
 
 
 
428
  sentences = re.split(r"(?<=[.!?])\s+", text_proc)
429
- sentences = [s for s in sentences if s.strip()]
430
- if not sentences:
431
- sentences = [text_proc]
432
-
433
- keep_conf = float(os.getenv("GEC_KEEP_CONFIDENCE", "0.0"))
434
- min_err_prob = float(os.getenv("GEC_MIN_ERROR_PROB", "0.0"))
435
- n_iter = int(os.getenv("GEC_N_ITER", "5"))
436
- batch_size = int(os.getenv("GEC_BATCH_SIZE", "8"))
437
-
438
- corrected_sentences = gector_predict(
439
- GEC_MODEL,
440
- GEC_TOKENIZER,
441
- sentences,
442
- GEC_ENCODE,
443
- GEC_DECODE,
444
- keep_confidence=keep_conf,
445
- min_error_prob=min_err_prob,
446
- n_iteration=n_iter,
447
- batch_size=batch_size,
448
- )
449
- corrected_text = " ".join(corrected_sentences)
450
- original_words = len(text_proc.split())
451
- corrections = sum(
452
- 1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b
453
- )
454
- return corrected_text, corrections, original_words
 
 
 
455
 
456
 
457
  # ------------------ FILE EXTRACTION HELPERS ------------------
 
383
 
384
  # ------------------ GECToR LOADING (Neural GEC) ------------------
385
  GEC_MODEL = None
 
 
 
 
386
 
387
+ try:
388
+ # Import specific classes from the installed library
389
+ from gector.gec_model import GecBERTModel
390
+ from gector.utils.helpers import load_verb_dict
391
+
392
+ print("[GECToR] Initializing model... (This may take 30s)")
393
+
394
+ vocab_path="/app/data", # Directory containing verb-form-vocab.txt
395
+ model_paths=["/app/data/gector_model.th"],
396
+ model_name='roberta-base',
397
+ max_len=50,
398
+ min_len=3,
399
+ iterations=5,
400
+ min_error_probability=0.0,
401
+ lowercase_tokens=0,
402
+ special_tokens_fix=1,
403
+ log=False,
404
+ is_ensemble=0,
405
+ weigths=None,
406
+ confidence=0,
407
+ del_confidence=0
408
+ )
409
 
410
+ # 2. Load and Attach the Verb Dictionary
411
+ # This maps verb forms (e.g., "go" -> "gone")
412
+ encode, decode = load_verb_dict("/app/data/verb-form-vocab.txt")
413
+ GEC_MODEL.encode = encode
414
+ GEC_MODEL.decode = decode
415
+
416
+ print(f"[GECToR] Model & Verb Dict Loaded Successfully!")
417
+
418
+ except Exception as e:
419
+ GEC_MODEL = None
420
+ print(f"[GECToR] Failed to load. Error: {e}")
421
+ print("[GECToR] Ensure you updated Dockerfile to download 'gector_model.th'")
422
 
423
  def gector_correct(text: str):
424
  """
425
  Run neural grammatical error correction using GECToR.
 
 
 
426
  """
427
+ # 1. Check if model is loaded
428
+ if GEC_MODEL is None:
429
+ print("[GECToR] Model not loaded, skipping.")
430
+ return text, 0, len(text.split())
431
 
432
+ # 2. Safety Truncate (Server protection)
433
  parts = text.strip().split()
434
  if len(parts) > 1000:
435
  text_proc = " ".join(parts[:1000])
 
439
  if not text_proc:
440
  return text_proc, 0, 0
441
 
442
+ # 3. Split into sentences and then tokens
443
+ # GECToR expects a list of token lists: [['Hello', 'world'], ['How', 'are', 'you']]
444
  sentences = re.split(r"(?<=[.!?])\s+", text_proc)
445
+ batch = [s.strip().split() for s in sentences if s.strip()]
446
+
447
+ if not batch:
448
+ return text_proc, 0, 0
449
+
450
+ try:
451
+ # 4. Run Prediction
452
+ # We pass the encode/decode maps we loaded earlier
453
+ final_batch, total_updates = GEC_MODEL.handle_batch(
454
+ batch,
455
+ encode_mapping=GEC_MODEL.encode,
456
+ decode_mapping=GEC_MODEL.decode
457
+ )
458
+
459
+ # 5. Reconstruct Text
460
+ corrected_sentences = [" ".join(tokens) for tokens in final_batch]
461
+ corrected_text = " ".join(corrected_sentences)
462
+
463
+ # 6. Count Corrections
464
+ # Simple word-by-word comparison
465
+ original_words = len(text_proc.split())
466
+ corrections = sum(1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b)
467
+
468
+ return corrected_text, corrections, original_words
469
+
470
+ except Exception as e:
471
+ print(f"[GECToR] Prediction error: {e}")
472
+ # Fallback to original text if crash
473
+ return text_proc, 0, len(text_proc.split())
474
 
475
 
476
  # ------------------ FILE EXTRACTION HELPERS ------------------