Update app.py
Browse files
app.py
CHANGED
|
@@ -383,39 +383,53 @@ except Exception as e:
|
|
| 383 |
|
| 384 |
# ------------------ GECToR LOADING (Neural GEC) ------------------
|
| 385 |
GEC_MODEL = None
|
| 386 |
-
GEC_TOKENIZER = None
|
| 387 |
-
GEC_ENCODE = None
|
| 388 |
-
GEC_DECODE = None
|
| 389 |
-
GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 390 |
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
|
| 409 |
def gector_correct(text: str):
|
| 410 |
"""
|
| 411 |
Run neural grammatical error correction using GECToR.
|
| 412 |
-
|
| 413 |
-
- Trims to 1000 words (server-side safety).
|
| 414 |
-
- Splits into sentences, runs GECToR, then joins back.
|
| 415 |
"""
|
| 416 |
-
|
| 417 |
-
|
|
|
|
|
|
|
| 418 |
|
|
|
|
| 419 |
parts = text.strip().split()
|
| 420 |
if len(parts) > 1000:
|
| 421 |
text_proc = " ".join(parts[:1000])
|
|
@@ -425,33 +439,38 @@ def gector_correct(text: str):
|
|
| 425 |
if not text_proc:
|
| 426 |
return text_proc, 0, 0
|
| 427 |
|
|
|
|
|
|
|
| 428 |
sentences = re.split(r"(?<=[.!?])\s+", text_proc)
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
|
|
|
|
|
|
|
|
|
| 455 |
|
| 456 |
|
| 457 |
# ------------------ FILE EXTRACTION HELPERS ------------------
|
|
|
|
| 383 |
|
| 384 |
# ------------------ GECToR LOADING (Neural GEC) ------------------
|
| 385 |
GEC_MODEL = None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
+
try:
|
| 388 |
+
# Import specific classes from the installed library
|
| 389 |
+
from gector.gec_model import GecBERTModel
|
| 390 |
+
from gector.utils.helpers import load_verb_dict
|
| 391 |
+
|
| 392 |
+
print("[GECToR] Initializing model... (This may take 30s)")
|
| 393 |
+
|
| 394 |
+
vocab_path="/app/data", # Directory containing verb-form-vocab.txt
|
| 395 |
+
model_paths=["/app/data/gector_model.th"],
|
| 396 |
+
model_name='roberta-base',
|
| 397 |
+
max_len=50,
|
| 398 |
+
min_len=3,
|
| 399 |
+
iterations=5,
|
| 400 |
+
min_error_probability=0.0,
|
| 401 |
+
lowercase_tokens=0,
|
| 402 |
+
special_tokens_fix=1,
|
| 403 |
+
log=False,
|
| 404 |
+
is_ensemble=0,
|
| 405 |
+
weigths=None,
|
| 406 |
+
confidence=0,
|
| 407 |
+
del_confidence=0
|
| 408 |
+
)
|
| 409 |
|
| 410 |
+
# 2. Load and Attach the Verb Dictionary
|
| 411 |
+
# This maps verb forms (e.g., "go" -> "gone")
|
| 412 |
+
encode, decode = load_verb_dict("/app/data/verb-form-vocab.txt")
|
| 413 |
+
GEC_MODEL.encode = encode
|
| 414 |
+
GEC_MODEL.decode = decode
|
| 415 |
+
|
| 416 |
+
print(f"[GECToR] Model & Verb Dict Loaded Successfully!")
|
| 417 |
+
|
| 418 |
+
except Exception as e:
|
| 419 |
+
GEC_MODEL = None
|
| 420 |
+
print(f"[GECToR] Failed to load. Error: {e}")
|
| 421 |
+
print("[GECToR] Ensure you updated Dockerfile to download 'gector_model.th'")
|
| 422 |
|
| 423 |
def gector_correct(text: str):
|
| 424 |
"""
|
| 425 |
Run neural grammatical error correction using GECToR.
|
|
|
|
|
|
|
|
|
|
| 426 |
"""
|
| 427 |
+
# 1. Check if model is loaded
|
| 428 |
+
if GEC_MODEL is None:
|
| 429 |
+
print("[GECToR] Model not loaded, skipping.")
|
| 430 |
+
return text, 0, len(text.split())
|
| 431 |
|
| 432 |
+
# 2. Safety Truncate (Server protection)
|
| 433 |
parts = text.strip().split()
|
| 434 |
if len(parts) > 1000:
|
| 435 |
text_proc = " ".join(parts[:1000])
|
|
|
|
| 439 |
if not text_proc:
|
| 440 |
return text_proc, 0, 0
|
| 441 |
|
| 442 |
+
# 3. Split into sentences and then tokens
|
| 443 |
+
# GECToR expects a list of token lists: [['Hello', 'world'], ['How', 'are', 'you']]
|
| 444 |
sentences = re.split(r"(?<=[.!?])\s+", text_proc)
|
| 445 |
+
batch = [s.strip().split() for s in sentences if s.strip()]
|
| 446 |
+
|
| 447 |
+
if not batch:
|
| 448 |
+
return text_proc, 0, 0
|
| 449 |
+
|
| 450 |
+
try:
|
| 451 |
+
# 4. Run Prediction
|
| 452 |
+
# We pass the encode/decode maps we loaded earlier
|
| 453 |
+
final_batch, total_updates = GEC_MODEL.handle_batch(
|
| 454 |
+
batch,
|
| 455 |
+
encode_mapping=GEC_MODEL.encode,
|
| 456 |
+
decode_mapping=GEC_MODEL.decode
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
# 5. Reconstruct Text
|
| 460 |
+
corrected_sentences = [" ".join(tokens) for tokens in final_batch]
|
| 461 |
+
corrected_text = " ".join(corrected_sentences)
|
| 462 |
+
|
| 463 |
+
# 6. Count Corrections
|
| 464 |
+
# Simple word-by-word comparison
|
| 465 |
+
original_words = len(text_proc.split())
|
| 466 |
+
corrections = sum(1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b)
|
| 467 |
+
|
| 468 |
+
return corrected_text, corrections, original_words
|
| 469 |
+
|
| 470 |
+
except Exception as e:
|
| 471 |
+
print(f"[GECToR] Prediction error: {e}")
|
| 472 |
+
# Fallback to original text if crash
|
| 473 |
+
return text_proc, 0, len(text_proc.split())
|
| 474 |
|
| 475 |
|
| 476 |
# ------------------ FILE EXTRACTION HELPERS ------------------
|