Spaces:

GopalKrushnaMahapatra
/

TrueWrite-Scan-Backend

Sleeping

App Files Files Community

GopalKrushnaMahapatra commited on 15 days ago

Commit

e2458a1

verified ·

1 Parent(s): ddc6c8a

Update app.py

Browse files

Files changed (1) hide show

app.py +946 -946

app.py CHANGED Viewed

@@ -1,947 +1,947 @@
-# backend/main.py
-import os
-import re
-import io
-import sqlite3
-from datetime import datetime, timezone
-from dotenv import load_dotenv
-from fastapi import FastAPI, HTTPException, status, Header, Depends, File, UploadFile
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, EmailStr
-from passlib.context import CryptContext
-import jwt
-# File parsing libs
-from docx import Document as DocxDocument
-import PyPDF2
-# ML / NLP libs
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-import torch
-import numpy as np
-# TF-IDF
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-# Semantic embeddings for plagiarism (combined approach)
-try:
-    from sentence_transformers import SentenceTransformer
-except Exception:
-    SentenceTransformer = None
-# LanguageTool (may require Java)
-try:
-    import language_tool_python
-except Exception:
-    language_tool_python = None
-# GECToR (neural grammatical error correction)
-try:
-    from gector import GECToR, predict as gector_predict, load_verb_dict
-except Exception:
-    GECToR = None
-    gector_predict = None
-    load_verb_dict = None
-# ------------------ ENV & DB SETUP ------------------
-load_dotenv()
-JWT_SECRET = os.getenv("JWT_SECRET", "super_secret_key_change_this")
-JWT_ALGO = os.getenv("JWT_ALGO", "HS256")
-DB_PATH = os.getenv("DB_PATH", "truewrite.db")
-CORPUS_DIR = os.getenv("CORPUS_DIR", "corpus")
-CORPUS_RAW = os.getenv("CORPUS_RAW", "corpus_raw")
-# Combined plagiarism weights
-PLAG_ALPHA = float(os.getenv("PLAG_ALPHA", "0.4"))  # TF-IDF weight; (1-alpha) for embeddings
-pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
-# SQLite DB (simple demo)
-conn = sqlite3.connect(DB_PATH, check_same_thread=False)
-conn.row_factory = sqlite3.Row
-cur = conn.cursor()
-# Create tables if not exist
-cur.execute("""
-CREATE TABLE IF NOT EXISTS users (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    name TEXT NOT NULL,
-    email TEXT NOT NULL UNIQUE,
-    password_hash TEXT NOT NULL,
-    created_at TEXT NOT NULL
-)
-""")
-cur.execute("""
-CREATE TABLE IF NOT EXISTS history (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    user_id INTEGER NOT NULL,
-    tool TEXT NOT NULL,
-    input_text TEXT,
-    result_summary TEXT,
-    created_at TEXT NOT NULL,
-    FOREIGN KEY (user_id) REFERENCES users(id)
-)
-""")
-conn.commit()
-# ------------------ FASTAPI APP ------------------
-app = FastAPI(title="TrueWrite Scan (Python Backend)")
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# ------------------ MODELS ------------------
-class SignupRequest(BaseModel):
-    name: str
-    email: EmailStr
-    password: str
-class LoginRequest(BaseModel):
-    email: EmailStr
-    password: str
-class TextRequest(BaseModel):
-    text: str
-# ------------------ AUTH HELPERS ------------------
-def hash_password(pw: str) -> str:
-    return pwd_context.hash(pw)
-def verify_password(plain: str, hashed: str) -> bool:
-    return pwd_context.verify(plain, hashed)
-def create_token(user_id: int, email: str) -> str:
-    payload = {"user_id": user_id, "email": email}
-    token = jwt.encode(payload, JWT_SECRET, algorithm=JWT_ALGO)
-    if isinstance(token, bytes):
-        token = token.decode("utf-8")
-    return token
-def decode_token(token: str):
-    try:
-        payload = jwt.decode(token, JWT_SECRET, algorithms=[JWT_ALGO])
-        return payload
-    except jwt.PyJWTError:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Invalid token"
-        )
-def get_current_user(authorization: str = Header(None)):
-    if not authorization or not authorization.startswith("Bearer "):
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Missing token"
-        )
-    token = authorization.split(" ", 1)[1]
-    payload = decode_token(token)
-    user_id = payload.get("user_id")
-    cur.execute("SELECT * FROM users WHERE id = ?", (user_id,))
-    row = cur.fetchone()
-    if not row:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="User not found"
-        )
-    return {"id": row["id"], "name": row["name"], "email": row["email"]}
-def now_iso():
-    return datetime.now(timezone.utc).isoformat()
-def save_history(user_id: int, tool: str, input_text: str, summary: str):
-    trimmed = (input_text[:500] + "...") if len(input_text) > 500 else input_text
-    cur.execute(
-        "INSERT INTO history (user_id, tool, input_text, result_summary, created_at) VALUES (?, ?, ?, ?, ?)",
-        (user_id, tool, trimmed, summary, now_iso()),
-    )
-    conn.commit()
-# ------------------ TEXT HELPERS ------------------
-def count_words(text: str) -> int:
-    tokens = text.strip().split()
-    return len(tokens) if text.strip() else 0
-def simple_grammar_correct(text: str):
-    """Old heuristic grammar fixer (kept as fallback)."""
-    corrections = 0
-    original_words = count_words(text)
-    before = text
-    text = re.sub(r"\s{2,}", " ", text)
-    if text != before:
-        corrections += 1
-    before = text
-    text = re.sub(r"\bi\b", "I", text)
-    if text != before:
-        corrections += 1
-    def cap_match(m):
-        return m.group(0).upper()
-    before = text
-    text = re.sub(r"(^\s*\w|[.!?]\s+\w)", cap_match, text)
-    if text != before:
-        corrections += 1
-    if text.strip() and not re.search(r"[.!?]\s*$", text.strip()):
-        text = text.strip() + "."
-        corrections += 1
-    return text, corrections, original_words
-# ------------------ CORPUS BUILDING (from corpus_raw -> corpus) ------------------
-def extract_from_docx_path(path: str) -> str:
-    doc = DocxDocument(path)
-    paragraphs = [p.text for p in doc.paragraphs]
-    return "\n".join(paragraphs)
-def extract_from_pdf_path(path: str) -> str:
-    with open(path, "rb") as f:
-        reader = PyPDF2.PdfReader(f)
-        texts = []
-        for pg in range(len(reader.pages)):
-            try:
-                texts.append(reader.pages[pg].extract_text() or "")
-            except Exception:
-                texts.append("")
-        return "\n".join(texts)
-def build_corpus_from_raw(raw_dir: str = CORPUS_RAW, out_dir: str = CORPUS_DIR):
-    """
-    Convert any .pdf / .docx / .txt files from corpus_raw/ into .txt files in corpus/.
-    This mirrors your build_corpus.py logic but is called automatically at startup.
-    """
-    os.makedirs(raw_dir, exist_ok=True)
-    os.makedirs(out_dir, exist_ok=True)
-    for fname in os.listdir(raw_dir):
-        inpath = os.path.join(raw_dir, fname)
-        if not os.path.isfile(inpath):
-            continue
-        outname = os.path.splitext(fname)[0] + ".txt"
-        outpath = os.path.join(out_dir, outname)
-        try:
-            ext = fname.lower()
-            if ext.endswith(".docx"):
-                text = extract_from_docx_path(inpath)
-            elif ext.endswith(".pdf"):
-                text = extract_from_pdf_path(inpath)
-            elif ext.endswith(".txt"):
-                with open(inpath, "r", encoding="utf-8", errors="ignore") as f:
-                    text = f.read()
-            else:
-                print("[CorpusRaw] Skipping unsupported:", fname)
-                continue
-            text = text.strip()
-            with open(outpath, "w", encoding="utf-8") as fo:
-                fo.write(text)
-            print("[CorpusRaw] Wrote:", outpath)
-        except Exception as e:
-            print("[CorpusRaw] Failed", fname, "->", e)
-# ------------------ TF-IDF CORPUS LOADING ------------------
-vectorizer = None
-corpus_tfidf = None
-corpus_titles = []
-corpus_texts = []
-def load_corpus(corpus_dir=CORPUS_DIR):
-    """
-    Load .txt corpus files from CORPUS_DIR, build TF-IDF index.
-    Semantic embeddings are built separately in load_embeddings().
-    """
-    global vectorizer, corpus_tfidf, corpus_titles, corpus_texts
-    corpus_titles = []
-    corpus_texts = []
-    if not os.path.isdir(corpus_dir):
-        os.makedirs(corpus_dir, exist_ok=True)
-        print("[Corpus] Created empty corpus directory:", corpus_dir)
-        vectorizer = None
-        corpus_tfidf = None
-        return
-    for fname in os.listdir(corpus_dir):
-        if fname.lower().endswith(".txt"):
-            path = os.path.join(corpus_dir, fname)
-            try:
-                with open(path, "r", encoding="utf-8", errors="ignore") as f:
-                    txt = f.read()
-                corpus_titles.append(fname)
-                corpus_texts.append(txt)
-            except Exception as e:
-                print(f"[Corpus] Failed to read {path}: {e}")
-    if corpus_texts:
-        try:
-            vectorizer = TfidfVectorizer(
-                ngram_range=(1, 3),
-                stop_words="english",
-                max_features=50000
-            )
-            corpus_tfidf = vectorizer.fit_transform(corpus_texts)
-            print(f"[Corpus] Loaded {len(corpus_texts)} documents into TF-IDF index")
-        except Exception as e:
-            print("[Corpus] TF-IDF build failed:", e)
-            vectorizer = None
-            corpus_tfidf = None
-    else:
-        vectorizer = None
-        corpus_tfidf = None
-        print("[Corpus] No .txt documents found in", corpus_dir)
-# ------------------ SEMANTIC EMBEDDINGS (SentenceTransformers) ------------------
-emb_model = None
-corpus_emb = None
-EMB_MODEL_NAME = os.getenv("PLAG_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
-def load_embeddings():
-    """
-    Build semantic embedding index for plagiarism using sentence-transformers.
-    """
-    global emb_model, corpus_emb
-    if SentenceTransformer is None:
-        print("[Embeddings] sentence-transformers not installed; skipping semantic index.")
-        emb_model = None
-        corpus_emb = None
-        return
-    if not corpus_texts:
-        print("[Embeddings] No corpus texts available; semantic index not built.")
-        emb_model = None
-        corpus_emb = None
-        return
-    try:
-        emb_model = SentenceTransformer(EMB_MODEL_NAME)
-        corpus_emb = emb_model.encode(
-            corpus_texts,
-            convert_to_numpy=True,
-            show_progress_bar=False,
-            normalize_embeddings=True,
-        )
-        print(f"[Embeddings] Loaded '{EMB_MODEL_NAME}' and encoded {len(corpus_texts)} corpus docs.")
-    except Exception as e:
-        emb_model = None
-        corpus_emb = None
-        print("[Embeddings] Failed to load or encode corpus:", e)
-# Build corpus & embeddings at startup
-build_corpus_from_raw()
-load_corpus()
-load_embeddings()
-# ------------------ HF MODEL LOADING (AI Detector) ------------------
-AI_DETECTOR_MODEL = "openai-community/roberta-base-openai-detector"
-tokenizer = None
-model = None
-device = None
-try:
-    tokenizer = AutoTokenizer.from_pretrained(AI_DETECTOR_MODEL)
-    model = AutoModelForSequenceClassification.from_pretrained(AI_DETECTOR_MODEL)
-    model.eval()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    print(f"[AI Detector] Loaded {AI_DETECTOR_MODEL} on {device}")
-except Exception as e:
-    tokenizer = None
-    model = None
-    device = None
-    print("[AI Detector] Failed to load HF model — using heuristic fallback. Error:", e)
-# ------------------ GECToR LOADING (Neural GEC) ------------------
-GEC_MODEL = None
-GEC_TOKENIZER = None
-GEC_ENCODE = None
-GEC_DECODE = None
-GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
-    try:
-        GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
-        GEC_VERB_FILE = os.getenv("GEC_VERB_FILE", "data/verb-form-vocab.txt")
-        GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
-        GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
-        GEC_ENCODE, GEC_DECODE = load_verb_dict(GEC_VERB_FILE)
-        print(f"[GECToR] Loaded model {GEC_MODEL_ID} on {GEC_DEVICE}")
-    except Exception as e:
-        GEC_MODEL = None
-        GEC_TOKENIZER = None
-        GEC_ENCODE = None
-        GEC_DECODE = None
-        print("[GECToR] Failed to load — falling back to LanguageTool/heuristics. Error:", e)
-else:
-    print("[GECToR] gector library not available; using LanguageTool/heuristics for grammar.")
-def gector_correct(text: str):
-    """
-    Run neural grammatical error correction using GECToR.
-    - Trims to 1000 words (server-side safety).
-    - Splits into sentences, runs GECToR, then joins back.
-    """
-    if not (GEC_MODEL and GEC_TOKENIZER and GEC_ENCODE and GEC_DECODE):
-        raise RuntimeError("GECToR model not loaded")
-    parts = text.strip().split()
-    if len(parts) > 1000:
-        text_proc = " ".join(parts[:1000])
-    else:
-        text_proc = text.strip()
-    if not text_proc:
-        return text_proc, 0, 0
-    sentences = re.split(r"(?<=[.!?])\s+", text_proc)
-    sentences = [s for s in sentences if s.strip()]
-    if not sentences:
-        sentences = [text_proc]
-    keep_conf = float(os.getenv("GEC_KEEP_CONFIDENCE", "0.0"))
-    min_err_prob = float(os.getenv("GEC_MIN_ERROR_PROB", "0.0"))
-    n_iter = int(os.getenv("GEC_N_ITER", "5"))
-    batch_size = int(os.getenv("GEC_BATCH_SIZE", "8"))
-    corrected_sentences = gector_predict(
-        GEC_MODEL,
-        GEC_TOKENIZER,
-        sentences,
-        GEC_ENCODE,
-        GEC_DECODE,
-        keep_confidence=keep_conf,
-        min_error_prob=min_err_prob,
-        n_iteration=n_iter,
-        batch_size=batch_size,
-    )
-    corrected_text = " ".join(corrected_sentences)
-    original_words = len(text_proc.split())
-    corrections = sum(
-        1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b
-    )
-    return corrected_text, corrections, original_words
-# ------------------ FILE EXTRACTION HELPERS ------------------
-MAX_FILE_SIZE = 15 * 1024 * 1024  # 15 MB
-def extract_text_from_upload(upload: UploadFile) -> str:
-    filename = (upload.filename or "").lower()
-    content_type = (upload.content_type or "").lower()
-    data = upload.file.read()
-    try:
-        upload.file.seek(0)
-    except Exception:
-        pass
-    if len(data) > MAX_FILE_SIZE:
-        raise HTTPException(status_code=413, detail="File too large (max 15MB)")
-    # TXT
-    if filename.endswith(".txt") or content_type == "text/plain":
-        try:
-            try:
-                return data.decode("utf-8")
-            except UnicodeDecodeError:
-                return data.decode("latin-1")
-        except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Failed to decode text file: {e}")
-    # DOCX
-    if filename.endswith(".docx") or "wordprocessingml" in content_type:
-        # Basic sanity check: valid .docx is a ZIP (PK header)
-        if not data.startswith(b"PK"):
-            raise HTTPException(
-                status_code=400,
-                detail="Uploaded file is not a valid .docx package (it might be an old .doc file or a corrupted document). "
-                       "Please open it in Word/Google Docs and re-save as .docx or export as PDF, then upload again."
-            )
-        try:
-            f = io.BytesIO(data)
-            doc = DocxDocument(f)
-            paragraphs = [p.text for p in doc.paragraphs]
-            text = "\n".join(paragraphs).strip()
-            if not text:
-                raise ValueError("DOCX contained no readable text.")
-            return text
-        except Exception as e:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Failed to parse docx file: {e}. Try opening it in Word/Google Docs and exporting again as .docx or PDF."
-            )
-    # PDF
-    if filename.endswith(".pdf") or "pdf" in content_type:
-        try:
-            f = io.BytesIO(data)
-            reader = PyPDF2.PdfReader(f)
-            texts = []
-            for pg in range(len(reader.pages)):
-                try:
-                    txt = reader.pages[pg].extract_text() or ""
-                except Exception:
-                    txt = ""
-                texts.append(txt)
-            return "\n".join(texts)
-        except Exception as e:
-            raise HTTPException(status_code=400, detail=f"Failed to parse PDF file: {e}")
-    raise HTTPException(
-        status_code=415,
-        detail="Unsupported file type. Use .txt, .pdf, or .docx",
-    )
-# ------------------ GRAMMAR (LANGUAGETOOL INTEGRATION) ------------------
-lt_tool = None
-if language_tool_python is not None:
-    try:
-        lt_tool = language_tool_python.LanguageTool("en-US")
-        print("[LanguageTool] Loaded (local Java-backed checker)")
-    except Exception as e:
-        lt_tool = None
-        print("[LanguageTool] Could not start local LanguageTool — falling back. Error:", e)
-else:
-    print("[LanguageTool] library not installed; falling back to heuristics.")
-def grammar_with_languagetool(text: str):
-    parts = text.strip().split()
-    if len(parts) > 1000:
-        text_proc = " ".join(parts[:1000])
-    else:
-        text_proc = text.strip()
-    matches = lt_tool.check(text_proc)
-    corrected = language_tool_python.utils.correct(text_proc, matches)
-    corrections = len(matches)
-    return corrected, corrections, len(text_proc.split())
-# ------------------ PLAGIARISM HELPERS (COMBINED ENGINE) ------------------
-def _clean_for_jaccard(t: str):
-    t = t.lower()
-    t = re.sub(r"[^a-z0-9\s]", " ", t)
-    return [w for w in t.split() if w]
-def _jaccard_similarity(a, b):
-    sa = set(a)
-    sb = set(b)
-    if not sa or not sb:
-        return 0.0
-    return len(sa & sb) / len(sa | sb)
-def demo_plagiarism_fallback(text: str):
-    """
-    Simple Jaccard-based fallback using a tiny built-in sample set.
-    Used when no TF-IDF / semantic corpus is available.
-    """
-    SAMPLE_DOCS = [
-        {"title": "AI for Social Good",
-         "text": "Artificial intelligence is transforming multiple industries by automating routine tasks and enabling data driven decision making for social impact and efficiency."},
-        {"title": "IoT in Smart Cities",
-         "text": "The Internet of Things connects sensors, devices, and cloud platforms to enable real time monitoring and control in smart cities including lighting, traffic, and waste management."},
-        {"title": "Climate & Renewable Energy",
-         "text": "Climate change is a critical global challenge that demands renewable energy, efficient resource management, and international cooperation to ensure a sustainable future."},
-    ]
-    input_words = _clean_for_jaccard(text)
-    best_score = 0.0
-    matches = []
-    for doc in SAMPLE_DOCS:
-        doc_words = _clean_for_jaccard(doc["text"])
-        score = _jaccard_similarity(input_words, doc_words)
-        matches.append({"title": doc["title"], "score": round(score * 100, 2)})
-        if score > best_score:
-            best_score = score
-    matches.sort(key=lambda x: x["score"], reverse=True)
-    plagiarism_percent = round(best_score * 100, 2)
-    summary = f"Plagiarism estimate (demo Jaccard): {plagiarism_percent}%"
-    return {"plagiarism_percent": plagiarism_percent, "matches": matches[:5], "summary": summary}
-def corpus_plagiarism_combined(text: str):
-    """
-    Combined plagiarism score using:
-      - TF-IDF cosine similarity
-      - Semantic embedding cosine similarity (SentenceTransformers)
-    Returns dict matching API schema:
-      { plagiarism_percent, matches, summary }
-    """
-    if not corpus_texts:
-        raise ValueError("No corpus texts loaded")
-    sims_tfidf = None
-    sims_emb = None
-    words = text.split()
-    if len(words) > 3000:
-        text_proc = " ".join(words[:3000])
-    else:
-        text_proc = text
-    # TF-IDF similarity
-    if vectorizer is not None and corpus_tfidf is not None:
-        q = vectorizer.transform([text_proc])
-        sims_tfidf = cosine_similarity(q, corpus_tfidf)[0]
-    # Semantic similarity
-    if emb_model is not None and corpus_emb is not None:
-        q_emb = emb_model.encode(
-            [text_proc],
-            convert_to_numpy=True,
-            normalize_embeddings=True,
-            show_progress_bar=False,
-        )[0]
-        sims_emb = corpus_emb @ q_emb  # normalized → dot = cosine
-    if sims_tfidf is None and sims_emb is None:
-        raise ValueError("No plagiarism backends (TF-IDF / embeddings) are available")
-    n_docs = len(corpus_texts)
-    combined_rows = []
-    alpha = PLAG_ALPHA  # TF-IDF weight
-    for i in range(n_docs):
-        tf = float(sims_tfidf[i]) if sims_tfidf is not None else None
-        se = float(sims_emb[i]) if sims_emb is not None else None
-        if tf is None and se is None:
-            continue
-        if tf is not None and se is not None:
-            score = alpha * tf + (1.0 - alpha) * se
-        elif tf is not None:
-            score = tf
-        else:
-            score = se
-        combined_rows.append({
-            "index": i,
-            "combined": score,
-            "tfidf": tf,
-            "semantic": se,
-        })
-    if not combined_rows:
-        raise ValueError("No scores computed for corpus documents")
-    combined_rows.sort(key=lambda x: x["combined"], reverse=True)
-    top = combined_rows[:10]
-    best = top[0]["combined"]
-    plagiarism_percent = round(best * 100, 2)
-    matches = []
-    for row in top:
-        matches.append({
-            "title": corpus_titles[row["index"]],
-            "score": round(row["combined"] * 100, 2),
-            "tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
-            "semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
-        })
-    components = []
-    if sims_tfidf is not None:
-        components.append("TF-IDF")
-    if sims_emb is not None:
-        components.append("semantic embeddings")
-    comp_str = " + ".join(components)
-    summary = f"Plagiarism estimate (combined {comp_str}): {plagiarism_percent}%"
-    return {"plagiarism_percent": plagiarism_percent, "matches": matches, "summary": summary}
-# ------------------ ENDPOINTS ------------------
-@app.post("/api/signup")
-def signup(req: SignupRequest):
-    cur.execute("SELECT id FROM users WHERE email = ?", (req.email,))
-    if cur.fetchone():
-        raise HTTPException(status_code=400, detail="Email already registered")
-    pw_hash = hash_password(req.password)
-    created_at = now_iso()
-    cur.execute(
-        "INSERT INTO users (name, email, password_hash, created_at) VALUES (?, ?, ?, ?)",
-        (req.name, req.email, pw_hash, created_at),
-    )
-    conn.commit()
-    user_id = cur.lastrowid
-    token = create_token(user_id, req.email)
-    return {
-        "message": "Signup successful",
-        "token": token,
-        "name": req.name,
-        "email": req.email,
-    }
-@app.post("/api/login")
-def login(req: LoginRequest):
-    cur.execute("SELECT * FROM users WHERE email = ?", (req.email,))
-    row = cur.fetchone()
-    if not row or not verify_password(req.password, row["password_hash"]):
-        raise HTTPException(status_code=401, detail="Invalid email or password")
-    token = create_token(row["id"], row["email"])
-    return {
-        "message": "Login successful",
-        "token": token,
-        "name": row["name"],
-        "email": row["email"],
-    }
-@app.post("/api/grammar-check")
-def api_grammar_check(req: TextRequest, user=Depends(get_current_user)):
-    text = req.text or ""
-    if not text.strip():
-        raise HTTPException(status_code=400, detail="Text is required")
-    # Prefer GECToR → LanguageTool → heuristics
-    if GEC_MODEL is not None:
-        corrected, corrections, original_words = gector_correct(text)
-        summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
-    elif lt_tool is not None:
-        corrected, corrections, original_words = grammar_with_languagetool(text)
-        summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
-    else:
-        corrected, corrections, original_words = simple_grammar_correct(text)
-        summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"
-    save_history(user["id"], "grammar", text, summary)
-    return {
-        "original_words": original_words,
-        "corrections": corrections,
-        "corrected_text": corrected,
-        "summary": summary,
-    }
-@app.post("/api/grammar-check-file")
-def api_grammar_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
-    text = extract_text_from_upload(file).strip()
-    if not text:
-        raise HTTPException(status_code=400, detail="Uploaded file contains no text")
-    if GEC_MODEL is not None:
-        corrected, corrections, original_words = gector_correct(text)
-        summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
-    elif lt_tool is not None:
-        corrected, corrections, original_words = grammar_with_languagetool(text)
-        summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
-    else:
-        parts = text.strip().split()
-        if len(parts) > 1000:
-            text = " ".join(parts[:1000])
-        corrected, corrections, original_words = simple_grammar_correct(text)
-        summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"
-    save_history(user["id"], "grammar", text, summary)
-    return {
-        "original_words": original_words,
-        "corrections": corrections,
-        "corrected_text": corrected,
-        "summary": summary,
-    }
-# ------------------ PLAGIARISM ENDPOINTS (COMBINED) ------------------
-@app.post("/api/plagiarism-check")
-def api_plagiarism_check(req: TextRequest, user=Depends(get_current_user)):
-    text = req.text or ""
-    if not text.strip():
-        raise HTTPException(status_code=400, detail="Text is required")
-    # First try full combined engine (TF-IDF + embeddings) with corpus
-    try:
-        result = corpus_plagiarism_combined(text)
-        save_history(user["id"], "plagiarism", text, result["summary"])
-        return result
-    except Exception as e:
-        print("[Plagiarism] Combined corpus engine failed, falling back to demo:", e)
-    # Fallback: small Jaccard demo
-    result = demo_plagiarism_fallback(text)
-    save_history(user["id"], "plagiarism", text, result["summary"])
-    return result
-@app.post("/api/plagiarism-check-file")
-def api_plagiarism_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
-    text = extract_text_from_upload(file).strip()
-    if not text:
-        raise HTTPException(status_code=400, detail="Uploaded file contains no text")
-    try:
-        result = corpus_plagiarism_combined(text)
-        save_history(user["id"], "plagiarism", text, result["summary"])
-        return result
-    except Exception as e:
-        print("[Plagiarism-file] Combined corpus engine failed, falling back to demo:", e)
-    # Fallback to demo if corpus/engines unavailable
-    result = demo_plagiarism_fallback(text)
-    save_history(user["id"], "plagiarism", text, result["summary"])
-    return result
-# ------------------ AI CHECK (TEXT & FILE) ------------------
-def heuristic_ai_score(text: str):
-    words = re.sub(r"[^a-z0-9\s]", " ", text.lower()).split()
-    word_count = len(words)
-    unique_ratio = len(set(words)) / (word_count or 1)
-    sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
-    avg_sentence_length = word_count / (len(sentences) or 1)
-    ai_score = 0
-    if unique_ratio < 0.45:
-        ai_score += 40
-    elif unique_ratio < 0.6:
-        ai_score += 20
-    if avg_sentence_length > 25:
-        ai_score += 40
-    elif avg_sentence_length > 18:
-        ai_score += 25
-    if word_count > 400:
-        ai_score += 10
-    ai_score = min(100, round(ai_score))
-    human_score = 100 - ai_score
-    return ai_score, human_score, word_count, avg_sentence_length, unique_ratio
-@app.post("/api/ai-check")
-def api_ai_check(req: TextRequest, user=Depends(get_current_user)):
-    text = (req.text or "").strip()
-    if not text:
-        raise HTTPException(status_code=400, detail="Text is required")
-    if model is not None and tokenizer is not None:
-        try:
-            max_len = getattr(tokenizer, "model_max_length", 512)
-            if max_len is None or max_len > 1024:
-                max_len = 512
-            words = text.split()
-            chunk_size = min(400, max_len - 10)
-            chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
-            probs = []
-            for chunk in chunks:
-                inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_len)
-                inputs = {k: v.to(device) for k, v in inputs.items()}
-                with torch.no_grad():
-                    outputs = model(**inputs)
-                    logits = outputs.logits
-                    p = torch.softmax(logits, dim=1).cpu().numpy()[0]
-                    ai_prob = float(p[1]) if p.shape[0] > 1 else float(p[0])
-                    probs.append(ai_prob)
-            avg_ai_prob = float(np.mean(probs)) if probs else 0.0
-            ai_percent = round(avg_ai_prob * 100, 2)
-            human_percent = round(100 - ai_percent, 2)
-            words_count = len(words)
-            sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
-            avg_sentence_len = round(words_count / (len(sentences) or 1), 2)
-            summary = f"Model: {AI_DETECTOR_MODEL}; AI probability: {ai_percent}%"
-            save_history(user["id"], "ai", text, summary)
-            return {
-                "ai_percent": ai_percent,
-                "human_percent": human_percent,
-                "word_count": words_count,
-                "avg_sentence_length": avg_sentence_len,
-                "summary": summary,
-            }
-        except Exception as e:
-            print("[AI-check] model inference failed:", e)
-    ai_percent, human_percent, wc, avg_len, uniq = heuristic_ai_score(text)
-    summary = f"HEURISTIC fallback — AI probability: {ai_percent}%"
-    save_history(user["id"], "ai", text, summary)
-    return {
-        "ai_percent": ai_percent,
-        "human_percent": human_percent,
-        "word_count": wc,
-        "avg_sentence_length": avg_len,
-        "unique_ratio": round(uniq, 3),
-        "summary": summary,
-    }
-@app.post("/api/ai-check-file")
-def api_ai_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
-    text = extract_text_from_upload(file).strip()
-    if not text:
-        raise HTTPException(status_code=400, detail="Uploaded file contains no text")
-    return api_ai_check.__wrapped__(TextRequest(text=text), user)
-# ------------------ HISTORY ------------------
-@app.get("/api/history")
-def api_history(user=Depends(get_current_user)):
-    cur.execute(
-        "SELECT id, tool, input_text, result_summary, created_at "
-        "FROM history WHERE user_id = ? "
-        "ORDER BY created_at DESC LIMIT 50",
-        (user["id"],),
-    )
-    rows = cur.fetchall()
-    items = []
-    for r in rows:
-        items.append(
-            {
-                "id": r["id"],
-                "tool": r["tool"],
-                "input_text": r["input_text"],
-                "summary": r["result_summary"],
-                "created_at": r["created_at"],
-            }
-        )
-    return {"items": items}
-@app.get("/")
-def read_root():
     return {"status": "Backend is running with 16GB RAM!"}

+# backend/main.py
+import os
+import re
+import io
+import sqlite3
+from datetime import datetime, timezone
+from dotenv import load_dotenv
+from fastapi import FastAPI, HTTPException, status, Header, Depends, File, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, EmailStr
+from passlib.context import CryptContext
+import jwt
+# File parsing libs
+from docx import Document as DocxDocument
+import PyPDF2
+# ML / NLP libs
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import numpy as np
+# TF-IDF
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+# Semantic embeddings for plagiarism (combined approach)
+try:
+    from sentence_transformers import SentenceTransformer
+except Exception:
+    SentenceTransformer = None
+# LanguageTool (may require Java)
+try:
+    import language_tool_python
+except Exception:
+    language_tool_python = None
+# GECToR (neural grammatical error correction)
+try:
+    from gector import GECToR, predict as gector_predict, load_verb_dict
+except Exception:
+    GECToR = None
+    gector_predict = None
+    load_verb_dict = None
+# ------------------ ENV & DB SETUP ------------------
+load_dotenv()
+JWT_SECRET = os.getenv("JWT_SECRET", "super_secret_key_change_this")
+JWT_ALGO = os.getenv("JWT_ALGO", "HS256")
+DB_PATH = os.getenv("DB_PATH", "truewrite.db")
+CORPUS_DIR = os.getenv("CORPUS_DIR", "corpus")
+CORPUS_RAW = os.getenv("CORPUS_RAW", "corpus_raw")
+# Combined plagiarism weights
+PLAG_ALPHA = float(os.getenv("PLAG_ALPHA", "0.4"))  # TF-IDF weight; (1-alpha) for embeddings
+pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
+# SQLite DB (simple demo)
+conn = sqlite3.connect(DB_PATH, check_same_thread=False)
+conn.row_factory = sqlite3.Row
+cur = conn.cursor()
+# Create tables if not exist
+cur.execute("""
+CREATE TABLE IF NOT EXISTS users (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    name TEXT NOT NULL,
+    email TEXT NOT NULL UNIQUE,
+    password_hash TEXT NOT NULL,
+    created_at TEXT NOT NULL
+)
+""")
+cur.execute("""
+CREATE TABLE IF NOT EXISTS history (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    user_id INTEGER NOT NULL,
+    tool TEXT NOT NULL,
+    input_text TEXT,
+    result_summary TEXT,
+    created_at TEXT NOT NULL,
+    FOREIGN KEY (user_id) REFERENCES users(id)
+)
+""")
+conn.commit()
+# ------------------ FASTAPI APP ------------------
+app = FastAPI(title="TrueWrite Scan (Python Backend)")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["https://true-write-scan.vercel.app/"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ------------------ MODELS ------------------
+class SignupRequest(BaseModel):
+    name: str
+    email: EmailStr
+    password: str
+class LoginRequest(BaseModel):
+    email: EmailStr
+    password: str
+class TextRequest(BaseModel):
+    text: str
+# ------------------ AUTH HELPERS ------------------
+def hash_password(pw: str) -> str:
+    return pwd_context.hash(pw)
+def verify_password(plain: str, hashed: str) -> bool:
+    return pwd_context.verify(plain, hashed)
+def create_token(user_id: int, email: str) -> str:
+    payload = {"user_id": user_id, "email": email}
+    token = jwt.encode(payload, JWT_SECRET, algorithm=JWT_ALGO)
+    if isinstance(token, bytes):
+        token = token.decode("utf-8")
+    return token
+def decode_token(token: str):
+    try:
+        payload = jwt.decode(token, JWT_SECRET, algorithms=[JWT_ALGO])
+        return payload
+    except jwt.PyJWTError:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Invalid token"
+        )
+def get_current_user(authorization: str = Header(None)):
+    if not authorization or not authorization.startswith("Bearer "):
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Missing token"
+        )
+    token = authorization.split(" ", 1)[1]
+    payload = decode_token(token)
+    user_id = payload.get("user_id")
+    cur.execute("SELECT * FROM users WHERE id = ?", (user_id,))
+    row = cur.fetchone()
+    if not row:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="User not found"
+        )
+    return {"id": row["id"], "name": row["name"], "email": row["email"]}
+def now_iso():
+    return datetime.now(timezone.utc).isoformat()
+def save_history(user_id: int, tool: str, input_text: str, summary: str):
+    trimmed = (input_text[:500] + "...") if len(input_text) > 500 else input_text
+    cur.execute(
+        "INSERT INTO history (user_id, tool, input_text, result_summary, created_at) VALUES (?, ?, ?, ?, ?)",
+        (user_id, tool, trimmed, summary, now_iso()),
+    )
+    conn.commit()
+# ------------------ TEXT HELPERS ------------------
+def count_words(text: str) -> int:
+    tokens = text.strip().split()
+    return len(tokens) if text.strip() else 0
+def simple_grammar_correct(text: str):
+    """Old heuristic grammar fixer (kept as fallback)."""
+    corrections = 0
+    original_words = count_words(text)
+    before = text
+    text = re.sub(r"\s{2,}", " ", text)
+    if text != before:
+        corrections += 1
+    before = text
+    text = re.sub(r"\bi\b", "I", text)
+    if text != before:
+        corrections += 1
+    def cap_match(m):
+        return m.group(0).upper()
+    before = text
+    text = re.sub(r"(^\s*\w|[.!?]\s+\w)", cap_match, text)
+    if text != before:
+        corrections += 1
+    if text.strip() and not re.search(r"[.!?]\s*$", text.strip()):
+        text = text.strip() + "."
+        corrections += 1
+    return text, corrections, original_words
+# ------------------ CORPUS BUILDING (from corpus_raw -> corpus) ------------------
+def extract_from_docx_path(path: str) -> str:
+    doc = DocxDocument(path)
+    paragraphs = [p.text for p in doc.paragraphs]
+    return "\n".join(paragraphs)
+def extract_from_pdf_path(path: str) -> str:
+    with open(path, "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        texts = []
+        for pg in range(len(reader.pages)):
+            try:
+                texts.append(reader.pages[pg].extract_text() or "")
+            except Exception:
+                texts.append("")
+        return "\n".join(texts)
+def build_corpus_from_raw(raw_dir: str = CORPUS_RAW, out_dir: str = CORPUS_DIR):
+    """
+    Convert any .pdf / .docx / .txt files from corpus_raw/ into .txt files in corpus/.
+    This mirrors your build_corpus.py logic but is called automatically at startup.
+    """
+    os.makedirs(raw_dir, exist_ok=True)
+    os.makedirs(out_dir, exist_ok=True)
+    for fname in os.listdir(raw_dir):
+        inpath = os.path.join(raw_dir, fname)
+        if not os.path.isfile(inpath):
+            continue
+        outname = os.path.splitext(fname)[0] + ".txt"
+        outpath = os.path.join(out_dir, outname)
+        try:
+            ext = fname.lower()
+            if ext.endswith(".docx"):
+                text = extract_from_docx_path(inpath)
+            elif ext.endswith(".pdf"):
+                text = extract_from_pdf_path(inpath)
+            elif ext.endswith(".txt"):
+                with open(inpath, "r", encoding="utf-8", errors="ignore") as f:
+                    text = f.read()
+            else:
+                print("[CorpusRaw] Skipping unsupported:", fname)
+                continue
+            text = text.strip()
+            with open(outpath, "w", encoding="utf-8") as fo:
+                fo.write(text)
+            print("[CorpusRaw] Wrote:", outpath)
+        except Exception as e:
+            print("[CorpusRaw] Failed", fname, "->", e)
+# ------------------ TF-IDF CORPUS LOADING ------------------
+vectorizer = None
+corpus_tfidf = None
+corpus_titles = []
+corpus_texts = []
+def load_corpus(corpus_dir=CORPUS_DIR):
+    """
+    Load .txt corpus files from CORPUS_DIR, build TF-IDF index.
+    Semantic embeddings are built separately in load_embeddings().
+    """
+    global vectorizer, corpus_tfidf, corpus_titles, corpus_texts
+    corpus_titles = []
+    corpus_texts = []
+    if not os.path.isdir(corpus_dir):
+        os.makedirs(corpus_dir, exist_ok=True)
+        print("[Corpus] Created empty corpus directory:", corpus_dir)
+        vectorizer = None
+        corpus_tfidf = None
+        return
+    for fname in os.listdir(corpus_dir):
+        if fname.lower().endswith(".txt"):
+            path = os.path.join(corpus_dir, fname)
+            try:
+                with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                    txt = f.read()
+                corpus_titles.append(fname)
+                corpus_texts.append(txt)
+            except Exception as e:
+                print(f"[Corpus] Failed to read {path}: {e}")
+    if corpus_texts:
+        try:
+            vectorizer = TfidfVectorizer(
+                ngram_range=(1, 3),
+                stop_words="english",
+                max_features=50000
+            )
+            corpus_tfidf = vectorizer.fit_transform(corpus_texts)
+            print(f"[Corpus] Loaded {len(corpus_texts)} documents into TF-IDF index")
+        except Exception as e:
+            print("[Corpus] TF-IDF build failed:", e)
+            vectorizer = None
+            corpus_tfidf = None
+    else:
+        vectorizer = None
+        corpus_tfidf = None
+        print("[Corpus] No .txt documents found in", corpus_dir)
+# ------------------ SEMANTIC EMBEDDINGS (SentenceTransformers) ------------------
+emb_model = None
+corpus_emb = None
+EMB_MODEL_NAME = os.getenv("PLAG_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+def load_embeddings():
+    """
+    Build semantic embedding index for plagiarism using sentence-transformers.
+    """
+    global emb_model, corpus_emb
+    if SentenceTransformer is None:
+        print("[Embeddings] sentence-transformers not installed; skipping semantic index.")
+        emb_model = None
+        corpus_emb = None
+        return
+    if not corpus_texts:
+        print("[Embeddings] No corpus texts available; semantic index not built.")
+        emb_model = None
+        corpus_emb = None
+        return
+    try:
+        emb_model = SentenceTransformer(EMB_MODEL_NAME)
+        corpus_emb = emb_model.encode(
+            corpus_texts,
+            convert_to_numpy=True,
+            show_progress_bar=False,
+            normalize_embeddings=True,
+        )
+        print(f"[Embeddings] Loaded '{EMB_MODEL_NAME}' and encoded {len(corpus_texts)} corpus docs.")
+    except Exception as e:
+        emb_model = None
+        corpus_emb = None
+        print("[Embeddings] Failed to load or encode corpus:", e)
+# Build corpus & embeddings at startup
+build_corpus_from_raw()
+load_corpus()
+load_embeddings()
+# ------------------ HF MODEL LOADING (AI Detector) ------------------
+AI_DETECTOR_MODEL = "openai-community/roberta-base-openai-detector"
+tokenizer = None
+model = None
+device = None
+try:
+    tokenizer = AutoTokenizer.from_pretrained(AI_DETECTOR_MODEL)
+    model = AutoModelForSequenceClassification.from_pretrained(AI_DETECTOR_MODEL)
+    model.eval()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    print(f"[AI Detector] Loaded {AI_DETECTOR_MODEL} on {device}")
+except Exception as e:
+    tokenizer = None
+    model = None
+    device = None
+    print("[AI Detector] Failed to load HF model — using heuristic fallback. Error:", e)
+# ------------------ GECToR LOADING (Neural GEC) ------------------
+GEC_MODEL = None
+GEC_TOKENIZER = None
+GEC_ENCODE = None
+GEC_DECODE = None
+GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
+    try:
+        GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
+        GEC_VERB_FILE = os.getenv("GEC_VERB_FILE", "data/verb-form-vocab.txt")
+        GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
+        GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
+        GEC_ENCODE, GEC_DECODE = load_verb_dict(GEC_VERB_FILE)
+        print(f"[GECToR] Loaded model {GEC_MODEL_ID} on {GEC_DEVICE}")
+    except Exception as e:
+        GEC_MODEL = None
+        GEC_TOKENIZER = None
+        GEC_ENCODE = None
+        GEC_DECODE = None
+        print("[GECToR] Failed to load — falling back to LanguageTool/heuristics. Error:", e)
+else:
+    print("[GECToR] gector library not available; using LanguageTool/heuristics for grammar.")
+def gector_correct(text: str):
+    """
+    Run neural grammatical error correction using GECToR.
+    - Trims to 1000 words (server-side safety).
+    - Splits into sentences, runs GECToR, then joins back.
+    """
+    if not (GEC_MODEL and GEC_TOKENIZER and GEC_ENCODE and GEC_DECODE):
+        raise RuntimeError("GECToR model not loaded")
+    parts = text.strip().split()
+    if len(parts) > 1000:
+        text_proc = " ".join(parts[:1000])
+    else:
+        text_proc = text.strip()
+    if not text_proc:
+        return text_proc, 0, 0
+    sentences = re.split(r"(?<=[.!?])\s+", text_proc)
+    sentences = [s for s in sentences if s.strip()]
+    if not sentences:
+        sentences = [text_proc]
+    keep_conf = float(os.getenv("GEC_KEEP_CONFIDENCE", "0.0"))
+    min_err_prob = float(os.getenv("GEC_MIN_ERROR_PROB", "0.0"))
+    n_iter = int(os.getenv("GEC_N_ITER", "5"))
+    batch_size = int(os.getenv("GEC_BATCH_SIZE", "8"))
+    corrected_sentences = gector_predict(
+        GEC_MODEL,
+        GEC_TOKENIZER,
+        sentences,
+        GEC_ENCODE,
+        GEC_DECODE,
+        keep_confidence=keep_conf,
+        min_error_prob=min_err_prob,
+        n_iteration=n_iter,
+        batch_size=batch_size,
+    )
+    corrected_text = " ".join(corrected_sentences)
+    original_words = len(text_proc.split())
+    corrections = sum(
+        1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b
+    )
+    return corrected_text, corrections, original_words
+# ------------------ FILE EXTRACTION HELPERS ------------------
+MAX_FILE_SIZE = 15 * 1024 * 1024  # 15 MB
+def extract_text_from_upload(upload: UploadFile) -> str:
+    filename = (upload.filename or "").lower()
+    content_type = (upload.content_type or "").lower()
+    data = upload.file.read()
+    try:
+        upload.file.seek(0)
+    except Exception:
+        pass
+    if len(data) > MAX_FILE_SIZE:
+        raise HTTPException(status_code=413, detail="File too large (max 15MB)")
+    # TXT
+    if filename.endswith(".txt") or content_type == "text/plain":
+        try:
+            try:
+                return data.decode("utf-8")
+            except UnicodeDecodeError:
+                return data.decode("latin-1")
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Failed to decode text file: {e}")
+    # DOCX
+    if filename.endswith(".docx") or "wordprocessingml" in content_type:
+        # Basic sanity check: valid .docx is a ZIP (PK header)
+        if not data.startswith(b"PK"):
+            raise HTTPException(
+                status_code=400,
+                detail="Uploaded file is not a valid .docx package (it might be an old .doc file or a corrupted document). "
+                       "Please open it in Word/Google Docs and re-save as .docx or export as PDF, then upload again."
+            )
+        try:
+            f = io.BytesIO(data)
+            doc = DocxDocument(f)
+            paragraphs = [p.text for p in doc.paragraphs]
+            text = "\n".join(paragraphs).strip()
+            if not text:
+                raise ValueError("DOCX contained no readable text.")
+            return text
+        except Exception as e:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Failed to parse docx file: {e}. Try opening it in Word/Google Docs and exporting again as .docx or PDF."
+            )
+    # PDF
+    if filename.endswith(".pdf") or "pdf" in content_type:
+        try:
+            f = io.BytesIO(data)
+            reader = PyPDF2.PdfReader(f)
+            texts = []
+            for pg in range(len(reader.pages)):
+                try:
+                    txt = reader.pages[pg].extract_text() or ""
+                except Exception:
+                    txt = ""
+                texts.append(txt)
+            return "\n".join(texts)
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"Failed to parse PDF file: {e}")
+    raise HTTPException(
+        status_code=415,
+        detail="Unsupported file type. Use .txt, .pdf, or .docx",
+    )
+# ------------------ GRAMMAR (LANGUAGETOOL INTEGRATION) ------------------
+lt_tool = None
+if language_tool_python is not None:
+    try:
+        lt_tool = language_tool_python.LanguageTool("en-US")
+        print("[LanguageTool] Loaded (local Java-backed checker)")
+    except Exception as e:
+        lt_tool = None
+        print("[LanguageTool] Could not start local LanguageTool — falling back. Error:", e)
+else:
+    print("[LanguageTool] library not installed; falling back to heuristics.")
+def grammar_with_languagetool(text: str):
+    parts = text.strip().split()
+    if len(parts) > 1000:
+        text_proc = " ".join(parts[:1000])
+    else:
+        text_proc = text.strip()
+    matches = lt_tool.check(text_proc)
+    corrected = language_tool_python.utils.correct(text_proc, matches)
+    corrections = len(matches)
+    return corrected, corrections, len(text_proc.split())
+# ------------------ PLAGIARISM HELPERS (COMBINED ENGINE) ------------------
+def _clean_for_jaccard(t: str):
+    t = t.lower()
+    t = re.sub(r"[^a-z0-9\s]", " ", t)
+    return [w for w in t.split() if w]
+def _jaccard_similarity(a, b):
+    sa = set(a)
+    sb = set(b)
+    if not sa or not sb:
+        return 0.0
+    return len(sa & sb) / len(sa | sb)
+def demo_plagiarism_fallback(text: str):
+    """
+    Simple Jaccard-based fallback using a tiny built-in sample set.
+    Used when no TF-IDF / semantic corpus is available.
+    """
+    SAMPLE_DOCS = [
+        {"title": "AI for Social Good",
+         "text": "Artificial intelligence is transforming multiple industries by automating routine tasks and enabling data driven decision making for social impact and efficiency."},
+        {"title": "IoT in Smart Cities",
+         "text": "The Internet of Things connects sensors, devices, and cloud platforms to enable real time monitoring and control in smart cities including lighting, traffic, and waste management."},
+        {"title": "Climate & Renewable Energy",
+         "text": "Climate change is a critical global challenge that demands renewable energy, efficient resource management, and international cooperation to ensure a sustainable future."},
+    ]
+    input_words = _clean_for_jaccard(text)
+    best_score = 0.0
+    matches = []
+    for doc in SAMPLE_DOCS:
+        doc_words = _clean_for_jaccard(doc["text"])
+        score = _jaccard_similarity(input_words, doc_words)
+        matches.append({"title": doc["title"], "score": round(score * 100, 2)})
+        if score > best_score:
+            best_score = score
+    matches.sort(key=lambda x: x["score"], reverse=True)
+    plagiarism_percent = round(best_score * 100, 2)
+    summary = f"Plagiarism estimate (demo Jaccard): {plagiarism_percent}%"
+    return {"plagiarism_percent": plagiarism_percent, "matches": matches[:5], "summary": summary}
+def corpus_plagiarism_combined(text: str):
+    """
+    Combined plagiarism score using:
+      - TF-IDF cosine similarity
+      - Semantic embedding cosine similarity (SentenceTransformers)
+    Returns dict matching API schema:
+      { plagiarism_percent, matches, summary }
+    """
+    if not corpus_texts:
+        raise ValueError("No corpus texts loaded")
+    sims_tfidf = None
+    sims_emb = None
+    words = text.split()
+    if len(words) > 3000:
+        text_proc = " ".join(words[:3000])
+    else:
+        text_proc = text
+    # TF-IDF similarity
+    if vectorizer is not None and corpus_tfidf is not None:
+        q = vectorizer.transform([text_proc])
+        sims_tfidf = cosine_similarity(q, corpus_tfidf)[0]
+    # Semantic similarity
+    if emb_model is not None and corpus_emb is not None:
+        q_emb = emb_model.encode(
+            [text_proc],
+            convert_to_numpy=True,
+            normalize_embeddings=True,
+            show_progress_bar=False,
+        )[0]
+        sims_emb = corpus_emb @ q_emb  # normalized → dot = cosine
+    if sims_tfidf is None and sims_emb is None:
+        raise ValueError("No plagiarism backends (TF-IDF / embeddings) are available")
+    n_docs = len(corpus_texts)
+    combined_rows = []
+    alpha = PLAG_ALPHA  # TF-IDF weight
+    for i in range(n_docs):
+        tf = float(sims_tfidf[i]) if sims_tfidf is not None else None
+        se = float(sims_emb[i]) if sims_emb is not None else None
+        if tf is None and se is None:
+            continue
+        if tf is not None and se is not None:
+            score = alpha * tf + (1.0 - alpha) * se
+        elif tf is not None:
+            score = tf
+        else:
+            score = se
+        combined_rows.append({
+            "index": i,
+            "combined": score,
+            "tfidf": tf,
+            "semantic": se,
+        })
+    if not combined_rows:
+        raise ValueError("No scores computed for corpus documents")
+    combined_rows.sort(key=lambda x: x["combined"], reverse=True)
+    top = combined_rows[:10]
+    best = top[0]["combined"]
+    plagiarism_percent = round(best * 100, 2)
+    matches = []
+    for row in top:
+        matches.append({
+            "title": corpus_titles[row["index"]],
+            "score": round(row["combined"] * 100, 2),
+            "tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
+            "semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
+        })
+    components = []
+    if sims_tfidf is not None:
+        components.append("TF-IDF")
+    if sims_emb is not None:
+        components.append("semantic embeddings")
+    comp_str = " + ".join(components)
+    summary = f"Plagiarism estimate (combined {comp_str}): {plagiarism_percent}%"
+    return {"plagiarism_percent": plagiarism_percent, "matches": matches, "summary": summary}
+# ------------------ ENDPOINTS ------------------
+@app.post("/api/signup")
+def signup(req: SignupRequest):
+    cur.execute("SELECT id FROM users WHERE email = ?", (req.email,))
+    if cur.fetchone():
+        raise HTTPException(status_code=400, detail="Email already registered")
+    pw_hash = hash_password(req.password)
+    created_at = now_iso()
+    cur.execute(
+        "INSERT INTO users (name, email, password_hash, created_at) VALUES (?, ?, ?, ?)",
+        (req.name, req.email, pw_hash, created_at),
+    )
+    conn.commit()
+    user_id = cur.lastrowid
+    token = create_token(user_id, req.email)
+    return {
+        "message": "Signup successful",
+        "token": token,
+        "name": req.name,
+        "email": req.email,
+    }
+@app.post("/api/login")
+def login(req: LoginRequest):
+    cur.execute("SELECT * FROM users WHERE email = ?", (req.email,))
+    row = cur.fetchone()
+    if not row or not verify_password(req.password, row["password_hash"]):
+        raise HTTPException(status_code=401, detail="Invalid email or password")
+    token = create_token(row["id"], row["email"])
+    return {
+        "message": "Login successful",
+        "token": token,
+        "name": row["name"],
+        "email": row["email"],
+    }
+@app.post("/api/grammar-check")
+def api_grammar_check(req: TextRequest, user=Depends(get_current_user)):
+    text = req.text or ""
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="Text is required")
+    # Prefer GECToR → LanguageTool → heuristics
+    if GEC_MODEL is not None:
+        corrected, corrections, original_words = gector_correct(text)
+        summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
+    elif lt_tool is not None:
+        corrected, corrections, original_words = grammar_with_languagetool(text)
+        summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
+    else:
+        corrected, corrections, original_words = simple_grammar_correct(text)
+        summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"
+    save_history(user["id"], "grammar", text, summary)
+    return {
+        "original_words": original_words,
+        "corrections": corrections,
+        "corrected_text": corrected,
+        "summary": summary,
+    }
+@app.post("/api/grammar-check-file")
+def api_grammar_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
+    text = extract_text_from_upload(file).strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="Uploaded file contains no text")
+    if GEC_MODEL is not None:
+        corrected, corrections, original_words = gector_correct(text)
+        summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
+    elif lt_tool is not None:
+        corrected, corrections, original_words = grammar_with_languagetool(text)
+        summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
+    else:
+        parts = text.strip().split()
+        if len(parts) > 1000:
+            text = " ".join(parts[:1000])
+        corrected, corrections, original_words = simple_grammar_correct(text)
+        summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"
+    save_history(user["id"], "grammar", text, summary)
+    return {
+        "original_words": original_words,
+        "corrections": corrections,
+        "corrected_text": corrected,
+        "summary": summary,
+    }
+# ------------------ PLAGIARISM ENDPOINTS (COMBINED) ------------------
+@app.post("/api/plagiarism-check")
+def api_plagiarism_check(req: TextRequest, user=Depends(get_current_user)):
+    text = req.text or ""
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="Text is required")
+    # First try full combined engine (TF-IDF + embeddings) with corpus
+    try:
+        result = corpus_plagiarism_combined(text)
+        save_history(user["id"], "plagiarism", text, result["summary"])
+        return result
+    except Exception as e:
+        print("[Plagiarism] Combined corpus engine failed, falling back to demo:", e)
+    # Fallback: small Jaccard demo
+    result = demo_plagiarism_fallback(text)
+    save_history(user["id"], "plagiarism", text, result["summary"])
+    return result
+@app.post("/api/plagiarism-check-file")
+def api_plagiarism_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
+    text = extract_text_from_upload(file).strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="Uploaded file contains no text")
+    try:
+        result = corpus_plagiarism_combined(text)
+        save_history(user["id"], "plagiarism", text, result["summary"])
+        return result
+    except Exception as e:
+        print("[Plagiarism-file] Combined corpus engine failed, falling back to demo:", e)
+    # Fallback to demo if corpus/engines unavailable
+    result = demo_plagiarism_fallback(text)
+    save_history(user["id"], "plagiarism", text, result["summary"])
+    return result
+# ------------------ AI CHECK (TEXT & FILE) ------------------
+def heuristic_ai_score(text: str):
+    words = re.sub(r"[^a-z0-9\s]", " ", text.lower()).split()
+    word_count = len(words)
+    unique_ratio = len(set(words)) / (word_count or 1)
+    sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
+    avg_sentence_length = word_count / (len(sentences) or 1)
+    ai_score = 0
+    if unique_ratio < 0.45:
+        ai_score += 40
+    elif unique_ratio < 0.6:
+        ai_score += 20
+    if avg_sentence_length > 25:
+        ai_score += 40
+    elif avg_sentence_length > 18:
+        ai_score += 25
+    if word_count > 400:
+        ai_score += 10
+    ai_score = min(100, round(ai_score))
+    human_score = 100 - ai_score
+    return ai_score, human_score, word_count, avg_sentence_length, unique_ratio
+@app.post("/api/ai-check")
+def api_ai_check(req: TextRequest, user=Depends(get_current_user)):
+    text = (req.text or "").strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="Text is required")
+    if model is not None and tokenizer is not None:
+        try:
+            max_len = getattr(tokenizer, "model_max_length", 512)
+            if max_len is None or max_len > 1024:
+                max_len = 512
+            words = text.split()
+            chunk_size = min(400, max_len - 10)
+            chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+            probs = []
+            for chunk in chunks:
+                inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_len)
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    logits = outputs.logits
+                    p = torch.softmax(logits, dim=1).cpu().numpy()[0]
+                    ai_prob = float(p[1]) if p.shape[0] > 1 else float(p[0])
+                    probs.append(ai_prob)
+            avg_ai_prob = float(np.mean(probs)) if probs else 0.0
+            ai_percent = round(avg_ai_prob * 100, 2)
+            human_percent = round(100 - ai_percent, 2)
+            words_count = len(words)
+            sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
+            avg_sentence_len = round(words_count / (len(sentences) or 1), 2)
+            summary = f"Model: {AI_DETECTOR_MODEL}; AI probability: {ai_percent}%"
+            save_history(user["id"], "ai", text, summary)
+            return {
+                "ai_percent": ai_percent,
+                "human_percent": human_percent,
+                "word_count": words_count,
+                "avg_sentence_length": avg_sentence_len,
+                "summary": summary,
+            }
+        except Exception as e:
+            print("[AI-check] model inference failed:", e)
+    ai_percent, human_percent, wc, avg_len, uniq = heuristic_ai_score(text)
+    summary = f"HEURISTIC fallback — AI probability: {ai_percent}%"
+    save_history(user["id"], "ai", text, summary)
+    return {
+        "ai_percent": ai_percent,
+        "human_percent": human_percent,
+        "word_count": wc,
+        "avg_sentence_length": avg_len,
+        "unique_ratio": round(uniq, 3),
+        "summary": summary,
+    }
+@app.post("/api/ai-check-file")
+def api_ai_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
+    text = extract_text_from_upload(file).strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="Uploaded file contains no text")
+    return api_ai_check.__wrapped__(TextRequest(text=text), user)
+# ------------------ HISTORY ------------------
+@app.get("/api/history")
+def api_history(user=Depends(get_current_user)):
+    cur.execute(
+        "SELECT id, tool, input_text, result_summary, created_at "
+        "FROM history WHERE user_id = ? "
+        "ORDER BY created_at DESC LIMIT 50",
+        (user["id"],),
+    )
+    rows = cur.fetchall()
+    items = []
+    for r in rows:
+        items.append(
+            {
+                "id": r["id"],
+                "tool": r["tool"],
+                "input_text": r["input_text"],
+                "summary": r["result_summary"],
+                "created_at": r["created_at"],
+            }
+        )
+    return {"items": items}
+@app.get("/")
+def read_root():
     return {"status": "Backend is running with 16GB RAM!"}