Spaces:

GopalKrushnaMahapatra
/

TrueWrite-Scan-Backend

Running

App Files Files Community

TrueWrite-Scan-Backend / app.py

GopalKrushnaMahapatra

Update app.py

a9d08a9 verified 5 days ago

raw

history blame contribute delete

49.4 kB

	# app.py (was: backend/main.py)
	import os
	import re
	import io
	import sqlite3
	from datetime import datetime, timezone

	from dotenv import load_dotenv
	from fastapi import FastAPI, HTTPException, status, Header, Depends, File, UploadFile, Form
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import FileResponse
	from pydantic import BaseModel, EmailStr
	from passlib.context import CryptContext
	import jwt

	# File parsing libs
	from docx import Document as DocxDocument
	import PyPDF2

	# ML / NLP libs
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import numpy as np

	# TF-IDF
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	# Semantic embeddings for plagiarism (combined approach)
	try:
	from sentence_transformers import SentenceTransformer
	except Exception:
	SentenceTransformer = None

	# LanguageTool (may require Java)
	try:
	import language_tool_python
	except Exception:
	language_tool_python = None

	# GECToR (neural grammatical error correction)
	try:
	# This is the official import path from gotutiyan/gector README
	from gector import GECToR, predict as gector_predict, load_verb_dict
	except Exception:
	GECToR = None
	gector_predict = None
	load_verb_dict = None

	# PDF generator (the new file)
	from pdf_reports import generate_report

	# ------------------ ENV & DB SETUP ------------------
	load_dotenv()

	JWT_SECRET = os.getenv("JWT_SECRET", "super_secret_key_change_this")
	JWT_ALGO = os.getenv("JWT_ALGO", "HS256")
	DB_PATH = os.getenv("DB_PATH", "truewrite.db")
	CORPUS_DIR = os.getenv("CORPUS_DIR", "corpus")
	CORPUS_RAW = os.getenv("CORPUS_RAW", "corpus_raw")

	# Combined plagiarism weights
	PLAG_ALPHA = float(os.getenv("PLAG_ALPHA", "0.4")) # TF-IDF weight; (1-alpha) for embeddings

	pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")

	# SQLite DB (simple demo)
	conn = sqlite3.connect(DB_PATH, check_same_thread=False)
	conn.row_factory = sqlite3.Row
	cur = conn.cursor()

	# Create tables if not exist
	cur.execute("""
	CREATE TABLE IF NOT EXISTS users (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	name TEXT NOT NULL,
	email TEXT NOT NULL UNIQUE,
	password_hash TEXT NOT NULL,
	created_at TEXT NOT NULL
	)
	""")

	cur.execute("""
	CREATE TABLE IF NOT EXISTS history (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	user_id INTEGER NOT NULL,
	tool TEXT NOT NULL,
	input_text TEXT,
	result_summary TEXT,
	created_at TEXT NOT NULL,
	FOREIGN KEY (user_id) REFERENCES users(id)
	)
	""")

	conn.commit()

	# ------------------ FASTAPI APP ------------------
	app = FastAPI(title="TrueWrite Scan (Python Backend)")

	app.add_middleware(
	CORSMiddleware,
	# This regex allows ANY URL (HTTP or HTTPS) to connect
	allow_origin_regex=r"https?://.*",
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# ------------------ MODELS ------------------
	class SignupRequest(BaseModel):
	name: str
	email: EmailStr
	password: str


	class LoginRequest(BaseModel):
	email: EmailStr
	password: str


	class TextRequest(BaseModel):
	text: str


	# ------------------ AUTH HELPERS ------------------
	def hash_password(pw: str) -> str:
	return pwd_context.hash(pw)


	def verify_password(plain: str, hashed: str) -> bool:
	return pwd_context.verify(plain, hashed)


	def create_token(user_id: int, email: str) -> str:
	payload = {"user_id": user_id, "email": email}
	token = jwt.encode(payload, JWT_SECRET, algorithm=JWT_ALGO)
	if isinstance(token, bytes):
	token = token.decode("utf-8")
	return token


	def decode_token(token: str):
	try:
	payload = jwt.decode(token, JWT_SECRET, algorithms=[JWT_ALGO])
	return payload
	except jwt.PyJWTError:
	raise HTTPException(
	status_code=status.HTTP_401_UNAUTHORIZED,
	detail="Invalid token"
	)


	def get_current_user(authorization: str = Header(None)):
	if not authorization or not authorization.startswith("Bearer "):
	raise HTTPException(
	status_code=status.HTTP_401_UNAUTHORIZED,
	detail="Missing token"
	)
	token = authorization.split(" ", 1)[1]
	payload = decode_token(token)
	user_id = payload.get("user_id")
	cur.execute("SELECT * FROM users WHERE id = ?", (user_id,))
	row = cur.fetchone()
	if not row:
	raise HTTPException(
	status_code=status.HTTP_401_UNAUTHORIZED,
	detail="User not found"
	)
	return {"id": row["id"], "name": row["name"], "email": row["email"]}


	def now_iso():
	return datetime.now(timezone.utc).isoformat()


	def save_history(user_id: int, tool: str, input_text: str, summary: str):
	trimmed = (input_text[:500] + "...") if len(input_text) > 500 else input_text
	cur.execute(
	"INSERT INTO history (user_id, tool, input_text, result_summary, created_at) VALUES (?, ?, ?, ?, ?)",
	(user_id, tool, trimmed, summary, now_iso()),
	)
	conn.commit()


	# ------------------ TEXT HELPERS ------------------
	def count_words(text: str) -> int:
	tokens = text.strip().split()
	return len(tokens) if text.strip() else 0


	def simple_grammar_correct(text: str):
	"""Old heuristic grammar fixer (kept as fallback)."""
	corrections = 0
	original_words = count_words(text)

	before = text
	text = re.sub(r"\s{2,}", " ", text)
	if text != before:
	corrections += 1

	before = text
	text = re.sub(r"\bi\b", "I", text)
	if text != before:
	corrections += 1

	def cap_match(m):
	return m.group(0).upper()

	before = text
	text = re.sub(r"(^\s*\w\|[.!?]\s+\w)", cap_match, text)
	if text != before:
	corrections += 1

	if text.strip() and not re.search(r"[.!?]\s*$", text.strip()):
	text = text.strip() + "."
	corrections += 1

	return text, corrections, original_words


	# ------------------ CORPUS BUILDING (from corpus_raw -> corpus) ------------------
	def extract_from_docx_path(path: str) -> str:
	doc = DocxDocument(path)
	paragraphs = [p.text for p in doc.paragraphs]
	return "\n".join(paragraphs)


	def extract_from_pdf_path(path: str) -> str:
	with open(path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	texts = []
	for pg in range(len(reader.pages)):
	try:
	texts.append(reader.pages[pg].extract_text() or "")
	except Exception:
	texts.append("")
	return "\n".join(texts)


	def build_corpus_from_raw(raw_dir: str = CORPUS_RAW, out_dir: str = CORPUS_DIR):
	"""
	Convert any .pdf / .docx / .txt files from corpus_raw/ into .txt files in corpus/.
	This mirrors your build_corpus.py logic but is called automatically at startup.
	"""
	os.makedirs(raw_dir, exist_ok=True)
	os.makedirs(out_dir, exist_ok=True)

	for fname in os.listdir(raw_dir):
	inpath = os.path.join(raw_dir, fname)
	if not os.path.isfile(inpath):
	continue
	outname = os.path.splitext(fname)[0] + ".txt"
	outpath = os.path.join(out_dir, outname)
	try:
	ext = fname.lower()
	if ext.endswith(".docx"):
	text = extract_from_docx_path(inpath)
	elif ext.endswith(".pdf"):
	text = extract_from_pdf_path(inpath)
	elif ext.endswith(".txt"):
	with open(inpath, "r", encoding="utf-8", errors="ignore") as f:
	text = f.read()
	else:
	print("[CorpusRaw] Skipping unsupported:", fname)
	continue

	text = text.strip()
	with open(outpath, "w", encoding="utf-8") as fo:
	fo.write(text)
	print("[CorpusRaw] Wrote:", outpath)
	except Exception as e:
	print("[CorpusRaw] Failed", fname, "->", e)


	# ------------------ TF-IDF CORPUS LOADING ------------------
	vectorizer = None
	corpus_tfidf = None
	corpus_titles = []
	corpus_texts = []


	def load_corpus(corpus_dir=CORPUS_DIR):
	"""
	Load .txt corpus files from CORPUS_DIR, build TF-IDF index.
	Semantic embeddings are built separately in load_embeddings().
	"""
	global vectorizer, corpus_tfidf, corpus_titles, corpus_texts
	corpus_titles = []
	corpus_texts = []
	if not os.path.isdir(corpus_dir):
	os.makedirs(corpus_dir, exist_ok=True)
	print("[Corpus] Created empty corpus directory:", corpus_dir)
	vectorizer = None
	corpus_tfidf = None
	return

	for fname in os.listdir(corpus_dir):
	if fname.lower().endswith(".txt"):
	path = os.path.join(corpus_dir, fname)
	try:
	with open(path, "r", encoding="utf-8", errors="ignore") as f:
	txt = f.read()
	corpus_titles.append(fname)
	corpus_texts.append(txt)
	except Exception as e:
	print(f"[Corpus] Failed to read {path}: {e}")

	if corpus_texts:
	try:
	vectorizer = TfidfVectorizer(
	ngram_range=(1, 3),
	stop_words="english",
	max_features=50000
	)
	corpus_tfidf = vectorizer.fit_transform(corpus_texts)
	print(f"[Corpus] Loaded {len(corpus_texts)} documents into TF-IDF index")
	except Exception as e:
	print("[Corpus] TF-IDF build failed:", e)
	vectorizer = None
	corpus_tfidf = None
	else:
	vectorizer = None
	corpus_tfidf = None
	print("[Corpus] No .txt documents found in", corpus_dir)


	# ------------------ SEMANTIC EMBEDDINGS (SentenceTransformers) ------------------
	emb_model = None
	corpus_emb = None
	EMB_MODEL_NAME = os.getenv("PLAG_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")


	def load_embeddings():
	"""
	Build semantic embedding index for plagiarism using sentence-transformers.
	"""
	global emb_model, corpus_emb
	if SentenceTransformer is None:
	print("[Embeddings] sentence-transformers not installed; skipping semantic index.")
	emb_model = None
	corpus_emb = None
	return

	if not corpus_texts:
	print("[Embeddings] No corpus texts available; semantic index not built.")
	emb_model = None
	corpus_emb = None
	return

	try:
	emb_model = SentenceTransformer(EMB_MODEL_NAME)
	corpus_emb = emb_model.encode(
	corpus_texts,
	convert_to_numpy=True,
	show_progress_bar=False,
	normalize_embeddings=True,
	)
	print(f"[Embeddings] Loaded '{EMB_MODEL_NAME}' and encoded {len(corpus_texts)} corpus docs.")
	except Exception as e:
	emb_model = None
	corpus_emb = None
	print("[Embeddings] Failed to load or encode corpus:", e)


	# Build corpus & embeddings at startup
	build_corpus_from_raw()
	load_corpus()
	load_embeddings()

	# ------------------ HF MODEL LOADING (AI Detector) ------------------
	AI_DETECTOR_MODEL = "openai-community/roberta-base-openai-detector"
	tokenizer = None
	model = None
	device = None

	try:
	tokenizer = AutoTokenizer.from_pretrained(AI_DETECTOR_MODEL)
	model = AutoModelForSequenceClassification.from_pretrained(AI_DETECTOR_MODEL)
	model.eval()
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	print(f"[AI Detector] Loaded {AI_DETECTOR_MODEL} on {device}")
	except Exception as e:
	tokenizer = None
	model = None
	device = None
	print("[AI Detector] Failed to load HF model — using heuristic fallback. Error:", e)

	# ------------------ GECToR LOADING (Neural GEC) ------------------
	GEC_MODEL = None
	GEC_TOKENIZER = None
	GEC_ENCODE = None
	GEC_DECODE = None
	GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
	try:
	print("[GECToR] Initializing model... (This may take a bit on first run)")
	GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
	VERB_DICT_PATH = os.getenv("GEC_VERB_DICT", "/app/data/verb-form-vocab.txt")

	GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
	GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
	GEC_ENCODE, GEC_DECODE = load_verb_dict(VERB_DICT_PATH)

	print(f"[GECToR] Model & verb dict loaded: {GEC_MODEL_ID}")
	except Exception as e:
	print(f"[GECToR] Failed to load. Error: {e}")
	GEC_MODEL = None
	GEC_TOKENIZER = None
	GEC_ENCODE = None
	GEC_DECODE = None
	else:
	print("[GECToR] Library not available; skipping neural GEC.")


	def gector_correct(text: str):
	"""
	Run neural grammatical error correction using GECToR (gotutiyan implementation).
	"""
	if GEC_MODEL is None or GEC_TOKENIZER is None or GEC_ENCODE is None or GEC_DECODE is None:
	print("[GECToR] Model not loaded, skipping.")
	return text, 0, len(text.split()) if text.strip() else 0

	parts = text.strip().split()
	# Safety truncate (protect server)
	if len(parts) > 1000:
	text_proc = " ".join(parts[:1000])
	else:
	text_proc = text.strip()

	if not text_proc:
	return text_proc, 0, 0

	srcs = [text_proc]

	try:
	corrected_list = gector_predict(
	GEC_MODEL,
	GEC_TOKENIZER,
	srcs,
	GEC_ENCODE,
	GEC_DECODE,
	keep_confidence=0.0,
	min_error_prob=0.0,
	n_iteration=5,
	batch_size=2,
	)
	corrected_text = corrected_list[0]

	orig_tokens = text_proc.split()
	corr_tokens = corrected_text.split()
	corrections = sum(1 for a, b in zip(orig_tokens, corr_tokens) if a != b)
	original_words = len(orig_tokens)

	return corrected_text, corrections, original_words

	except Exception as e:
	print(f"[GECToR] Prediction error: {e}")
	return text_proc, 0, len(text_proc.split())


	# ------------------ FILE EXTRACTION HELPERS ------------------
	MAX_FILE_SIZE = 15 * 1024 * 1024 # 15 MB


	def extract_text_from_upload(upload: UploadFile) -> str:
	filename = (upload.filename or "").lower()
	content_type = (upload.content_type or "").lower()
	data = upload.file.read()
	try:
	upload.file.seek(0)
	except Exception:
	pass

	if len(data) > MAX_FILE_SIZE:
	raise HTTPException(status_code=413, detail="File too large (max 15MB)")

	# TXT
	if filename.endswith(".txt") or content_type == "text/plain":
	try:
	try:
	return data.decode("utf-8")
	except UnicodeDecodeError:
	return data.decode("latin-1")
	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Failed to decode text file: {e}")

	# DOCX
	if filename.endswith(".docx") or "wordprocessingml" in content_type:
	# Basic sanity check: valid .docx is a ZIP (PK header)
	if not data.startswith(b"PK"):
	raise HTTPException(
	status_code=400,
	detail="Uploaded file is not a valid .docx package (it might be an old .doc file or a corrupted document). "
	"Please open it in Word/Google Docs and re-save as .docx or export as PDF, then upload again."
	)
	try:
	f = io.BytesIO(data)
	doc = DocxDocument(f)
	paragraphs = [p.text for p in doc.paragraphs]
	text = "\n".join(paragraphs).strip()
	if not text:
	raise ValueError("DOCX contained no readable text.")
	return text
	except Exception as e:
	raise HTTPException(
	status_code=400,
	detail=f"Failed to parse docx file: {e}. Try opening it in Word/Google Docs and exporting again as .docx or PDF."
	)

	# PDF
	if filename.endswith(".pdf") or "pdf" in content_type:
	try:
	f = io.BytesIO(data)
	reader = PyPDF2.PdfReader(f)
	texts = []
	for pg in range(len(reader.pages)):
	try:
	txt = reader.pages[pg].extract_text() or ""
	except Exception:
	txt = ""
	texts.append(txt)
	return "\n".join(texts)
	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Failed to parse PDF file: {e}")

	raise HTTPException(
	status_code=415,
	detail="Unsupported file type. Use .txt, .pdf, or .docx",
	)


	# ------------------ GRAMMAR (LANGUAGETOOL INTEGRATION) ------------------
	lt_tool = None
	if language_tool_python is not None:
	try:
	lt_tool = language_tool_python.LanguageTool("en-US")
	print("[LanguageTool] Loaded (local Java-backed checker)")
	except Exception as e:
	lt_tool = None
	print("[LanguageTool] Could not start local LanguageTool — falling back. Error:", e)
	else:
	print("[LanguageTool] library not installed; falling back to heuristics.")


	def grammar_with_languagetool(text: str):
	parts = text.strip().split()
	if len(parts) > 1000:
	text_proc = " ".join(parts[:1000])
	else:
	text_proc = text.strip()

	matches = lt_tool.check(text_proc)
	corrected = language_tool_python.utils.correct(text_proc, matches)
	corrections = len(matches)
	return corrected, corrections, len(text_proc.split())


	# ------------------ PLAGIARISM HELPERS (COMBINED ENGINE) ------------------
	def _clean_for_jaccard(t: str):
	t = t.lower()
	t = re.sub(r"[^a-z0-9\s]", " ", t)
	return [w for w in t.split() if w]


	def _jaccard_similarity(a, b):
	sa = set(a)
	sb = set(b)
	if not sa or not sb:
	return 0.0
	return len(sa & sb) / len(sa \| sb)


	def demo_plagiarism_fallback(text: str):
	"""
	Simple Jaccard-based fallback using a tiny built-in sample set.
	Used when no TF-IDF / semantic corpus is available.
	"""
	SAMPLE_DOCS = [
	{"title": "AI for Social Good",
	"text": "Artificial intelligence is transforming multiple industries by automating routine tasks and enabling data driven decision making for social impact and efficiency."},
	{"title": "IoT in Smart Cities",
	"text": "The Internet of Things connects sensors, devices, and cloud platforms to enable real time monitoring and control in smart cities including lighting, traffic, and waste management."},
	{"title": "Climate & Renewable Energy",
	"text": "Climate change is a critical global challenge that demands renewable energy, efficient resource management, and international cooperation to ensure a sustainable future."},
	]

	input_words = _clean_for_jaccard(text)
	best_score = 0.0
	matches = []
	for doc in SAMPLE_DOCS:
	doc_words = _clean_for_jaccard(doc["text"])
	score = _jaccard_similarity(input_words, doc_words)
	matches.append({"title": doc["title"], "score": round(score * 100, 2)})
	if score > best_score:
	best_score = score

	matches.sort(key=lambda x: x["score"], reverse=True)
	plagiarism_percent = round(best_score * 100, 2)
	summary = f"Plagiarism estimate (demo Jaccard): {plagiarism_percent}%"
	return {"plagiarism_percent": plagiarism_percent, "matches": matches[:5], "summary": summary}


	def corpus_plagiarism_combined(text: str):
	"""
	Combined plagiarism score using:
	- TF-IDF cosine similarity
	- Semantic embedding cosine similarity (SentenceTransformers)

	Returns dict matching API schema:
	{ plagiarism_percent, matches, summary }
	"""
	if not corpus_texts:
	raise ValueError("No corpus texts loaded")

	sims_tfidf = None
	sims_emb = None

	words = text.split()
	if len(words) > 3000:
	text_proc = " ".join(words[:3000])
	else:
	text_proc = text

	# TF-IDF similarity
	if vectorizer is not None and corpus_tfidf is not None:
	q = vectorizer.transform([text_proc])
	sims_tfidf = cosine_similarity(q, corpus_tfidf)[0]

	# Semantic similarity
	if emb_model is not None and corpus_emb is not None:
	q_emb = emb_model.encode(
	[text_proc],
	convert_to_numpy=True,
	normalize_embeddings=True,
	show_progress_bar=False,
	)[0]
	sims_emb = corpus_emb @ q_emb # normalized → dot = cosine

	if sims_tfidf is None and sims_emb is None:
	raise ValueError("No plagiarism backends (TF-IDF / embeddings) are available")

	n_docs = len(corpus_texts)
	combined_rows = []
	alpha = PLAG_ALPHA # TF-IDF weight

	for i in range(n_docs):
	tf = float(sims_tfidf[i]) if sims_tfidf is not None else None
	se = float(sims_emb[i]) if sims_emb is not None else None
	if tf is None and se is None:
	continue

	if tf is not None and se is not None:
	score = alpha * tf + (1.0 - alpha) * se
	elif tf is not None:
	score = tf
	else:
	score = se

	combined_rows.append({
	"index": i,
	"combined": score,
	"tfidf": tf,
	"semantic": se,
	})

	if not combined_rows:
	raise ValueError("No scores computed for corpus documents")

	combined_rows.sort(key=lambda x: x["combined"], reverse=True)
	top = combined_rows[:10]

	best = top[0]["combined"]
	plagiarism_percent = round(best * 100, 2)

	matches = []
	for row in top:
	matches.append({
	"title": corpus_titles[row["index"]],
	"score": round(row["combined"] * 100, 2),
	"tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
	"semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
	})

	components = []
	if sims_tfidf is not None:
	components.append("TF-IDF")
	if sims_emb is not None:
	components.append("semantic embeddings")
	comp_str = " + ".join(components)

	summary = f"Plagiarism estimate (combined {comp_str}): {plagiarism_percent}%"
	return {"plagiarism_percent": plagiarism_percent, "matches": matches, "summary": summary}


	# ------------------ ENDPOINTS ------------------

	@app.post("/api/signup")
	def signup(req: SignupRequest):
	cur.execute("SELECT id FROM users WHERE email = ?", (req.email,))
	if cur.fetchone():
	raise HTTPException(status_code=400, detail="Email already registered")

	pw_hash = hash_password(req.password)
	created_at = now_iso()
	cur.execute(
	"INSERT INTO users (name, email, password_hash, created_at) VALUES (?, ?, ?, ?)",
	(req.name, req.email, pw_hash, created_at),
	)
	conn.commit()
	user_id = cur.lastrowid
	token = create_token(user_id, req.email)

	return {
	"message": "Signup successful",
	"token": token,
	"name": req.name,
	"email": req.email,
	}


	@app.post("/api/login")
	def login(req: LoginRequest):
	cur.execute("SELECT * FROM users WHERE email = ?", (req.email,))
	row = cur.fetchone()
	if not row or not verify_password(req.password, row["password_hash"]):
	raise HTTPException(status_code=401, detail="Invalid email or password")

	token = create_token(row["id"], row["email"])
	return {
	"message": "Login successful",
	"token": token,
	"name": row["name"],
	"email": row["email"],
	}


	@app.post("/api/grammar-check")
	def api_grammar_check(req: TextRequest, user=Depends(get_current_user)):
	text = req.text or ""
	if not text.strip():
	raise HTTPException(status_code=400, detail="Text is required")

	# Prefer GECToR → LanguageTool → heuristics
	if GEC_MODEL is not None:
	corrected, corrections, original_words = gector_correct(text)
	summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
	elif lt_tool is not None:
	corrected, corrections, original_words = grammar_with_languagetool(text)
	summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
	else:
	corrected, corrections, original_words = simple_grammar_correct(text)
	summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"

	save_history(user["id"], "grammar", text, summary)

	return {
	"original_words": original_words,
	"corrections": corrections,
	"corrected_text": corrected,
	"summary": summary,
	}


	@app.post("/api/grammar-check-file")
	def api_grammar_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
	text = extract_text_from_upload(file).strip()
	if not text:
	raise HTTPException(status_code=400, detail="Uploaded file contains no text")

	if GEC_MODEL is not None:
	corrected, corrections, original_words = gector_correct(text)
	summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
	elif lt_tool is not None:
	corrected, corrections, original_words = grammar_with_languagetool(text)
	summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
	else:
	parts = text.strip().split()
	if len(parts) > 1000:
	text = " ".join(parts[:1000])
	corrected, corrections, original_words = simple_grammar_correct(text)
	summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"

	save_history(user["id"], "grammar", text, summary)

	return {
	"original_words": original_words,
	"corrections": corrections,
	"corrected_text": corrected,
	"summary": summary,
	}


	# ------------------ PLAGIARISM ENDPOINTS (COMBINED) ------------------
	@app.post("/api/plagiarism-check")
	def api_plagiarism_check(req: TextRequest, user=Depends(get_current_user)):
	text = req.text or ""
	if not text.strip():
	raise HTTPException(status_code=400, detail="Text is required")

	# First try full combined engine (TF-IDF + embeddings) with corpus
	try:
	result = corpus_plagiarism_combined(text)
	save_history(user["id"], "plagiarism", text, result["summary"])
	return result
	except Exception as e:
	print("[Plagiarism] Combined corpus engine failed, falling back to demo:", e)

	# Fallback: small Jaccard demo
	result = demo_plagiarism_fallback(text)
	save_history(user["id"], "plagiarism", text, result["summary"])
	return result


	@app.post("/api/plagiarism-check-file")
	def api_plagiarism_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
	text = extract_text_from_upload(file).strip()
	if not text:
	raise HTTPException(status_code=400, detail="Uploaded file contains no text")

	try:
	result = corpus_plagiarism_combined(text)
	save_history(user["id"], "plagiarism", text, result["summary"])
	return result
	except Exception as e:
	print("[Plagiarism-file] Combined corpus engine failed, falling back to demo:", e)

	# Fallback to demo if corpus/engines unavailable
	result = demo_plagiarism_fallback(text)
	save_history(user["id"], "plagiarism", text, result["summary"])
	return result


	# ------------------ AI CHECK (TEXT & FILE) ------------------
	def heuristic_ai_score(text: str):
	words = re.sub(r"[^a-z0-9\s]", " ", text.lower()).split()
	word_count = len(words)
	unique_ratio = len(set(words)) / (word_count or 1)
	sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
	avg_sentence_length = word_count / (len(sentences) or 1)

	ai_score = 0
	if unique_ratio < 0.45:
	ai_score += 40
	elif unique_ratio < 0.6:
	ai_score += 20

	if avg_sentence_length > 25:
	ai_score += 40
	elif avg_sentence_length > 18:
	ai_score += 25

	if word_count > 400:
	ai_score += 10

	ai_score = min(100, round(ai_score))
	human_score = 100 - ai_score
	return ai_score, human_score, word_count, avg_sentence_length, unique_ratio


	@app.post("/api/ai-check")
	def api_ai_check(req: TextRequest, user=Depends(get_current_user)):
	text = (req.text or "").strip()
	if not text:
	raise HTTPException(status_code=400, detail="Text is required")

	if model is not None and tokenizer is not None:
	try:
	max_len = getattr(tokenizer, "model_max_length", 512)
	if max_len is None or max_len > 1024:
	max_len = 512

	words = text.split()
	chunk_size = min(400, max_len - 10)
	chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
	probs = []
	for chunk in chunks:
	inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_len)
	inputs = {k: v.to(device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	p = torch.softmax(logits, dim=1).cpu().numpy()[0]
	ai_prob = float(p[1]) if p.shape[0] > 1 else float(p[0])
	probs.append(ai_prob)
	avg_ai_prob = float(np.mean(probs)) if probs else 0.0
	ai_percent = round(avg_ai_prob * 100, 2)
	human_percent = round(100 - ai_percent, 2)
	words_count = len(words)
	sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
	avg_sentence_len = round(words_count / (len(sentences) or 1), 2)
	summary = f"Model: {AI_DETECTOR_MODEL}; AI probability: {ai_percent}%"
	save_history(user["id"], "ai", text, summary)
	return {
	"ai_percent": ai_percent,
	"human_percent": human_percent,
	"word_count": words_count,
	"avg_sentence_length": avg_sentence_len,
	"summary": summary,
	}
	except Exception as e:
	print("[AI-check] model inference failed:", e)

	ai_percent, human_percent, wc, avg_len, uniq = heuristic_ai_score(text)
	summary = f"HEURISTIC fallback — AI probability: {ai_percent}%"
	save_history(user["id"], "ai", text, summary)
	return {
	"ai_percent": ai_percent,
	"human_percent": human_percent,
	"word_count": wc,
	"avg_sentence_length": avg_len,
	"unique_ratio": round(uniq, 3),
	"summary": summary,
	}


	@app.post("/api/ai-check-file")
	def api_ai_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
	text = extract_text_from_upload(file).strip()
	if not text:
	raise HTTPException(status_code=400, detail="Uploaded file contains no text")
	return api_ai_check.__wrapped__(TextRequest(text=text), user)


	# ------------------ HISTORY ------------------
	@app.get("/api/history")
	def api_history(user=Depends(get_current_user)):
	cur.execute(
	"SELECT id, tool, input_text, result_summary, created_at "
	"FROM history WHERE user_id = ? "
	"ORDER BY created_at DESC LIMIT 50",
	(user["id"],),
	)
	rows = cur.fetchall()
	items = []
	for r in rows:
	items.append(
	{
	"id": r["id"],
	"tool": r["tool"],
	"input_text": r["input_text"],
	"summary": r["result_summary"],
	"created_at": r["created_at"],
	}
	)
	return {"items": items}


	# ------------------ NEW: PDF REPORT ENDPOINTS ------------------
	# These endpoints run the same checks you already have and format results into the PDF template
	# using pdf_reports.generate_report (pixel-perfect Duplichecker-style).

	@app.get("/report/grammar")
	def report_grammar_get(text: str = "", user=Depends(get_current_user)):
	"""
	Generate Grammar Report PDF from query param text.
	If you prefer file upload, use POST /report/grammar-file
	"""
	if not text.strip():
	raise HTTPException(status_code=400, detail="Text is required for report")
	# run the same grammar logic
	if GEC_MODEL is not None:
	corrected, corrections, original_words = gector_correct(text)
	summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
	elif lt_tool is not None:
	corrected, corrections, original_words = grammar_with_languagetool(text)
	summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
	else:
	corrected, corrections, original_words = simple_grammar_correct(text)
	summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"

	save_history(user["id"], "grammar", text, summary)

	# Prepare the PDF payload using same visual template
	tiles = [
	{'value': str(corrections), 'label': 'Errors'},
	{'value': '-', 'label': 'Warnings'},
	{'value': '-', 'label': 'Suggestions'},
	{'value': '—', 'label': 'Readability'}
	]
	counts = {
	'Words': str(original_words),
	'Characters': str(len(text)),
	'Sentences': str(len([s for s in re.split(r"[.!?]+", text) if s.strip()])),
	'Paragraphs': str(len([p for p in text.split("\n") if p.strip()])),
	'Read Time': f"{max(1, original_words//200)} minute(s)"
	}
	sections = [
	{'heading': 'Summary', 'paragraphs': [
	"This Grammar Report lists detected grammar issues and suggestions.",
	{'text': f"Corrections suggested: {corrections}", 'highlight': 'yellow' if corrections > 0 else None},
	{'text': corrected, 'highlight': None}
	]},
	{'heading': 'Document', 'paragraphs': [text]}
	]
	matched_sources = [] # grammar report does not need matched sources by default
	footer = "Grammar suggestions are automated. Review before applying changes."

	pdf_path = generate_report("grammar", out_dir="/tmp",
	title_text="Grammar Report",
	tiles=tiles, counts=counts, sections=sections,
	matched_sources=matched_sources, footer_text=footer)

	return FileResponse(pdf_path, media_type="application/pdf", filename="TrueWrite_GrammarReport.pdf")


	@app.post("/report/grammar-file")
	def report_grammar_file(file: UploadFile = File(...), user=Depends(get_current_user)):
	text = extract_text_from_upload(file).strip()
	if not text:
	raise HTTPException(status_code=400, detail="Uploaded file contains no text")
	# reuse above logic to create report
	if GEC_MODEL is not None:
	corrected, corrections, original_words = gector_correct(text)
	summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
	elif lt_tool is not None:
	corrected, corrections, original_words = grammar_with_languagetool(text)
	summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
	else:
	corrected, corrections, original_words = simple_grammar_correct(text)
	summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"

	save_history(user["id"], "grammar", text, summary)

	tiles = [
	{'value': str(corrections), 'label': 'Errors'},
	{'value': '-', 'label': 'Warnings'},
	{'value': '-', 'label': 'Suggestions'},
	{'value': '—', 'label': 'Readability'}
	]
	counts = {
	'Words': str(original_words),
	'Characters': str(len(text)),
	'Sentences': str(len([s for s in re.split(r"[.!?]+", text) if s.strip()])),
	'Paragraphs': str(len([p for p in text.split("\n") if p.strip()])),
	'Read Time': f"{max(1, original_words//200)} minute(s)"
	}
	sections = [
	{'heading': 'Summary', 'paragraphs': [
	"This Grammar Report lists detected grammar issues and suggestions.",
	{'text': f"Corrections suggested: {corrections}", 'highlight': 'yellow' if corrections > 0 else None},
	{'text': corrected, 'highlight': None}
	]},
	{'heading': 'Document', 'paragraphs': [text]}
	]
	matched_sources = []
	footer = "Grammar suggestions are automated. Review before applying changes."

	pdf_path = generate_report("grammar", out_dir="/tmp",
	title_text="Grammar Report",
	tiles=tiles, counts=counts, sections=sections,
	matched_sources=matched_sources, footer_text=footer)

	return FileResponse(pdf_path, media_type="application/pdf", filename="TrueWrite_GrammarReport.pdf")


	@app.get("/report/plagiarism")
	def report_plagiarism_get(text: str = "", user=Depends(get_current_user)):
	"""
	Generate Plagiarism Report PDF from query param text.
	If you prefer file upload, use POST /report/plagiarism-file
	"""
	if not text.strip():
	raise HTTPException(status_code=400, detail="Text is required for report")

	# reuse the plagiarism check logic
	try:
	result = corpus_plagiarism_combined(text)
	except Exception:
	result = demo_plagiarism_fallback(text)

	save_history(user["id"], "plagiarism", text, result.get("summary", ""))

	# Build tiles and matched_sources for PDF
	plag_percent = f"{result.get('plagiarism_percent', 0)}%"
	top_matches = result.get("matches", [])[:5]
	tiles = [
	{'value': plag_percent, 'label': 'Plagiarism'},
	{'value': f"{top_matches[0]['score']}%" if top_matches else '0%', 'label': 'Top Match'},
	{'value': '-', 'label': 'Partial Match'},
	{'value': f"{100 - float(result.get('plagiarism_percent', 0))}%", 'label': 'Unique'}
	]
	counts = {
	'Words': str(count_words(text)),
	'Characters': str(len(text)),
	'Sentences': str(len([s for s in re.split(r"[.!?]+", text) if s.strip()])),
	'Paragraphs': str(len([p for p in text.split("\n") if p.strip()])),
	'Read Time': f"{max(1, count_words(text)//200)} minute(s)"
	}

	# Create sections; mark highest-match sentences as highlighted (simple heuristic)
	sections = [
	{'heading': 'Summary', 'paragraphs': [
	result.get("summary", "Plagiarism analysis completed."),
	"Top matches are listed below."
	]},
	{'heading': 'Document', 'paragraphs': [text]}
	]

	matched_sources = []
	for m in top_matches:
	matched_sources.append({
	'title': m.get('title') or m.get('source', 'Source'),
	'url': m.get('url') or '',
	'similarity': f"{m.get('score', 0)}%"
	})

	footer = "Plagiarism detection results are estimates. Review sources for exact matches."

	pdf_path = generate_report("plagiarism", out_dir="/tmp",
	title_text="Plagiarism Report",
	tiles=tiles, counts=counts, sections=sections,
	matched_sources=matched_sources, footer_text=footer)

	return FileResponse(pdf_path, media_type="application/pdf", filename="TrueWrite_PlagiarismReport.pdf")


	@app.post("/report/plagiarism-file")
	def report_plagiarism_file(file: UploadFile = File(...), user=Depends(get_current_user)):
	text = extract_text_from_upload(file).strip()
	if not text:
	raise HTTPException(status_code=400, detail="Uploaded file contains no text")

	try:
	result = corpus_plagiarism_combined(text)
	except Exception:
	result = demo_plagiarism_fallback(text)

	save_history(user["id"], "plagiarism", text, result.get("summary", ""))

	plag_percent = f"{result.get('plagiarism_percent', 0)}%"
	top_matches = result.get("matches", [])[:5]
	tiles = [
	{'value': plag_percent, 'label': 'Plagiarism'},
	{'value': f"{top_matches[0]['score']}%" if top_matches else '0%', 'label': 'Top Match'},
	{'value': '-', 'label': 'Partial Match'},
	{'value': f"{100 - float(result.get('plagiarism_percent', 0))}%", 'label': 'Unique'}
	]
	counts = {
	'Words': str(count_words(text)),
	'Characters': str(len(text)),
	'Sentences': str(len([s for s in re.split(r"[.!?]+", text) if s.strip()])),
	'Paragraphs': str(len([p for p in text.split("\n") if p.strip()])),
	'Read Time': f"{max(1, count_words(text)//200)} minute(s)"
	}

	sections = [
	{'heading': 'Summary', 'paragraphs': [
	result.get("summary", "Plagiarism analysis completed."),
	"Top matches are listed below."
	]},
	{'heading': 'Document', 'paragraphs': [text]}
	]

	matched_sources = []
	for m in top_matches:
	matched_sources.append({
	'title': m.get('title') or m.get('source', 'Source'),
	'url': m.get('url') or '',
	'similarity': f"{m.get('score', 0)}%"
	})

	footer = "Plagiarism detection results are estimates. Review sources for exact matches."

	pdf_path = generate_report("plagiarism", out_dir="/tmp",
	title_text="Plagiarism Report",
	tiles=tiles, counts=counts, sections=sections,
	matched_sources=matched_sources, footer_text=footer)

	return FileResponse(pdf_path, media_type="application/pdf", filename="TrueWrite_PlagiarismReport.pdf")


	@app.get("/report/ai")
	def report_ai_get(text: str = "", user=Depends(get_current_user)):
	"""
	Generate AI Content Report PDF from query param text.
	If you prefer file upload, use POST /report/ai-file
	"""
	if not text.strip():
	raise HTTPException(status_code=400, detail="Text is required for report")

	# Reuse ai-check logic to compute ai_percent etc.
	if model is not None and tokenizer is not None:
	try:
	max_len = getattr(tokenizer, "model_max_length", 512)
	if max_len is None or max_len > 1024:
	max_len = 512

	words = text.split()
	chunk_size = min(400, max_len - 10)
	chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
	probs = []
	for chunk in chunks:
	inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_len)
	inputs = {k: v.to(device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	p = torch.softmax(logits, dim=1).cpu().numpy()[0]
	ai_prob = float(p[1]) if p.shape[0] > 1 else float(p[0])
	probs.append(ai_prob)
	avg_ai_prob = float(np.mean(probs)) if probs else 0.0
	ai_percent = round(avg_ai_prob * 100, 2)
	human_percent = round(100 - ai_percent, 2)
	words_count = len(words)
	sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
	avg_sentence_len = round(words_count / (len(sentences) or 1), 2)
	summary = f"Model: {AI_DETECTOR_MODEL}; AI probability: {ai_percent}%"
	except Exception as e:
	print("[AI-report] model inference failed:", e)
	ai_percent, human_percent, wc, avg_len, uniq = heuristic_ai_score(text)
	ai_percent = ai_percent
	human_percent = human_percent
	words_count = wc
	avg_sentence_len = avg_len
	summary = f"HEURISTIC fallback — AI probability: {ai_percent}%"
	else:
	ai_percent, human_percent, wc, avg_len, uniq = heuristic_ai_score(text)
	ai_percent = ai_percent
	human_percent = human_percent
	words_count = wc
	avg_sentence_len = avg_len
	summary = f"HEURISTIC fallback — AI probability: {ai_percent}%"

	save_history(user["id"], "ai", text, summary)

	tiles = [
	{'value': f"{ai_percent}%", 'label': 'AI Likelihood'},
	{'value': '-', 'label': 'Plagiarism'},
	{'value': '-', 'label': 'Human-Like'},
	{'value': f"{human_percent}%", 'label': 'Human Likelihood'}
	]
	counts = {
	'Words': str(words_count),
	'Characters': str(len(text)),
	'Sentences': str(len([s for s in re.split(r"[.!?]+", text) if s.strip()])),
	'Paragraphs': str(len([p for p in text.split("\n") if p.strip()])),
	'Avg Sentence Len': str(avg_sentence_len)
	}
	sections = [
	{'heading': 'Executive Summary', 'paragraphs': [
	summary,
	{'text': "This AI Content Report analyses the likelihood that portions of the submitted text were generated by AI.", 'highlight': None}
	]},
	{'heading': 'Document Body', 'paragraphs': [text]}
	]
	matched_sources = [] # optional for AI report; kept empty here
	footer = "AI detection is probabilistic. Use results as guidance."

	pdf_path = generate_report("ai", out_dir="/tmp",
	title_text="AI Content Report",
	tiles=tiles, counts=counts, sections=sections,
	matched_sources=matched_sources, footer_text=footer)

	return FileResponse(pdf_path, media_type="application/pdf", filename="TrueWrite_AiReport.pdf")


	@app.post("/report/ai-file")
	def report_ai_file(file: UploadFile = File(...), user=Depends(get_current_user)):
	text = extract_text_from_upload(file).strip()
	if not text:
	raise HTTPException(status_code=400, detail="Uploaded file contains no text")

	# reuse logic above (heuristic or model)
	if model is not None and tokenizer is not None:
	try:
	max_len = getattr(tokenizer, "model_max_length", 512)
	if max_len is None or max_len > 1024:
	max_len = 512

	words = text.split()
	chunk_size = min(400, max_len - 10)
	chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
	probs = []
	for chunk in chunks:
	inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_len)
	inputs = {k: v.to(device) for k, v in inputs.items()}
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	p = torch.softmax(logits, dim=1).cpu().numpy()[0]
	ai_prob = float(p[1]) if p.shape[0] > 1 else float(p[0])
	probs.append(ai_prob)
	avg_ai_prob = float(np.mean(probs)) if probs else 0.0
	ai_percent = round(avg_ai_prob * 100, 2)
	human_percent = round(100 - ai_percent, 2)
	words_count = len(words)
	sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
	avg_sentence_len = round(words_count / (len(sentences) or 1), 2)
	summary = f"Model: {AI_DETECTOR_MODEL}; AI probability: {ai_percent}%"
	except Exception as e:
	print("[AI-report-file] model inference failed:", e)
	ai_percent, human_percent, wc, avg_len, uniq = heuristic_ai_score(text)
	ai_percent = ai_percent
	human_percent = human_percent
	words_count = wc
	avg_sentence_len = avg_len
	summary = f"HEURISTIC fallback — AI probability: {ai_percent}%"
	else:
	ai_percent, human_percent, wc, avg_len, uniq = heuristic_ai_score(text)
	ai_percent = ai_percent
	human_percent = human_percent
	words_count = wc
	avg_sentence_len = avg_len
	summary = f"HEURISTIC fallback — AI probability: {ai_percent}%"

	save_history(user["id"], "ai", text, summary)

	tiles = [
	{'value': f"{ai_percent}%", 'label': 'AI Likelihood'},
	{'value': '-', 'label': 'Plagiarism'},
	{'value': '-', 'label': 'Human-Like'},
	{'value': f"{human_percent}%", 'label': 'Human Likelihood'}
	]
	counts = {
	'Words': str(words_count),
	'Characters': str(len(text)),
	'Sentences': str(len([s for s in re.split(r"[.!?]+", text) if s.strip()])),
	'Paragraphs': str(len([p for p in text.split("\n") if p.strip()])),
	'Avg Sentence Len': str(avg_sentence_len)
	}
	sections = [
	{'heading': 'Executive Summary', 'paragraphs': [
	summary,
	{'text': "This AI Content Report analyses the likelihood that portions of the submitted text were generated by AI.", 'highlight': None}
	]},
	{'heading': 'Document Body', 'paragraphs': [text]}
	]
	matched_sources = []
	footer = "AI detection is probabilistic. Use results as guidance."

	pdf_path = generate_report("ai", out_dir="/tmp",
	title_text="AI Content Report",
	tiles=tiles, counts=counts, sections=sections,
	matched_sources=matched_sources, footer_text=footer)

	return FileResponse(pdf_path, media_type="application/pdf", filename="TrueWrite_AiReport.pdf")


	@app.get("/")
	def read_root():
	return {"status": "Backend is running with GECToR + 16GB RAM!"}