GopalKrushnaMahapatra commited on
Commit
e2458a1
·
verified ·
1 Parent(s): ddc6c8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +946 -946
app.py CHANGED
@@ -1,947 +1,947 @@
1
- # backend/main.py
2
- import os
3
- import re
4
- import io
5
- import sqlite3
6
- from datetime import datetime, timezone
7
-
8
- from dotenv import load_dotenv
9
- from fastapi import FastAPI, HTTPException, status, Header, Depends, File, UploadFile
10
- from fastapi.middleware.cors import CORSMiddleware
11
- from pydantic import BaseModel, EmailStr
12
- from passlib.context import CryptContext
13
- import jwt
14
-
15
- # File parsing libs
16
- from docx import Document as DocxDocument
17
- import PyPDF2
18
-
19
- # ML / NLP libs
20
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
21
- import torch
22
- import numpy as np
23
-
24
- # TF-IDF
25
- from sklearn.feature_extraction.text import TfidfVectorizer
26
- from sklearn.metrics.pairwise import cosine_similarity
27
-
28
- # Semantic embeddings for plagiarism (combined approach)
29
- try:
30
- from sentence_transformers import SentenceTransformer
31
- except Exception:
32
- SentenceTransformer = None
33
-
34
- # LanguageTool (may require Java)
35
- try:
36
- import language_tool_python
37
- except Exception:
38
- language_tool_python = None
39
-
40
- # GECToR (neural grammatical error correction)
41
- try:
42
- from gector import GECToR, predict as gector_predict, load_verb_dict
43
- except Exception:
44
- GECToR = None
45
- gector_predict = None
46
- load_verb_dict = None
47
-
48
- # ------------------ ENV & DB SETUP ------------------
49
- load_dotenv()
50
-
51
- JWT_SECRET = os.getenv("JWT_SECRET", "super_secret_key_change_this")
52
- JWT_ALGO = os.getenv("JWT_ALGO", "HS256")
53
- DB_PATH = os.getenv("DB_PATH", "truewrite.db")
54
- CORPUS_DIR = os.getenv("CORPUS_DIR", "corpus")
55
- CORPUS_RAW = os.getenv("CORPUS_RAW", "corpus_raw")
56
-
57
- # Combined plagiarism weights
58
- PLAG_ALPHA = float(os.getenv("PLAG_ALPHA", "0.4")) # TF-IDF weight; (1-alpha) for embeddings
59
-
60
- pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
61
-
62
- # SQLite DB (simple demo)
63
- conn = sqlite3.connect(DB_PATH, check_same_thread=False)
64
- conn.row_factory = sqlite3.Row
65
- cur = conn.cursor()
66
-
67
- # Create tables if not exist
68
- cur.execute("""
69
- CREATE TABLE IF NOT EXISTS users (
70
- id INTEGER PRIMARY KEY AUTOINCREMENT,
71
- name TEXT NOT NULL,
72
- email TEXT NOT NULL UNIQUE,
73
- password_hash TEXT NOT NULL,
74
- created_at TEXT NOT NULL
75
- )
76
- """)
77
-
78
- cur.execute("""
79
- CREATE TABLE IF NOT EXISTS history (
80
- id INTEGER PRIMARY KEY AUTOINCREMENT,
81
- user_id INTEGER NOT NULL,
82
- tool TEXT NOT NULL,
83
- input_text TEXT,
84
- result_summary TEXT,
85
- created_at TEXT NOT NULL,
86
- FOREIGN KEY (user_id) REFERENCES users(id)
87
- )
88
- """)
89
-
90
- conn.commit()
91
-
92
- # ------------------ FASTAPI APP ------------------
93
- app = FastAPI(title="TrueWrite Scan (Python Backend)")
94
-
95
- app.add_middleware(
96
- CORSMiddleware,
97
- allow_origins=["*"],
98
- allow_credentials=True,
99
- allow_methods=["*"],
100
- allow_headers=["*"],
101
- )
102
-
103
- # ------------------ MODELS ------------------
104
- class SignupRequest(BaseModel):
105
- name: str
106
- email: EmailStr
107
- password: str
108
-
109
-
110
- class LoginRequest(BaseModel):
111
- email: EmailStr
112
- password: str
113
-
114
-
115
- class TextRequest(BaseModel):
116
- text: str
117
-
118
-
119
- # ------------------ AUTH HELPERS ------------------
120
- def hash_password(pw: str) -> str:
121
- return pwd_context.hash(pw)
122
-
123
-
124
- def verify_password(plain: str, hashed: str) -> bool:
125
- return pwd_context.verify(plain, hashed)
126
-
127
-
128
- def create_token(user_id: int, email: str) -> str:
129
- payload = {"user_id": user_id, "email": email}
130
- token = jwt.encode(payload, JWT_SECRET, algorithm=JWT_ALGO)
131
- if isinstance(token, bytes):
132
- token = token.decode("utf-8")
133
- return token
134
-
135
-
136
- def decode_token(token: str):
137
- try:
138
- payload = jwt.decode(token, JWT_SECRET, algorithms=[JWT_ALGO])
139
- return payload
140
- except jwt.PyJWTError:
141
- raise HTTPException(
142
- status_code=status.HTTP_401_UNAUTHORIZED,
143
- detail="Invalid token"
144
- )
145
-
146
-
147
- def get_current_user(authorization: str = Header(None)):
148
- if not authorization or not authorization.startswith("Bearer "):
149
- raise HTTPException(
150
- status_code=status.HTTP_401_UNAUTHORIZED,
151
- detail="Missing token"
152
- )
153
- token = authorization.split(" ", 1)[1]
154
- payload = decode_token(token)
155
- user_id = payload.get("user_id")
156
- cur.execute("SELECT * FROM users WHERE id = ?", (user_id,))
157
- row = cur.fetchone()
158
- if not row:
159
- raise HTTPException(
160
- status_code=status.HTTP_401_UNAUTHORIZED,
161
- detail="User not found"
162
- )
163
- return {"id": row["id"], "name": row["name"], "email": row["email"]}
164
-
165
-
166
- def now_iso():
167
- return datetime.now(timezone.utc).isoformat()
168
-
169
-
170
- def save_history(user_id: int, tool: str, input_text: str, summary: str):
171
- trimmed = (input_text[:500] + "...") if len(input_text) > 500 else input_text
172
- cur.execute(
173
- "INSERT INTO history (user_id, tool, input_text, result_summary, created_at) VALUES (?, ?, ?, ?, ?)",
174
- (user_id, tool, trimmed, summary, now_iso()),
175
- )
176
- conn.commit()
177
-
178
-
179
- # ------------------ TEXT HELPERS ------------------
180
- def count_words(text: str) -> int:
181
- tokens = text.strip().split()
182
- return len(tokens) if text.strip() else 0
183
-
184
-
185
- def simple_grammar_correct(text: str):
186
- """Old heuristic grammar fixer (kept as fallback)."""
187
- corrections = 0
188
- original_words = count_words(text)
189
-
190
- before = text
191
- text = re.sub(r"\s{2,}", " ", text)
192
- if text != before:
193
- corrections += 1
194
-
195
- before = text
196
- text = re.sub(r"\bi\b", "I", text)
197
- if text != before:
198
- corrections += 1
199
-
200
- def cap_match(m):
201
- return m.group(0).upper()
202
-
203
- before = text
204
- text = re.sub(r"(^\s*\w|[.!?]\s+\w)", cap_match, text)
205
- if text != before:
206
- corrections += 1
207
-
208
- if text.strip() and not re.search(r"[.!?]\s*$", text.strip()):
209
- text = text.strip() + "."
210
- corrections += 1
211
-
212
- return text, corrections, original_words
213
-
214
-
215
- # ------------------ CORPUS BUILDING (from corpus_raw -> corpus) ------------------
216
- def extract_from_docx_path(path: str) -> str:
217
- doc = DocxDocument(path)
218
- paragraphs = [p.text for p in doc.paragraphs]
219
- return "\n".join(paragraphs)
220
-
221
-
222
- def extract_from_pdf_path(path: str) -> str:
223
- with open(path, "rb") as f:
224
- reader = PyPDF2.PdfReader(f)
225
- texts = []
226
- for pg in range(len(reader.pages)):
227
- try:
228
- texts.append(reader.pages[pg].extract_text() or "")
229
- except Exception:
230
- texts.append("")
231
- return "\n".join(texts)
232
-
233
-
234
- def build_corpus_from_raw(raw_dir: str = CORPUS_RAW, out_dir: str = CORPUS_DIR):
235
- """
236
- Convert any .pdf / .docx / .txt files from corpus_raw/ into .txt files in corpus/.
237
- This mirrors your build_corpus.py logic but is called automatically at startup.
238
- """
239
- os.makedirs(raw_dir, exist_ok=True)
240
- os.makedirs(out_dir, exist_ok=True)
241
-
242
- for fname in os.listdir(raw_dir):
243
- inpath = os.path.join(raw_dir, fname)
244
- if not os.path.isfile(inpath):
245
- continue
246
- outname = os.path.splitext(fname)[0] + ".txt"
247
- outpath = os.path.join(out_dir, outname)
248
- try:
249
- ext = fname.lower()
250
- if ext.endswith(".docx"):
251
- text = extract_from_docx_path(inpath)
252
- elif ext.endswith(".pdf"):
253
- text = extract_from_pdf_path(inpath)
254
- elif ext.endswith(".txt"):
255
- with open(inpath, "r", encoding="utf-8", errors="ignore") as f:
256
- text = f.read()
257
- else:
258
- print("[CorpusRaw] Skipping unsupported:", fname)
259
- continue
260
-
261
- text = text.strip()
262
- with open(outpath, "w", encoding="utf-8") as fo:
263
- fo.write(text)
264
- print("[CorpusRaw] Wrote:", outpath)
265
- except Exception as e:
266
- print("[CorpusRaw] Failed", fname, "->", e)
267
-
268
-
269
- # ------------------ TF-IDF CORPUS LOADING ------------------
270
- vectorizer = None
271
- corpus_tfidf = None
272
- corpus_titles = []
273
- corpus_texts = []
274
-
275
-
276
- def load_corpus(corpus_dir=CORPUS_DIR):
277
- """
278
- Load .txt corpus files from CORPUS_DIR, build TF-IDF index.
279
- Semantic embeddings are built separately in load_embeddings().
280
- """
281
- global vectorizer, corpus_tfidf, corpus_titles, corpus_texts
282
- corpus_titles = []
283
- corpus_texts = []
284
- if not os.path.isdir(corpus_dir):
285
- os.makedirs(corpus_dir, exist_ok=True)
286
- print("[Corpus] Created empty corpus directory:", corpus_dir)
287
- vectorizer = None
288
- corpus_tfidf = None
289
- return
290
-
291
- for fname in os.listdir(corpus_dir):
292
- if fname.lower().endswith(".txt"):
293
- path = os.path.join(corpus_dir, fname)
294
- try:
295
- with open(path, "r", encoding="utf-8", errors="ignore") as f:
296
- txt = f.read()
297
- corpus_titles.append(fname)
298
- corpus_texts.append(txt)
299
- except Exception as e:
300
- print(f"[Corpus] Failed to read {path}: {e}")
301
-
302
- if corpus_texts:
303
- try:
304
- vectorizer = TfidfVectorizer(
305
- ngram_range=(1, 3),
306
- stop_words="english",
307
- max_features=50000
308
- )
309
- corpus_tfidf = vectorizer.fit_transform(corpus_texts)
310
- print(f"[Corpus] Loaded {len(corpus_texts)} documents into TF-IDF index")
311
- except Exception as e:
312
- print("[Corpus] TF-IDF build failed:", e)
313
- vectorizer = None
314
- corpus_tfidf = None
315
- else:
316
- vectorizer = None
317
- corpus_tfidf = None
318
- print("[Corpus] No .txt documents found in", corpus_dir)
319
-
320
-
321
- # ------------------ SEMANTIC EMBEDDINGS (SentenceTransformers) ------------------
322
- emb_model = None
323
- corpus_emb = None
324
- EMB_MODEL_NAME = os.getenv("PLAG_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
325
-
326
-
327
- def load_embeddings():
328
- """
329
- Build semantic embedding index for plagiarism using sentence-transformers.
330
- """
331
- global emb_model, corpus_emb
332
- if SentenceTransformer is None:
333
- print("[Embeddings] sentence-transformers not installed; skipping semantic index.")
334
- emb_model = None
335
- corpus_emb = None
336
- return
337
-
338
- if not corpus_texts:
339
- print("[Embeddings] No corpus texts available; semantic index not built.")
340
- emb_model = None
341
- corpus_emb = None
342
- return
343
-
344
- try:
345
- emb_model = SentenceTransformer(EMB_MODEL_NAME)
346
- corpus_emb = emb_model.encode(
347
- corpus_texts,
348
- convert_to_numpy=True,
349
- show_progress_bar=False,
350
- normalize_embeddings=True,
351
- )
352
- print(f"[Embeddings] Loaded '{EMB_MODEL_NAME}' and encoded {len(corpus_texts)} corpus docs.")
353
- except Exception as e:
354
- emb_model = None
355
- corpus_emb = None
356
- print("[Embeddings] Failed to load or encode corpus:", e)
357
-
358
-
359
- # Build corpus & embeddings at startup
360
- build_corpus_from_raw()
361
- load_corpus()
362
- load_embeddings()
363
-
364
- # ------------------ HF MODEL LOADING (AI Detector) ------------------
365
- AI_DETECTOR_MODEL = "openai-community/roberta-base-openai-detector"
366
- tokenizer = None
367
- model = None
368
- device = None
369
-
370
- try:
371
- tokenizer = AutoTokenizer.from_pretrained(AI_DETECTOR_MODEL)
372
- model = AutoModelForSequenceClassification.from_pretrained(AI_DETECTOR_MODEL)
373
- model.eval()
374
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
375
- model.to(device)
376
- print(f"[AI Detector] Loaded {AI_DETECTOR_MODEL} on {device}")
377
- except Exception as e:
378
- tokenizer = None
379
- model = None
380
- device = None
381
- print("[AI Detector] Failed to load HF model — using heuristic fallback. Error:", e)
382
-
383
- # ------------------ GECToR LOADING (Neural GEC) ------------------
384
- GEC_MODEL = None
385
- GEC_TOKENIZER = None
386
- GEC_ENCODE = None
387
- GEC_DECODE = None
388
- GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
389
-
390
- if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
391
- try:
392
- GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
393
- GEC_VERB_FILE = os.getenv("GEC_VERB_FILE", "data/verb-form-vocab.txt")
394
- GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
395
- GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
396
- GEC_ENCODE, GEC_DECODE = load_verb_dict(GEC_VERB_FILE)
397
- print(f"[GECToR] Loaded model {GEC_MODEL_ID} on {GEC_DEVICE}")
398
- except Exception as e:
399
- GEC_MODEL = None
400
- GEC_TOKENIZER = None
401
- GEC_ENCODE = None
402
- GEC_DECODE = None
403
- print("[GECToR] Failed to load — falling back to LanguageTool/heuristics. Error:", e)
404
- else:
405
- print("[GECToR] gector library not available; using LanguageTool/heuristics for grammar.")
406
-
407
-
408
- def gector_correct(text: str):
409
- """
410
- Run neural grammatical error correction using GECToR.
411
-
412
- - Trims to 1000 words (server-side safety).
413
- - Splits into sentences, runs GECToR, then joins back.
414
- """
415
- if not (GEC_MODEL and GEC_TOKENIZER and GEC_ENCODE and GEC_DECODE):
416
- raise RuntimeError("GECToR model not loaded")
417
-
418
- parts = text.strip().split()
419
- if len(parts) > 1000:
420
- text_proc = " ".join(parts[:1000])
421
- else:
422
- text_proc = text.strip()
423
-
424
- if not text_proc:
425
- return text_proc, 0, 0
426
-
427
- sentences = re.split(r"(?<=[.!?])\s+", text_proc)
428
- sentences = [s for s in sentences if s.strip()]
429
- if not sentences:
430
- sentences = [text_proc]
431
-
432
- keep_conf = float(os.getenv("GEC_KEEP_CONFIDENCE", "0.0"))
433
- min_err_prob = float(os.getenv("GEC_MIN_ERROR_PROB", "0.0"))
434
- n_iter = int(os.getenv("GEC_N_ITER", "5"))
435
- batch_size = int(os.getenv("GEC_BATCH_SIZE", "8"))
436
-
437
- corrected_sentences = gector_predict(
438
- GEC_MODEL,
439
- GEC_TOKENIZER,
440
- sentences,
441
- GEC_ENCODE,
442
- GEC_DECODE,
443
- keep_confidence=keep_conf,
444
- min_error_prob=min_err_prob,
445
- n_iteration=n_iter,
446
- batch_size=batch_size,
447
- )
448
- corrected_text = " ".join(corrected_sentences)
449
- original_words = len(text_proc.split())
450
- corrections = sum(
451
- 1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b
452
- )
453
- return corrected_text, corrections, original_words
454
-
455
-
456
- # ------------------ FILE EXTRACTION HELPERS ------------------
457
- MAX_FILE_SIZE = 15 * 1024 * 1024 # 15 MB
458
-
459
-
460
- def extract_text_from_upload(upload: UploadFile) -> str:
461
- filename = (upload.filename or "").lower()
462
- content_type = (upload.content_type or "").lower()
463
- data = upload.file.read()
464
- try:
465
- upload.file.seek(0)
466
- except Exception:
467
- pass
468
-
469
- if len(data) > MAX_FILE_SIZE:
470
- raise HTTPException(status_code=413, detail="File too large (max 15MB)")
471
-
472
- # TXT
473
- if filename.endswith(".txt") or content_type == "text/plain":
474
- try:
475
- try:
476
- return data.decode("utf-8")
477
- except UnicodeDecodeError:
478
- return data.decode("latin-1")
479
- except Exception as e:
480
- raise HTTPException(status_code=400, detail=f"Failed to decode text file: {e}")
481
-
482
- # DOCX
483
- if filename.endswith(".docx") or "wordprocessingml" in content_type:
484
- # Basic sanity check: valid .docx is a ZIP (PK header)
485
- if not data.startswith(b"PK"):
486
- raise HTTPException(
487
- status_code=400,
488
- detail="Uploaded file is not a valid .docx package (it might be an old .doc file or a corrupted document). "
489
- "Please open it in Word/Google Docs and re-save as .docx or export as PDF, then upload again."
490
- )
491
- try:
492
- f = io.BytesIO(data)
493
- doc = DocxDocument(f)
494
- paragraphs = [p.text for p in doc.paragraphs]
495
- text = "\n".join(paragraphs).strip()
496
- if not text:
497
- raise ValueError("DOCX contained no readable text.")
498
- return text
499
- except Exception as e:
500
- raise HTTPException(
501
- status_code=400,
502
- detail=f"Failed to parse docx file: {e}. Try opening it in Word/Google Docs and exporting again as .docx or PDF."
503
- )
504
-
505
- # PDF
506
- if filename.endswith(".pdf") or "pdf" in content_type:
507
- try:
508
- f = io.BytesIO(data)
509
- reader = PyPDF2.PdfReader(f)
510
- texts = []
511
- for pg in range(len(reader.pages)):
512
- try:
513
- txt = reader.pages[pg].extract_text() or ""
514
- except Exception:
515
- txt = ""
516
- texts.append(txt)
517
- return "\n".join(texts)
518
- except Exception as e:
519
- raise HTTPException(status_code=400, detail=f"Failed to parse PDF file: {e}")
520
-
521
- raise HTTPException(
522
- status_code=415,
523
- detail="Unsupported file type. Use .txt, .pdf, or .docx",
524
- )
525
-
526
-
527
- # ------------------ GRAMMAR (LANGUAGETOOL INTEGRATION) ------------------
528
- lt_tool = None
529
- if language_tool_python is not None:
530
- try:
531
- lt_tool = language_tool_python.LanguageTool("en-US")
532
- print("[LanguageTool] Loaded (local Java-backed checker)")
533
- except Exception as e:
534
- lt_tool = None
535
- print("[LanguageTool] Could not start local LanguageTool — falling back. Error:", e)
536
- else:
537
- print("[LanguageTool] library not installed; falling back to heuristics.")
538
-
539
-
540
- def grammar_with_languagetool(text: str):
541
- parts = text.strip().split()
542
- if len(parts) > 1000:
543
- text_proc = " ".join(parts[:1000])
544
- else:
545
- text_proc = text.strip()
546
-
547
- matches = lt_tool.check(text_proc)
548
- corrected = language_tool_python.utils.correct(text_proc, matches)
549
- corrections = len(matches)
550
- return corrected, corrections, len(text_proc.split())
551
-
552
-
553
- # ------------------ PLAGIARISM HELPERS (COMBINED ENGINE) ------------------
554
- def _clean_for_jaccard(t: str):
555
- t = t.lower()
556
- t = re.sub(r"[^a-z0-9\s]", " ", t)
557
- return [w for w in t.split() if w]
558
-
559
-
560
- def _jaccard_similarity(a, b):
561
- sa = set(a)
562
- sb = set(b)
563
- if not sa or not sb:
564
- return 0.0
565
- return len(sa & sb) / len(sa | sb)
566
-
567
-
568
- def demo_plagiarism_fallback(text: str):
569
- """
570
- Simple Jaccard-based fallback using a tiny built-in sample set.
571
- Used when no TF-IDF / semantic corpus is available.
572
- """
573
- SAMPLE_DOCS = [
574
- {"title": "AI for Social Good",
575
- "text": "Artificial intelligence is transforming multiple industries by automating routine tasks and enabling data driven decision making for social impact and efficiency."},
576
- {"title": "IoT in Smart Cities",
577
- "text": "The Internet of Things connects sensors, devices, and cloud platforms to enable real time monitoring and control in smart cities including lighting, traffic, and waste management."},
578
- {"title": "Climate & Renewable Energy",
579
- "text": "Climate change is a critical global challenge that demands renewable energy, efficient resource management, and international cooperation to ensure a sustainable future."},
580
- ]
581
-
582
- input_words = _clean_for_jaccard(text)
583
- best_score = 0.0
584
- matches = []
585
- for doc in SAMPLE_DOCS:
586
- doc_words = _clean_for_jaccard(doc["text"])
587
- score = _jaccard_similarity(input_words, doc_words)
588
- matches.append({"title": doc["title"], "score": round(score * 100, 2)})
589
- if score > best_score:
590
- best_score = score
591
-
592
- matches.sort(key=lambda x: x["score"], reverse=True)
593
- plagiarism_percent = round(best_score * 100, 2)
594
- summary = f"Plagiarism estimate (demo Jaccard): {plagiarism_percent}%"
595
- return {"plagiarism_percent": plagiarism_percent, "matches": matches[:5], "summary": summary}
596
-
597
-
598
- def corpus_plagiarism_combined(text: str):
599
- """
600
- Combined plagiarism score using:
601
- - TF-IDF cosine similarity
602
- - Semantic embedding cosine similarity (SentenceTransformers)
603
-
604
- Returns dict matching API schema:
605
- { plagiarism_percent, matches, summary }
606
- """
607
- if not corpus_texts:
608
- raise ValueError("No corpus texts loaded")
609
-
610
- sims_tfidf = None
611
- sims_emb = None
612
-
613
- words = text.split()
614
- if len(words) > 3000:
615
- text_proc = " ".join(words[:3000])
616
- else:
617
- text_proc = text
618
-
619
- # TF-IDF similarity
620
- if vectorizer is not None and corpus_tfidf is not None:
621
- q = vectorizer.transform([text_proc])
622
- sims_tfidf = cosine_similarity(q, corpus_tfidf)[0]
623
-
624
- # Semantic similarity
625
- if emb_model is not None and corpus_emb is not None:
626
- q_emb = emb_model.encode(
627
- [text_proc],
628
- convert_to_numpy=True,
629
- normalize_embeddings=True,
630
- show_progress_bar=False,
631
- )[0]
632
- sims_emb = corpus_emb @ q_emb # normalized → dot = cosine
633
-
634
- if sims_tfidf is None and sims_emb is None:
635
- raise ValueError("No plagiarism backends (TF-IDF / embeddings) are available")
636
-
637
- n_docs = len(corpus_texts)
638
- combined_rows = []
639
- alpha = PLAG_ALPHA # TF-IDF weight
640
-
641
- for i in range(n_docs):
642
- tf = float(sims_tfidf[i]) if sims_tfidf is not None else None
643
- se = float(sims_emb[i]) if sims_emb is not None else None
644
- if tf is None and se is None:
645
- continue
646
-
647
- if tf is not None and se is not None:
648
- score = alpha * tf + (1.0 - alpha) * se
649
- elif tf is not None:
650
- score = tf
651
- else:
652
- score = se
653
-
654
- combined_rows.append({
655
- "index": i,
656
- "combined": score,
657
- "tfidf": tf,
658
- "semantic": se,
659
- })
660
-
661
- if not combined_rows:
662
- raise ValueError("No scores computed for corpus documents")
663
-
664
- combined_rows.sort(key=lambda x: x["combined"], reverse=True)
665
- top = combined_rows[:10]
666
-
667
- best = top[0]["combined"]
668
- plagiarism_percent = round(best * 100, 2)
669
-
670
- matches = []
671
- for row in top:
672
- matches.append({
673
- "title": corpus_titles[row["index"]],
674
-
675
-
676
- "score": round(row["combined"] * 100, 2),
677
- "tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
678
- "semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
679
- })
680
-
681
- components = []
682
- if sims_tfidf is not None:
683
- components.append("TF-IDF")
684
- if sims_emb is not None:
685
- components.append("semantic embeddings")
686
- comp_str = " + ".join(components)
687
-
688
- summary = f"Plagiarism estimate (combined {comp_str}): {plagiarism_percent}%"
689
- return {"plagiarism_percent": plagiarism_percent, "matches": matches, "summary": summary}
690
-
691
-
692
- # ------------------ ENDPOINTS ------------------
693
-
694
- @app.post("/api/signup")
695
- def signup(req: SignupRequest):
696
- cur.execute("SELECT id FROM users WHERE email = ?", (req.email,))
697
- if cur.fetchone():
698
- raise HTTPException(status_code=400, detail="Email already registered")
699
-
700
- pw_hash = hash_password(req.password)
701
- created_at = now_iso()
702
- cur.execute(
703
- "INSERT INTO users (name, email, password_hash, created_at) VALUES (?, ?, ?, ?)",
704
- (req.name, req.email, pw_hash, created_at),
705
- )
706
- conn.commit()
707
- user_id = cur.lastrowid
708
- token = create_token(user_id, req.email)
709
-
710
- return {
711
- "message": "Signup successful",
712
- "token": token,
713
- "name": req.name,
714
- "email": req.email,
715
- }
716
-
717
-
718
- @app.post("/api/login")
719
- def login(req: LoginRequest):
720
- cur.execute("SELECT * FROM users WHERE email = ?", (req.email,))
721
- row = cur.fetchone()
722
- if not row or not verify_password(req.password, row["password_hash"]):
723
- raise HTTPException(status_code=401, detail="Invalid email or password")
724
-
725
- token = create_token(row["id"], row["email"])
726
- return {
727
- "message": "Login successful",
728
- "token": token,
729
- "name": row["name"],
730
- "email": row["email"],
731
- }
732
-
733
-
734
- @app.post("/api/grammar-check")
735
- def api_grammar_check(req: TextRequest, user=Depends(get_current_user)):
736
- text = req.text or ""
737
- if not text.strip():
738
- raise HTTPException(status_code=400, detail="Text is required")
739
-
740
- # Prefer GECToR → LanguageTool → heuristics
741
- if GEC_MODEL is not None:
742
- corrected, corrections, original_words = gector_correct(text)
743
- summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
744
- elif lt_tool is not None:
745
- corrected, corrections, original_words = grammar_with_languagetool(text)
746
- summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
747
- else:
748
- corrected, corrections, original_words = simple_grammar_correct(text)
749
- summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"
750
-
751
- save_history(user["id"], "grammar", text, summary)
752
-
753
- return {
754
- "original_words": original_words,
755
- "corrections": corrections,
756
- "corrected_text": corrected,
757
- "summary": summary,
758
- }
759
-
760
-
761
- @app.post("/api/grammar-check-file")
762
- def api_grammar_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
763
- text = extract_text_from_upload(file).strip()
764
- if not text:
765
- raise HTTPException(status_code=400, detail="Uploaded file contains no text")
766
-
767
- if GEC_MODEL is not None:
768
- corrected, corrections, original_words = gector_correct(text)
769
- summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
770
- elif lt_tool is not None:
771
- corrected, corrections, original_words = grammar_with_languagetool(text)
772
- summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
773
- else:
774
- parts = text.strip().split()
775
- if len(parts) > 1000:
776
- text = " ".join(parts[:1000])
777
- corrected, corrections, original_words = simple_grammar_correct(text)
778
- summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"
779
-
780
- save_history(user["id"], "grammar", text, summary)
781
-
782
- return {
783
- "original_words": original_words,
784
- "corrections": corrections,
785
- "corrected_text": corrected,
786
- "summary": summary,
787
- }
788
-
789
-
790
- # ------------------ PLAGIARISM ENDPOINTS (COMBINED) ------------------
791
- @app.post("/api/plagiarism-check")
792
- def api_plagiarism_check(req: TextRequest, user=Depends(get_current_user)):
793
- text = req.text or ""
794
- if not text.strip():
795
- raise HTTPException(status_code=400, detail="Text is required")
796
-
797
- # First try full combined engine (TF-IDF + embeddings) with corpus
798
- try:
799
- result = corpus_plagiarism_combined(text)
800
- save_history(user["id"], "plagiarism", text, result["summary"])
801
- return result
802
- except Exception as e:
803
- print("[Plagiarism] Combined corpus engine failed, falling back to demo:", e)
804
-
805
- # Fallback: small Jaccard demo
806
- result = demo_plagiarism_fallback(text)
807
- save_history(user["id"], "plagiarism", text, result["summary"])
808
- return result
809
-
810
-
811
- @app.post("/api/plagiarism-check-file")
812
- def api_plagiarism_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
813
- text = extract_text_from_upload(file).strip()
814
- if not text:
815
- raise HTTPException(status_code=400, detail="Uploaded file contains no text")
816
-
817
- try:
818
- result = corpus_plagiarism_combined(text)
819
- save_history(user["id"], "plagiarism", text, result["summary"])
820
- return result
821
- except Exception as e:
822
- print("[Plagiarism-file] Combined corpus engine failed, falling back to demo:", e)
823
-
824
- # Fallback to demo if corpus/engines unavailable
825
- result = demo_plagiarism_fallback(text)
826
- save_history(user["id"], "plagiarism", text, result["summary"])
827
- return result
828
-
829
-
830
- # ------------------ AI CHECK (TEXT & FILE) ------------------
831
- def heuristic_ai_score(text: str):
832
- words = re.sub(r"[^a-z0-9\s]", " ", text.lower()).split()
833
- word_count = len(words)
834
- unique_ratio = len(set(words)) / (word_count or 1)
835
- sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
836
- avg_sentence_length = word_count / (len(sentences) or 1)
837
-
838
- ai_score = 0
839
- if unique_ratio < 0.45:
840
- ai_score += 40
841
- elif unique_ratio < 0.6:
842
- ai_score += 20
843
-
844
- if avg_sentence_length > 25:
845
- ai_score += 40
846
- elif avg_sentence_length > 18:
847
- ai_score += 25
848
-
849
- if word_count > 400:
850
- ai_score += 10
851
-
852
- ai_score = min(100, round(ai_score))
853
- human_score = 100 - ai_score
854
- return ai_score, human_score, word_count, avg_sentence_length, unique_ratio
855
-
856
-
857
- @app.post("/api/ai-check")
858
- def api_ai_check(req: TextRequest, user=Depends(get_current_user)):
859
- text = (req.text or "").strip()
860
- if not text:
861
- raise HTTPException(status_code=400, detail="Text is required")
862
-
863
- if model is not None and tokenizer is not None:
864
- try:
865
- max_len = getattr(tokenizer, "model_max_length", 512)
866
- if max_len is None or max_len > 1024:
867
- max_len = 512
868
-
869
- words = text.split()
870
- chunk_size = min(400, max_len - 10)
871
- chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
872
- probs = []
873
- for chunk in chunks:
874
- inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_len)
875
- inputs = {k: v.to(device) for k, v in inputs.items()}
876
- with torch.no_grad():
877
- outputs = model(**inputs)
878
- logits = outputs.logits
879
- p = torch.softmax(logits, dim=1).cpu().numpy()[0]
880
- ai_prob = float(p[1]) if p.shape[0] > 1 else float(p[0])
881
- probs.append(ai_prob)
882
- avg_ai_prob = float(np.mean(probs)) if probs else 0.0
883
- ai_percent = round(avg_ai_prob * 100, 2)
884
- human_percent = round(100 - ai_percent, 2)
885
- words_count = len(words)
886
- sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
887
- avg_sentence_len = round(words_count / (len(sentences) or 1), 2)
888
- summary = f"Model: {AI_DETECTOR_MODEL}; AI probability: {ai_percent}%"
889
- save_history(user["id"], "ai", text, summary)
890
- return {
891
- "ai_percent": ai_percent,
892
- "human_percent": human_percent,
893
- "word_count": words_count,
894
- "avg_sentence_length": avg_sentence_len,
895
- "summary": summary,
896
- }
897
- except Exception as e:
898
- print("[AI-check] model inference failed:", e)
899
-
900
- ai_percent, human_percent, wc, avg_len, uniq = heuristic_ai_score(text)
901
- summary = f"HEURISTIC fallback — AI probability: {ai_percent}%"
902
- save_history(user["id"], "ai", text, summary)
903
- return {
904
- "ai_percent": ai_percent,
905
- "human_percent": human_percent,
906
- "word_count": wc,
907
- "avg_sentence_length": avg_len,
908
- "unique_ratio": round(uniq, 3),
909
- "summary": summary,
910
- }
911
-
912
-
913
- @app.post("/api/ai-check-file")
914
- def api_ai_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
915
- text = extract_text_from_upload(file).strip()
916
- if not text:
917
- raise HTTPException(status_code=400, detail="Uploaded file contains no text")
918
- return api_ai_check.__wrapped__(TextRequest(text=text), user)
919
-
920
-
921
- # ------------------ HISTORY ------------------
922
- @app.get("/api/history")
923
- def api_history(user=Depends(get_current_user)):
924
- cur.execute(
925
- "SELECT id, tool, input_text, result_summary, created_at "
926
- "FROM history WHERE user_id = ? "
927
- "ORDER BY created_at DESC LIMIT 50",
928
- (user["id"],),
929
- )
930
- rows = cur.fetchall()
931
- items = []
932
- for r in rows:
933
- items.append(
934
- {
935
- "id": r["id"],
936
- "tool": r["tool"],
937
- "input_text": r["input_text"],
938
- "summary": r["result_summary"],
939
- "created_at": r["created_at"],
940
- }
941
- )
942
- return {"items": items}
943
-
944
-
945
- @app.get("/")
946
- def read_root():
947
  return {"status": "Backend is running with 16GB RAM!"}
 
1
+ # backend/main.py
2
+ import os
3
+ import re
4
+ import io
5
+ import sqlite3
6
+ from datetime import datetime, timezone
7
+
8
+ from dotenv import load_dotenv
9
+ from fastapi import FastAPI, HTTPException, status, Header, Depends, File, UploadFile
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ from pydantic import BaseModel, EmailStr
12
+ from passlib.context import CryptContext
13
+ import jwt
14
+
15
+ # File parsing libs
16
+ from docx import Document as DocxDocument
17
+ import PyPDF2
18
+
19
+ # ML / NLP libs
20
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
21
+ import torch
22
+ import numpy as np
23
+
24
+ # TF-IDF
25
+ from sklearn.feature_extraction.text import TfidfVectorizer
26
+ from sklearn.metrics.pairwise import cosine_similarity
27
+
28
+ # Semantic embeddings for plagiarism (combined approach)
29
+ try:
30
+ from sentence_transformers import SentenceTransformer
31
+ except Exception:
32
+ SentenceTransformer = None
33
+
34
+ # LanguageTool (may require Java)
35
+ try:
36
+ import language_tool_python
37
+ except Exception:
38
+ language_tool_python = None
39
+
40
+ # GECToR (neural grammatical error correction)
41
+ try:
42
+ from gector import GECToR, predict as gector_predict, load_verb_dict
43
+ except Exception:
44
+ GECToR = None
45
+ gector_predict = None
46
+ load_verb_dict = None
47
+
48
+ # ------------------ ENV & DB SETUP ------------------
49
+ load_dotenv()
50
+
51
+ JWT_SECRET = os.getenv("JWT_SECRET", "super_secret_key_change_this")
52
+ JWT_ALGO = os.getenv("JWT_ALGO", "HS256")
53
+ DB_PATH = os.getenv("DB_PATH", "truewrite.db")
54
+ CORPUS_DIR = os.getenv("CORPUS_DIR", "corpus")
55
+ CORPUS_RAW = os.getenv("CORPUS_RAW", "corpus_raw")
56
+
57
+ # Combined plagiarism weights
58
+ PLAG_ALPHA = float(os.getenv("PLAG_ALPHA", "0.4")) # TF-IDF weight; (1-alpha) for embeddings
59
+
60
+ pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
61
+
62
+ # SQLite DB (simple demo)
63
+ conn = sqlite3.connect(DB_PATH, check_same_thread=False)
64
+ conn.row_factory = sqlite3.Row
65
+ cur = conn.cursor()
66
+
67
+ # Create tables if not exist
68
+ cur.execute("""
69
+ CREATE TABLE IF NOT EXISTS users (
70
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
71
+ name TEXT NOT NULL,
72
+ email TEXT NOT NULL UNIQUE,
73
+ password_hash TEXT NOT NULL,
74
+ created_at TEXT NOT NULL
75
+ )
76
+ """)
77
+
78
+ cur.execute("""
79
+ CREATE TABLE IF NOT EXISTS history (
80
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
81
+ user_id INTEGER NOT NULL,
82
+ tool TEXT NOT NULL,
83
+ input_text TEXT,
84
+ result_summary TEXT,
85
+ created_at TEXT NOT NULL,
86
+ FOREIGN KEY (user_id) REFERENCES users(id)
87
+ )
88
+ """)
89
+
90
+ conn.commit()
91
+
92
+ # ------------------ FASTAPI APP ------------------
93
+ app = FastAPI(title="TrueWrite Scan (Python Backend)")
94
+
95
+ app.add_middleware(
96
+ CORSMiddleware,
97
+ allow_origins=["https://true-write-scan.vercel.app/"],
98
+ allow_credentials=True,
99
+ allow_methods=["*"],
100
+ allow_headers=["*"],
101
+ )
102
+
103
+ # ------------------ MODELS ------------------
104
+ class SignupRequest(BaseModel):
105
+ name: str
106
+ email: EmailStr
107
+ password: str
108
+
109
+
110
+ class LoginRequest(BaseModel):
111
+ email: EmailStr
112
+ password: str
113
+
114
+
115
+ class TextRequest(BaseModel):
116
+ text: str
117
+
118
+
119
+ # ------------------ AUTH HELPERS ------------------
120
+ def hash_password(pw: str) -> str:
121
+ return pwd_context.hash(pw)
122
+
123
+
124
+ def verify_password(plain: str, hashed: str) -> bool:
125
+ return pwd_context.verify(plain, hashed)
126
+
127
+
128
+ def create_token(user_id: int, email: str) -> str:
129
+ payload = {"user_id": user_id, "email": email}
130
+ token = jwt.encode(payload, JWT_SECRET, algorithm=JWT_ALGO)
131
+ if isinstance(token, bytes):
132
+ token = token.decode("utf-8")
133
+ return token
134
+
135
+
136
+ def decode_token(token: str):
137
+ try:
138
+ payload = jwt.decode(token, JWT_SECRET, algorithms=[JWT_ALGO])
139
+ return payload
140
+ except jwt.PyJWTError:
141
+ raise HTTPException(
142
+ status_code=status.HTTP_401_UNAUTHORIZED,
143
+ detail="Invalid token"
144
+ )
145
+
146
+
147
+ def get_current_user(authorization: str = Header(None)):
148
+ if not authorization or not authorization.startswith("Bearer "):
149
+ raise HTTPException(
150
+ status_code=status.HTTP_401_UNAUTHORIZED,
151
+ detail="Missing token"
152
+ )
153
+ token = authorization.split(" ", 1)[1]
154
+ payload = decode_token(token)
155
+ user_id = payload.get("user_id")
156
+ cur.execute("SELECT * FROM users WHERE id = ?", (user_id,))
157
+ row = cur.fetchone()
158
+ if not row:
159
+ raise HTTPException(
160
+ status_code=status.HTTP_401_UNAUTHORIZED,
161
+ detail="User not found"
162
+ )
163
+ return {"id": row["id"], "name": row["name"], "email": row["email"]}
164
+
165
+
166
+ def now_iso():
167
+ return datetime.now(timezone.utc).isoformat()
168
+
169
+
170
+ def save_history(user_id: int, tool: str, input_text: str, summary: str):
171
+ trimmed = (input_text[:500] + "...") if len(input_text) > 500 else input_text
172
+ cur.execute(
173
+ "INSERT INTO history (user_id, tool, input_text, result_summary, created_at) VALUES (?, ?, ?, ?, ?)",
174
+ (user_id, tool, trimmed, summary, now_iso()),
175
+ )
176
+ conn.commit()
177
+
178
+
179
+ # ------------------ TEXT HELPERS ------------------
180
+ def count_words(text: str) -> int:
181
+ tokens = text.strip().split()
182
+ return len(tokens) if text.strip() else 0
183
+
184
+
185
+ def simple_grammar_correct(text: str):
186
+ """Old heuristic grammar fixer (kept as fallback)."""
187
+ corrections = 0
188
+ original_words = count_words(text)
189
+
190
+ before = text
191
+ text = re.sub(r"\s{2,}", " ", text)
192
+ if text != before:
193
+ corrections += 1
194
+
195
+ before = text
196
+ text = re.sub(r"\bi\b", "I", text)
197
+ if text != before:
198
+ corrections += 1
199
+
200
+ def cap_match(m):
201
+ return m.group(0).upper()
202
+
203
+ before = text
204
+ text = re.sub(r"(^\s*\w|[.!?]\s+\w)", cap_match, text)
205
+ if text != before:
206
+ corrections += 1
207
+
208
+ if text.strip() and not re.search(r"[.!?]\s*$", text.strip()):
209
+ text = text.strip() + "."
210
+ corrections += 1
211
+
212
+ return text, corrections, original_words
213
+
214
+
215
+ # ------------------ CORPUS BUILDING (from corpus_raw -> corpus) ------------------
216
+ def extract_from_docx_path(path: str) -> str:
217
+ doc = DocxDocument(path)
218
+ paragraphs = [p.text for p in doc.paragraphs]
219
+ return "\n".join(paragraphs)
220
+
221
+
222
+ def extract_from_pdf_path(path: str) -> str:
223
+ with open(path, "rb") as f:
224
+ reader = PyPDF2.PdfReader(f)
225
+ texts = []
226
+ for pg in range(len(reader.pages)):
227
+ try:
228
+ texts.append(reader.pages[pg].extract_text() or "")
229
+ except Exception:
230
+ texts.append("")
231
+ return "\n".join(texts)
232
+
233
+
234
+ def build_corpus_from_raw(raw_dir: str = CORPUS_RAW, out_dir: str = CORPUS_DIR):
235
+ """
236
+ Convert any .pdf / .docx / .txt files from corpus_raw/ into .txt files in corpus/.
237
+ This mirrors your build_corpus.py logic but is called automatically at startup.
238
+ """
239
+ os.makedirs(raw_dir, exist_ok=True)
240
+ os.makedirs(out_dir, exist_ok=True)
241
+
242
+ for fname in os.listdir(raw_dir):
243
+ inpath = os.path.join(raw_dir, fname)
244
+ if not os.path.isfile(inpath):
245
+ continue
246
+ outname = os.path.splitext(fname)[0] + ".txt"
247
+ outpath = os.path.join(out_dir, outname)
248
+ try:
249
+ ext = fname.lower()
250
+ if ext.endswith(".docx"):
251
+ text = extract_from_docx_path(inpath)
252
+ elif ext.endswith(".pdf"):
253
+ text = extract_from_pdf_path(inpath)
254
+ elif ext.endswith(".txt"):
255
+ with open(inpath, "r", encoding="utf-8", errors="ignore") as f:
256
+ text = f.read()
257
+ else:
258
+ print("[CorpusRaw] Skipping unsupported:", fname)
259
+ continue
260
+
261
+ text = text.strip()
262
+ with open(outpath, "w", encoding="utf-8") as fo:
263
+ fo.write(text)
264
+ print("[CorpusRaw] Wrote:", outpath)
265
+ except Exception as e:
266
+ print("[CorpusRaw] Failed", fname, "->", e)
267
+
268
+
269
+ # ------------------ TF-IDF CORPUS LOADING ------------------
270
+ vectorizer = None
271
+ corpus_tfidf = None
272
+ corpus_titles = []
273
+ corpus_texts = []
274
+
275
+
276
+ def load_corpus(corpus_dir=CORPUS_DIR):
277
+ """
278
+ Load .txt corpus files from CORPUS_DIR, build TF-IDF index.
279
+ Semantic embeddings are built separately in load_embeddings().
280
+ """
281
+ global vectorizer, corpus_tfidf, corpus_titles, corpus_texts
282
+ corpus_titles = []
283
+ corpus_texts = []
284
+ if not os.path.isdir(corpus_dir):
285
+ os.makedirs(corpus_dir, exist_ok=True)
286
+ print("[Corpus] Created empty corpus directory:", corpus_dir)
287
+ vectorizer = None
288
+ corpus_tfidf = None
289
+ return
290
+
291
+ for fname in os.listdir(corpus_dir):
292
+ if fname.lower().endswith(".txt"):
293
+ path = os.path.join(corpus_dir, fname)
294
+ try:
295
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
296
+ txt = f.read()
297
+ corpus_titles.append(fname)
298
+ corpus_texts.append(txt)
299
+ except Exception as e:
300
+ print(f"[Corpus] Failed to read {path}: {e}")
301
+
302
+ if corpus_texts:
303
+ try:
304
+ vectorizer = TfidfVectorizer(
305
+ ngram_range=(1, 3),
306
+ stop_words="english",
307
+ max_features=50000
308
+ )
309
+ corpus_tfidf = vectorizer.fit_transform(corpus_texts)
310
+ print(f"[Corpus] Loaded {len(corpus_texts)} documents into TF-IDF index")
311
+ except Exception as e:
312
+ print("[Corpus] TF-IDF build failed:", e)
313
+ vectorizer = None
314
+ corpus_tfidf = None
315
+ else:
316
+ vectorizer = None
317
+ corpus_tfidf = None
318
+ print("[Corpus] No .txt documents found in", corpus_dir)
319
+
320
+
321
+ # ------------------ SEMANTIC EMBEDDINGS (SentenceTransformers) ------------------
322
+ emb_model = None
323
+ corpus_emb = None
324
+ EMB_MODEL_NAME = os.getenv("PLAG_EMB_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
325
+
326
+
327
+ def load_embeddings():
328
+ """
329
+ Build semantic embedding index for plagiarism using sentence-transformers.
330
+ """
331
+ global emb_model, corpus_emb
332
+ if SentenceTransformer is None:
333
+ print("[Embeddings] sentence-transformers not installed; skipping semantic index.")
334
+ emb_model = None
335
+ corpus_emb = None
336
+ return
337
+
338
+ if not corpus_texts:
339
+ print("[Embeddings] No corpus texts available; semantic index not built.")
340
+ emb_model = None
341
+ corpus_emb = None
342
+ return
343
+
344
+ try:
345
+ emb_model = SentenceTransformer(EMB_MODEL_NAME)
346
+ corpus_emb = emb_model.encode(
347
+ corpus_texts,
348
+ convert_to_numpy=True,
349
+ show_progress_bar=False,
350
+ normalize_embeddings=True,
351
+ )
352
+ print(f"[Embeddings] Loaded '{EMB_MODEL_NAME}' and encoded {len(corpus_texts)} corpus docs.")
353
+ except Exception as e:
354
+ emb_model = None
355
+ corpus_emb = None
356
+ print("[Embeddings] Failed to load or encode corpus:", e)
357
+
358
+
359
+ # Build corpus & embeddings at startup
360
+ build_corpus_from_raw()
361
+ load_corpus()
362
+ load_embeddings()
363
+
364
+ # ------------------ HF MODEL LOADING (AI Detector) ------------------
365
+ AI_DETECTOR_MODEL = "openai-community/roberta-base-openai-detector"
366
+ tokenizer = None
367
+ model = None
368
+ device = None
369
+
370
+ try:
371
+ tokenizer = AutoTokenizer.from_pretrained(AI_DETECTOR_MODEL)
372
+ model = AutoModelForSequenceClassification.from_pretrained(AI_DETECTOR_MODEL)
373
+ model.eval()
374
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
375
+ model.to(device)
376
+ print(f"[AI Detector] Loaded {AI_DETECTOR_MODEL} on {device}")
377
+ except Exception as e:
378
+ tokenizer = None
379
+ model = None
380
+ device = None
381
+ print("[AI Detector] Failed to load HF model — using heuristic fallback. Error:", e)
382
+
383
+ # ------------------ GECToR LOADING (Neural GEC) ------------------
384
+ GEC_MODEL = None
385
+ GEC_TOKENIZER = None
386
+ GEC_ENCODE = None
387
+ GEC_DECODE = None
388
+ GEC_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
389
+
390
+ if GECToR is not None and gector_predict is not None and load_verb_dict is not None:
391
+ try:
392
+ GEC_MODEL_ID = os.getenv("GEC_MODEL_ID", "gotutiyan/gector-roberta-base-5k")
393
+ GEC_VERB_FILE = os.getenv("GEC_VERB_FILE", "data/verb-form-vocab.txt")
394
+ GEC_MODEL = GECToR.from_pretrained(GEC_MODEL_ID).to(GEC_DEVICE)
395
+ GEC_TOKENIZER = AutoTokenizer.from_pretrained(GEC_MODEL_ID)
396
+ GEC_ENCODE, GEC_DECODE = load_verb_dict(GEC_VERB_FILE)
397
+ print(f"[GECToR] Loaded model {GEC_MODEL_ID} on {GEC_DEVICE}")
398
+ except Exception as e:
399
+ GEC_MODEL = None
400
+ GEC_TOKENIZER = None
401
+ GEC_ENCODE = None
402
+ GEC_DECODE = None
403
+ print("[GECToR] Failed to load — falling back to LanguageTool/heuristics. Error:", e)
404
+ else:
405
+ print("[GECToR] gector library not available; using LanguageTool/heuristics for grammar.")
406
+
407
+
408
+ def gector_correct(text: str):
409
+ """
410
+ Run neural grammatical error correction using GECToR.
411
+
412
+ - Trims to 1000 words (server-side safety).
413
+ - Splits into sentences, runs GECToR, then joins back.
414
+ """
415
+ if not (GEC_MODEL and GEC_TOKENIZER and GEC_ENCODE and GEC_DECODE):
416
+ raise RuntimeError("GECToR model not loaded")
417
+
418
+ parts = text.strip().split()
419
+ if len(parts) > 1000:
420
+ text_proc = " ".join(parts[:1000])
421
+ else:
422
+ text_proc = text.strip()
423
+
424
+ if not text_proc:
425
+ return text_proc, 0, 0
426
+
427
+ sentences = re.split(r"(?<=[.!?])\s+", text_proc)
428
+ sentences = [s for s in sentences if s.strip()]
429
+ if not sentences:
430
+ sentences = [text_proc]
431
+
432
+ keep_conf = float(os.getenv("GEC_KEEP_CONFIDENCE", "0.0"))
433
+ min_err_prob = float(os.getenv("GEC_MIN_ERROR_PROB", "0.0"))
434
+ n_iter = int(os.getenv("GEC_N_ITER", "5"))
435
+ batch_size = int(os.getenv("GEC_BATCH_SIZE", "8"))
436
+
437
+ corrected_sentences = gector_predict(
438
+ GEC_MODEL,
439
+ GEC_TOKENIZER,
440
+ sentences,
441
+ GEC_ENCODE,
442
+ GEC_DECODE,
443
+ keep_confidence=keep_conf,
444
+ min_error_prob=min_err_prob,
445
+ n_iteration=n_iter,
446
+ batch_size=batch_size,
447
+ )
448
+ corrected_text = " ".join(corrected_sentences)
449
+ original_words = len(text_proc.split())
450
+ corrections = sum(
451
+ 1 for a, b in zip(text_proc.split(), corrected_text.split()) if a != b
452
+ )
453
+ return corrected_text, corrections, original_words
454
+
455
+
456
+ # ------------------ FILE EXTRACTION HELPERS ------------------
457
+ MAX_FILE_SIZE = 15 * 1024 * 1024 # 15 MB
458
+
459
+
460
+ def extract_text_from_upload(upload: UploadFile) -> str:
461
+ filename = (upload.filename or "").lower()
462
+ content_type = (upload.content_type or "").lower()
463
+ data = upload.file.read()
464
+ try:
465
+ upload.file.seek(0)
466
+ except Exception:
467
+ pass
468
+
469
+ if len(data) > MAX_FILE_SIZE:
470
+ raise HTTPException(status_code=413, detail="File too large (max 15MB)")
471
+
472
+ # TXT
473
+ if filename.endswith(".txt") or content_type == "text/plain":
474
+ try:
475
+ try:
476
+ return data.decode("utf-8")
477
+ except UnicodeDecodeError:
478
+ return data.decode("latin-1")
479
+ except Exception as e:
480
+ raise HTTPException(status_code=400, detail=f"Failed to decode text file: {e}")
481
+
482
+ # DOCX
483
+ if filename.endswith(".docx") or "wordprocessingml" in content_type:
484
+ # Basic sanity check: valid .docx is a ZIP (PK header)
485
+ if not data.startswith(b"PK"):
486
+ raise HTTPException(
487
+ status_code=400,
488
+ detail="Uploaded file is not a valid .docx package (it might be an old .doc file or a corrupted document). "
489
+ "Please open it in Word/Google Docs and re-save as .docx or export as PDF, then upload again."
490
+ )
491
+ try:
492
+ f = io.BytesIO(data)
493
+ doc = DocxDocument(f)
494
+ paragraphs = [p.text for p in doc.paragraphs]
495
+ text = "\n".join(paragraphs).strip()
496
+ if not text:
497
+ raise ValueError("DOCX contained no readable text.")
498
+ return text
499
+ except Exception as e:
500
+ raise HTTPException(
501
+ status_code=400,
502
+ detail=f"Failed to parse docx file: {e}. Try opening it in Word/Google Docs and exporting again as .docx or PDF."
503
+ )
504
+
505
+ # PDF
506
+ if filename.endswith(".pdf") or "pdf" in content_type:
507
+ try:
508
+ f = io.BytesIO(data)
509
+ reader = PyPDF2.PdfReader(f)
510
+ texts = []
511
+ for pg in range(len(reader.pages)):
512
+ try:
513
+ txt = reader.pages[pg].extract_text() or ""
514
+ except Exception:
515
+ txt = ""
516
+ texts.append(txt)
517
+ return "\n".join(texts)
518
+ except Exception as e:
519
+ raise HTTPException(status_code=400, detail=f"Failed to parse PDF file: {e}")
520
+
521
+ raise HTTPException(
522
+ status_code=415,
523
+ detail="Unsupported file type. Use .txt, .pdf, or .docx",
524
+ )
525
+
526
+
527
+ # ------------------ GRAMMAR (LANGUAGETOOL INTEGRATION) ------------------
528
+ lt_tool = None
529
+ if language_tool_python is not None:
530
+ try:
531
+ lt_tool = language_tool_python.LanguageTool("en-US")
532
+ print("[LanguageTool] Loaded (local Java-backed checker)")
533
+ except Exception as e:
534
+ lt_tool = None
535
+ print("[LanguageTool] Could not start local LanguageTool — falling back. Error:", e)
536
+ else:
537
+ print("[LanguageTool] library not installed; falling back to heuristics.")
538
+
539
+
540
+ def grammar_with_languagetool(text: str):
541
+ parts = text.strip().split()
542
+ if len(parts) > 1000:
543
+ text_proc = " ".join(parts[:1000])
544
+ else:
545
+ text_proc = text.strip()
546
+
547
+ matches = lt_tool.check(text_proc)
548
+ corrected = language_tool_python.utils.correct(text_proc, matches)
549
+ corrections = len(matches)
550
+ return corrected, corrections, len(text_proc.split())
551
+
552
+
553
+ # ------------------ PLAGIARISM HELPERS (COMBINED ENGINE) ------------------
554
+ def _clean_for_jaccard(t: str):
555
+ t = t.lower()
556
+ t = re.sub(r"[^a-z0-9\s]", " ", t)
557
+ return [w for w in t.split() if w]
558
+
559
+
560
+ def _jaccard_similarity(a, b):
561
+ sa = set(a)
562
+ sb = set(b)
563
+ if not sa or not sb:
564
+ return 0.0
565
+ return len(sa & sb) / len(sa | sb)
566
+
567
+
568
+ def demo_plagiarism_fallback(text: str):
569
+ """
570
+ Simple Jaccard-based fallback using a tiny built-in sample set.
571
+ Used when no TF-IDF / semantic corpus is available.
572
+ """
573
+ SAMPLE_DOCS = [
574
+ {"title": "AI for Social Good",
575
+ "text": "Artificial intelligence is transforming multiple industries by automating routine tasks and enabling data driven decision making for social impact and efficiency."},
576
+ {"title": "IoT in Smart Cities",
577
+ "text": "The Internet of Things connects sensors, devices, and cloud platforms to enable real time monitoring and control in smart cities including lighting, traffic, and waste management."},
578
+ {"title": "Climate & Renewable Energy",
579
+ "text": "Climate change is a critical global challenge that demands renewable energy, efficient resource management, and international cooperation to ensure a sustainable future."},
580
+ ]
581
+
582
+ input_words = _clean_for_jaccard(text)
583
+ best_score = 0.0
584
+ matches = []
585
+ for doc in SAMPLE_DOCS:
586
+ doc_words = _clean_for_jaccard(doc["text"])
587
+ score = _jaccard_similarity(input_words, doc_words)
588
+ matches.append({"title": doc["title"], "score": round(score * 100, 2)})
589
+ if score > best_score:
590
+ best_score = score
591
+
592
+ matches.sort(key=lambda x: x["score"], reverse=True)
593
+ plagiarism_percent = round(best_score * 100, 2)
594
+ summary = f"Plagiarism estimate (demo Jaccard): {plagiarism_percent}%"
595
+ return {"plagiarism_percent": plagiarism_percent, "matches": matches[:5], "summary": summary}
596
+
597
+
598
+ def corpus_plagiarism_combined(text: str):
599
+ """
600
+ Combined plagiarism score using:
601
+ - TF-IDF cosine similarity
602
+ - Semantic embedding cosine similarity (SentenceTransformers)
603
+
604
+ Returns dict matching API schema:
605
+ { plagiarism_percent, matches, summary }
606
+ """
607
+ if not corpus_texts:
608
+ raise ValueError("No corpus texts loaded")
609
+
610
+ sims_tfidf = None
611
+ sims_emb = None
612
+
613
+ words = text.split()
614
+ if len(words) > 3000:
615
+ text_proc = " ".join(words[:3000])
616
+ else:
617
+ text_proc = text
618
+
619
+ # TF-IDF similarity
620
+ if vectorizer is not None and corpus_tfidf is not None:
621
+ q = vectorizer.transform([text_proc])
622
+ sims_tfidf = cosine_similarity(q, corpus_tfidf)[0]
623
+
624
+ # Semantic similarity
625
+ if emb_model is not None and corpus_emb is not None:
626
+ q_emb = emb_model.encode(
627
+ [text_proc],
628
+ convert_to_numpy=True,
629
+ normalize_embeddings=True,
630
+ show_progress_bar=False,
631
+ )[0]
632
+ sims_emb = corpus_emb @ q_emb # normalized → dot = cosine
633
+
634
+ if sims_tfidf is None and sims_emb is None:
635
+ raise ValueError("No plagiarism backends (TF-IDF / embeddings) are available")
636
+
637
+ n_docs = len(corpus_texts)
638
+ combined_rows = []
639
+ alpha = PLAG_ALPHA # TF-IDF weight
640
+
641
+ for i in range(n_docs):
642
+ tf = float(sims_tfidf[i]) if sims_tfidf is not None else None
643
+ se = float(sims_emb[i]) if sims_emb is not None else None
644
+ if tf is None and se is None:
645
+ continue
646
+
647
+ if tf is not None and se is not None:
648
+ score = alpha * tf + (1.0 - alpha) * se
649
+ elif tf is not None:
650
+ score = tf
651
+ else:
652
+ score = se
653
+
654
+ combined_rows.append({
655
+ "index": i,
656
+ "combined": score,
657
+ "tfidf": tf,
658
+ "semantic": se,
659
+ })
660
+
661
+ if not combined_rows:
662
+ raise ValueError("No scores computed for corpus documents")
663
+
664
+ combined_rows.sort(key=lambda x: x["combined"], reverse=True)
665
+ top = combined_rows[:10]
666
+
667
+ best = top[0]["combined"]
668
+ plagiarism_percent = round(best * 100, 2)
669
+
670
+ matches = []
671
+ for row in top:
672
+ matches.append({
673
+ "title": corpus_titles[row["index"]],
674
+
675
+
676
+ "score": round(row["combined"] * 100, 2),
677
+ "tfidf_score": round(row["tfidf"] * 100, 2) if row["tfidf"] is not None else None,
678
+ "semantic_score": round(row["semantic"] * 100, 2) if row["semantic"] is not None else None,
679
+ })
680
+
681
+ components = []
682
+ if sims_tfidf is not None:
683
+ components.append("TF-IDF")
684
+ if sims_emb is not None:
685
+ components.append("semantic embeddings")
686
+ comp_str = " + ".join(components)
687
+
688
+ summary = f"Plagiarism estimate (combined {comp_str}): {plagiarism_percent}%"
689
+ return {"plagiarism_percent": plagiarism_percent, "matches": matches, "summary": summary}
690
+
691
+
692
+ # ------------------ ENDPOINTS ------------------
693
+
694
+ @app.post("/api/signup")
695
+ def signup(req: SignupRequest):
696
+ cur.execute("SELECT id FROM users WHERE email = ?", (req.email,))
697
+ if cur.fetchone():
698
+ raise HTTPException(status_code=400, detail="Email already registered")
699
+
700
+ pw_hash = hash_password(req.password)
701
+ created_at = now_iso()
702
+ cur.execute(
703
+ "INSERT INTO users (name, email, password_hash, created_at) VALUES (?, ?, ?, ?)",
704
+ (req.name, req.email, pw_hash, created_at),
705
+ )
706
+ conn.commit()
707
+ user_id = cur.lastrowid
708
+ token = create_token(user_id, req.email)
709
+
710
+ return {
711
+ "message": "Signup successful",
712
+ "token": token,
713
+ "name": req.name,
714
+ "email": req.email,
715
+ }
716
+
717
+
718
+ @app.post("/api/login")
719
+ def login(req: LoginRequest):
720
+ cur.execute("SELECT * FROM users WHERE email = ?", (req.email,))
721
+ row = cur.fetchone()
722
+ if not row or not verify_password(req.password, row["password_hash"]):
723
+ raise HTTPException(status_code=401, detail="Invalid email or password")
724
+
725
+ token = create_token(row["id"], row["email"])
726
+ return {
727
+ "message": "Login successful",
728
+ "token": token,
729
+ "name": row["name"],
730
+ "email": row["email"],
731
+ }
732
+
733
+
734
+ @app.post("/api/grammar-check")
735
+ def api_grammar_check(req: TextRequest, user=Depends(get_current_user)):
736
+ text = req.text or ""
737
+ if not text.strip():
738
+ raise HTTPException(status_code=400, detail="Text is required")
739
+
740
+ # Prefer GECToR → LanguageTool → heuristics
741
+ if GEC_MODEL is not None:
742
+ corrected, corrections, original_words = gector_correct(text)
743
+ summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
744
+ elif lt_tool is not None:
745
+ corrected, corrections, original_words = grammar_with_languagetool(text)
746
+ summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
747
+ else:
748
+ corrected, corrections, original_words = simple_grammar_correct(text)
749
+ summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"
750
+
751
+ save_history(user["id"], "grammar", text, summary)
752
+
753
+ return {
754
+ "original_words": original_words,
755
+ "corrections": corrections,
756
+ "corrected_text": corrected,
757
+ "summary": summary,
758
+ }
759
+
760
+
761
+ @app.post("/api/grammar-check-file")
762
+ def api_grammar_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
763
+ text = extract_text_from_upload(file).strip()
764
+ if not text:
765
+ raise HTTPException(status_code=400, detail="Uploaded file contains no text")
766
+
767
+ if GEC_MODEL is not None:
768
+ corrected, corrections, original_words = gector_correct(text)
769
+ summary = f"GECToR neural GEC: {corrections} edits; words analysed: {original_words}"
770
+ elif lt_tool is not None:
771
+ corrected, corrections, original_words = grammar_with_languagetool(text)
772
+ summary = f"LanguageTool corrections: {corrections}; words analysed: {original_words}"
773
+ else:
774
+ parts = text.strip().split()
775
+ if len(parts) > 1000:
776
+ text = " ".join(parts[:1000])
777
+ corrected, corrections, original_words = simple_grammar_correct(text)
778
+ summary = f"HEURISTIC corrections: {corrections}; words analysed: {original_words}"
779
+
780
+ save_history(user["id"], "grammar", text, summary)
781
+
782
+ return {
783
+ "original_words": original_words,
784
+ "corrections": corrections,
785
+ "corrected_text": corrected,
786
+ "summary": summary,
787
+ }
788
+
789
+
790
+ # ------------------ PLAGIARISM ENDPOINTS (COMBINED) ------------------
791
+ @app.post("/api/plagiarism-check")
792
+ def api_plagiarism_check(req: TextRequest, user=Depends(get_current_user)):
793
+ text = req.text or ""
794
+ if not text.strip():
795
+ raise HTTPException(status_code=400, detail="Text is required")
796
+
797
+ # First try full combined engine (TF-IDF + embeddings) with corpus
798
+ try:
799
+ result = corpus_plagiarism_combined(text)
800
+ save_history(user["id"], "plagiarism", text, result["summary"])
801
+ return result
802
+ except Exception as e:
803
+ print("[Plagiarism] Combined corpus engine failed, falling back to demo:", e)
804
+
805
+ # Fallback: small Jaccard demo
806
+ result = demo_plagiarism_fallback(text)
807
+ save_history(user["id"], "plagiarism", text, result["summary"])
808
+ return result
809
+
810
+
811
+ @app.post("/api/plagiarism-check-file")
812
+ def api_plagiarism_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
813
+ text = extract_text_from_upload(file).strip()
814
+ if not text:
815
+ raise HTTPException(status_code=400, detail="Uploaded file contains no text")
816
+
817
+ try:
818
+ result = corpus_plagiarism_combined(text)
819
+ save_history(user["id"], "plagiarism", text, result["summary"])
820
+ return result
821
+ except Exception as e:
822
+ print("[Plagiarism-file] Combined corpus engine failed, falling back to demo:", e)
823
+
824
+ # Fallback to demo if corpus/engines unavailable
825
+ result = demo_plagiarism_fallback(text)
826
+ save_history(user["id"], "plagiarism", text, result["summary"])
827
+ return result
828
+
829
+
830
+ # ------------------ AI CHECK (TEXT & FILE) ------------------
831
+ def heuristic_ai_score(text: str):
832
+ words = re.sub(r"[^a-z0-9\s]", " ", text.lower()).split()
833
+ word_count = len(words)
834
+ unique_ratio = len(set(words)) / (word_count or 1)
835
+ sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
836
+ avg_sentence_length = word_count / (len(sentences) or 1)
837
+
838
+ ai_score = 0
839
+ if unique_ratio < 0.45:
840
+ ai_score += 40
841
+ elif unique_ratio < 0.6:
842
+ ai_score += 20
843
+
844
+ if avg_sentence_length > 25:
845
+ ai_score += 40
846
+ elif avg_sentence_length > 18:
847
+ ai_score += 25
848
+
849
+ if word_count > 400:
850
+ ai_score += 10
851
+
852
+ ai_score = min(100, round(ai_score))
853
+ human_score = 100 - ai_score
854
+ return ai_score, human_score, word_count, avg_sentence_length, unique_ratio
855
+
856
+
857
+ @app.post("/api/ai-check")
858
+ def api_ai_check(req: TextRequest, user=Depends(get_current_user)):
859
+ text = (req.text or "").strip()
860
+ if not text:
861
+ raise HTTPException(status_code=400, detail="Text is required")
862
+
863
+ if model is not None and tokenizer is not None:
864
+ try:
865
+ max_len = getattr(tokenizer, "model_max_length", 512)
866
+ if max_len is None or max_len > 1024:
867
+ max_len = 512
868
+
869
+ words = text.split()
870
+ chunk_size = min(400, max_len - 10)
871
+ chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
872
+ probs = []
873
+ for chunk in chunks:
874
+ inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_len)
875
+ inputs = {k: v.to(device) for k, v in inputs.items()}
876
+ with torch.no_grad():
877
+ outputs = model(**inputs)
878
+ logits = outputs.logits
879
+ p = torch.softmax(logits, dim=1).cpu().numpy()[0]
880
+ ai_prob = float(p[1]) if p.shape[0] > 1 else float(p[0])
881
+ probs.append(ai_prob)
882
+ avg_ai_prob = float(np.mean(probs)) if probs else 0.0
883
+ ai_percent = round(avg_ai_prob * 100, 2)
884
+ human_percent = round(100 - ai_percent, 2)
885
+ words_count = len(words)
886
+ sentences = [s.strip() for s in re.split(r"[.!?]+", text) if s.strip()]
887
+ avg_sentence_len = round(words_count / (len(sentences) or 1), 2)
888
+ summary = f"Model: {AI_DETECTOR_MODEL}; AI probability: {ai_percent}%"
889
+ save_history(user["id"], "ai", text, summary)
890
+ return {
891
+ "ai_percent": ai_percent,
892
+ "human_percent": human_percent,
893
+ "word_count": words_count,
894
+ "avg_sentence_length": avg_sentence_len,
895
+ "summary": summary,
896
+ }
897
+ except Exception as e:
898
+ print("[AI-check] model inference failed:", e)
899
+
900
+ ai_percent, human_percent, wc, avg_len, uniq = heuristic_ai_score(text)
901
+ summary = f"HEURISTIC fallback — AI probability: {ai_percent}%"
902
+ save_history(user["id"], "ai", text, summary)
903
+ return {
904
+ "ai_percent": ai_percent,
905
+ "human_percent": human_percent,
906
+ "word_count": wc,
907
+ "avg_sentence_length": avg_len,
908
+ "unique_ratio": round(uniq, 3),
909
+ "summary": summary,
910
+ }
911
+
912
+
913
+ @app.post("/api/ai-check-file")
914
+ def api_ai_check_file(file: UploadFile = File(...), user=Depends(get_current_user)):
915
+ text = extract_text_from_upload(file).strip()
916
+ if not text:
917
+ raise HTTPException(status_code=400, detail="Uploaded file contains no text")
918
+ return api_ai_check.__wrapped__(TextRequest(text=text), user)
919
+
920
+
921
+ # ------------------ HISTORY ------------------
922
+ @app.get("/api/history")
923
+ def api_history(user=Depends(get_current_user)):
924
+ cur.execute(
925
+ "SELECT id, tool, input_text, result_summary, created_at "
926
+ "FROM history WHERE user_id = ? "
927
+ "ORDER BY created_at DESC LIMIT 50",
928
+ (user["id"],),
929
+ )
930
+ rows = cur.fetchall()
931
+ items = []
932
+ for r in rows:
933
+ items.append(
934
+ {
935
+ "id": r["id"],
936
+ "tool": r["tool"],
937
+ "input_text": r["input_text"],
938
+ "summary": r["result_summary"],
939
+ "created_at": r["created_at"],
940
+ }
941
+ )
942
+ return {"items": items}
943
+
944
+
945
+ @app.get("/")
946
+ def read_root():
947
  return {"status": "Backend is running with 16GB RAM!"}