Spaces:

MAS-AI-0000
/

Authentica

Running

App Files Files Community

MAS-AI-0000 commited on 3 days ago

Commit

8eaaeee

verified ·

1 Parent(s): 2e29cb0

Update textPreprocess.py

Browse files

Files changed (1) hide show

textPreprocess.py +130 -66

textPreprocess.py CHANGED Viewed

@@ -1,93 +1,157 @@
-import os
 import torch
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
-from huggingface_hub import snapshot_download  # <-- needed to pull the folder
-# ── 1) PATHS / VARS ────────────────────────────────────────────────────────────
 REPO_ID = "MAS-AI-0000/Authentica"
 TEXT_SUBFOLDER = "Lib/Models/Text"   # where config.json/model.safetensors live in the repo
-# download a local snapshot of just the Text folder and point MODEL_DIR at it
-_snapshot_dir = snapshot_download(
-    repo_id=REPO_ID,
-    allow_patterns=[f"{TEXT_SUBFOLDER}/*"]
-)
-MODEL_DIR = os.path.join(_snapshot_dir, TEXT_SUBFOLDER)
-# individual file paths (in case you need them elsewhere)
-CONFIG_PATH               = os.path.join(MODEL_DIR, "config.json")
-MODEL_SAFETENSORS_PATH    = os.path.join(MODEL_DIR, "model.safetensors")
-TOKENIZER_JSON_PATH       = os.path.join(MODEL_DIR, "tokenizer.json")
-TOKENIZER_CONFIG_PATH     = os.path.join(MODEL_DIR, "tokenizer_config.json")
-SPECIAL_TOKENS_MAP_PATH   = os.path.join(MODEL_DIR, "special_tokens_map.json")
-TRAINING_ARGS_BIN_PATH    = os.path.join(MODEL_DIR, "training_args.bin")  # optional
-TEXT_TXT_PATH             = os.path.join(MODEL_DIR, "text.txt")           # optional
-MAX_LEN = 512
 # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Text prediction device: {device}")
-tokenizer = None
-model = None
-ID2LABEL = {0: "human", 1: "ai"}
 try:
-    # load directly from the local MODEL_DIR
-    config = AutoConfig.from_pretrained(MODEL_DIR)
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
-    model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
-    model.eval().to(device)
-    # override labels from config if present
-    if getattr(model.config, "id2label", None):
-        ID2LABEL = {int(k): v for k, v in model.config.id2label.items()}
-    print("Text classification model loaded successfully")
-    print("MODEL_DIR:", MODEL_DIR)
-    print("Labels:", ID2LABEL)
 except Exception as e:
     print(f"Error loading text model: {e}")
     print("Text prediction will return fallback responses")
-# ── 3) Inference ───────────────────────────────────────────────────────────────
-@torch.inference_mode()
-def predict_text(text: str, max_length: int | None = None):
-    if model is None or tokenizer is None:
-        print("Issue 1")
         return {"predicted_class": "Human", "confidence": -100.0}
-    if max_length is None:
-        max_length = MAX_LEN
     try:
-        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
-        enc = {k: v.to(device) for k, v in enc.items()}
-        logits = model(**enc).logits
-        probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
-        pred_id = int(probs.argmax(-1))
-        label = ID2LABEL.get(pred_id, str(pred_id)).capitalize()
-        return {"predicted_class": label, "confidence": float(probs[pred_id])}
     except Exception as e:
         print(f"Error during text prediction: {e}")
         return {"predicted_class": "Human", "confidence": -100.0}
-# ── 4) Batch (optional) ────────────────────────────────────────────────────────
-@torch.inference_mode()
 def predict_batch(texts, batch_size=16):
-    if model is None or tokenizer is None:
-        print("Issue 2")
         return [{"predicted_class": "Human", "confidence": -100.0} for _ in texts]
-    results = []
-    for i in range(0, len(texts), batch_size):
-        chunk = texts[i:i+batch_size]
-        enc = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=MAX_LEN, padding=True)
-        enc = {k: v.to(device) for k, v in enc.items()}
-        probs = torch.softmax(model(**enc).logits, dim=-1).detach().cpu().numpy()
-        ids = probs.argmax(-1)
-        for t, pid, p in zip(chunk, ids, probs):
-            label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
-            results.append({"text": t, "predicted_class": label, "confidence": float(p[int(pid)])})
-    return results

 import torch
+import os
+import sys
+from pathlib import Path
+from huggingface_hub import snapshot_download
+# Ensure local detree package is importable
+# This allows the script to find the 'detree' package if it sits in the same directory
+current_dir = os.path.dirname(os.path.abspath(__file__))
+if current_dir not in sys.path:
+    sys.path.append(current_dir)
+try:
+    from detree.inference import Detector
+except ImportError:
+    # Fallback if detree is not found (e.g. during initial setup check)
+    print("Warning: 'detree' package not found. Please ensure the 'detree' folder is in the same directory.")
+    Detector = None
+# ── 1) Configuration ────────────────────────────────────────────────────────────
 REPO_ID = "MAS-AI-0000/Authentica"
 TEXT_SUBFOLDER = "Lib/Models/Text"   # where config.json/model.safetensors live in the repo
+EMBEDDING_FILE = "priori1_center10k.pt"
+MAX_LEN = 512
+MODEL_DIR = None
+try:
+    # download a local snapshot of just the Text folder and point MODEL_DIR at it
+    print(f"Downloading/Checking model from {REPO_ID}...")
+    _snapshot_dir = snapshot_download(
+        repo_id=REPO_ID,
+        allow_patterns=[f"{TEXT_SUBFOLDER}/*"]
+    )
+    MODEL_DIR = os.path.join(_snapshot_dir, TEXT_SUBFOLDER)
+    print(f"Model directory set to: {MODEL_DIR}")
+except Exception as e:
+    print(f"Error downloading model from Hugging Face: {e}")
 # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Text prediction device: {device}")
+detector = None
 try:
+    if Detector and MODEL_DIR:
+        database_path = os.path.join(MODEL_DIR, EMBEDDING_FILE)
+        if not os.path.exists(MODEL_DIR):
+            print(f"Warning: Model directory not found at {MODEL_DIR}")
+        if not os.path.exists(database_path):
+            print(f"Warning: Embedding file not found at {database_path}")
+            print(f"Please ensure '{EMBEDDING_FILE}' is present in '{TEXT_SUBFOLDER}' of the Hugging Face repo.")
+        # Initialize DETree Detector
+        # This loads the model from MODEL_DIR and the embeddings from database_path
+        detector = Detector(
+            database_path=database_path,
+            model_name_or_path=MODEL_DIR,
+            device=device,
+            max_length=MAX_LEN,
+            pooling="max" # Default pooling
+        )
+        print(f"Text classification model (DETree) loaded successfully")
+    else:
+        if not Detector:
+            print("DETree detector could not be initialized due to missing package.")
+        if not MODEL_DIR:
+            print("DETree detector could not be initialized due to missing model directory.")
 except Exception as e:
     print(f"Error loading text model: {e}")
     print("Text prediction will return fallback responses")
+# ── 3) Inference function ──────────────────────────────────────────────────────
+def predict_text(text: str, max_length: int = None):
+    """
+    Predict whether the given text is human-written or AI-generated using DETree.
+    Args:
+        text (str): The text to classify
+        max_length (int): Ignored in this implementation as DETree handles it globally,
+                          but kept for compatibility.
+    Returns:
+        dict: Contains predicted_class and confidence
+    """
+    if detector is None:
         return {"predicted_class": "Human", "confidence": -100.0}
     try:
+        # detector.predict expects a list of strings
+        predictions = detector.predict([text])
+        if not predictions:
+             return {"predicted_class": "Human", "confidence": -100.0}
+        pred = predictions[0]
+        # pred.label is "Human" or "AI"
+        # Map to "Human" or "Ai" to match previous API
+        label = pred.label
+        if label == "AI":
+            label = "Ai"
+        # Confidence logic:
+        # If label is Human, use probability_human
+        # If label is Ai, use probability_ai
+        confidence = pred.probability_human if label == "Human" else pred.probability_ai
+        return {
+            "predicted_class": label,
+            "confidence": float(confidence)
+        }
     except Exception as e:
         print(f"Error during text prediction: {e}")
         return {"predicted_class": "Human", "confidence": -100.0}
+# ── 4) Batch prediction ────────────────────────────────────────────────────────
 def predict_batch(texts, batch_size=16):
+    """
+    Predict multiple texts in batches.
+    Args:
+        texts (list): List of text strings to classify
+        batch_size (int): Batch size for processing
+    Returns:
+        list: List of prediction dictionaries
+    """
+    if detector is None:
         return [{"predicted_class": "Human", "confidence": -100.0} for _ in texts]
+    # Temporarily update batch size if needed, or just use the detector's default
+    # We'll update it to respect the argument
+    original_batch_size = detector.batch_size
+    detector.batch_size = batch_size
+    try:
+        predictions = detector.predict(texts)
+        results = []
+        for text, pred in zip(texts, predictions):
+            label = pred.label
+            if label == "AI":
+                label = "Ai"
+            confidence = pred.probability_human if label == "Human" else pred.probability_ai
+            results.append({
+                "text": text,
+                "predicted_class": label,
+                "confidence": float(confidence)
+            })
+        return results
+    except Exception as e:
+        print(f"Error during batch prediction: {e}")
+        return [{"predicted_class": "Human", "confidence": -100.0} for _ in texts]
+    finally:
+        detector.batch_size = original_batch_size