Spaces:

MAS-AI-0000
/

Authentica

Running

App Files Files Community

MAS-AI-0000 commited on Oct 18

Commit

b4746b6

verified ·

1 Parent(s): bb0d669

Update textPreprocess.py

Browse files

Files changed (1) hide show

textPreprocess.py +45 -77

textPreprocess.py CHANGED Viewed

@@ -1,123 +1,91 @@
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
-import os
-# ── 1) Configuration ────────────────────────────────────────────────────────────
-BASE_DIR = "MAS-AI-0000/Authentica/tree/main"
-MODEL_DIR = os.path.join(BASE_DIR, "Lib/Models/Text")  # Update this path to your model location
 MAX_LEN = 512
 # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Text prediction device: {device}")
-# Global variables for model and tokenizer
 tokenizer = None
 model = None
 ID2LABEL = {0: "human", 1: "ai"}
 try:
-    # Config carries id2label/label2id if you saved them
     config = AutoConfig.from_pretrained(MODEL_DIR)
-    # Loads tokenizer.json + special_tokens_map.json automatically
     tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
-    # Loads model.safetensors automatically (no extra flags needed)
     model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
     model.eval().to(device)
-    # Update label mapping from config if available
-    ID2LABEL = model.config.id2label if getattr(model.config, "id2label", None) else {0: "human", 1: "ai"}
-    print(f"Text classification model loaded successfully")
     print("Labels:", ID2LABEL)
 except Exception as e:
     print(f"Error loading text model: {e}")
     print("Text prediction will return fallback responses")
-# ── 3) Inference function ──────────────────────────────────────────────────────
 @torch.inference_mode()
-def predict_text(text: str, max_length: int = None):
-    """
-    Predict whether the given text is human-written or AI-generated.
-    Args:
-        text (str): The text to classify
-        max_length (int): Maximum sequence length for tokenization (defaults to MAX_LEN)
-    Returns:
-        dict: Contains predicted_class and confidence
-    """
     if model is None or tokenizer is None:
-        return {"predicted_class": "Human", "confidence": 0}
     if max_length is None:
         max_length = MAX_LEN
     try:
-        # Tokenize input
-        enc = tokenizer(
-            text,
-            return_tensors="pt",
-            truncation=True,
-            max_length=max_length,
-        )
         enc = {k: v.to(device) for k, v in enc.items()}
-        # Get predictions
         logits = model(**enc).logits
         probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
         pred_id = int(probs.argmax(-1))
-        # Get label (capitalize first letter for consistency)
-        label = ID2LABEL.get(pred_id, str(pred_id))
-        label = label.capitalize()  # "human" -> "Human", "ai" -> "Ai"
-        return {
-            "predicted_class": label,
-            "confidence": float(probs[pred_id])
-        }
     except Exception as e:
         print(f"Error during text prediction: {e}")
-        return {"predicted_class": "Human", "confidence": 0}
-# ── 4) Batch prediction (optional, for future use) ─────────────────────────────
 @torch.inference_mode()
 def predict_batch(texts, batch_size=16):
-    """
-    Predict multiple texts in batches.
-    Args:
-        texts (list): List of text strings to classify
-        batch_size (int): Batch size for processing
-    Returns:
-        list: List of prediction dictionaries
-    """
     if model is None or tokenizer is None:
-        return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
     results = []
     for i in range(0, len(texts), batch_size):
         chunk = texts[i:i+batch_size]
-        enc = tokenizer(
-            chunk,
-            return_tensors="pt",
-            truncation=True,
-            max_length=MAX_LEN,
-            padding=True,
-        )
         enc = {k: v.to(device) for k, v in enc.items()}
-        logits = model(**enc).logits
-        probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
         ids = probs.argmax(-1)
         for t, pid, p in zip(chunk, ids, probs):
             label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
-            results.append({
-                "text": t,
-                "predicted_class": label,
-                "confidence": float(p[int(pid)])
-            })
     return results

+import os
 import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
+from huggingface_hub import snapshot_download  # <-- needed to pull the folder
+# ── 1) PATHS / VARS ────────────────────────────────────────────────────────────
+REPO_ID = "MAS-AI-0000/Authentica"
+TEXT_SUBFOLDER = "Lib/Models/Text"   # where config.json/model.safetensors live in the repo
+# download a local snapshot of just the Text folder and point MODEL_DIR at it
+_snapshot_dir = snapshot_download(
+    repo_id=REPO_ID,
+    allow_patterns=[f"{TEXT_SUBFOLDER}/*"]
+)
+MODEL_DIR = os.path.join(_snapshot_dir, TEXT_SUBFOLDER)
+# individual file paths (in case you need them elsewhere)
+CONFIG_PATH               = os.path.join(MODEL_DIR, "config.json")
+MODEL_SAFETENSORS_PATH    = os.path.join(MODEL_DIR, "model.safetensors")
+TOKENIZER_JSON_PATH       = os.path.join(MODEL_DIR, "tokenizer.json")
+TOKENIZER_CONFIG_PATH     = os.path.join(MODEL_DIR, "tokenizer_config.json")
+SPECIAL_TOKENS_MAP_PATH   = os.path.join(MODEL_DIR, "special_tokens_map.json")
+TRAINING_ARGS_BIN_PATH    = os.path.join(MODEL_DIR, "training_args.bin")  # optional
+TEXT_TXT_PATH             = os.path.join(MODEL_DIR, "text.txt")           # optional
 MAX_LEN = 512
 # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Text prediction device: {device}")
 tokenizer = None
 model = None
 ID2LABEL = {0: "human", 1: "ai"}
 try:
+    # load directly from the local MODEL_DIR
     config = AutoConfig.from_pretrained(MODEL_DIR)
     tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
     model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
     model.eval().to(device)
+    # override labels from config if present
+    if getattr(model.config, "id2label", None):
+        ID2LABEL = {int(k): v for k, v in model.config.id2label.items()}
+    print("Text classification model loaded successfully")
+    print("MODEL_DIR:", MODEL_DIR)
     print("Labels:", ID2LABEL)
 except Exception as e:
     print(f"Error loading text model: {e}")
     print("Text prediction will return fallback responses")
+# ── 3) Inference ───────────────────────────────────────────────────────────────
 @torch.inference_mode()
+def predict_text(text: str, max_length: int | None = None):
     if model is None or tokenizer is None:
+        return {"predicted_class": "Human", "confidence": 0.0}
     if max_length is None:
         max_length = MAX_LEN
     try:
+        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
         enc = {k: v.to(device) for k, v in enc.items()}
         logits = model(**enc).logits
         probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
         pred_id = int(probs.argmax(-1))
+        label = ID2LABEL.get(pred_id, str(pred_id)).capitalize()
+        return {"predicted_class": label, "confidence": float(probs[pred_id])}
     except Exception as e:
         print(f"Error during text prediction: {e}")
+        return {"predicted_class": "Human", "confidence": 0.0}
+# ── 4) Batch (optional) ────────────────────────────────────────────────────────
 @torch.inference_mode()
 def predict_batch(texts, batch_size=16):
     if model is None or tokenizer is None:
+        return [{"predicted_class": "Human", "confidence": 0.0} for _ in texts]
     results = []
     for i in range(0, len(texts), batch_size):
         chunk = texts[i:i+batch_size]
+        enc = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=MAX_LEN, padding=True)
         enc = {k: v.to(device) for k, v in enc.items()}
+        probs = torch.softmax(model(**enc).logits, dim=-1).detach().cpu().numpy()
         ids = probs.argmax(-1)
         for t, pid, p in zip(chunk, ids, probs):
             label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
+            results.append({"text": t, "predicted_class": label, "confidence": float(p[int(pid)])})
     return results