MAS-AI-0000 commited on
Commit
b4746b6
Β·
verified Β·
1 Parent(s): bb0d669

Update textPreprocess.py

Browse files
Files changed (1) hide show
  1. textPreprocess.py +45 -77
textPreprocess.py CHANGED
@@ -1,123 +1,91 @@
 
1
  import torch
2
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
3
- import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # ── 1) Configuration ────────────────────────────────────────────────────────────
6
- BASE_DIR = "MAS-AI-0000/Authentica/tree/main"
7
- MODEL_DIR = os.path.join(BASE_DIR, "Lib/Models/Text") # Update this path to your model location
8
  MAX_LEN = 512
9
 
10
  # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
  print(f"Text prediction device: {device}")
13
 
14
- # Global variables for model and tokenizer
15
  tokenizer = None
16
  model = None
17
  ID2LABEL = {0: "human", 1: "ai"}
18
 
19
  try:
20
- # Config carries id2label/label2id if you saved them
21
  config = AutoConfig.from_pretrained(MODEL_DIR)
22
-
23
- # Loads tokenizer.json + special_tokens_map.json automatically
24
  tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
25
-
26
- # Loads model.safetensors automatically (no extra flags needed)
27
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
28
  model.eval().to(device)
29
-
30
- # Update label mapping from config if available
31
- ID2LABEL = model.config.id2label if getattr(model.config, "id2label", None) else {0: "human", 1: "ai"}
32
-
33
- print(f"Text classification model loaded successfully")
 
 
34
  print("Labels:", ID2LABEL)
35
  except Exception as e:
36
  print(f"Error loading text model: {e}")
37
  print("Text prediction will return fallback responses")
38
 
39
- # ── 3) Inference function ──────────────────────────────────────────────────────
40
  @torch.inference_mode()
41
- def predict_text(text: str, max_length: int = None):
42
- """
43
- Predict whether the given text is human-written or AI-generated.
44
-
45
- Args:
46
- text (str): The text to classify
47
- max_length (int): Maximum sequence length for tokenization (defaults to MAX_LEN)
48
-
49
- Returns:
50
- dict: Contains predicted_class and confidence
51
- """
52
  if model is None or tokenizer is None:
53
- return {"predicted_class": "Human", "confidence": 0}
54
-
55
  if max_length is None:
56
  max_length = MAX_LEN
57
-
58
  try:
59
- # Tokenize input
60
- enc = tokenizer(
61
- text,
62
- return_tensors="pt",
63
- truncation=True,
64
- max_length=max_length,
65
- )
66
  enc = {k: v.to(device) for k, v in enc.items()}
67
-
68
- # Get predictions
69
  logits = model(**enc).logits
70
  probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
71
  pred_id = int(probs.argmax(-1))
72
-
73
- # Get label (capitalize first letter for consistency)
74
- label = ID2LABEL.get(pred_id, str(pred_id))
75
- label = label.capitalize() # "human" -> "Human", "ai" -> "Ai"
76
-
77
- return {
78
- "predicted_class": label,
79
- "confidence": float(probs[pred_id])
80
- }
81
  except Exception as e:
82
  print(f"Error during text prediction: {e}")
83
- return {"predicted_class": "Human", "confidence": 0}
84
 
85
- # ── 4) Batch prediction (optional, for future use) ─────────────────────────────
86
  @torch.inference_mode()
87
  def predict_batch(texts, batch_size=16):
88
- """
89
- Predict multiple texts in batches.
90
-
91
- Args:
92
- texts (list): List of text strings to classify
93
- batch_size (int): Batch size for processing
94
-
95
- Returns:
96
- list: List of prediction dictionaries
97
- """
98
  if model is None or tokenizer is None:
99
- return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
100
-
101
  results = []
102
  for i in range(0, len(texts), batch_size):
103
  chunk = texts[i:i+batch_size]
104
- enc = tokenizer(
105
- chunk,
106
- return_tensors="pt",
107
- truncation=True,
108
- max_length=MAX_LEN,
109
- padding=True,
110
- )
111
  enc = {k: v.to(device) for k, v in enc.items()}
112
- logits = model(**enc).logits
113
- probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
114
  ids = probs.argmax(-1)
115
-
116
  for t, pid, p in zip(chunk, ids, probs):
117
  label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
118
- results.append({
119
- "text": t,
120
- "predicted_class": label,
121
- "confidence": float(p[int(pid)])
122
- })
123
  return results
 
1
+ import os
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
4
+ from huggingface_hub import snapshot_download # <-- needed to pull the folder
5
+
6
+ # ── 1) PATHS / VARS ────────────────────────────────────────────────────────────
7
+ REPO_ID = "MAS-AI-0000/Authentica"
8
+ TEXT_SUBFOLDER = "Lib/Models/Text" # where config.json/model.safetensors live in the repo
9
+
10
+ # download a local snapshot of just the Text folder and point MODEL_DIR at it
11
+ _snapshot_dir = snapshot_download(
12
+ repo_id=REPO_ID,
13
+ allow_patterns=[f"{TEXT_SUBFOLDER}/*"]
14
+ )
15
+ MODEL_DIR = os.path.join(_snapshot_dir, TEXT_SUBFOLDER)
16
+
17
+ # individual file paths (in case you need them elsewhere)
18
+ CONFIG_PATH = os.path.join(MODEL_DIR, "config.json")
19
+ MODEL_SAFETENSORS_PATH = os.path.join(MODEL_DIR, "model.safetensors")
20
+ TOKENIZER_JSON_PATH = os.path.join(MODEL_DIR, "tokenizer.json")
21
+ TOKENIZER_CONFIG_PATH = os.path.join(MODEL_DIR, "tokenizer_config.json")
22
+ SPECIAL_TOKENS_MAP_PATH = os.path.join(MODEL_DIR, "special_tokens_map.json")
23
+ TRAINING_ARGS_BIN_PATH = os.path.join(MODEL_DIR, "training_args.bin") # optional
24
+ TEXT_TXT_PATH = os.path.join(MODEL_DIR, "text.txt") # optional
25
 
 
 
 
26
  MAX_LEN = 512
27
 
28
  # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
  print(f"Text prediction device: {device}")
31
 
 
32
  tokenizer = None
33
  model = None
34
  ID2LABEL = {0: "human", 1: "ai"}
35
 
36
  try:
37
+ # load directly from the local MODEL_DIR
38
  config = AutoConfig.from_pretrained(MODEL_DIR)
 
 
39
  tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
 
 
40
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR, config=config)
41
  model.eval().to(device)
42
+
43
+ # override labels from config if present
44
+ if getattr(model.config, "id2label", None):
45
+ ID2LABEL = {int(k): v for k, v in model.config.id2label.items()}
46
+
47
+ print("Text classification model loaded successfully")
48
+ print("MODEL_DIR:", MODEL_DIR)
49
  print("Labels:", ID2LABEL)
50
  except Exception as e:
51
  print(f"Error loading text model: {e}")
52
  print("Text prediction will return fallback responses")
53
 
54
+ # ── 3) Inference ───────────────────────────────────────────────────────────────
55
  @torch.inference_mode()
56
+ def predict_text(text: str, max_length: int | None = None):
 
 
 
 
 
 
 
 
 
 
57
  if model is None or tokenizer is None:
58
+ return {"predicted_class": "Human", "confidence": 0.0}
59
+
60
  if max_length is None:
61
  max_length = MAX_LEN
62
+
63
  try:
64
+ enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
 
 
 
 
 
 
65
  enc = {k: v.to(device) for k, v in enc.items()}
 
 
66
  logits = model(**enc).logits
67
  probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
68
  pred_id = int(probs.argmax(-1))
69
+ label = ID2LABEL.get(pred_id, str(pred_id)).capitalize()
70
+ return {"predicted_class": label, "confidence": float(probs[pred_id])}
 
 
 
 
 
 
 
71
  except Exception as e:
72
  print(f"Error during text prediction: {e}")
73
+ return {"predicted_class": "Human", "confidence": 0.0}
74
 
75
+ # ── 4) Batch (optional) ────────────────────────────────────────────────────────
76
  @torch.inference_mode()
77
  def predict_batch(texts, batch_size=16):
 
 
 
 
 
 
 
 
 
 
78
  if model is None or tokenizer is None:
79
+ return [{"predicted_class": "Human", "confidence": 0.0} for _ in texts]
80
+
81
  results = []
82
  for i in range(0, len(texts), batch_size):
83
  chunk = texts[i:i+batch_size]
84
+ enc = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=MAX_LEN, padding=True)
 
 
 
 
 
 
85
  enc = {k: v.to(device) for k, v in enc.items()}
86
+ probs = torch.softmax(model(**enc).logits, dim=-1).detach().cpu().numpy()
 
87
  ids = probs.argmax(-1)
 
88
  for t, pid, p in zip(chunk, ids, probs):
89
  label = ID2LABEL.get(int(pid), str(int(pid))).capitalize()
90
+ results.append({"text": t, "predicted_class": label, "confidence": float(p[int(pid)])})
 
 
 
 
91
  return results