MAS-AI-0000 commited on
Commit
677a506
Β·
verified Β·
1 Parent(s): bbbfa2c

Update textPreprocess.py

Browse files
Files changed (1) hide show
  1. textPreprocess.py +47 -42
textPreprocess.py CHANGED
@@ -38,6 +38,7 @@ try:
38
  except Exception as e:
39
  print(f"Error downloading model from Hugging Face: {e}")
40
 
 
41
  # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
42
  device = "cuda" if torch.cuda.is_available() else "cpu"
43
  print(f"Text prediction device: {device}")
@@ -45,14 +46,13 @@ print(f"Text prediction device: {device}")
45
  detector = None
46
 
47
  try:
48
- if Detector and MODEL_DIR:
49
  database_path = os.path.join(MODEL_DIR, EMBEDDING_FILE)
50
 
51
  if not os.path.exists(MODEL_DIR):
52
  print(f"Warning: Model directory not found at {MODEL_DIR}")
53
  if not os.path.exists(database_path):
54
  print(f"Warning: Embedding file not found at {database_path}")
55
- print(f"Please ensure '{EMBEDDING_FILE}' is present in '{TEXT_SUBFOLDER}' of the Hugging Face repo.")
56
 
57
  # Initialize DETree Detector
58
  # This loads the model from MODEL_DIR and the embeddings from database_path
@@ -65,10 +65,7 @@ try:
65
  )
66
  print(f"Text classification model (DETree) loaded successfully")
67
  else:
68
- if not Detector:
69
- print("DETree detector could not be initialized due to missing package.")
70
- if not MODEL_DIR:
71
- print("DETree detector could not be initialized due to missing model directory.")
72
 
73
  except Exception as e:
74
  print(f"Error loading text model: {e}")
@@ -88,37 +85,43 @@ def predict_text(text: str, max_length: int = None):
88
  dict: Contains predicted_class and confidence
89
  """
90
  if detector is None:
91
- return {"predicted_class": "Human", "confidence": -100.0}
 
 
 
 
92
 
93
  try:
94
  # detector.predict expects a list of strings
95
  predictions = detector.predict([text])
 
 
 
96
  if not predictions:
97
- return {"predicted_class": "Human", "confidence": -100.0}
 
 
 
 
98
 
99
  pred = predictions[0]
100
- # pred.label is "Human" or "AI"
101
- # Map to "Human" or "Ai" to match previous API
102
- # Determine label based on higher confidence
103
- if pred.probability_ai > pred.probability_human:
104
- label = "AI"
105
- confidence = pred.probability_ai
106
- else:
107
- label = "Human"
108
- confidence = pred.probability_human
109
-
110
- # Confidence logic:
111
- # If label is Human, use probability_human
112
- # If label is Ai, use probability_ai
113
- confidence = pred.probability_human if label == "Human" else pred.probability_ai
114
 
115
  return {
116
- "predicted_class": label,
117
- "confidence": float(confidence)
 
118
  }
 
119
  except Exception as e:
120
  print(f"Error during text prediction: {e}")
121
- return {"predicted_class": "Human", "confidence": -100.0}
 
 
 
 
122
 
123
  # ── 4) Batch prediction ────────────────────────────────────────────────────────
124
  def predict_batch(texts, batch_size=16):
@@ -128,12 +131,16 @@ def predict_batch(texts, batch_size=16):
128
  Args:
129
  texts (list): List of text strings to classify
130
  batch_size (int): Batch size for processing
131
-
132
- Returns:
 
 
 
 
133
  list: List of prediction dictionaries
134
  """
135
  if detector is None:
136
- return [{"predicted_class": "Human", "confidence": -100.0} for _ in texts]
137
 
138
  # Temporarily update batch size if needed, or just use the detector's default
139
  # We'll update it to respect the argument
@@ -144,25 +151,23 @@ def predict_batch(texts, batch_size=16):
144
  predictions = detector.predict(texts)
145
  results = []
146
  for text, pred in zip(texts, predictions):
147
- label = pred.label
148
-
149
- # Determine label based on higher confidence
150
- if pred.probability_ai > pred.probability_human:
151
- label = "AI"
152
- confidence = pred.probability_ai
153
- else:
154
- label = "Human"
155
- confidence = pred.probability_human
156
-
157
 
158
  results.append({
159
  "text": text,
160
- "predicted_class": label,
161
- "confidence": float(confidence)
 
162
  })
163
  return results
 
164
  except Exception as e:
165
  print(f"Error during batch prediction: {e}")
166
- return [{"predicted_class": "Human", "confidence": -100.0} for _ in texts]
 
 
 
 
167
  finally:
168
- detector.batch_size = original_batch_size
 
38
  except Exception as e:
39
  print(f"Error downloading model from Hugging Face: {e}")
40
 
41
+
42
  # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
43
  device = "cuda" if torch.cuda.is_available() else "cpu"
44
  print(f"Text prediction device: {device}")
 
46
  detector = None
47
 
48
  try:
49
+ if Detector:
50
  database_path = os.path.join(MODEL_DIR, EMBEDDING_FILE)
51
 
52
  if not os.path.exists(MODEL_DIR):
53
  print(f"Warning: Model directory not found at {MODEL_DIR}")
54
  if not os.path.exists(database_path):
55
  print(f"Warning: Embedding file not found at {database_path}")
 
56
 
57
  # Initialize DETree Detector
58
  # This loads the model from MODEL_DIR and the embeddings from database_path
 
65
  )
66
  print(f"Text classification model (DETree) loaded successfully")
67
  else:
68
+ print("DETree detector could not be initialized due to missing package.")
 
 
 
69
 
70
  except Exception as e:
71
  print(f"Error loading text model: {e}")
 
85
  dict: Contains predicted_class and confidence
86
  """
87
  if detector is None:
88
+ return {
89
+ "predicted_class": "Human",
90
+ "confidence_ai": -100.0,
91
+ "confidence_human": -100.0
92
+ }
93
 
94
  try:
95
  # detector.predict expects a list of strings
96
  predictions = detector.predict([text])
97
+
98
+ print(f"DETree prediction output: {predictions}")
99
+
100
  if not predictions:
101
+ return {
102
+ "predicted_class": "Human",
103
+ "confidence_ai": -100.0,
104
+ "confidence_human": -100.0
105
+ }
106
 
107
  pred = predictions[0]
108
+
109
+ # Determine predicted_class based on higher confidence
110
+ predicted_class = "AI" if pred.probability_ai > pred.probability_human else "Human"
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  return {
113
+ "predicted_class": predicted_class,
114
+ "confidence_ai": float(pred.probability_ai),
115
+ "confidence_human": float(pred.probability_human)
116
  }
117
+
118
  except Exception as e:
119
  print(f"Error during text prediction: {e}")
120
+ return {
121
+ "predicted_class": "Human",
122
+ "confidence_ai": -100.0,
123
+ "confidence_human": -100.0
124
+ }
125
 
126
  # ── 4) Batch prediction ────────────────────────────────────────────────────────
127
  def predict_batch(texts, batch_size=16):
 
131
  Args:
132
  texts (list): List of text strings to classify
133
  batch_size (int): Batch size for processing
134
+ if detector is None:
135
+ return [{
136
+ "predicted_class": "Human",
137
+ "confidence_ai": -100.0,
138
+ "confidence_human": -100.0
139
+ } for _ in texts]
140
  list: List of prediction dictionaries
141
  """
142
  if detector is None:
143
+ return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
144
 
145
  # Temporarily update batch size if needed, or just use the detector's default
146
  # We'll update it to respect the argument
 
151
  predictions = detector.predict(texts)
152
  results = []
153
  for text, pred in zip(texts, predictions):
154
+ # Determine predicted_class based on higher confidence
155
+ predicted_class = "AI" if pred.probability_ai > pred.probability_human else "Human"
 
 
 
 
 
 
 
 
156
 
157
  results.append({
158
  "text": text,
159
+ "predicted_class": predicted_class,
160
+ "confidence_ai": float(pred.probability_ai),
161
+ "confidence_human": float(pred.probability_human)
162
  })
163
  return results
164
+
165
  except Exception as e:
166
  print(f"Error during batch prediction: {e}")
167
+ return [{
168
+ "predicted_class": "Human",
169
+ "confidence_ai": 0.0,
170
+ "confidence_human": 0.0
171
+ } for _ in texts]
172
  finally:
173
+ detector.batch_size = original_batch_size