Spaces:

MAS-AI-0000
/

Authentica

Running

App Files Files Community

MAS-AI-0000 commited on 2 days ago

Commit

677a506

verified ·

1 Parent(s): bbbfa2c

Update textPreprocess.py

Browse files

Files changed (1) hide show

textPreprocess.py +47 -42

textPreprocess.py CHANGED Viewed

@@ -38,6 +38,7 @@ try:
 except Exception as e:
     print(f"Error downloading model from Hugging Face: {e}")
 # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Text prediction device: {device}")
@@ -45,14 +46,13 @@ print(f"Text prediction device: {device}")
 detector = None
 try:
-    if Detector and MODEL_DIR:
         database_path = os.path.join(MODEL_DIR, EMBEDDING_FILE)
         if not os.path.exists(MODEL_DIR):
             print(f"Warning: Model directory not found at {MODEL_DIR}")
         if not os.path.exists(database_path):
             print(f"Warning: Embedding file not found at {database_path}")
-            print(f"Please ensure '{EMBEDDING_FILE}' is present in '{TEXT_SUBFOLDER}' of the Hugging Face repo.")
         # Initialize DETree Detector
         # This loads the model from MODEL_DIR and the embeddings from database_path
@@ -65,10 +65,7 @@ try:
         )
         print(f"Text classification model (DETree) loaded successfully")
     else:
-        if not Detector:
-            print("DETree detector could not be initialized due to missing package.")
-        if not MODEL_DIR:
-            print("DETree detector could not be initialized due to missing model directory.")
 except Exception as e:
     print(f"Error loading text model: {e}")
@@ -88,37 +85,43 @@ def predict_text(text: str, max_length: int = None):
         dict: Contains predicted_class and confidence
     """
     if detector is None:
-        return {"predicted_class": "Human", "confidence": -100.0}
     try:
         # detector.predict expects a list of strings
         predictions = detector.predict([text])
         if not predictions:
-             return {"predicted_class": "Human", "confidence": -100.0}
         pred = predictions[0]
-        # pred.label is "Human" or "AI"
-        # Map to "Human" or "Ai" to match previous API
-        # Determine label based on higher confidence
-        if pred.probability_ai > pred.probability_human:
-            label = "AI"
-            confidence = pred.probability_ai
-        else:
-            label = "Human"
-            confidence = pred.probability_human
-        # Confidence logic:
-        # If label is Human, use probability_human
-        # If label is Ai, use probability_ai
-        confidence = pred.probability_human if label == "Human" else pred.probability_ai
         return {
-            "predicted_class": label,
-            "confidence": float(confidence)
         }
     except Exception as e:
         print(f"Error during text prediction: {e}")
-        return {"predicted_class": "Human", "confidence": -100.0}
 # ── 4) Batch prediction ────────────────────────────────────────────────────────
 def predict_batch(texts, batch_size=16):
@@ -128,12 +131,16 @@ def predict_batch(texts, batch_size=16):
     Args:
         texts (list): List of text strings to classify
         batch_size (int): Batch size for processing
-    Returns:
         list: List of prediction dictionaries
     """
     if detector is None:
-        return [{"predicted_class": "Human", "confidence": -100.0} for _ in texts]
     # Temporarily update batch size if needed, or just use the detector's default
     # We'll update it to respect the argument
@@ -144,25 +151,23 @@ def predict_batch(texts, batch_size=16):
         predictions = detector.predict(texts)
         results = []
         for text, pred in zip(texts, predictions):
-            label = pred.label
-            # Determine label based on higher confidence
-            if pred.probability_ai > pred.probability_human:
-                label = "AI"
-                confidence = pred.probability_ai
-            else:
-                label = "Human"
-                confidence = pred.probability_human
             results.append({
                 "text": text,
-                "predicted_class": label,
-                "confidence": float(confidence)
             })
         return results
     except Exception as e:
         print(f"Error during batch prediction: {e}")
-        return [{"predicted_class": "Human", "confidence": -100.0} for _ in texts]
     finally:
-        detector.batch_size = original_batch_size

 except Exception as e:
     print(f"Error downloading model from Hugging Face: {e}")
 # ── 2) Load model & tokenizer ──────────────────────────────────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Text prediction device: {device}")
 detector = None
 try:
+    if Detector:
         database_path = os.path.join(MODEL_DIR, EMBEDDING_FILE)
         if not os.path.exists(MODEL_DIR):
             print(f"Warning: Model directory not found at {MODEL_DIR}")
         if not os.path.exists(database_path):
             print(f"Warning: Embedding file not found at {database_path}")
         # Initialize DETree Detector
         # This loads the model from MODEL_DIR and the embeddings from database_path
         )
         print(f"Text classification model (DETree) loaded successfully")
     else:
+        print("DETree detector could not be initialized due to missing package.")
 except Exception as e:
     print(f"Error loading text model: {e}")
         dict: Contains predicted_class and confidence
     """
     if detector is None:
+        return {
+            "predicted_class": "Human",
+            "confidence_ai": -100.0,
+            "confidence_human": -100.0
+        }
     try:
         # detector.predict expects a list of strings
         predictions = detector.predict([text])
+        print(f"DETree prediction output: {predictions}")
         if not predictions:
+             return {
+                "predicted_class": "Human",
+                "confidence_ai": -100.0,
+                "confidence_human": -100.0
+            }
         pred = predictions[0]
+        # Determine predicted_class based on higher confidence
+        predicted_class = "AI" if pred.probability_ai > pred.probability_human else "Human"
         return {
+            "predicted_class": predicted_class,
+            "confidence_ai": float(pred.probability_ai),
+            "confidence_human": float(pred.probability_human)
         }
     except Exception as e:
         print(f"Error during text prediction: {e}")
+        return {
+            "predicted_class": "Human",
+            "confidence_ai": -100.0,
+            "confidence_human": -100.0
+        }
 # ── 4) Batch prediction ────────────────────────────────────────────────────────
 def predict_batch(texts, batch_size=16):
     Args:
         texts (list): List of text strings to classify
         batch_size (int): Batch size for processing
+    if detector is None:
+        return [{
+            "predicted_class": "Human",
+            "confidence_ai": -100.0,
+            "confidence_human": -100.0
+        } for _ in texts]
         list: List of prediction dictionaries
     """
     if detector is None:
+        return [{"predicted_class": "Human", "confidence": 0} for _ in texts]
     # Temporarily update batch size if needed, or just use the detector's default
     # We'll update it to respect the argument
         predictions = detector.predict(texts)
         results = []
         for text, pred in zip(texts, predictions):
+            # Determine predicted_class based on higher confidence
+            predicted_class = "AI" if pred.probability_ai > pred.probability_human else "Human"
             results.append({
                 "text": text,
+                "predicted_class": predicted_class,
+                "confidence_ai": float(pred.probability_ai),
+                "confidence_human": float(pred.probability_human)
             })
         return results
     except Exception as e:
         print(f"Error during batch prediction: {e}")
+        return [{
+            "predicted_class": "Human",
+            "confidence_ai": 0.0,
+            "confidence_human": 0.0
+        } for _ in texts]
     finally:
+        detector.batch_size = original_batch_size