Spaces:

xxmaranxx
/

clustering-test

Sleeping

App Files Files Community

xxmaranxx commited on Nov 2

Commit

8f028d3

verified ·

1 Parent(s): 233841c

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -10

app.py CHANGED Viewed

@@ -1,8 +1,18 @@
-import pickle, numpy as np
 from fastapi import FastAPI
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 lw = pickle.load(open("predictor.pkl", "rb"))
 sbert = SentenceTransformer(lw["model_name"])
@@ -13,34 +23,59 @@ cids = sorted(centroides.keys())
 meta = lw.get("meta", {})
-sentiment = pipeline("text-classification", model="UMUTeam/roberta-spanish-sentiment-analysis")
-emotion = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7")
 EMOTIONS = ["alegría","tristeza","ira","asco","miedo","sorpresa","neutral"]
 app = FastAPI()
-def _encode(text):
     emb = sbert.encode(text, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
     return emb[None, :] if emb.ndim == 1 else emb
-def _assign(vec):
     dists = [np.linalg.norm(vec - centroides[c]) for c in cids]
     return cids[int(np.argmin(dists))]
 @app.post("/predict")
 def predict(payload: dict):
     item = payload.get("data", [{}])[0]
-    text = f"{item.get('subject','')} — {item.get('body','')}"
     emb = _encode(text)[0]
     cid = _assign(emb)
-    s = sentiment(text)[0]["label"]
-    e = emotion(text, candidate_labels=EMOTIONS, hypothesis_template="El texto expresa {}.")["labels"][0]
     m = meta.get(str(cid), meta.get(cid, {}))
     return {
-        "subject": item.get("subject",""),
-        "body": item.get("body",""),
         "cluster": cid,
         "cluster_nombre": (m or {}).get("nombre"),
         "cluster_desc": (m or {}).get("descripcion"),

+import os, pickle, numpy as np
 from fastapi import FastAPI
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
+# ---- Performance flags ----
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+try:
+    import torch
+    torch.set_num_threads(1)  # evita thrashing en CPU básica
+except Exception:
+    pass
+# ---- Carga artefactos una vez ----
 lw = pickle.load(open("predictor.pkl", "rb"))
 sbert = SentenceTransformer(lw["model_name"])
 meta = lw.get("meta", {})
+# Sentimiento (modelo liviano; recorta a 256 tokens)
+sentiment = pipeline(
+    "text-classification",
+    model="UMUTeam/roberta-spanish-sentiment-analysis",
+    device=-1
+)
 EMOTIONS = ["alegría","tristeza","ira","asco","miedo","sorpresa","neutral"]
+HYP = "El texto expresa {}."
+# Precompute embeddings de las emociones con tu mismo encoder (muy rápido)
+_emotion_texts = [HYP.format(e) for e in EMOTIONS]
+_emotion_embs = sbert.encode(_emotion_texts, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
 app = FastAPI()
+def _encode(text: str) -> np.ndarray:
     emb = sbert.encode(text, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
     return emb[None, :] if emb.ndim == 1 else emb
+def _assign(vec: np.ndarray) -> int:
     dists = [np.linalg.norm(vec - centroides[c]) for c in cids]
     return cids[int(np.argmin(dists))]
+def _truncate_for_classifier(text: str, max_chars: int = 1000) -> str:
+    # evita tokenizaciones eternas en CPU (≈256 tokens)
+    return text if len(text) <= max_chars else text[:max_chars]
+def _fast_emotion(emb: np.ndarray) -> str:
+    # cos sim porque ya están normalizados
+    sims = (_emotion_embs @ emb.reshape(-1, 1)).squeeze(-1)
+    return EMOTIONS[int(np.argmax(sims))]
 @app.post("/predict")
 def predict(payload: dict):
     item = payload.get("data", [{}])[0]
+    subject = item.get("subject", "")
+    body = item.get("body", "")
+    text = f"{subject} — {body}"
     emb = _encode(text)[0]
     cid = _assign(emb)
+    # RÁPIDO: sentimiento con truncado
+    s = sentiment(_truncate_for_classifier(text), truncation=True, max_length=256)[0]["label"]
+    # RÁPIDO: emoción por similitud con SBERT (sin segundo Transformer)
+    e = _fast_emotion(emb)
     m = meta.get(str(cid), meta.get(cid, {}))
     return {
+        "subject": subject,
+        "body": body,
         "cluster": cid,
         "cluster_nombre": (m or {}).get("nombre"),
         "cluster_desc": (m or {}).get("descripcion"),