timothytzkung's picture
Init commit
d84e52f verified
raw
history blame
4.05 kB
import json
import numpy as np
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import gradio as gr
import torch
from huggingface_hub import login
import os
# Sanity Check
hf_token = os.getenv("V2_TOKEN")
if hf_token is None:
raise RuntimeError("V2_TOKEN environment variable is not set in this Space.")
# Explicit login
login(token=hf_token)
# --- Configuration ---
print("Loading RAG system on your device...")
# Load Knowledge base
FILE_PATH = "data.jsonl"
PRELOAD_FILE_PATH = "preload-data"
# File path readings
if not os.path.exists(FILE_PATH):
# Dummy data for testing if you don't have the file yet
print(f"Warning: {FILE_PATH} not found. Creating dummy data.")
data = [{"text": "To reset your password, visit password.sfu.ca and click 'Forgot Password'."}]
elif os.path.exists(PRELOAD_FILE_PATH):
print(f"Found Preloaded Data! Using {PRELOAD_FILE_PATH}...")
with open(PRELOAD_FILE_PATH, "r", encoding="utf-8") as f:
data = json.load(f)
else:
with open(FILE_PATH, "r", encoding="utf-8") as f:
print(f"No Preloaded Data Found. Using {FILE_PATH}...")
data = pd.read_json(path_or_buf=f, lines=True)
# Writes in data embedding
if not os.path.exists(PRELOAD_FILE_PATH):
documents = list(data["text"])
print(f"Creating {PRELOAD_FILE_PATH}...")
with open("preload-data", "w") as fp:
json.dump(documents, fp)
else:
documents = data
# Embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents, convert_to_numpy=True)
# Use pandas dataframe
df = pd.DataFrame(
{
"Document": documents,
"Embedding": list(embeddings), # store as list
}
)
# Load LLM Pipeline
llm = pipeline(
"text-generation",
model="google/flan-t5-xl", # Might not have enough storage ngl
token=hf_token
)
# Retrieve w Pandas
def retrieve_with_pandas(query: str, top_k: int = 10):
"""
Embed the query, compute cosine similarity to each document,
and return the top_k most similar documents (as a DataFrame).
"""
query_embedding = embedding_model.encode([query])[0]
def cosine_sim(x):
x = np.array(x)
return float(
np.dot(query_embedding, x)
/ (np.linalg.norm(query_embedding) * np.linalg.norm(x))
)
df["Similarity"] = df["Embedding"].apply(cosine_sim)
results = df.sort_values(by="Similarity", ascending=False).head(top_k)
return results[["Document", "Similarity"]]
def generate_with_rag(query, top_k=5):
# Retrieve context as a pandas Series of document texts
docs = retrieve_with_pandas(query) # whatever you currently return
context_series = docs["Document"] if "Document" in docs else docs
# Turn the Series into a single string of text
# (each doc separated by a divider)
context_str = "\n\n---\n\n".join(context_series.tolist())
# Build a clean prompt
input_text = f"""You are an IT helpdesk assistant.
If the user asked a question, answer the user's question with detailed step by step instructions: consider all the articles below.
If the user asked a question and the answer is not in the contexts, say you don't know and suggest contacting SFU IT.
If the user DID NOT ask a question, be friendly and ask how you can help them.
Question:
{query}
-- Start of Articles --
{context_str}
-- End of Articles --
Answer:"""
# Call the LLM
response = llm(
input_text,
max_new_tokens=1024,
do_sample=False,
return_full_text=False
)
return response[0]["generated_text"].strip()
def chat_fn(message, history):
"""
Chat Interface callback
"""
answer = generate_with_rag(message, top_k=2)
return answer
demo = gr.ChatInterface(
fn=chat_fn,
title="SFU IT Chatbot",
description="Enter your question and the SFU IT Chatbot will try to answer using retrieved SFU IT knowledge.",
)
# share=True
if __name__ == "__main__":
demo.launch()