from fastapi import FastAPI, Query from fastapi.responses import JSONResponse from src.embeddings_search import create_embeddings_search_function_from_embeddings_df from src.tfidf_search import create_tfidf_search_function import polars as pl #from jinja2 import Template # remove this prefix from the file paths: path_prefix = "/Users/wes/Google Drive/Shared drives/datalab/projects/2025_coul_aisearch/data/original_box_download/" # data we will need for search: block_embeddings_df_path = "block_embeddings/block-embeddings.parquet" doc_tfidf_df_path = "block_tfidf/TF-IDF-doc-text.parquet" tfidf_vectorizer_path = "block_tfidf/tfidf_vectorizer_doc_text.joblib" sbert_query_docs = create_embeddings_search_function_from_embeddings_df( model_name = "sentence-transformers/all-MiniLM-L6-v2", embeddings_df_path = block_embeddings_df_path, device = "cpu") tfidf_query_docs = create_tfidf_search_function( dtm_df_path = doc_tfidf_df_path, vectorizer_path = tfidf_vectorizer_path, model_name = "facebook/fasttext-en-vectors") app = FastAPI() @app.get("/") def default(): return {"status": "ok", "version": 0.1} @app.get("/search", response_class=JSONResponse) def search(q: str = Query(..., description="Search query")): res_tfidf = tfidf_query_docs(q) res_sbert = sbert_query_docs(q) joined = res_sbert.join(res_tfidf, on='file', how = 'inner') res_combined = joined.with_columns( (0.7 * pl.col("rank-sbert") + 0.3 * pl.col("rank-tfidf")).alias("rank-combined"), pl.col("file").str.strip_prefix(path_prefix).alias("file") ).sort("rank-combined").with_columns( (20.0 / pl.col('rank-combined')).round(2).alias('confidence') ).select(['file', 'confidence']) #return {"request": request, "results": str(res_combined)} #return {"request": request, "results": res_combined.to_dicts()} return res_combined.to_dicts() @app.get("/test") def echo(query: str): return {"echo": query}