|
|
import os |
|
|
import base64 |
|
|
import io |
|
|
import pandas as pd |
|
|
import plotly.express as px |
|
|
import plotly.graph_objects as go |
|
|
from dash import Dash, html, dcc, Input, Output, State, callback_context |
|
|
import dash_bootstrap_components as dbc |
|
|
from typing import Optional |
|
|
from dotenv import load_dotenv |
|
|
from pydantic import Field, SecretStr |
|
|
|
|
|
|
|
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
|
from langchain_community.vectorstores import FAISS |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
from langchain.schema import Document |
|
|
from langchain_openai import ChatOpenAI |
|
|
from langchain_core.prompts import PromptTemplate |
|
|
from langchain.chains import LLMChain |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
class ChatOpenRouter(ChatOpenAI): |
|
|
def __init__(self, |
|
|
openai_api_key: Optional[str] = None, |
|
|
**kwargs): |
|
|
openai_api_key = openai_api_key or os.environ.get("OPENROUTER_API_KEY") |
|
|
super().__init__( |
|
|
base_url="https://openrouter.ai/api/v1", |
|
|
openai_api_key=openai_api_key, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
|
|
|
openrouter_model = ChatOpenRouter( |
|
|
model="microsoft/phi-4-reasoning-plus", |
|
|
temperature=0.3, |
|
|
max_tokens=1500, |
|
|
model_kwargs={ |
|
|
"top_p": 0.9, |
|
|
"frequency_penalty": 0.0, |
|
|
"presence_penalty": 0.0 |
|
|
}, |
|
|
streaming=False |
|
|
) |
|
|
|
|
|
|
|
|
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP]) |
|
|
server = app.server |
|
|
|
|
|
|
|
|
def init_langchain(): |
|
|
"""Initialize Langchain components""" |
|
|
try: |
|
|
|
|
|
embeddings = HuggingFaceEmbeddings( |
|
|
model_name="sentence-transformers/all-MiniLM-L6-v2", |
|
|
model_kwargs={'device': 'cpu'} |
|
|
) |
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=1000, |
|
|
chunk_overlap=200 |
|
|
) |
|
|
|
|
|
return embeddings, text_splitter |
|
|
except Exception as e: |
|
|
print(f"Error initializing Langchain: {e}") |
|
|
return None, None |
|
|
|
|
|
|
|
|
embeddings, text_splitter = init_langchain() |
|
|
vector_store = None |
|
|
|
|
|
|
|
|
app.layout = dbc.Container([ |
|
|
dbc.Row([ |
|
|
dbc.Col([ |
|
|
html.H1("π€ AI-Powered Data Analytics", className="text-center mb-4"), |
|
|
html.P("Upload data, ask questions, and get AI-powered insights!", |
|
|
className="text-center text-muted"), |
|
|
html.Hr(), |
|
|
], width=12) |
|
|
]), |
|
|
|
|
|
dbc.Row([ |
|
|
dbc.Col([ |
|
|
dbc.Card([ |
|
|
dbc.CardBody([ |
|
|
html.H4("π Data Upload", className="card-title"), |
|
|
dcc.Upload( |
|
|
id='upload-data', |
|
|
children=html.Div([ |
|
|
'Drag and Drop or ', |
|
|
html.A('Select Files') |
|
|
]), |
|
|
style={ |
|
|
'width': '100%', |
|
|
'height': '60px', |
|
|
'lineHeight': '60px', |
|
|
'borderWidth': '1px', |
|
|
'borderStyle': 'dashed', |
|
|
'borderRadius': '5px', |
|
|
'textAlign': 'center', |
|
|
'margin': '10px' |
|
|
}, |
|
|
multiple=False, |
|
|
accept='.csv,.xlsx,.txt' |
|
|
), |
|
|
|
|
|
html.Div(id='upload-status', className="mt-2"), |
|
|
html.Hr(), |
|
|
|
|
|
html.H4("π€ AI Assistant", className="card-title"), |
|
|
dbc.InputGroup([ |
|
|
dbc.Input( |
|
|
id="ai-question", |
|
|
placeholder="Ask questions about your data...", |
|
|
type="text", |
|
|
style={"fontSize": "14px"} |
|
|
), |
|
|
dbc.Button( |
|
|
"Ask AI", |
|
|
id="ask-button", |
|
|
color="primary", |
|
|
n_clicks=0 |
|
|
) |
|
|
]), |
|
|
|
|
|
html.Div(id="ai-response", className="mt-3"), |
|
|
html.Hr(), |
|
|
|
|
|
html.H4("π Quick Analytics", className="card-title"), |
|
|
dbc.ButtonGroup([ |
|
|
dbc.Button("Summary Stats", id="stats-btn", size="sm"), |
|
|
dbc.Button("Correlations", id="corr-btn", size="sm"), |
|
|
dbc.Button("Missing Data", id="missing-btn", size="sm"), |
|
|
], className="w-100"), |
|
|
|
|
|
html.Div(id="quick-analytics", className="mt-3") |
|
|
]) |
|
|
]) |
|
|
], width=4), |
|
|
|
|
|
dbc.Col([ |
|
|
dbc.Card([ |
|
|
dbc.CardBody([ |
|
|
html.H4("π Visualizations", className="card-title"), |
|
|
dcc.Graph(id='main-graph', style={'height': '400px'}), |
|
|
]) |
|
|
]), |
|
|
|
|
|
dbc.Card([ |
|
|
dbc.CardBody([ |
|
|
html.H4("π Data Explorer", className="card-title"), |
|
|
html.Div(id='data-table') |
|
|
]) |
|
|
], className="mt-3") |
|
|
], width=8) |
|
|
], className="mt-4"), |
|
|
|
|
|
|
|
|
dcc.Store(id='stored-data'), |
|
|
dcc.Store(id='data-context') |
|
|
], fluid=True) |
|
|
|
|
|
def create_vector_store(df): |
|
|
"""Create vector store from dataframe""" |
|
|
global vector_store |
|
|
|
|
|
if embeddings is None: |
|
|
return False |
|
|
|
|
|
try: |
|
|
|
|
|
documents = [] |
|
|
|
|
|
|
|
|
col_info = f"Dataset has {len(df)} rows and {len(df.columns)} columns.\n" |
|
|
col_info += f"Columns: {', '.join(df.columns)}\n" |
|
|
col_info += f"Data types: {df.dtypes.to_string()}\n" |
|
|
documents.append(Document(page_content=col_info, metadata={"type": "schema"})) |
|
|
|
|
|
|
|
|
summary = df.describe().to_string() |
|
|
documents.append(Document(page_content=f"Summary statistics:\n{summary}", |
|
|
metadata={"type": "statistics"})) |
|
|
|
|
|
|
|
|
sample_data = df.head(10).to_string() |
|
|
documents.append(Document(page_content=f"Sample data:\n{sample_data}", |
|
|
metadata={"type": "sample"})) |
|
|
|
|
|
|
|
|
numeric_cols = df.select_dtypes(include=['number']).columns |
|
|
if len(numeric_cols) > 1: |
|
|
corr = df[numeric_cols].corr().to_string() |
|
|
documents.append(Document(page_content=f"Correlations:\n{corr}", |
|
|
metadata={"type": "correlation"})) |
|
|
|
|
|
|
|
|
vector_store = FAISS.from_documents(documents, embeddings) |
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error creating vector store: {e}") |
|
|
return False |
|
|
|
|
|
def get_ai_response(question, df): |
|
|
"""Get AI response using OpenRouter LLM and RAG""" |
|
|
global vector_store |
|
|
|
|
|
if vector_store is None: |
|
|
return "Please upload data first to enable AI features." |
|
|
|
|
|
try: |
|
|
|
|
|
data_context = f""" |
|
|
Dataset Information: |
|
|
- Shape: {df.shape[0]} rows Γ {df.shape[1]} columns |
|
|
- Columns: {', '.join(df.columns)} |
|
|
- Data Types: {df.dtypes.to_dict()} |
|
|
- Missing Values: {df.isnull().sum().to_dict()} |
|
|
|
|
|
Sample Data (first 5 rows): |
|
|
{df.head().to_string()} |
|
|
|
|
|
Summary Statistics: |
|
|
{df.describe().to_string()} |
|
|
""" |
|
|
|
|
|
|
|
|
prompt_template = PromptTemplate( |
|
|
input_variables=["question", "data_context"], |
|
|
template=""" |
|
|
You are a professional data analyst AI assistant. Based on the provided dataset information, answer the user's question with clear, actionable insights. |
|
|
|
|
|
Dataset Context: |
|
|
{data_context} |
|
|
|
|
|
User Question: {question} |
|
|
|
|
|
Please provide a helpful, accurate response with: |
|
|
1. Direct answer to the question |
|
|
2. Key insights or patterns you notice |
|
|
3. Recommendations or next steps if applicable |
|
|
|
|
|
Use emojis and markdown formatting to make your response engaging and easy to read. |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
llm_chain = LLMChain( |
|
|
llm=openrouter_model, |
|
|
prompt=prompt_template |
|
|
) |
|
|
|
|
|
|
|
|
response = llm_chain.run( |
|
|
question=question, |
|
|
data_context=data_context |
|
|
) |
|
|
|
|
|
return response |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print(f"OpenRouter error: {e}") |
|
|
return f"""π€ **AI Assistant** (Limited Mode): |
|
|
I encountered an issue with the AI service. Here's basic info about your data: |
|
|
|
|
|
π **Quick Summary**: |
|
|
- Shape: {df.shape[0]} rows Γ {df.shape[1]} columns |
|
|
- Columns: {', '.join(df.columns)} |
|
|
- Missing values: {df.isnull().sum().sum()} total |
|
|
|
|
|
Please check your OPENROUTER_API_KEY configuration. |
|
|
""" |
|
|
|
|
|
def parse_contents(contents, filename): |
|
|
"""Parse uploaded file contents""" |
|
|
content_type, content_string = contents.split(',') |
|
|
decoded = base64.b64decode(content_string) |
|
|
|
|
|
try: |
|
|
if 'csv' in filename: |
|
|
df = pd.read_csv(io.StringIO(decoded.decode('utf-8'))) |
|
|
elif 'xls' in filename: |
|
|
df = pd.read_excel(io.BytesIO(decoded)) |
|
|
else: |
|
|
return None, "Unsupported file type" |
|
|
|
|
|
return df, None |
|
|
except Exception as e: |
|
|
return None, f"Error processing file: {str(e)}" |
|
|
|
|
|
@app.callback( |
|
|
[Output('stored-data', 'data'), |
|
|
Output('upload-status', 'children'), |
|
|
Output('data-table', 'children')], |
|
|
[Input('upload-data', 'contents')], |
|
|
[State('upload-data', 'filename')] |
|
|
) |
|
|
def update_data(contents, filename): |
|
|
"""Update data when file is uploaded""" |
|
|
if contents is None: |
|
|
return None, "", "" |
|
|
|
|
|
df, error = parse_contents(contents, filename) |
|
|
|
|
|
if error: |
|
|
return None, dbc.Alert(error, color="danger"), "" |
|
|
|
|
|
|
|
|
vector_success = create_vector_store(df) |
|
|
|
|
|
|
|
|
table = dbc.Table.from_dataframe( |
|
|
df.head(10), |
|
|
striped=True, |
|
|
bordered=True, |
|
|
hover=True, |
|
|
size='sm' |
|
|
) |
|
|
|
|
|
ai_status = "π€ AI Ready" if vector_success else "β οΈ AI Limited" |
|
|
|
|
|
success_msg = dbc.Alert([ |
|
|
html.H6(f"β
File uploaded successfully! {ai_status}"), |
|
|
html.P(f"Shape: {df.shape[0]} rows Γ {df.shape[1]} columns"), |
|
|
html.P(f"Columns: {', '.join(df.columns.tolist())}") |
|
|
], color="success") |
|
|
|
|
|
return df.to_dict('records'), success_msg, table |
|
|
|
|
|
@app.callback( |
|
|
Output('ai-response', 'children'), |
|
|
[Input('ask-button', 'n_clicks')], |
|
|
[State('ai-question', 'value'), |
|
|
State('stored-data', 'data')] |
|
|
) |
|
|
def handle_ai_question(n_clicks, question, data): |
|
|
"""Handle AI question""" |
|
|
if not n_clicks or not question or not data: |
|
|
return "" |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
response = get_ai_response(question, df) |
|
|
|
|
|
return dbc.Alert( |
|
|
dcc.Markdown(response), |
|
|
color="info" |
|
|
) |
|
|
|
|
|
@app.callback( |
|
|
Output('quick-analytics', 'children'), |
|
|
[Input('stats-btn', 'n_clicks'), |
|
|
Input('corr-btn', 'n_clicks'), |
|
|
Input('missing-btn', 'n_clicks')], |
|
|
[State('stored-data', 'data')] |
|
|
) |
|
|
def quick_analytics(stats_clicks, corr_clicks, missing_clicks, data): |
|
|
"""Handle quick analytics buttons""" |
|
|
if not data: |
|
|
return "" |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
ctx = callback_context |
|
|
|
|
|
if not ctx.triggered: |
|
|
return "" |
|
|
|
|
|
button_id = ctx.triggered[0]['prop_id'].split('.')[0] |
|
|
|
|
|
if button_id == 'stats-btn': |
|
|
stats = df.describe() |
|
|
return dbc.Alert([ |
|
|
html.H6("π Summary Statistics"), |
|
|
dbc.Table.from_dataframe(stats.reset_index(), size='sm') |
|
|
], color="light") |
|
|
|
|
|
elif button_id == 'corr-btn': |
|
|
numeric_df = df.select_dtypes(include=['number']) |
|
|
if len(numeric_df.columns) > 1: |
|
|
corr = numeric_df.corr() |
|
|
fig = px.imshow(corr, text_auto=True, aspect="auto", |
|
|
title="Correlation Matrix") |
|
|
return dcc.Graph(figure=fig, style={'height': '300px'}) |
|
|
return dbc.Alert("No numeric columns for correlation analysis", color="warning") |
|
|
|
|
|
elif button_id == 'missing-btn': |
|
|
missing = df.isnull().sum() |
|
|
missing = missing[missing > 0] |
|
|
if missing.empty: |
|
|
return dbc.Alert("β
No missing values!", color="success") |
|
|
return dbc.Alert([ |
|
|
html.H6("β οΈ Missing Values"), |
|
|
html.Pre(missing.to_string()) |
|
|
], color="warning") |
|
|
|
|
|
return "" |
|
|
|
|
|
@app.callback( |
|
|
Output('main-graph', 'figure'), |
|
|
[Input('stored-data', 'data')] |
|
|
) |
|
|
def update_main_graph(data): |
|
|
"""Update main visualization""" |
|
|
if not data: |
|
|
return {} |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
|
|
|
numeric_cols = df.select_dtypes(include=['number']).columns |
|
|
categorical_cols = df.select_dtypes(include=['object']).columns |
|
|
|
|
|
if len(numeric_cols) >= 2: |
|
|
|
|
|
fig = px.scatter(df, x=numeric_cols[0], y=numeric_cols[1], |
|
|
title=f"Relationship: {numeric_cols[1]} vs {numeric_cols[0]}") |
|
|
elif len(numeric_cols) >= 1 and len(categorical_cols) >= 1: |
|
|
|
|
|
fig = px.bar(df, x=categorical_cols[0], y=numeric_cols[0], |
|
|
title=f"Distribution: {numeric_cols[0]} by {categorical_cols[0]}") |
|
|
elif len(numeric_cols) >= 1: |
|
|
|
|
|
fig = px.histogram(df, x=numeric_cols[0], |
|
|
title=f"Distribution of {numeric_cols[0]}") |
|
|
else: |
|
|
|
|
|
fig = go.Figure() |
|
|
fig.add_annotation(text="Upload data to see visualizations", |
|
|
x=0.5, y=0.5, showarrow=False) |
|
|
|
|
|
fig.update_layout(template="plotly_white") |
|
|
return fig |
|
|
|
|
|
if __name__ == '__main__': |
|
|
app.run_server(host='0.0.0.0', port=7860, debug=False) |