File size: 14,769 Bytes
c4848ed
 
 
 
 
 
e153726
c4848ed
b19a39d
 
 
c4848ed
d2ef69d
 
e153726
c4848ed
 
b19a39d
 
 
 
 
 
 
 
 
 
 
 
da23c4f
 
 
 
 
b19a39d
 
 
d2ef69d
b19a39d
 
d2ef69d
 
 
 
 
b19a39d
 
c4848ed
 
 
 
 
e153726
c4848ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b19a39d
c4848ed
 
 
 
 
 
b19a39d
 
 
 
 
 
 
 
 
 
 
 
 
 
c4848ed
b19a39d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4848ed
b19a39d
c4848ed
b19a39d
 
 
 
 
c4848ed
b19a39d
 
 
 
 
c4848ed
b19a39d
c4848ed
 
b19a39d
 
 
 
 
 
 
 
 
 
 
 
c4848ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
import os
import base64
import io
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, html, dcc, Input, Output, State, callback_context
import dash_bootstrap_components as dbc
from typing import Optional
from dotenv import load_dotenv
from pydantic import Field, SecretStr

# Fixed Langchain imports (using langchain-huggingface for v0.2+)
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain

# Load environment variables
load_dotenv()

class ChatOpenRouter(ChatOpenAI):
    def __init__(self,
                 openai_api_key: Optional[str] = None,
                 **kwargs):
        openai_api_key = openai_api_key or os.environ.get("OPENROUTER_API_KEY")
        super().__init__(
            base_url="https://openrouter.ai/api/v1", 
            openai_api_key=openai_api_key, 
            **kwargs
        )

# Initialize OpenRouter model
openrouter_model = ChatOpenRouter(
    model="microsoft/phi-4-reasoning-plus",
    temperature=0.3,
    max_tokens=1500,
    model_kwargs={
        "top_p": 0.9,
        "frequency_penalty": 0.0,
        "presence_penalty": 0.0
    },
    streaming=False        
)

# Initialize Dash app
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
server = app.server

# Initialize Langchain components (removed @st.cache_resource)
def init_langchain():
    """Initialize Langchain components"""
    try:
        # Use a lightweight model for embeddings
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'}
        )
        
        # Initialize text splitter
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200
        )
        
        return embeddings, text_splitter
    except Exception as e:
        print(f"Error initializing Langchain: {e}")
        return None, None

# Global variables
embeddings, text_splitter = init_langchain()
vector_store = None

# App layout
app.layout = dbc.Container([
    dbc.Row([
        dbc.Col([
            html.H1("πŸ€– AI-Powered Data Analytics", className="text-center mb-4"),
            html.P("Upload data, ask questions, and get AI-powered insights!", 
                   className="text-center text-muted"),
            html.Hr(),
        ], width=12)
    ]),
    
    dbc.Row([
        dbc.Col([
            dbc.Card([
                dbc.CardBody([
                    html.H4("πŸ“ Data Upload", className="card-title"),
                    dcc.Upload(
                        id='upload-data',
                        children=html.Div([
                            'Drag and Drop or ',
                            html.A('Select Files')
                        ]),
                        style={
                            'width': '100%',
                            'height': '60px',
                            'lineHeight': '60px',
                            'borderWidth': '1px',
                            'borderStyle': 'dashed',
                            'borderRadius': '5px',
                            'textAlign': 'center',
                            'margin': '10px'
                        },
                        multiple=False,
                        accept='.csv,.xlsx,.txt'
                    ),
                    
                    html.Div(id='upload-status', className="mt-2"),
                    html.Hr(),
                    
                    html.H4("πŸ€– AI Assistant", className="card-title"),
                    dbc.InputGroup([
                        dbc.Input(
                            id="ai-question",
                            placeholder="Ask questions about your data...",
                            type="text",
                            style={"fontSize": "14px"}
                        ),
                        dbc.Button(
                            "Ask AI", 
                            id="ask-button", 
                            color="primary",
                            n_clicks=0
                        )
                    ]),
                    
                    html.Div(id="ai-response", className="mt-3"),
                    html.Hr(),
                    
                    html.H4("πŸ“Š Quick Analytics", className="card-title"),
                    dbc.ButtonGroup([
                        dbc.Button("Summary Stats", id="stats-btn", size="sm"),
                        dbc.Button("Correlations", id="corr-btn", size="sm"),
                        dbc.Button("Missing Data", id="missing-btn", size="sm"),
                    ], className="w-100"),
                    
                    html.Div(id="quick-analytics", className="mt-3")
                ])
            ])
        ], width=4),
        
        dbc.Col([
            dbc.Card([
                dbc.CardBody([
                    html.H4("πŸ“ˆ Visualizations", className="card-title"),
                    dcc.Graph(id='main-graph', style={'height': '400px'}),
                ])
            ]),
            
            dbc.Card([
                dbc.CardBody([
                    html.H4("πŸ” Data Explorer", className="card-title"),
                    html.Div(id='data-table')
                ])
            ], className="mt-3")
        ], width=8)
    ], className="mt-4"),
    
    # Store components
    dcc.Store(id='stored-data'),
    dcc.Store(id='data-context')
], fluid=True)

def create_vector_store(df):
    """Create vector store from dataframe"""
    global vector_store
    
    if embeddings is None:
        return False
    
    try:
        # Convert dataframe to documents
        documents = []
        
        # Add column information
        col_info = f"Dataset has {len(df)} rows and {len(df.columns)} columns.\n"
        col_info += f"Columns: {', '.join(df.columns)}\n"
        col_info += f"Data types: {df.dtypes.to_string()}\n"
        documents.append(Document(page_content=col_info, metadata={"type": "schema"}))
        
        # Add summary statistics
        summary = df.describe().to_string()
        documents.append(Document(page_content=f"Summary statistics:\n{summary}", 
                                metadata={"type": "statistics"}))
        
        # Add sample rows
        sample_data = df.head(10).to_string()
        documents.append(Document(page_content=f"Sample data:\n{sample_data}", 
                                metadata={"type": "sample"}))
        
        # Add correlation information for numeric columns
        numeric_cols = df.select_dtypes(include=['number']).columns
        if len(numeric_cols) > 1:
            corr = df[numeric_cols].corr().to_string()
            documents.append(Document(page_content=f"Correlations:\n{corr}", 
                                    metadata={"type": "correlation"}))
        
        # Create vector store
        vector_store = FAISS.from_documents(documents, embeddings)
        return True
        
    except Exception as e:
        print(f"Error creating vector store: {e}")
        return False

def get_ai_response(question, df):
    """Get AI response using OpenRouter LLM and RAG"""
    global vector_store
    
    if vector_store is None:
        return "Please upload data first to enable AI features."
    
    try:
        # Create data context for the LLM
        data_context = f"""
Dataset Information:
- Shape: {df.shape[0]} rows Γ— {df.shape[1]} columns
- Columns: {', '.join(df.columns)}
- Data Types: {df.dtypes.to_dict()}
- Missing Values: {df.isnull().sum().to_dict()}

Sample Data (first 5 rows):
{df.head().to_string()}

Summary Statistics:
{df.describe().to_string()}
        """
        
        # Create a prompt template for data analysis
        prompt_template = PromptTemplate(
            input_variables=["question", "data_context"],
            template="""
You are a professional data analyst AI assistant. Based on the provided dataset information, answer the user's question with clear, actionable insights.

Dataset Context:
{data_context}

User Question: {question}

Please provide a helpful, accurate response with:
1. Direct answer to the question
2. Key insights or patterns you notice
3. Recommendations or next steps if applicable

Use emojis and markdown formatting to make your response engaging and easy to read.
            """
        )
        
        # Create LLM chain
        llm_chain = LLMChain(
            llm=openrouter_model,
            prompt=prompt_template
        )
        
        # Get response from OpenRouter
        response = llm_chain.run(
            question=question,
            data_context=data_context
        )
        
        return response
        
    except Exception as e:
        # Fallback to basic responses if OpenRouter fails
        print(f"OpenRouter error: {e}")
        return f"""πŸ€– **AI Assistant** (Limited Mode): 
        I encountered an issue with the AI service. Here's basic info about your data:
        
        πŸ“Š **Quick Summary**:
        - Shape: {df.shape[0]} rows Γ— {df.shape[1]} columns
        - Columns: {', '.join(df.columns)}
        - Missing values: {df.isnull().sum().sum()} total
        
        Please check your OPENROUTER_API_KEY configuration.
        """

def parse_contents(contents, filename):
    """Parse uploaded file contents"""
    content_type, content_string = contents.split(',')
    decoded = base64.b64decode(content_string)
    
    try:
        if 'csv' in filename:
            df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
        elif 'xls' in filename:
            df = pd.read_excel(io.BytesIO(decoded))
        else:
            return None, "Unsupported file type"
        
        return df, None
    except Exception as e:
        return None, f"Error processing file: {str(e)}"

@app.callback(
    [Output('stored-data', 'data'),
     Output('upload-status', 'children'),
     Output('data-table', 'children')],
    [Input('upload-data', 'contents')],
    [State('upload-data', 'filename')]
)
def update_data(contents, filename):
    """Update data when file is uploaded"""
    if contents is None:
        return None, "", ""
    
    df, error = parse_contents(contents, filename)
    
    if error:
        return None, dbc.Alert(error, color="danger"), ""
    
    # Create vector store for AI
    vector_success = create_vector_store(df)
    
    # Create data table preview
    table = dbc.Table.from_dataframe(
        df.head(10), 
        striped=True, 
        bordered=True, 
        hover=True, 
        size='sm'
    )
    
    ai_status = "πŸ€– AI Ready" if vector_success else "⚠️ AI Limited"
    
    success_msg = dbc.Alert([
        html.H6(f"βœ… File uploaded successfully! {ai_status}"),
        html.P(f"Shape: {df.shape[0]} rows Γ— {df.shape[1]} columns"),
        html.P(f"Columns: {', '.join(df.columns.tolist())}")
    ], color="success")
    
    return df.to_dict('records'), success_msg, table

@app.callback(
    Output('ai-response', 'children'),
    [Input('ask-button', 'n_clicks')],
    [State('ai-question', 'value'),
     State('stored-data', 'data')]
)
def handle_ai_question(n_clicks, question, data):
    """Handle AI question"""
    if not n_clicks or not question or not data:
        return ""
    
    df = pd.DataFrame(data)
    response = get_ai_response(question, df)
    
    return dbc.Alert(
        dcc.Markdown(response),
        color="info"
    )

@app.callback(
    Output('quick-analytics', 'children'),
    [Input('stats-btn', 'n_clicks'),
     Input('corr-btn', 'n_clicks'),
     Input('missing-btn', 'n_clicks')],
    [State('stored-data', 'data')]
)
def quick_analytics(stats_clicks, corr_clicks, missing_clicks, data):
    """Handle quick analytics buttons"""
    if not data:
        return ""
    
    df = pd.DataFrame(data)
    ctx = callback_context
    
    if not ctx.triggered:
        return ""
    
    button_id = ctx.triggered[0]['prop_id'].split('.')[0]
    
    if button_id == 'stats-btn':
        stats = df.describe()
        return dbc.Alert([
            html.H6("πŸ“Š Summary Statistics"),
            dbc.Table.from_dataframe(stats.reset_index(), size='sm')
        ], color="light")
    
    elif button_id == 'corr-btn':
        numeric_df = df.select_dtypes(include=['number'])
        if len(numeric_df.columns) > 1:
            corr = numeric_df.corr()
            fig = px.imshow(corr, text_auto=True, aspect="auto", 
                          title="Correlation Matrix")
            return dcc.Graph(figure=fig, style={'height': '300px'})
        return dbc.Alert("No numeric columns for correlation analysis", color="warning")
    
    elif button_id == 'missing-btn':
        missing = df.isnull().sum()
        missing = missing[missing > 0]
        if missing.empty:
            return dbc.Alert("βœ… No missing values!", color="success")
        return dbc.Alert([
            html.H6("⚠️ Missing Values"),
            html.Pre(missing.to_string())
        ], color="warning")
    
    return ""

@app.callback(
    Output('main-graph', 'figure'),
    [Input('stored-data', 'data')]
)
def update_main_graph(data):
    """Update main visualization"""
    if not data:
        return {}
    
    df = pd.DataFrame(data)
    
    # Create a smart default visualization
    numeric_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    
    if len(numeric_cols) >= 2:
        # Scatter plot for numeric data
        fig = px.scatter(df, x=numeric_cols[0], y=numeric_cols[1],
                        title=f"Relationship: {numeric_cols[1]} vs {numeric_cols[0]}")
    elif len(numeric_cols) >= 1 and len(categorical_cols) >= 1:
        # Bar chart for mixed data
        fig = px.bar(df, x=categorical_cols[0], y=numeric_cols[0],
                    title=f"Distribution: {numeric_cols[0]} by {categorical_cols[0]}")
    elif len(numeric_cols) >= 1:
        # Histogram for single numeric
        fig = px.histogram(df, x=numeric_cols[0],
                         title=f"Distribution of {numeric_cols[0]}")
    else:
        # Default message
        fig = go.Figure()
        fig.add_annotation(text="Upload data to see visualizations", 
                         x=0.5, y=0.5, showarrow=False)
    
    fig.update_layout(template="plotly_white")
    return fig

if __name__ == '__main__':
    app.run_server(host='0.0.0.0', port=7860, debug=False)