|
|
""" |
|
|
AI Assistant Module for Data Analytics Dashboard |
|
|
|
|
|
This module contains all AI-related functionality including: |
|
|
- LLM integrations (OpenRouter, OpenAI, etc.) |
|
|
- Data analysis functions |
|
|
- Natural language processing |
|
|
- Chart generation from prompts |
|
|
- Advanced analytics |
|
|
""" |
|
|
|
|
|
import os |
|
|
import pandas as pd |
|
|
import plotly.express as px |
|
|
import plotly.graph_objects as go |
|
|
from typing import Optional, Dict, Any, List, Tuple |
|
|
from dotenv import load_dotenv |
|
|
import sys |
|
|
import io |
|
|
import contextlib |
|
|
|
|
|
|
|
|
import matplotlib |
|
|
matplotlib.use('Agg') |
|
|
import matplotlib.pyplot as plt |
|
|
import traceback |
|
|
import re |
|
|
import numpy as np |
|
|
import seaborn as sns |
|
|
from io import StringIO, BytesIO |
|
|
import base64 |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
try: |
|
|
from langchain_core.prompts import PromptTemplate |
|
|
|
|
|
try: |
|
|
from langchain_openai import ChatOpenAI |
|
|
except ImportError: |
|
|
from langchain_community.chat_models import ChatOpenAI |
|
|
LANGCHAIN_AVAILABLE = True |
|
|
except ImportError: |
|
|
print("LangChain not fully available - using demo mode") |
|
|
LANGCHAIN_AVAILABLE = False |
|
|
|
|
|
class PythonREPL: |
|
|
"""Safe Python code execution environment for AI assistant""" |
|
|
|
|
|
def __init__(self, dataframe=None): |
|
|
self.df = dataframe |
|
|
self.globals_dict = { |
|
|
|
|
|
'pd': pd, |
|
|
'np': np, |
|
|
'plt': plt, |
|
|
'sns': sns, |
|
|
'px': px, |
|
|
'go': go, |
|
|
|
|
|
'len': len, |
|
|
'sum': sum, |
|
|
'min': min, |
|
|
'max': max, |
|
|
'abs': abs, |
|
|
'round': round, |
|
|
'range': range, |
|
|
'list': list, |
|
|
'dict': dict, |
|
|
'tuple': tuple, |
|
|
'set': set, |
|
|
'str': str, |
|
|
'int': int, |
|
|
'float': float, |
|
|
'bool': bool, |
|
|
|
|
|
'print': print, |
|
|
} |
|
|
|
|
|
if dataframe is not None: |
|
|
self.globals_dict['df'] = dataframe |
|
|
|
|
|
def execute_code(self, code: str) -> Dict[str, Any]: |
|
|
"""Execute Python code safely and return results""" |
|
|
|
|
|
stdout_capture = StringIO() |
|
|
result = { |
|
|
'success': False, |
|
|
'output': '', |
|
|
'error': '', |
|
|
'plots': [], |
|
|
'returned_value': None |
|
|
} |
|
|
|
|
|
try: |
|
|
|
|
|
if self._is_code_safe(code): |
|
|
with contextlib.redirect_stdout(stdout_capture): |
|
|
|
|
|
local_globals = self.globals_dict.copy() |
|
|
|
|
|
|
|
|
exec(code, local_globals) |
|
|
|
|
|
result['success'] = True |
|
|
result['output'] = stdout_capture.getvalue() |
|
|
|
|
|
|
|
|
if plt.get_fignums(): |
|
|
plot_data = self._capture_plots() |
|
|
result['plots'] = plot_data |
|
|
else: |
|
|
result['error'] = "Code contains potentially unsafe operations and cannot be executed." |
|
|
|
|
|
except Exception as e: |
|
|
result['error'] = f"Error: {str(e)}\n{traceback.format_exc()}" |
|
|
|
|
|
return result |
|
|
|
|
|
def _is_code_safe(self, code: str) -> bool: |
|
|
"""Check if code is safe to execute""" |
|
|
|
|
|
dangerous_patterns = [ |
|
|
r'import\s+os', |
|
|
r'import\s+sys', |
|
|
r'import\s+subprocess', |
|
|
r'import\s+shutil', |
|
|
r'from\s+os', |
|
|
r'from\s+sys', |
|
|
r'from\s+subprocess', |
|
|
r'__import__', |
|
|
r'eval\s*\(', |
|
|
r'exec\s*\(', |
|
|
r'open\s*\(', |
|
|
r'file\s*\(', |
|
|
r'input\s*\(', |
|
|
r'raw_input\s*\(', |
|
|
r'exit\s*\(', |
|
|
r'quit\s*\(', |
|
|
r'del\s+', |
|
|
r'globals\s*\(', |
|
|
r'locals\s*\(', |
|
|
r'vars\s*\(', |
|
|
r'reload\s*\(', |
|
|
r'pd\.read_csv\s*\(', |
|
|
r'pd\.read_excel\s*\(', |
|
|
r'pd\.read_json\s*\(', |
|
|
r'pandas\.read_csv\s*\(', |
|
|
r'pandas\.read_excel\s*\(', |
|
|
r'pandas\.read_json\s*\(', |
|
|
r'["\'][^"\']*\.csv["\']', |
|
|
r'["\'][^"\']*\.xlsx["\']', |
|
|
r'["\'][^"\']*\.json["\']', |
|
|
r'your_data_file', |
|
|
] |
|
|
|
|
|
code_lower = code.lower() |
|
|
for pattern in dangerous_patterns: |
|
|
if re.search(pattern, code_lower): |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def _capture_plots(self) -> List[str]: |
|
|
"""Capture matplotlib plots as base64 encoded images""" |
|
|
plots = [] |
|
|
|
|
|
for fig_num in plt.get_fignums(): |
|
|
fig = plt.figure(fig_num) |
|
|
|
|
|
|
|
|
img_buffer = BytesIO() |
|
|
fig.savefig(img_buffer, format='png', bbox_inches='tight', dpi=150) |
|
|
img_buffer.seek(0) |
|
|
|
|
|
|
|
|
img_base64 = base64.b64encode(img_buffer.getvalue()).decode() |
|
|
plots.append(img_base64) |
|
|
|
|
|
|
|
|
plt.close(fig) |
|
|
|
|
|
return plots |
|
|
|
|
|
class ChatOpenRouter: |
|
|
"""Custom ChatOpenRouter class for OpenRouter API integration""" |
|
|
|
|
|
def __init__(self, model="google/gemma-3-27b-it:free", temperature=0.3, max_tokens=1500, **kwargs): |
|
|
self.model = model |
|
|
self.temperature = temperature |
|
|
self.max_tokens = max_tokens |
|
|
self.api_key = os.environ.get("OPENROUTER_API_KEY") |
|
|
|
|
|
if not self.api_key: |
|
|
raise ValueError("OPENROUTER_API_KEY not found in environment variables") |
|
|
|
|
|
if LANGCHAIN_AVAILABLE: |
|
|
self.client = ChatOpenAI( |
|
|
base_url="https://openrouter.ai/api/v1", |
|
|
api_key=self.api_key, |
|
|
model=model, |
|
|
temperature=temperature, |
|
|
max_tokens=max_tokens, |
|
|
**kwargs |
|
|
) |
|
|
else: |
|
|
self.client = None |
|
|
|
|
|
def invoke(self, messages): |
|
|
"""Invoke the model with messages""" |
|
|
if self.client: |
|
|
return self.client.invoke(messages) |
|
|
else: |
|
|
|
|
|
return type('Response', (), {'content': 'LangChain not available - using demo mode'})() |
|
|
|
|
|
def is_available(self): |
|
|
"""Check if the client is properly initialized""" |
|
|
return self.client is not None and self.api_key is not None |
|
|
|
|
|
class AIAssistant: |
|
|
"""Main AI Assistant class that handles various AI-powered data analysis tasks""" |
|
|
|
|
|
def __init__(self): |
|
|
self.llm_client = None |
|
|
self.openrouter_available = self._init_openrouter() |
|
|
self.current_dataset = None |
|
|
self.dataset_context = {} |
|
|
self.python_repl = None |
|
|
|
|
|
def _init_openrouter(self) -> bool: |
|
|
"""Initialize OpenRouter LLM if API key is available""" |
|
|
try: |
|
|
self.llm_client = ChatOpenRouter() |
|
|
if self.llm_client.is_available(): |
|
|
print("β
OpenRouter initialized successfully") |
|
|
return True |
|
|
else: |
|
|
print("β οΈ OpenRouter client not fully available - using demo mode") |
|
|
return False |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Failed to initialize OpenRouter: {e}") |
|
|
print("Using demo mode instead") |
|
|
return False |
|
|
|
|
|
def set_dataset(self, df: pd.DataFrame, dataset_name: str = "current"): |
|
|
"""Set the current dataset for AI analysis""" |
|
|
self.current_dataset = df |
|
|
self.dataset_context[dataset_name] = { |
|
|
'dataframe': df, |
|
|
'shape': df.shape, |
|
|
'columns': df.columns.tolist(), |
|
|
'dtypes': df.dtypes.to_dict(), |
|
|
'missing_values': df.isnull().sum().to_dict(), |
|
|
'numeric_columns': df.select_dtypes(include=['number']).columns.tolist(), |
|
|
'categorical_columns': df.select_dtypes(include=['object']).columns.tolist(), |
|
|
'summary_stats': df.describe().to_dict() if len(df.select_dtypes(include=['number']).columns) > 0 else {} |
|
|
} |
|
|
|
|
|
self.python_repl = PythonREPL(dataframe=df) |
|
|
|
|
|
def get_llm_response(self, question: str, df: pd.DataFrame) -> str: |
|
|
"""Generate LLM-powered response using OpenRouter""" |
|
|
if not self.openrouter_available or not self.llm_client: |
|
|
return self.get_basic_response(question, df) |
|
|
|
|
|
try: |
|
|
|
|
|
if self._should_execute_code(question): |
|
|
return self._get_code_execution_response(question, df) |
|
|
|
|
|
|
|
|
data_context = self._create_data_context(df) |
|
|
|
|
|
|
|
|
prompt = f"""You are a professional data analyst AI assistant with Python code execution capabilities. Based on the provided dataset information, answer the user's question with clear, actionable insights. |
|
|
|
|
|
Dataset Context: |
|
|
{data_context} |
|
|
|
|
|
User Question: {question} |
|
|
|
|
|
Available capabilities: |
|
|
- You can write and execute Python code to analyze the data |
|
|
- The dataset is available as 'df' variable |
|
|
- Available libraries: pandas (pd), numpy (np), matplotlib (plt), seaborn (sns), plotly (px, go) |
|
|
- You can create visualizations and perform complex analyses |
|
|
|
|
|
Response format: |
|
|
1. Direct answer to the question based on the actual data |
|
|
2. Key insights or patterns you notice in this specific dataset |
|
|
3. If analysis requires computation, suggest or provide Python code |
|
|
4. Practical recommendations or next steps if applicable |
|
|
5. Use emojis and markdown formatting to make your response engaging and easy to read |
|
|
|
|
|
Keep your response concise but informative, focusing on actionable insights about this specific dataset. |
|
|
""" |
|
|
|
|
|
|
|
|
response = self.llm_client.invoke(prompt) |
|
|
|
|
|
|
|
|
if hasattr(response, 'content'): |
|
|
return response.content |
|
|
else: |
|
|
return str(response) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error getting LLM response: {e}") |
|
|
return self.get_basic_response(question, df) |
|
|
|
|
|
def _create_data_context(self, df: pd.DataFrame) -> str: |
|
|
"""Create comprehensive data context for LLM""" |
|
|
numeric_cols = df.select_dtypes(include=['number']).columns |
|
|
categorical_cols = df.select_dtypes(include=['object']).columns |
|
|
|
|
|
context = f"""Dataset Information: |
|
|
- Shape: {df.shape[0]:,} rows Γ {df.shape[1]} columns |
|
|
- Columns: {', '.join(df.columns.tolist())} |
|
|
- Numeric columns ({len(numeric_cols)}): {', '.join(numeric_cols.tolist())} |
|
|
- Categorical columns ({len(categorical_cols)}): {', '.join(categorical_cols.tolist())} |
|
|
- Missing values: {df.isnull().sum().sum()} total |
|
|
|
|
|
Sample Data (first 3 rows): |
|
|
{df.head(3).to_string()} |
|
|
|
|
|
Summary Statistics (numeric columns): |
|
|
{df.describe().to_string() if len(numeric_cols) > 0 else 'No numeric columns for statistics'} |
|
|
|
|
|
Data Types: |
|
|
{df.dtypes.to_string()}""" |
|
|
|
|
|
return context |
|
|
|
|
|
def _should_execute_code(self, question: str) -> bool: |
|
|
"""Determine if the question requires code execution""" |
|
|
code_keywords = [ |
|
|
'run code', 'execute', 'calculate', 'compute', 'plot', 'visualize', 'graph', |
|
|
'correlation matrix', 'regression', 'analysis', 'statistics', 'distribution', |
|
|
'histogram', 'scatter plot', 'bar chart', 'create chart', 'show me', |
|
|
'python code', 'pandas', 'numpy' |
|
|
] |
|
|
|
|
|
question_lower = question.lower() |
|
|
return any(keyword in question_lower for keyword in code_keywords) |
|
|
|
|
|
def _get_code_execution_response(self, question: str, df: pd.DataFrame) -> str: |
|
|
"""Generate response with code execution""" |
|
|
if not self.python_repl: |
|
|
return "Code execution environment not available. Please load a dataset first." |
|
|
|
|
|
|
|
|
code_prompt = f"""You are a Python data analyst. Generate Python code to answer this question about the dataset: |
|
|
|
|
|
Question: {question} |
|
|
|
|
|
IMPORTANT - Dataset is already loaded: |
|
|
- The dataset is already loaded and available as the variable 'df' |
|
|
- DO NOT use pd.read_csv() or any file loading commands |
|
|
- DO NOT try to load data from files - it's already available as 'df' |
|
|
- The dataframe 'df' contains {df.shape[0]} rows and {df.shape[1]} columns |
|
|
- Columns available in df: {df.columns.tolist()} |
|
|
|
|
|
Sample data from df: |
|
|
{df.head(3).to_string()} |
|
|
|
|
|
Requirements: |
|
|
1. Use the pre-loaded dataframe 'df' directly |
|
|
2. Write clean, well-commented Python code |
|
|
3. Use pandas, numpy, matplotlib, seaborn as needed |
|
|
4. Include print statements to show results |
|
|
5. Create visualizations if requested |
|
|
6. DO NOT use plt.show() - plots are automatically captured |
|
|
7. Only return the Python code, no explanations |
|
|
|
|
|
Code:""" |
|
|
|
|
|
try: |
|
|
|
|
|
response = self.llm_client.invoke(code_prompt) |
|
|
generated_code = response.content if hasattr(response, 'content') else str(response) |
|
|
|
|
|
|
|
|
code = self._extract_code_from_response(generated_code) |
|
|
|
|
|
if code: |
|
|
|
|
|
result = self.python_repl.execute_code(code) |
|
|
|
|
|
|
|
|
return self._format_code_execution_result(question, code, result) |
|
|
else: |
|
|
return f"I couldn't generate appropriate code for your request: {question}" |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error generating code execution response: {str(e)}" |
|
|
|
|
|
def _extract_code_from_response(self, response: str) -> str: |
|
|
"""Extract Python code from LLM response""" |
|
|
|
|
|
code_patterns = [ |
|
|
r'```python\s*\n(.*?)\n```', |
|
|
r'```\s*\n(.*?)\n```', |
|
|
r'`([^`]+)`' |
|
|
] |
|
|
|
|
|
for pattern in code_patterns: |
|
|
matches = re.findall(pattern, response, re.DOTALL) |
|
|
if matches: |
|
|
code_result = matches[0].strip() |
|
|
|
|
|
code_result = re.sub(r'plt\.show\(\)\s*', '', code_result) |
|
|
return code_result |
|
|
|
|
|
|
|
|
lines = response.strip().split('\n') |
|
|
code_lines = [] |
|
|
|
|
|
for line in lines: |
|
|
|
|
|
if any(skip in line.lower() for skip in ['here', 'this code', 'explanation', 'result']): |
|
|
continue |
|
|
if line.strip().startswith(('#', '//', '/*')): |
|
|
continue |
|
|
code_lines.append(line) |
|
|
|
|
|
code_result = '\n'.join(code_lines).strip() |
|
|
|
|
|
|
|
|
code_result = re.sub(r'plt\.show\(\)\s*', '', code_result) |
|
|
|
|
|
return code_result |
|
|
|
|
|
def _format_code_execution_result(self, question: str, code: str, result: Dict[str, Any]) -> str: |
|
|
"""Format the code execution result for display""" |
|
|
response_parts = [ |
|
|
f"## π **Code Execution Result**", |
|
|
f"**Question:** {question}", |
|
|
"", |
|
|
"### **Code:**", |
|
|
f"```python", |
|
|
code, |
|
|
"```", |
|
|
"" |
|
|
] |
|
|
|
|
|
if result['success']: |
|
|
if result['output']: |
|
|
response_parts.extend([ |
|
|
"### **Output:**", |
|
|
"```", |
|
|
result['output'], |
|
|
"```", |
|
|
"" |
|
|
]) |
|
|
|
|
|
if result['plots']: |
|
|
response_parts.extend([ |
|
|
"### **Generated Plots:**", |
|
|
f"π {len(result['plots'])} plot(s) created.", |
|
|
"" |
|
|
]) |
|
|
|
|
|
|
|
|
for i, plot_base64 in enumerate(result['plots'], 1): |
|
|
response_parts.extend([ |
|
|
f"**Plot {i}:**", |
|
|
f"", |
|
|
"" |
|
|
]) |
|
|
else: |
|
|
response_parts.extend([ |
|
|
"### **β Error:**", |
|
|
"```", |
|
|
result['error'], |
|
|
"```", |
|
|
"" |
|
|
]) |
|
|
|
|
|
return "\n".join(response_parts) |
|
|
|
|
|
def get_basic_response(self, question: str, df: pd.DataFrame) -> str: |
|
|
"""Generate a basic AI response for demo mode""" |
|
|
|
|
|
|
|
|
question_lower = question.lower() |
|
|
|
|
|
|
|
|
if any(word in question_lower for word in ['overview', 'summary', 'describe', 'about']): |
|
|
return self._generate_data_overview(df) |
|
|
|
|
|
|
|
|
elif any(word in question_lower for word in ['missing', 'null', 'empty', 'incomplete']): |
|
|
return self._generate_missing_data_analysis(df) |
|
|
|
|
|
|
|
|
elif any(word in question_lower for word in ['correlation', 'relationship', 'related', 'associated']): |
|
|
return self._generate_correlation_analysis(df) |
|
|
|
|
|
|
|
|
elif any(word in question_lower for word in ['statistics', 'stats', 'mean', 'average', 'median']): |
|
|
return self._generate_statistics_analysis(df) |
|
|
|
|
|
|
|
|
elif any(word in question_lower for word in ['chart', 'plot', 'visualize', 'graph']): |
|
|
return self._generate_visualization_suggestions(df) |
|
|
|
|
|
|
|
|
elif any(word in question_lower for word in ['quality', 'clean', 'issues', 'problems']): |
|
|
return self._generate_data_quality_analysis(df) |
|
|
|
|
|
|
|
|
else: |
|
|
return self._generate_default_response(question, df) |
|
|
|
|
|
def _generate_data_overview(self, df: pd.DataFrame) -> str: |
|
|
"""Generate data overview response""" |
|
|
numeric_cols = len(df.select_dtypes(include=['number']).columns) |
|
|
categorical_cols = len(df.select_dtypes(include=['object']).columns) |
|
|
|
|
|
return f"""π **Data Overview** |
|
|
|
|
|
**Dataset Summary:** |
|
|
β’ Shape: {df.shape[0]:,} rows Γ {df.shape[1]} columns |
|
|
β’ Numeric columns: {numeric_cols} |
|
|
β’ Categorical columns: {categorical_cols} |
|
|
β’ Total data points: {df.shape[0] * df.shape[1]:,} |
|
|
|
|
|
**Key Insights:** |
|
|
β’ The dataset contains {df.shape[0]:,} observations |
|
|
β’ Memory usage: ~{df.memory_usage().sum() / 1024:.1f} KB |
|
|
β’ Column diversity: {df.shape[1]} different variables to analyze |
|
|
|
|
|
π‘ **Suggested next steps:** Explore correlations, check data quality, or create visualizations! |
|
|
""" |
|
|
|
|
|
def _generate_missing_data_analysis(self, df: pd.DataFrame) -> str: |
|
|
"""Generate missing data analysis response""" |
|
|
missing = df.isnull().sum() |
|
|
missing_cols = missing[missing > 0] |
|
|
|
|
|
if missing_cols.empty: |
|
|
return """β
**Missing Data Analysis** |
|
|
|
|
|
**Great news!** Your dataset has no missing values. This indicates: |
|
|
β’ High data quality |
|
|
β’ Complete observations for all variables |
|
|
β’ Ready for analysis without imputation |
|
|
|
|
|
π‘ **This makes your analysis more reliable and straightforward!** |
|
|
""" |
|
|
else: |
|
|
total_missing = missing_cols.sum() |
|
|
missing_percentage = (total_missing / (df.shape[0] * df.shape[1])) * 100 |
|
|
|
|
|
missing_info = "\n".join([f"β’ {col}: {count} missing ({count/len(df)*100:.1f}%)" |
|
|
for col, count in missing_cols.head(5).items()]) |
|
|
|
|
|
return f"""β οΈ **Missing Data Analysis** |
|
|
|
|
|
**Missing Data Found:** |
|
|
{missing_info} |
|
|
|
|
|
**Impact Assessment:** |
|
|
β’ Total missing values: {total_missing:,} |
|
|
β’ Percentage of dataset: {missing_percentage:.2f}% |
|
|
β’ Affected columns: {len(missing_cols)} |
|
|
|
|
|
π‘ **Recommendations:** |
|
|
β’ Consider data imputation strategies |
|
|
β’ Analyze patterns in missing data |
|
|
β’ Evaluate if missing data is random or systematic |
|
|
""" |
|
|
|
|
|
def _generate_correlation_analysis(self, df: pd.DataFrame) -> str: |
|
|
"""Generate correlation analysis response""" |
|
|
numeric_cols = df.select_dtypes(include=['number']).columns |
|
|
|
|
|
if len(numeric_cols) < 2: |
|
|
return """π **Correlation Analysis** |
|
|
|
|
|
**Limited Analysis:** Your dataset has fewer than 2 numeric columns, so correlation analysis isn't applicable. |
|
|
|
|
|
π‘ **Suggestions:** |
|
|
β’ Look at categorical relationships instead |
|
|
β’ Consider frequency distributions |
|
|
β’ Explore data patterns within individual variables |
|
|
""" |
|
|
|
|
|
|
|
|
corr_matrix = df[numeric_cols].corr() |
|
|
|
|
|
|
|
|
strong_corr = [] |
|
|
for i in range(len(corr_matrix.columns)): |
|
|
for j in range(i+1, len(corr_matrix.columns)): |
|
|
corr_val = corr_matrix.iloc[i, j] |
|
|
if abs(corr_val) > 0.5: |
|
|
strength = "Strong" if abs(corr_val) > 0.7 else "Moderate" |
|
|
direction = "positive" if corr_val > 0 else "negative" |
|
|
strong_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], |
|
|
corr_val, strength, direction)) |
|
|
|
|
|
if strong_corr: |
|
|
corr_info = "\n".join([f"β’ {pair[0]} β {pair[1]}: {pair[2]:.3f} ({pair[3]} {pair[4]})" |
|
|
for pair in strong_corr[:5]]) |
|
|
return f"""π **Correlation Analysis** |
|
|
|
|
|
**Strong Relationships Found:** |
|
|
{corr_info} |
|
|
|
|
|
**Analysis Summary:** |
|
|
β’ {len(strong_corr)} significant correlations detected |
|
|
β’ Analyzed {len(numeric_cols)} numeric variables |
|
|
β’ Correlation threshold: >0.5 |
|
|
|
|
|
π‘ **Insights:** These relationships could be key for predictive modeling or understanding data patterns! |
|
|
""" |
|
|
else: |
|
|
return f"""π **Correlation Analysis** |
|
|
|
|
|
**Analysis Results:** |
|
|
β’ Analyzed {len(numeric_cols)} numeric variables |
|
|
β’ No strong correlations (>0.5) detected |
|
|
β’ Variables appear relatively independent |
|
|
|
|
|
π‘ **This suggests:** |
|
|
β’ Variables measure different aspects |
|
|
β’ Good for diverse analysis approaches |
|
|
β’ Less multicollinearity concerns |
|
|
""" |
|
|
|
|
|
def _generate_statistics_analysis(self, df: pd.DataFrame) -> str: |
|
|
"""Generate statistical analysis response""" |
|
|
numeric_cols = df.select_dtypes(include=['number']).columns |
|
|
|
|
|
if len(numeric_cols) == 0: |
|
|
return """π **Statistical Analysis** |
|
|
|
|
|
**No numeric columns found** for statistical analysis. |
|
|
|
|
|
π‘ **Alternative approaches:** |
|
|
β’ Frequency distributions for categorical data |
|
|
β’ Mode analysis for text columns |
|
|
β’ Data type conversions if needed |
|
|
""" |
|
|
|
|
|
stats_summary = [] |
|
|
for col in numeric_cols[:5]: |
|
|
data = df[col] |
|
|
stats_summary.append(f"**{col}:**") |
|
|
stats_summary.append(f" β’ Mean: {data.mean():.2f}") |
|
|
stats_summary.append(f" β’ Median: {data.median():.2f}") |
|
|
stats_summary.append(f" β’ Std Dev: {data.std():.2f}") |
|
|
stats_summary.append(f" β’ Range: {data.min():.2f} to {data.max():.2f}") |
|
|
stats_summary.append("") |
|
|
|
|
|
return f"""π **Statistical Analysis** |
|
|
|
|
|
{chr(10).join(stats_summary)} |
|
|
|
|
|
**Key Insights:** |
|
|
β’ {len(numeric_cols)} numeric variables analyzed |
|
|
β’ Statistical distributions vary across columns |
|
|
β’ Ready for advanced analytics |
|
|
|
|
|
π‘ **Next steps:** Consider outlier detection, normalization, or predictive modeling! |
|
|
""" |
|
|
|
|
|
def _generate_visualization_suggestions(self, df: pd.DataFrame) -> str: |
|
|
"""Generate visualization suggestions""" |
|
|
numeric_cols = df.select_dtypes(include=['number']).columns |
|
|
categorical_cols = df.select_dtypes(include=['object']).columns |
|
|
|
|
|
suggestions = [] |
|
|
|
|
|
if len(numeric_cols) >= 2: |
|
|
suggestions.append("β’ **Scatter Plot**: Explore relationships between numeric variables") |
|
|
suggestions.append("β’ **Correlation Heatmap**: Visualize all correlations at once") |
|
|
|
|
|
if len(numeric_cols) >= 1: |
|
|
suggestions.append("β’ **Histogram**: Show distribution of numeric variables") |
|
|
suggestions.append("β’ **Box Plot**: Identify outliers and quartiles") |
|
|
|
|
|
if len(categorical_cols) >= 1: |
|
|
suggestions.append("β’ **Bar Chart**: Compare categories and frequencies") |
|
|
suggestions.append("β’ **Pie Chart**: Show proportions of categories") |
|
|
|
|
|
if len(numeric_cols) >= 1 and len(categorical_cols) >= 1: |
|
|
suggestions.append("β’ **Grouped Charts**: Compare numeric values across categories") |
|
|
|
|
|
if not suggestions: |
|
|
suggestions.append("β’ **Data Table**: Explore your data structure first") |
|
|
|
|
|
return f"""π **Visualization Suggestions** |
|
|
|
|
|
**Recommended Charts for Your Data:** |
|
|
{chr(10).join(suggestions)} |
|
|
|
|
|
**Data Composition:** |
|
|
β’ Numeric columns: {len(numeric_cols)} |
|
|
β’ Categorical columns: {len(categorical_cols)} |
|
|
β’ Total observations: {len(df):,} |
|
|
|
|
|
π‘ **Tip:** Start with simple charts and build complexity as you discover patterns! |
|
|
""" |
|
|
|
|
|
def _generate_data_quality_analysis(self, df: pd.DataFrame) -> str: |
|
|
"""Generate data quality analysis""" |
|
|
quality_issues = [] |
|
|
quality_score = 100 |
|
|
|
|
|
|
|
|
missing_count = df.isnull().sum().sum() |
|
|
if missing_count > 0: |
|
|
missing_pct = (missing_count / (df.shape[0] * df.shape[1])) * 100 |
|
|
quality_issues.append(f"β’ Missing values: {missing_count:,} ({missing_pct:.1f}% of data)") |
|
|
quality_score -= min(missing_pct * 2, 30) |
|
|
|
|
|
|
|
|
duplicate_count = df.duplicated().sum() |
|
|
if duplicate_count > 0: |
|
|
duplicate_pct = (duplicate_count / len(df)) * 100 |
|
|
quality_issues.append(f"β’ Duplicate rows: {duplicate_count} ({duplicate_pct:.1f}%)") |
|
|
quality_score -= min(duplicate_pct * 1.5, 25) |
|
|
|
|
|
|
|
|
numeric_cols = df.select_dtypes(include=['number']).columns |
|
|
outlier_cols = [] |
|
|
for col in numeric_cols: |
|
|
Q1 = df[col].quantile(0.25) |
|
|
Q3 = df[col].quantile(0.75) |
|
|
IQR = Q3 - Q1 |
|
|
outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))][col].count() |
|
|
if outliers > len(df) * 0.05: |
|
|
outlier_cols.append((col, outliers)) |
|
|
|
|
|
if outlier_cols: |
|
|
quality_issues.append(f"β’ Potential outliers detected in {len(outlier_cols)} columns") |
|
|
quality_score -= len(outlier_cols) * 5 |
|
|
|
|
|
quality_score = max(quality_score, 0) |
|
|
|
|
|
if not quality_issues: |
|
|
return f"""β
**Data Quality Assessment** |
|
|
|
|
|
**Excellent Data Quality! Score: {quality_score:.0f}/100** |
|
|
|
|
|
**Quality Indicators:** |
|
|
β’ No missing values detected |
|
|
β’ No duplicate rows found |
|
|
β’ Outliers within acceptable ranges |
|
|
β’ Data ready for analysis |
|
|
|
|
|
π‘ **Your data is clean and analysis-ready!** |
|
|
""" |
|
|
else: |
|
|
status_color = "π’" if quality_score >= 80 else "π‘" if quality_score >= 60 else "π΄" |
|
|
|
|
|
return f"""{status_color} **Data Quality Assessment** |
|
|
|
|
|
**Quality Score: {quality_score:.0f}/100** |
|
|
|
|
|
**Issues Detected:** |
|
|
{chr(10).join(quality_issues)} |
|
|
|
|
|
**Recommendations:** |
|
|
β’ Address missing values through imputation or removal |
|
|
β’ Consider duplicate row handling strategy |
|
|
β’ Investigate outliers for business significance |
|
|
|
|
|
π‘ **Data cleaning will improve analysis reliability!** |
|
|
""" |
|
|
|
|
|
def _generate_default_response(self, question: str, df: pd.DataFrame) -> str: |
|
|
"""Generate default response with data context""" |
|
|
return f"""π€ **AI Assistant** (Demo Mode) |
|
|
|
|
|
**Your Question:** "{question}" |
|
|
|
|
|
π **Dataset Context:** |
|
|
β’ Shape: {df.shape[0]:,} rows Γ {df.shape[1]} columns |
|
|
β’ Numeric columns: {len(df.select_dtypes(include=['number']).columns)} |
|
|
β’ Categorical columns: {len(df.select_dtypes(include=['object']).columns)} |
|
|
|
|
|
**I can help you with:** |
|
|
β’ Data overviews and summaries |
|
|
β’ Missing data analysis |
|
|
β’ Correlation insights |
|
|
β’ Statistical descriptions |
|
|
β’ Visualization suggestions |
|
|
β’ Data quality assessment |
|
|
|
|
|
π‘ **Try asking:** "What's the data overview?" or "Are there any correlations?" |
|
|
|
|
|
βοΈ **Note:** Add OPENROUTER_API_KEY for advanced AI capabilities! |
|
|
""" |
|
|
|
|
|
|
|
|
ai_assistant = AIAssistant() |
|
|
|
|
|
def get_ai_response(question: str, df: pd.DataFrame) -> str: |
|
|
"""Main function to get AI response - can be called from main app""" |
|
|
ai_assistant.set_dataset(df) |
|
|
|
|
|
return ai_assistant.get_llm_response(question, df) |
|
|
|
|
|
|
|
|
|
|
|
def suggest_chart_type(df: pd.DataFrame, x_col: str = None, y_col: str = None) -> Dict[str, Any]: |
|
|
"""Suggest the best chart type based on data types""" |
|
|
suggestions = { |
|
|
'recommended': 'scatter', |
|
|
'alternatives': [], |
|
|
'reasoning': '' |
|
|
} |
|
|
|
|
|
if x_col and y_col: |
|
|
x_dtype = df[x_col].dtype |
|
|
y_dtype = df[y_col].dtype |
|
|
|
|
|
|
|
|
if pd.api.types.is_numeric_dtype(x_dtype) and pd.api.types.is_numeric_dtype(y_dtype): |
|
|
suggestions['recommended'] = 'scatter' |
|
|
suggestions['alternatives'] = ['line', 'heatmap'] |
|
|
suggestions['reasoning'] = 'Both variables are numeric - scatter plot shows relationships best' |
|
|
|
|
|
|
|
|
elif (pd.api.types.is_numeric_dtype(x_dtype) and pd.api.types.is_object_dtype(y_dtype)) or \ |
|
|
(pd.api.types.is_object_dtype(x_dtype) and pd.api.types.is_numeric_dtype(y_dtype)): |
|
|
suggestions['recommended'] = 'bar' |
|
|
suggestions['alternatives'] = ['box', 'violin'] |
|
|
suggestions['reasoning'] = 'Categorical vs numeric - bar chart shows comparisons clearly' |
|
|
|
|
|
|
|
|
else: |
|
|
suggestions['recommended'] = 'bar' |
|
|
suggestions['alternatives'] = ['heatmap'] |
|
|
suggestions['reasoning'] = 'Both categorical - bar chart shows frequency distributions' |
|
|
|
|
|
elif x_col: |
|
|
if pd.api.types.is_numeric_dtype(df[x_col].dtype): |
|
|
suggestions['recommended'] = 'histogram' |
|
|
suggestions['alternatives'] = ['box'] |
|
|
suggestions['reasoning'] = 'Single numeric variable - histogram shows distribution' |
|
|
else: |
|
|
suggestions['recommended'] = 'pie' |
|
|
suggestions['alternatives'] = ['bar'] |
|
|
suggestions['reasoning'] = 'Single categorical variable - pie chart shows proportions' |
|
|
|
|
|
return suggestions |
|
|
|
|
|
def analyze_data_patterns(df: pd.DataFrame) -> Dict[str, Any]: |
|
|
"""Analyze patterns in the dataset""" |
|
|
patterns = { |
|
|
'trends': [], |
|
|
'outliers': [], |
|
|
'correlations': [], |
|
|
'insights': [] |
|
|
} |
|
|
|
|
|
|
|
|
numeric_cols = df.select_dtypes(include=['number']).columns |
|
|
|
|
|
if len(numeric_cols) >= 2: |
|
|
corr_matrix = df[numeric_cols].corr() |
|
|
|
|
|
for i in range(len(corr_matrix.columns)): |
|
|
for j in range(i+1, len(corr_matrix.columns)): |
|
|
corr_val = corr_matrix.iloc[i, j] |
|
|
if abs(corr_val) > 0.7: |
|
|
patterns['correlations'].append({ |
|
|
'variables': (corr_matrix.columns[i], corr_matrix.columns[j]), |
|
|
'correlation': corr_val, |
|
|
'strength': 'strong' |
|
|
}) |
|
|
|
|
|
return patterns |