Spaces:

datajoi
/

Dataset-Test-Workflow

Sleeping

App Files Files Community

Mustehson commited on Nov 12, 2024

Commit

9ecf6e0

verified ·

1 Parent(s): 4e0396a

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -87

app.py CHANGED Viewed

@@ -5,19 +5,13 @@ import gradio as gr
 import pandas as pd
 import pandera as pa
 from pandera import Column
-import random
-from dataprep.eda import compute
 from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
-from .utils import (
-    format_num_stats, format_cat_stats,
-    format_ov_stats, format_insights
-)
 from langsmith import traceable
 from langchain import hub
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 # Height of the Tabs Text Area
 TAB_LINES = 8
@@ -43,7 +37,7 @@ for model in models:
       print(f"Error for model {model}: {e}")
       continue
-llm = ChatHuggingFace(llm=endpoint).bind(max_tokens=4096)
 #---------------------------------------
 #-----LOAD PROMPT FROM LANCHAIN HUB-----
@@ -69,98 +63,44 @@ def get_tables_names(schema_name):
 def update_table_names(schema_name):
     tables = get_tables_names(schema_name)
     return gr.update(choices=tables)
-# Get Schema
-def get_table_schema(table):
-    result = conn.sql(f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';").df()
-    ddl_create = result.iloc[0,0]
-    parent_database = result.iloc[0,1]
-    schema_name = result.iloc[0,2]
-    full_path = f"{parent_database}.{schema_name}.{table}"
-    if schema_name != "main":
-        old_path = f"{schema_name}.{table}"
-    else:
-        old_path = table
-    ddl_create = ddl_create.replace(old_path, full_path)
-    return full_path
 def get_data_df(schema):
     print('Getting Dataframe from the Database')
     return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
-<<<<<<< HEAD
-def calcualte_stats(df):
-    indev_stats = []
-    cols = []
-    _df = df.copy()
-    num_cols = _df.select_dtypes(include=['number'], exclude=['datetime']).columns
-    cat_cols = _df.select_dtypes(include=['object'], exclude=['datetime']).columns
-    _all_stats = compute(_df)
-    all_stats = format_ov_stats(_all_stats['stats'])
-    insights = format_insights(_all_stats['overview_insights'])
-    for i, col in enumerate(random.sample(num_cols.tolist()+cat_cols.tolist(), 2)):
-        _indv_data = compute(_df, col)
-        if col in cat_cols:
-            indev_data_cat = format_cat_stats(_indv_data["data"])
-            indev_stats.append(pd.DataFrame([indev_data_cat['Overview']], index=[f'{col}_stats']).T)
-        elif col in num_cols:
-            try:
-                indev_data_num = format_num_stats(_indv_data["data"])
-            except:
-                indev_data_num = format_cat_stats(_indv_data["data"])
-        indev_stats.append(pd.DataFrame([indev_data_num['Overview']], index=[f'{col}_stats']).T)
-    return {
-        "overall_stats": pd.DataFrame(all_stats[0], index=['Dataset Statistics']).T,
-        "insights": insights,
-        "stats_1": indev_stats[0],
-        "stats_2": indev_stats[1]
-    }
 def df_summary(df):
     summary = []
     for column in df.columns:
         if pd.api.types.is_numeric_dtype(df[column]):
             summary.append({
-                "column": column, "max": df[column].max(), "min": df[column].min(),
-                "count": df[column].count(), "nunique": df[column].nunique(),
-                "dtype": str(df[column].dtype), "top": None
             })
         elif pd.api.types.is_categorical_dtype(df[column]) or pd.api.types.is_object_dtype(df[column]):
             top_value = df[column].mode().iloc[0] if not df[column].mode().empty else None
             summary.append({
-                "column": column, "max": None, "min": None, "count": df[column].count(),
-                "nunique": df[column].nunique(), "dtype": str(df[column].dtype), "top": top_value
             })
     summary_df = pd.DataFrame(summary)
     return summary_df.reset_index(drop=True)
-=======
->>>>>>> parent of 7c2e7ac (Summary Added)
 def format_prompt(df):
-    summary_df = pd.DataFrame({
-        "max": df.max(),
-        "min": df.min(),
-        "top": df.mode().iloc[0],
-        "nunique": df.nunique(),
-        "count": df.count(),
-        "dtype": df.dtypes.astype(str)
-        }).reset_index().rename(columns={"index": "column"})
     return prompt_autogenerate.format_prompt(data=df.head().to_json(orient='records'),
-                                           summary=summary_df.to_json(orient='records'))
 def format_user_prompt(df):
     return prompt_user_input.format_prompt(data=df.head().to_json(orient='records'))
@@ -177,6 +117,33 @@ def run_llm(messages):
   return tests
 def validate_pandera(tests, df):
     validation_results = []
@@ -196,6 +163,41 @@ def validate_pandera(tests, df):
             })
     return pd.DataFrame(validation_results)
 #---------------------------------------
@@ -204,26 +206,22 @@ def validate_pandera(tests, df):
 def main(table):
     schema = get_table_schema(table)
     df = get_data_df(schema)
     messages = format_prompt(df=df)
     tests = run_llm(messages)
     print(tests)
-    stats = calcualte_stats(df)
-    df_insights = stats['insights']
-    df_statistics = stats['overall_stats']
-    df_stat_1 = stats['stats_1']
-    df_stat_2 = stats['stats_2']
     if isinstance(tests, Exception):
         tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
-        return df.head(10), df_statistics, df_insights, df_stat_1, df_stat_2, tests, pd.DataFrame([])
     tests_df = pd.DataFrame(tests)
     tests_df.rename(columns={tests_df.columns[0]: 'Column', tests_df.columns[1]: 'Rule Name', tests_df.columns[2]: 'Rules' }, inplace=True)
     pandera_results = validate_pandera(tests, df)
-    return df.head(10), df_statistics, df_insights, df_stat_1, df_stat_2, tests_df, pandera_results
 def user_results(table, text_query):
@@ -328,3 +326,4 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
 if __name__ == "__main__":
     demo.launch(debug=True)

 import pandas as pd
 import pandera as pa
 from pandera import Column
+import ydata_profiling as pp
 from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
 from langsmith import traceable
 from langchain import hub
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 # Height of the Tabs Text Area
 TAB_LINES = 8
       print(f"Error for model {model}: {e}")
       continue
+llm = ChatHuggingFace(llm=endpoint).bind(max_tokens=8192)
 #---------------------------------------
 #-----LOAD PROMPT FROM LANCHAIN HUB-----
 def update_table_names(schema_name):
     tables = get_tables_names(schema_name)
     return gr.update(choices=tables)
 def get_data_df(schema):
     print('Getting Dataframe from the Database')
     return conn.sql(f"SELECT * FROM {schema} LIMIT 1000").df()
 def df_summary(df):
     summary = []
     for column in df.columns:
         if pd.api.types.is_numeric_dtype(df[column]):
             summary.append({
+                "column": column,
+                "max": df[column].max(),
+                "min": df[column].min(),
+                "count": df[column].count(),
+                "nunique": df[column].nunique(),
+                "dtype": str(df[column].dtype),
+                "top": None
             })
         elif pd.api.types.is_categorical_dtype(df[column]) or pd.api.types.is_object_dtype(df[column]):
             top_value = df[column].mode().iloc[0] if not df[column].mode().empty else None
             summary.append({
+                "column": column,
+                "max": None,
+                "min": None,
+                "count": df[column].count(),
+                "nunique": df[column].nunique(),
+                "dtype": str(df[column].dtype),
+                "top": top_value
             })
     summary_df = pd.DataFrame(summary)
     return summary_df.reset_index(drop=True)
 def format_prompt(df):
+    summary = df_summary(df)
     return prompt_autogenerate.format_prompt(data=df.head().to_json(orient='records'),
+                                           summary=summary.to_json(orient='records'))
 def format_user_prompt(df):
     return prompt_user_input.format_prompt(data=df.head().to_json(orient='records'))
   return tests
+# Get Schema
+def get_table_schema(table):
+    result = conn.sql(f"SELECT sql, database_name, schema_name FROM duckdb_tables() where table_name ='{table}';").df()
+    ddl_create = result.iloc[0,0]
+    parent_database = result.iloc[0,1]
+    schema_name = result.iloc[0,2]
+    full_path = f"{parent_database}.{schema_name}.{table}"
+    if schema_name != "main":
+        old_path = f"{schema_name}.{table}"
+    else:
+        old_path = table
+    ddl_create = ddl_create.replace(old_path, full_path)
+    return full_path
+def describe(df):
+    numerical_info = pd.DataFrame()
+    categorical_info = pd.DataFrame()
+    if len(df.select_dtypes(include=['number']).columns) >= 1:
+        numerical_info = df.select_dtypes(include=['number']).describe().T.reset_index()
+        numerical_info.rename(columns={'index': 'column'}, inplace=True)
+    if len(df.select_dtypes(include=['object']).columns) >= 1:
+        categorical_info = df.select_dtypes(include=['object']).describe().T.reset_index()
+        categorical_info.rename(columns={'index': 'column'}, inplace=True)
+    return numerical_info, categorical_info
 def validate_pandera(tests, df):
     validation_results = []
             })
     return pd.DataFrame(validation_results)
+def statistics(df):
+    profile = pp.ProfileReport(df)
+    report_dict = profile.get_description()
+    description, alerts = report_dict.table, report_dict.alerts
+    # Statistics
+    mapping = {
+        'n': 'Number of observations',
+        'n_var': 'Number of variables',
+        'n_cells_missing': 'Number of cells missing',
+        'n_vars_with_missing': 'Number of columns with missing data',
+        'n_vars_all_missing': 'Columns with all missing data',
+        'p_cells_missing': 'Missing cells (%)',
+        'n_duplicates': 'Duplicated rows',
+        'p_duplicates': 'Duplicated rows (%)',
+    }
+    updated_data = {mapping.get(k, k): v for k, v in description.items() if k != 'types'}
+    # Add flattened types information
+    if 'Text' in description.get('types', {}):
+            updated_data['Number of text columns'] = description['types']['Text']
+    if 'Categorical' in description.get('types', {}):
+        updated_data['Number of categorical columns'] = description['types']['Categorical']
+    if 'Numeric' in description.get('types', {}):
+        updated_data['Number of numeric columns'] = description['types']['Numeric']
+    if 'DateTime' in description.get('types', {}):
+        updated_data['Number of datetime columns'] = description['types']['DateTime']
+    df_statistics = pd.DataFrame(list(updated_data.items()), columns=['Statistic Description', 'Value'])
+    df_statistics['Value'] = df_statistics['Value'].astype(int)
+    # Alerts
+    alerts_list = [(str(alert).replace('[', '').replace(']', ''), alert.alert_type_name) for alert in alerts]
+    df_alerts = pd.DataFrame(alerts_list, columns=['Data Quality Issue', 'Category'])
+    return df_statistics, df_alerts
 #---------------------------------------
 def main(table):
     schema = get_table_schema(table)
     df = get_data_df(schema)
+    df_statistics, df_alerts = statistics(df)
+    describe_num, describe_cat  = describe(df)
     messages = format_prompt(df=df)
     tests = run_llm(messages)
     print(tests)
     if isinstance(tests, Exception):
         tests = pd.DataFrame([{"error": f"❌ Unable to generate tests. {tests}"}])
+        return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests, pd.DataFrame([])
     tests_df = pd.DataFrame(tests)
     tests_df.rename(columns={tests_df.columns[0]: 'Column', tests_df.columns[1]: 'Rule Name', tests_df.columns[2]: 'Rules' }, inplace=True)
     pandera_results = validate_pandera(tests, df)
+    return df.head(10), df_statistics, df_alerts, describe_cat, describe_num, tests_df, pandera_results
 def user_results(table, text_query):
 if __name__ == "__main__":
     demo.launch(debug=True)