from dash import Dash, html, dcc, Input, Output, State import pandas as pd import dash_mantine_components as dmc import duckdb import time from graphs.leaderboard import button_style, get_top_n_leaderboard, render_table_content from dash_iconify import DashIconify # Initialize the app app = Dash() server = app.server # Query for most recent date in all_downloads def get_last_updated(): try: result = con.execute( "SELECT MAX(time) as max_time FROM all_downloads" ).fetchdf() max_time = result["max_time"].iloc[0] if pd.isnull(max_time): return "N/A" dt = pd.to_datetime(max_time) return dt.strftime("%b %d, %Y") except Exception: return "N/A" def load_parquet_to_duckdb(con, parquet_url, view_name): """ Loads a parquet file from a remote URL into DuckDB as a view. Returns (start_dt, end_dt) for the 'time' column. """ # Install and load httpfs extension for remote file access con.execute("INSTALL httpfs;") con.execute("LOAD httpfs;") # Create a view that references the remote parquet file con.execute(f""" CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM read_parquet('{parquet_url}') """) # Get time range for slider time_range = con.execute( f"SELECT MIN(time) as min_time, MAX(time) as max_time FROM {view_name}" ).fetchdf() start_dt = pd.to_datetime(time_range["min_time"].iloc[0]) end_dt = pd.to_datetime(time_range["max_time"].iloc[0]) return start_dt, end_dt # DuckDB connection (global) con = duckdb.connect(database=":memory:", read_only=False) # disable all caching so HF Spaces always read latest parquet con.execute("SET enable_http_metadata_cache = false;") con.execute("SET enable_object_cache = false;") # Load parquet files from Hugging Face using DuckDB HF_DATASET_ID = "mmpr/open_model_evolution_data" hf_parquet_url_1 = "https://huggingface.co/datasets/mmpr/open_model_evolution_data/resolve/main/all_downloads_with_annotations.parquet" hf_parquet_url_2 = "https://huggingface.co/datasets/mmpr/open_model_evolution_data/resolve/main/one_year_rolling.parquet" print(f"Attempting to connect to dataset from Hugging Face Hub: {HF_DATASET_ID}") try: overall_start_time = time.time() # Load both parquet files as views start_dt, end_dt = load_parquet_to_duckdb(con, hf_parquet_url_1, "all_downloads") # Example: load a second parquet file as another view start_dt2, end_dt2 = load_parquet_to_duckdb( con, hf_parquet_url_2, "one_year_rolling" ) msg = f"Successfully connected to datasets in {time.time() - overall_start_time:.2f}s." print(msg) except Exception as e: err_msg = f"Failed to load dataset(s). Error: {e}" print(err_msg) raise # Create a dcc slider for time range selection by year (readable marks) start_ts = int(start_dt.timestamp()) end_ts = int(end_dt.timestamp()) def ordinal(n): # Helper to get ordinal suffix for a day if 10 <= n % 100 <= 20: suffix = "th" else: suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th") return f"{n}{suffix}" def format_date(dt): # Format date as "Oct 8th, 2025" return dt.strftime("%b") + f" {ordinal(dt.day)}, {dt.year}" marks = [] # Add start label (e.g. "Jan 2020") marks.append({"value": start_ts, "label": start_dt.strftime("%b %Y")}) # Add yearly marks between start and end (e.g. "2021", "2022") for yr in range(start_dt.year, end_dt.year + 1): yr_ts = int(pd.Timestamp(year=yr, month=1, day=1).timestamp()) start_yr = int(pd.Timestamp(year=start_dt.year, month=1, day=1).timestamp()) if yr_ts != start_yr and yr_ts != end_ts: marks.append({"value": yr_ts, "label": str(yr)}) # Add end label (e.g. "Dec 2024") marks.append({"value": end_ts, "label": end_dt.strftime("%b %Y")}) def get_thumb_labels(values): # Returns formatted labels for both thumbs distance = abs(values[1] - values[0]) close = distance < 4 * 30 * 86400 # 4 months label_style = { "background": "#fff", "color": "#082030", "fontWeight": "bold", "fontSize": "13px", "borderRadius": "8px", "padding": "2px 8px", "boxShadow": "0 1px 4px rgba(8,32,48,0.10)", "position": "absolute", "left": "50%", "transform": "translateX(-50%)", "whiteSpace": "nowrap", "zIndex": 100, } if close: # Move first label above, second label below (closer to slider) style_top_1 = label_style.copy() style_top_1["top"] = "-38px" style_top_2 = label_style.copy() style_top_2["top"] = "14px" return [ html.Div( format_date(pd.to_datetime(values[0], unit="s")), style=style_top_1, ), html.Div( format_date(pd.to_datetime(values[1], unit="s")), style=style_top_2, ), ] else: # Both labels below the slider (closer to slider) style_top_1 = label_style.copy() style_top_1["top"] = "14px" style_top_2 = label_style.copy() style_top_2["top"] = "14px" return [ html.Div( format_date(pd.to_datetime(values[0], unit="s")), style=style_top_1, ), html.Div( format_date(pd.to_datetime(values[1], unit="s")), style=style_top_2, ), ] # Create a dcc slider for time range selection by year time_slider = dmc.RangeSlider( id="time-slider", min=start_ts, max=end_ts, value=[ start_ts, end_ts, ], step=24 * 60 * 60, color="#AC482A", size="md", radius="xl", marks=marks, style={"width": "95%", "paddingLeft": "60px"}, # updated paddingLeft label=None, showLabelOnHover=False, labelTransitionProps={"transition": "fade", "duration": 150}, thumbChildren=get_thumb_labels([start_ts, end_ts]), ) # Add a dcc.Store to hold the selected view (all_downloads or one_year_rolling) app.layout = dmc.MantineProvider( theme={ "colorScheme": "light", "primaryColor": "blue", "fontFamily": "Inter, sans-serif", }, children=[ dcc.Store(id="selected-view", data="all_downloads"), dcc.Store(id="model-attribution-type", data="uploader"), html.Div( [ # Header html.Div( [ html.Div( [ html.Span( [ html.Span( className="live-dot", ), html.Span( "LIVE", className="live-label", ), ], className="live-row", ), html.Span( f"Last updated: {get_last_updated()}", className="last-updated", ), ], className="header-status-row", ), html.Div( [ html.A( children=[ html.Img( src="assets/images/dpi.svg", className="header-logo-img", ), "Data Provenance Initiative", ], href="https://www.dataprovenance.org/", target="_blank", className="no-bg-link header-link", ), html.A( children=[ html.Img( src="assets/images/hf.svg", className="header-logo-img", ), html.Span( "Hugging Face", className="hf-brand-text", ), ], href="https://huggingface.co/", target="_blank", className="no-bg-link header-link", ), html.A( children=[ html.Span( "Read the paper", className="paper-text", ), ], href="https://www.dataprovenance.org/economies-of-open-intelligence.pdf", target="_blank", className="no-bg-link header-link paper-link", ), ], className="header-links-row", ), ], style={ "display": "flex", "justifyContent": "space-between", "alignItems": "center", "padding": "18px 24px", "gap": "24px", "backgroundColor": "#082030", # restored dark background }, className="responsive-header", # <-- add class ), html.Div( children=[ dmc.Alert( # add an icon to the alert icon=DashIconify( icon="mdi:information-outline", width=18, height=18, style={"color": "#1A5F8D"}, ), children=[ "Note: This dashboard uses ", html.A( "public Hugging Face", href="https://huggingface.co/datasets/hfmlsoc/hub_weekly_snapshots", target="_blank", style={ "color": "#1A5F8D", "fontWeight": "bold", "textDecoration": "underline", }, ), " download data, which is less precise than data analyzed in the paper.", ], color="blue", radius="md", variant="light", withCloseButton=True, style={ "marginTop": "16px", "marginBottom": "8px", "fontSize": "15px", "fontWeight": "500", "marginLeft": "auto", "marginRight": "auto", }, ), html.Span( "The Open Model Leaderboard", style={ "fontSize": 40, "fontWeight": "700", "textAlign": "center", "marginTop": "20px", "marginBottom": "20px", }, ), ], style={ "display": "flex", "flexDirection": "column", "alignItems": "center", "justifyContent": "center", "gap": "12px", "marginTop": "20px", "marginBottom": "20px", }, className="responsive-title-row", # <-- add class ), html.Div( children=[ "This leaderboard assesses concentrations of power in the open model ecosystem through ranking user downloads across three groups: countries, developers, and models. Explore how user downloads of models are distributed among these groups and identify key players shaping the open model ecosystem on Hugging Face. This dashboard accompanies the paper titled ", html.A( "Economies of Open Intelligence: Tracing Power & Participation in the Model Ecosystem.", href="https://www.dataprovenance.org/economies-of-open-intelligence.pdf", target="_blank", style={ "color": "#AC482A", "fontWeight": "700", "textDecoration": "underline", }, ), ], style={ "fontSize": 14, "marginTop": 18, "marginBottom": 12, "marginLeft": 100, "marginRight": 100, "textAlign": "center", }, className="responsive-intro", # <-- add class ), html.Div( children=[ html.Div( [ html.Div( html.Span( [ "Download View", dmc.HoverCard( width=260, shadow="md", position="top", children=[ dmc.HoverCardTarget( html.Span( DashIconify( icon="mdi:information-outline", width=16, height=16, style={ "marginLeft": "6px", "color": "#AC482A", "verticalAlign": "middle", }, ), style={"cursor": "pointer"}, ) ), dmc.HoverCardDropdown( dmc.Text( "We believe this filter isolates more authentic usage, mitigating the impact of automatic software downloads for older models.", size="sm", style={"maxWidth": "240px"}, ) ), ], ), ], className="filter-label-row", ), className="filter-label-container", ), html.Div( [ dmc.SegmentedControl( id="segmented", value="all-downloads", color="#AC482A", transitionDuration=200, data=[ { "value": "all-downloads", "label": "All Downloads", }, { "value": "filtered-downloads", "label": html.Span( ["Filtered Downloads"] ), }, ], mb=10, ), ], className="filter-segmented-row", ), html.Div( "Choose whether to count all downloads, or only downloads up to one year from model creation.", className="filter-description", ), html.Div( [ html.Div( "Model Attribution", className="filter-label", ), dmc.SegmentedControl( id="model-attribution-segmented", value="uploader", color="#AC482A", transitionDuration=200, data=[ { "value": "uploader", "label": "Model Uploader", }, { "value": "original_creator", "label": "Original Model Creator", }, ], mb=10, ), html.Div( "Toggle between having downloads attributed to the account that uploaded the model, or the account that uploaded the model that this was originally derived from.", className="filter-description", ), ], style={"marginTop": "10px"}, ), html.Span( id="global-toggle-status", className="global-toggle-status", ), ], className="main-content-left", ), html.Div( [ html.Div( "Download Date Range", className="filter-label", ), time_slider, html.Div( "Adjust the time range to filter leaderboard results by when models were downloaded by users.", className="filter-description filter-description-margin", ), html.Div( [ html.Div( [ DashIconify( icon="mdi:lightbulb-on-outline", width=20, height=20, style={ "marginRight": "8px", "color": "#082030", }, ), html.Span("Tip"), ], className="tip-title", ), html.Div( [ "Try switching between ", html.Span( "All Downloads", className="tip-highlight", ), " and ", html.Span( "Filtered Downloads", className="tip-highlight", ), " to compare net popularity (but many duplicate, unused downloads) versus more immediate interest as models are released. ", "You can also toggle between ", html.Span( "Model Uploader", className="tip-highlight", ), " and ", html.Span( "Original Model Creator", className="tip-highlight", ), " to see how attribution affects perceived popularity.", ], className="tip-description", ), ], className="tip-section", ), ], className="main-content-right", ), ], style={ "display": "flex", "gap": "24px", "padding": "32px", "alignItems": "flex-start", "marginLeft": "100px", "marginRight": "100px", "backgroundColor": "#FFFBF9", "borderRadius": "18px", }, className="responsive-main-content", # <-- add class ), html.Div( [ dcc.Tabs( id="leaderboard-tabs", value="Countries", children=[ dcc.Tab( label="Countries", value="Countries", style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "color": "#6B7280", "fontWeight": "500", }, selected_style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "fontWeight": "700", "borderBottom": "3px solid #082030", }, children=[ html.Div( children=[ "The country leaderboard shows how downloads are distributed across different nations, highlighting which countries are leading in model usage and adoption. The metadata includes the ", html.Span( "country", className="meta-var" ), " and number of ", html.Span( "user downloads", className="meta-var", ), ".", ], className="tab-description", ), html.Div( dcc.Loading( id="loading-countries", type="circle", color="#AC482A", children=html.Div( id="top_countries-table" ), ), className="responsive-table-wrapper", # <-- add wrapper for scroll ), html.Button( id="top_countries-toggle", children="▼ Show Top 50", n_clicks=0, style={**button_style, "border": "none"}, ), ], ), dcc.Tab( label="Developers", value="Developers", style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "color": "#6B7280", "fontWeight": "500", }, selected_style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "fontWeight": "700", "borderBottom": "3px solid #082030", }, children=[ html.Div( children=[ "The developer leaderboard highlights the most influential model creators on Hugging Face, showcasing which developers have garnered the highest download counts for their models. The metadata includes the ", html.Span( "developer", className="meta-var" ), ", number of ", html.Span( "user downloads", className="meta-var", ), ", and ", html.Span( "country", className="meta-var" ), ".", ], className="tab-description", ), html.Div( dcc.Loading( id="loading-developers", type="circle", color="#AC482A", children=html.Div( id="top_developers-table" ), ), className="responsive-table-wrapper", ), html.Button( id="top_developers-toggle", children="▼ Show Top 50", n_clicks=0, style={**button_style, "border": "none"}, ), ], ), dcc.Tab( label="Models", value="Models", style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "color": "#6B7280", "fontWeight": "500", }, selected_style={ "backgroundColor": "transparent", "border": "none", "padding": "10px 18px", "fontWeight": "700", "borderBottom": "3px solid #082030", }, children=[ html.Div( children=[ "The model leaderboard ranks individual models based on their download counts, revealing which models are most popular among users on Hugging Face. The metadata includes the ", html.Span( "model name", className="meta-var" ), ", number of ", html.Span( "user downloads", className="meta-var", ), ", ", html.Span( "developer", className="meta-var" ), ", and ", html.Span( "modality", className="meta-var" ), " (the input and output types of the model).", ], className="tab-description", ), html.Div( dcc.Loading( id="loading-models", type="circle", color="#AC482A", children=html.Div( id="top_models-table" ), ), className="responsive-table-wrapper", ), html.Button( id="top_models-toggle", children="▼ Show Top 50", n_clicks=0, style={**button_style, "border": "none"}, ), ], ), ], ), ], style={ "borderRadius": "18px", "padding": "32px", "marginTop": "12px", "marginBottom": "12px", "marginLeft": "50px", "marginRight": "50px", }, className="responsive-tabs", # <-- add class ), ], style={ "fontFamily": "Inter", "backgroundColor": "#ffffff", "minHeight": "100vh", }, ), ], ) # Callbacks for interactivity # -- helper utilities to consolidate duplicated callback logic -- def _get_filtered_top_n_from_duckdb( slider_value, group_col, top_n, view="all_downloads" ): """ Query DuckDB directly to get top N entries with metadata This minimizes data transfer by doing aggregation in DuckDB """ # Build time filter clause time_clause = "" if slider_value and len(slider_value) == 2: start = pd.to_datetime(slider_value[0], unit="s") end = pd.to_datetime(slider_value[1], unit="s") time_clause = f"WHERE time >= '{start}' AND time <= '{end}'" # If grouping by country, group by the transformed country column if group_col == "org_country_single": group_expr = """CASE WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America' WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online' ELSE org_country_single END""" else: group_expr = group_col # Build a lookup for author -> country mapping # When grouping by derived_author, we need to find the country where derived_author = author if group_col == "derived_author": query = f""" WITH base_data AS ( SELECT {group_expr} AS group_key, CASE WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America' WHEN org_country_single IN ('International', 'Online', 'Online?') THEN 'International/Online' ELSE org_country_single END AS org_country_single, author, derived_author, merged_country_groups_single, merged_modality, downloads, model FROM {view} {time_clause} ), -- Create a lookup table for derived_author -> country author_country_lookup AS ( SELECT DISTINCT author, FIRST_VALUE(org_country_single) OVER (PARTITION BY author ORDER BY downloads DESC) AS author_country FROM base_data WHERE author IS NOT NULL ), total_downloads_cte AS ( SELECT SUM(downloads) AS total_downloads_all FROM base_data ), top_items AS ( SELECT b.group_key AS name, SUM(b.downloads) AS total_downloads, ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total, COALESCE(acl.author_country, ANY_VALUE(b.org_country_single)) AS org_country_single, ANY_VALUE(b.author) AS author, ANY_VALUE(b.derived_author) AS derived_author, ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single, ANY_VALUE(b.merged_modality) AS merged_modality, ANY_VALUE(b.model) AS model FROM base_data b CROSS JOIN total_downloads_cte t LEFT JOIN author_country_lookup acl ON b.group_key = acl.author GROUP BY b.group_key, acl.author_country, t.total_downloads_all ) SELECT * FROM top_items ORDER BY total_downloads DESC LIMIT {top_n}; """ else: query = f""" WITH base_data AS ( SELECT {group_expr} AS group_key, CASE WHEN org_country_single IN ('HF', 'United States of America') THEN 'United States of America' WHEN org_country_single IN ('International', 'Online') THEN 'International/Online' ELSE org_country_single END AS org_country_single, author, derived_author, merged_country_groups_single, merged_modality, downloads, model FROM {view} {time_clause} ), total_downloads_cte AS ( SELECT SUM(downloads) AS total_downloads_all FROM base_data ), top_items AS ( SELECT b.group_key AS name, SUM(b.downloads) AS total_downloads, ROUND(SUM(b.downloads) * 100.0 / t.total_downloads_all, 2) AS percent_of_total, ANY_VALUE(b.org_country_single) AS org_country_single, ANY_VALUE(b.author) AS author, ANY_VALUE(b.derived_author) AS derived_author, ANY_VALUE(b.merged_country_groups_single) AS merged_country_groups_single, ANY_VALUE(b.merged_modality) AS merged_modality, ANY_VALUE(b.model) AS model FROM base_data b CROSS JOIN total_downloads_cte t GROUP BY b.group_key, t.total_downloads_all ) SELECT * FROM top_items ORDER BY total_downloads DESC LIMIT {top_n}; """ return con.execute(query).fetchdf() def _leaderboard_callback_logic( n_clicks, slider_value, current_label, group_col, filename, default_label="▼ Show Top 50", chip_color="#F0F9FF", view="all_downloads", derived_author_toggle=True, ): # Normalize label on first load if current_label is None: current_label = default_label # Determine top_n and next label if n_clicks == 0: top_n = 10 new_label = current_label elif "Show Top 50" in current_label: top_n, new_label = 50, "▼ Show Top 100" elif "Show Top 100" in current_label: top_n, new_label = 100, "▲ Show Less" else: top_n, new_label = 10, "▼ Show Top 50" # Get filtered and aggregated data directly from DuckDB df_filtered = _get_filtered_top_n_from_duckdb( slider_value, group_col, top_n, view=view ) # If the SQL query returned no rows, ask user to broaden date range if df_filtered is None or df_filtered.empty: msg = html.Div( "No data found in this time range. Try broadening the download date range.", style={"padding": "18px", "fontSize": "16px", "color": "#082030"}, ) return msg, new_label # Process the already-filtered data - pass derived_author_toggle df, download_df = get_top_n_leaderboard( df_filtered, group_col, top_n, derived_author_toggle=derived_author_toggle ) # If processing produced no rows, ask user to broaden date range if df is None or (hasattr(df, "empty") and df.empty): msg = html.Div( "No data found in this time range. Try broadening the download date range.", style={"padding": "18px", "fontSize": "16px", "color": "#082030"}, ) return msg, new_label return render_table_content( df, download_df, chip_color=chip_color, filename=filename ), new_label # -- end helpers -- # --- Callback to store model attribution type --- @app.callback( Output("model-attribution-type", "data"), Input("model-attribution-segmented", "value"), ) def update_model_attribution_type(selected_value): return selected_value # Callbacks for interactivity (modularized) @app.callback( Output("top_countries-table", "children"), Output("top_countries-toggle", "children"), Input("top_countries-toggle", "n_clicks"), Input("time-slider", "value"), Input("selected-view", "data"), Input("model-attribution-type", "data"), State("top_countries-toggle", "children"), ) def update_top_countries( n_clicks, slider_value, selected_view, attribution_type, current_label ): return _leaderboard_callback_logic( n_clicks, slider_value, current_label, group_col="org_country_single", filename="top_countries", default_label="▼ Show Top 50", chip_color="#F0F9FF", view=selected_view, derived_author_toggle=(attribution_type == "uploader"), ) @app.callback( Output("top_developers-table", "children"), Output("top_developers-toggle", "children"), Input("top_developers-toggle", "n_clicks"), Input("time-slider", "value"), Input("selected-view", "data"), Input("model-attribution-type", "data"), State("top_developers-toggle", "children"), ) def update_top_developers( n_clicks, slider_value, selected_view, attribution_type, current_label ): # Use derived_author if attribution_type == "uploader", else author group_col = "derived_author" if attribution_type == "uploader" else "author" return _leaderboard_callback_logic( n_clicks, slider_value, current_label, group_col=group_col, filename="top_developers", default_label="▼ Show Top 50", chip_color="#F0F9FF", view=selected_view, derived_author_toggle=(attribution_type == "uploader"), ) @app.callback( Output("top_models-table", "children"), Output("top_models-toggle", "children"), Input("top_models-toggle", "n_clicks"), Input("time-slider", "value"), Input("selected-view", "data"), Input("model-attribution-type", "data"), State("top_models-toggle", "children"), ) def update_top_models( n_clicks, slider_value, selected_view, attribution_type, current_label ): return _leaderboard_callback_logic( n_clicks, slider_value, current_label, group_col="model", filename="top_models", default_label="▼ Show More", chip_color="#F0F9FF", view=selected_view, derived_author_toggle=(attribution_type == "uploader"), ) @app.callback( Output("time-slider", "thumbChildren"), Input("time-slider", "value"), ) def update_thumb_labels(values): return get_thumb_labels(values) # --- Add callback to update selected view based on segmented control --- @app.callback( Output("selected-view", "data"), Input("segmented", "value"), ) def update_selected_view(seg_value): if seg_value == "filtered-downloads": return "one_year_rolling" return "all_downloads" # Run the app if __name__ == "__main__": app.run(debug=True)