import json
import math
from statistics import mean
from datetime import datetime
import pandas as pd
import gradio as gr
import plotly.graph_objects as go
from gradio_leaderboard import Leaderboard
from src.utils import(
_safe_numeric,
calculate_cumulative_average,
create_dataframe,
get_aggregated_columns,
load_data,
load_model_metadata,
load_raw_model_data,
build_year_column_mapping,
)
from content import LLMLAGBENCH_INTRO, LEADERBOARD_INTRO, MODEL_COMPARISON_INTRO, AUTHORS, CIT_BTN_TEXT, CIT_BTN_LABEL, EXEMPLARY_QUESTIONS_INTRO, EXEMPLARY_QUESTIONS_DATA
### CONFIGURATION
cfg = {
"data_path": "data/leaderboard_graph_data.json",
"metadata_path": "data/model_metadata.json",
"years": ["2021", "2022", "2023", "2024", "2025"],
"months": [f"{i:02d}" for i in range(1, 13)]
}
### CALLBACKS
# updated update_dash to create interactive plot
def update_dashboard(graph_years, graph_model_filter):
"""
graph_years: list like ["2024","2025"] for graph from graph_year_selector
graph_model_filter: list of models for the line plot or None
"""
# Table always shows all years and all models
table_years = cfg.get("years")
table_model_filter = None
# Default: show all years if none selected for graph
if not graph_years:
graph_years = cfg.get("years")
# keep some necessary metadata columns in the specified order
metadata_cols = ["Model", "Overall Average", "1st Detected cutoff", "2nd Detected cutoff", "Provider cutoff", "Provider", "Release date", "Self-declared cutoff", "trend_changepoints", "Parameters", "Evaluation period"]
cols = metadata_cols.copy()
yearly_df = df.copy()
monthly_df = df.copy()
graph_df = df.copy()
# TODO if >1 year - aggregate the values to be per year, not per month
if len(table_years) > 1:
lb_cols = ["Model", "Overall Average", "1st Detected cutoff", "2nd Detected cutoff", "Provider cutoff", "Provider", "Release date", "Self-declared cutoff", "Parameters", "Evaluation period"] + [y for y in cfg.get("aggregated_cols_year") if y in table_years]
yearly_df = yearly_df[lb_cols]
# Expand years into their YYYY_MM columns (for table)
chosen_months = []
for y in table_years:
chosen_months.extend(year_to_columns.get(y, []))
# Sort chronologically using the global aggregated_cols order
# Only include months that actually exist in the dataframe
chosen_months_with_years = table_years + [c for c in cfg.get("aggregated_cols_month") if c in chosen_months and c in monthly_df.columns]
cols.extend(chosen_months_with_years)
# Filter by models for table if requested
if table_model_filter:
yearly_df = yearly_df[yearly_df["Model"].isin(table_model_filter)]
monthly_df = monthly_df[monthly_df["Model"].isin(table_model_filter)]
# Sort by Overall Average in descending order
yearly_df = yearly_df.sort_values(by="Overall Average", ascending=False)
monthly_df = monthly_df.sort_values(by="Overall Average", ascending=False)
# Reduce columns
monthly_df = monthly_df[cols]
# Filter by models for graph if requested (use separate dataframe)
# Build graph columns based on graph_years
graph_months = []
for y in graph_years:
graph_months.extend(year_to_columns.get(y, []))
graph_months_with_years = graph_years + [c for c in cfg.get("aggregated_cols_month") if c in graph_months and c in graph_df.columns]
graph_cols = metadata_cols + graph_months_with_years
graph_df = graph_df[graph_cols]
if graph_model_filter:
graph_df = graph_df[graph_df["Model"].isin(graph_model_filter)]
# Build tidy dataframe for gr.LinePlot with columns x, y, Model
records = []
# Exclude all metadata columns and yearly aggregates from x_labels - only keep monthly columns
excluded_cols = {"Model", "Overall Average", "Parameters", "1st Detected cutoff", "2nd Detected cutoff", "Provider", "Provider cutoff", "Release date", "Self-declared cutoff", "trend_changepoints", "Evaluation period"}
x_labels = [c for c in graph_cols if c not in excluded_cols and c not in graph_years] # only months for the plot
for _, row in graph_df.iterrows():
for col in x_labels:
y_val = _safe_numeric(row.get(col))
records.append({"x": col, "y": y_val, "Model": row["Model"]})
lineplot_df = pd.DataFrame(records)
# Ensure chronological order using global sorted list - double sorting? TODO verify
chronological_order = [c for c in cfg.get("aggregated_cols_month") if c in lineplot_df["x"].unique()]
lineplot_df["x"] = pd.Categorical(lineplot_df["x"], categories=chronological_order, ordered=True)
lineplot_df = lineplot_df.sort_values(by="x")
# Build Plotly figure
fig = go.Figure()
for _, row in graph_df.iterrows():
model = row["Model"]
color = GLOBAL_MODEL_COLORS[model]
model_data = lineplot_df[lineplot_df["Model"] == model]
fig.add_trace(go.Scatter(
x=model_data["x"],
y=model_data["y"],
mode="lines",
name=model,
line=dict(width=2, color=color),
hovertemplate="Model: %{text}
x=%{x}
y=%{y}",
text=[model] * len(model_data),
showlegend=True,
line_shape='spline'
))
# Highlight changepoints (can be multiple)
changepoints = row.get("trend_changepoints", [])
if isinstance(changepoints, list):
for idx, bp in enumerate(changepoints):
if bp in model_data["x"].values:
cp_row = model_data[model_data["x"] == bp]
# Make first changepoint smaller if there are multiple
marker_size = 12 if idx == 0 else 6
fig.add_trace(go.Scatter(
x=cp_row["x"],
y=cp_row["y"],
mode="markers",
marker=dict(
size=marker_size,
color=color,
symbol="circle-open",
line=dict(width=3, color="white")
),
hovertemplate=f"Trend Changepoint
Model: {model}
x=%{{x}}
y=%{{y}}",
showlegend=False
))
# Style the figure & Lock axis order
fig.update_layout(
xaxis=dict(
categoryorder="array",
categoryarray=chronological_order,
title="Year_Month",
color="#e5e7eb",
gridcolor="#374151",
nticks=30 # Limit number of x-axis ticks displayed
),
yaxis=dict(title="Average Faithfulness (0-2 scale)", color="#e5e7eb", gridcolor="#374151"),
paper_bgcolor="#1f2937",
plot_bgcolor="#1f2937",
font=dict(family="IBM Plex Sans", size=12, color="#e5e7eb"),
hoverlabel=dict(bgcolor="#374151", font=dict(color="#e5e7eb"), bordercolor="#4b5563"),
margin=dict(l=40, r=20, t=60, b=40),
# title=dict(text="Model Comparison with Trend Changepoints", x=0.5, font=dict(color="#e5e7eb")),
showlegend=True,
yaxis_range=[-0.1, 2.1],
xaxis_tickangle=-45
)
if len(table_years) > 1:
return yearly_df, fig
else:
return monthly_df, fig
def create_faithfulness_plot(model_name):
"""
Create a Plotly figure showing faithfulness scores with segments and cumulative refusals.
Args:
model_name: Name of the model to plot
Returns:
Plotly Figure object or None if model not found
"""
if not model_name:
return go.Figure()
# Load raw model data
model_data = load_raw_model_data(cfg.get("data_path"), model_name)
if not model_data:
return go.Figure()
# Extract data
dates = model_data.get('dates', [])
faithfulness = model_data.get('faithfulness', [])
cumulative_refusals = model_data.get('cumulative_refusals', [])
segments = model_data.get('segments', [])
changepoint_dates = model_data.get('changepoint_dates', [])
total_obs = model_data.get('total_observations', max(cumulative_refusals) if cumulative_refusals else 1)
# Calculate cumulative average faithfulness
cumulative_avg_faithfulness = calculate_cumulative_average(faithfulness) if faithfulness else []
# Create figure with secondary y-axis
fig = go.Figure()
# Add faithfulness scatter points
fig.add_trace(go.Scatter(
x=dates,
y=faithfulness,
mode='markers',
name='Faithfulness',
marker=dict(size=4, color='steelblue', opacity=0.6),
hovertemplate='Date: %{x}
Faithfulness: %{y}',
yaxis='y'
))
# Add cumulative average faithfulness line (green curve)
if cumulative_avg_faithfulness:
fig.add_trace(go.Scatter(
x=dates,
y=cumulative_avg_faithfulness,
mode='lines',
name='Cumulative Average',
line=dict(color='#22c55e', width=2.5),
hovertemplate='Date: %{x}
Cumulative Avg: %{y:.3f}',
yaxis='y'
))
# Add segment mean lines (horizontal lines for each segment)
for seg in segments:
fig.add_trace(go.Scatter(
x=[seg['start_date'], seg['end_date']],
y=[seg['mean_faithfulness'], seg['mean_faithfulness']],
mode='lines',
name=f"Segment Mean ({seg['mean_faithfulness']:.2f})",
line=dict(color='red', width=2),
hovertemplate=f"Mean: {seg['mean_faithfulness']:.2f}
Refusal Rate: {seg['refusal_rate_percent']:.1f}%",
yaxis='y',
showlegend=False
))
# Add changepoint vertical lines
for cp_date in changepoint_dates:
fig.add_vline(
x=cp_date,
line=dict(color='darkred', dash='dash', width=1.5),
opacity=0.7
)
# Add cumulative refusals line (on secondary y-axis)
fig.add_trace(go.Scatter(
x=dates,
y=cumulative_refusals,
mode='lines',
name='Cumulative Refusals',
line=dict(color='darkorange', width=2),
hovertemplate='Date: %{x}
Cumulative Refusals: %{y}',
yaxis='y2'
))
# Add refusal rate annotations for each segment
for seg in segments:
# Calculate midpoint date for annotation
start = datetime.strptime(seg['start_date'], '%Y-%m-%d')
end = datetime.strptime(seg['end_date'], '%Y-%m-%d')
mid_date = start + (end - start) / 2
fig.add_annotation(
x=mid_date.strftime('%Y-%m-%d'),
y=1.85,
text=f"{seg['refusal_rate_percent']:.1f}%",
showarrow=False,
font=dict(size=10, color='#fbbf24', family='IBM Plex Sans'),
bgcolor='rgba(55, 65, 81, 0.9)',
bordercolor='#fbbf24',
borderwidth=1,
yref='y'
)
# Update layout with dual y-axes
fig.update_layout(
title=dict(
text=f"{model_name}: Faithfulness with PELT Changepoints",
x=0.5,
font=dict(color='#e5e7eb', size=14, family='IBM Plex Sans')
),
xaxis=dict(
title='Date',
color='#e5e7eb',
gridcolor='#374151',
tickangle=-45
),
yaxis=dict(
title='Faithfulness Score',
color='#e5e7eb',
gridcolor='#374151',
range=[-0.05, 2.05],
side='left'
),
yaxis2=dict(
title='Cumulative Refusals',
color='#fbbf24',
gridcolor='#374151',
range=[0, total_obs],
overlaying='y',
side='right'
),
paper_bgcolor='#1f2937',
plot_bgcolor='#1f2937',
font=dict(family='IBM Plex Sans', size=12, color='#e5e7eb'),
hoverlabel=dict(bgcolor='#374151', font=dict(color='#e5e7eb'), bordercolor='#4b5563'),
margin=dict(l=60, r=60, t=80, b=80),
showlegend=False,
legend=dict(
x=0.02,
y=0.98,
bgcolor='rgba(55, 65, 81, 0.9)',
bordercolor='#4b5563',
borderwidth=1
)
)
return fig
def update_model_comparison(split_enabled, model1, model2):
"""
Update model comparison plots based on split checkbox and model selections.
Args:
split_enabled: Boolean indicating if 2 graphs should be shown
model1: First model name
model2: Second model name (only used if split_enabled)
Returns:
Tuple of (plot1, plot2, visibility_dict)
"""
if split_enabled:
# Show 2 graphs side by side
plot1 = create_faithfulness_plot(model1) if model1 else go.Figure()
plot2 = create_faithfulness_plot(model2) if model2 else go.Figure()
return plot1, plot2, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
else:
# Show only 1 graph
plot1 = create_faithfulness_plot(model1) if model1 else go.Figure()
return plot1, go.Figure(), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
def initialize_model_comparison():
"""
Initialize model comparison section with random model selections on page load.
Returns:
Tuple of (model1_value, model2_value, plot1, plot2, col_plot_1_visible, col_model_2_visible, col_plot_2_visible)
"""
import random
# Select random models for initial display
if len(all_models) >= 2:
random_models = random.sample(all_models, 2)
model1 = random_models[0]
model2 = random_models[1]
elif len(all_models) == 1:
model1 = all_models[0]
model2 = None
else:
model1 = None
model2 = None
# Generate initial plot for model1 only (split_enabled=False by default)
plot1 = create_faithfulness_plot(model1) if model1 else go.Figure()
return (
gr.update(value=model1), # model_dropdown_1
gr.update(value=model2), # model_dropdown_2
plot1, # comparison_plot_1
go.Figure(), # comparison_plot_2 (empty since split is disabled)
gr.update(visible=True), # col_plot_1
gr.update(visible=False), # col_model_2
gr.update(visible=False) # col_plot_2
)
def initialize_main_dashboard(graph_year_selector_value):
"""
Initialize main dashboard with random model selections on page load.
Args:
graph_year_selector_value: Selected years from the graph year selector
Returns:
Tuple of (graph_model_filter_value, leaderboard, line_plot)
"""
import random
# Select random models for initial display (5 models for graph)
num_models = min(5, len(all_models))
if num_models > 0:
random_graph_models = random.sample(all_models, num_models)
else:
random_graph_models = []
# Generate dashboard with random models
leaderboard, line_plot = update_dashboard(graph_year_selector_value, random_graph_models)
return (
gr.update(value=random_graph_models), # graph_model_filter
leaderboard,
line_plot
)
def initialize_all_components(graph_year_selector_value):
"""
Initialize all components on page load: main dashboard and model comparison.
Combining into a single load function to prevent double-rendering issues in HF Spaces.
Args:
graph_year_selector_value: Selected years from the graph year selector
Returns:
Tuple of all outputs for both dashboard and comparison sections
"""
import random
# Initialize main dashboard
num_models = min(5, len(all_models))
if num_models > 0:
random_graph_models = random.sample(all_models, num_models)
else:
random_graph_models = []
leaderboard, line_plot = update_dashboard(graph_year_selector_value, random_graph_models)
# Initialize model comparison with 2 graphs side by side (split_enabled=True)
if len(all_models) >= 2:
random_models = random.sample(all_models, 2)
model1 = random_models[0]
model2 = random_models[1]
elif len(all_models) == 1:
model1 = all_models[0]
model2 = None
else:
model1 = None
model2 = None
# Generate both plots for split view
plot1 = create_faithfulness_plot(model1) if model1 else go.Figure()
plot2 = create_faithfulness_plot(model2) if model2 else go.Figure()
return (
gr.update(value=random_graph_models), # graph_model_filter
leaderboard,
line_plot,
gr.update(value=model1), # model_dropdown_1
gr.update(value=model2), # model_dropdown_2
plot1, # comparison_plot_1
plot2, # comparison_plot_2 (now showing model2)
gr.update(visible=True), # col_plot_1
gr.update(visible=True), # col_model_2 (now visible)
gr.update(visible=True) # col_plot_2 (now visible)
)
### HELPER FUNCTIONS
def generate_distinct_colors(n):
"""
Generate n distinct colors using HSL color space.
Args:
n: Number of distinct colors needed
Returns:
List of hex color strings
"""
colors = []
for i in range(n):
hue = (i * 360 / n) % 360
saturation = 70 + (i % 3) * 10 # Vary saturation slightly for more distinction
lightness = 55 + (i % 2) * 10 # Vary lightness slightly
# Convert HSL to RGB
h = hue / 360
s = saturation / 100
l = lightness / 100
if s == 0:
r = g = b = l
else:
def hue_to_rgb(p, q, t):
if t < 0: t += 1
if t > 1: t -= 1
if t < 1/6: return p + (q - p) * 6 * t
if t < 1/2: return q
if t < 2/3: return p + (q - p) * (2/3 - t) * 6
return p
q = l * (1 + s) if l < 0.5 else l + s - l * s
p = 2 * l - q
r = hue_to_rgb(p, q, h + 1/3)
g = hue_to_rgb(p, q, h)
b = hue_to_rgb(p, q, h - 1/3)
# Convert to hex
hex_color = f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
colors.append(hex_color)
return colors
### DATA PREP
# Load data
data = load_data(cfg.get("data_path"))
# Load model metadata
model_metadata = load_model_metadata(cfg.get("metadata_path"))
# Build year to columns mapping
year_to_columns = build_year_column_mapping(cfg.get("years"), cfg.get("months"))
# Create DataFrame (new format doesn't need models_map or metrics)
df = create_dataframe(cfg, data, model_metadata=model_metadata)
# Get aggregated column lists
aggregated_cols_year, aggregated_cols_month = get_aggregated_columns(
cfg.get("years"), year_to_columns
)
cfg["aggregated_cols_year"] = aggregated_cols_year
cfg["aggregated_cols_month"] = aggregated_cols_month
# Generate consistent color mapping for all models (do this once globally)
all_models = sorted(df["Model"].unique().tolist()) # Sort for consistency
colors = generate_distinct_colors(len(all_models))
GLOBAL_MODEL_COLORS = {model: colors[i] for i, model in enumerate(all_models)}
### BUILD UI
theme = gr.themes.Base(
primary_hue="green",
secondary_hue="green",
radius_size="lg",
text_size="sm",
)
# Custom CSS for scrollable dropdown and table styling
custom_css = """
/* Limit the height of selected items in multiselect dropdown */
.scrollable-dropdown .wrap-inner {
max-height: 100px !important;
overflow-y: auto !important;
}
/* Alternative selector for the selected items container */
.scrollable-dropdown div[data-testid="block-label-inner"] ~ div {
max-height: 100px !important;
overflow-y: auto !important;
}
/* Style the leaderboard table background */
.gradio-container .gr-table-wrap,
.gradio-container .gr-dataframe,
.gradio-leaderboard {
background-color: #fafafa !important;
}
.gradio-container table {
background-color: #fafafa !important;
}
/* Header row - gray background */
.gradio-container table thead tr,
.gradio-container table thead th {
background-color: #f3f4f6 !important;
}
/* First column (td:first-child) - gray background for all rows */
.gradio-container table tbody tr td:first-child {
background-color: #f3f4f6 !important;
}
/* Odd rows - very light background (excluding first column) */
.gradio-container table tbody tr:nth-child(odd) td {
background-color: #fafafa !important;
}
/* Even rows - white background (excluding first column) */
.gradio-container table tbody tr:nth-child(even) td {
background-color: white !important;
}
/* Keep first column gray for both odd and even rows */
.gradio-container table tbody tr:nth-child(odd) td:first-child,
.gradio-container table tbody tr:nth-child(even) td:first-child {
background-color: #f3f4f6 !important;
}
/* Hover effect for all rows */
.gradio-container table tbody tr:hover td {
background-color: #f3f4f6 !important;
}
/* Keep first column darker gray on hover */
.gradio-container table tbody tr:hover td:first-child {
background-color: #e5e7eb !important;
}
"""
# JavaScript to force light mode
js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'light') {
url.searchParams.set('__theme', 'light');
window.location.href = url.href;
}
}
"""
with gr.Blocks(theme=theme, css=custom_css, js=js_func) as demo:
gr.Markdown(
"""
📶 LLMLagBench - All LLMs lag behind
"""
)
gr.Markdown(AUTHORS)
gr.Markdown("
")
gr.Markdown(LLMLAGBENCH_INTRO)
gr.Markdown("
")
with gr.Row():
# Year selector for graph
with gr.Column(scale=1):
graph_year_selector = gr.CheckboxGroup(choices=cfg.get("years"), value=["2021", "2022", "2023", "2024", "2025"], label="Select Years for Graph")
with gr.Column(scale=1):
graph_model_filter = gr.Dropdown(
choices=df["Model"].unique().tolist(),
multiselect=True,
filterable=True,
value=None, # Will be set randomly on page load
label="Select Models for Graph",
elem_classes="scrollable-dropdown"
)
gr.Markdown("## Model Comparison with Trend Changepoints")
line_plot = gr.Plot(label="Model Trends")
gr.Markdown('
')
gr.Markdown('
')
gr.Markdown(LEADERBOARD_INTRO)
leaderboard = Leaderboard(
value=df,
search_columns=["Model"],
interactive=False,
)
# Wire events — graph inputs update the leaderboard + plot
for comp in (graph_year_selector, graph_model_filter):
comp.change(
fn=update_dashboard,
inputs=[graph_year_selector, graph_model_filter],
outputs=[leaderboard, line_plot],
)
gr.Markdown('
')
gr.Markdown('
')
# Model comparison section - wrap everything in a container to prevent duplication
with gr.Column():
gr.Markdown("## Model Comparison: Faithfulness to Ideal Answer with PELT Changepoints")
gr.Markdown(MODEL_COMPARISON_INTRO)
with gr.Row():
split_checkbox = gr.Checkbox(
label="Split into 2 segments",
value=True,
info="Enable to compare two models side by side"
)
with gr.Row():
with gr.Column(scale=1):
model_dropdown_1 = gr.Dropdown(
choices=all_models,
value=None, # Will be set randomly on page load
label="Select Model 1",
filterable=True,
elem_classes="scrollable-dropdown"
)
with gr.Column(scale=1, visible=False) as col_model_2:
model_dropdown_2 = gr.Dropdown(
choices=all_models,
value=None, # Will be set randomly on page load
label="Select Model 2",
filterable=True,
elem_classes="scrollable-dropdown"
)
with gr.Row():
with gr.Column(scale=1) as col_plot_1:
comparison_plot_1 = gr.Plot(label="Model Faithfulness Analysis")
with gr.Column(scale=1, visible=False) as col_plot_2:
comparison_plot_2 = gr.Plot(label="Model Faithfulness Analysis")
# Wire model comparison events
split_checkbox.change(
fn=update_model_comparison,
inputs=[split_checkbox, model_dropdown_1, model_dropdown_2],
outputs=[comparison_plot_1, comparison_plot_2, col_plot_1, col_model_2, col_plot_2]
)
model_dropdown_1.change(
fn=update_model_comparison,
inputs=[split_checkbox, model_dropdown_1, model_dropdown_2],
outputs=[comparison_plot_1, comparison_plot_2, col_plot_1, col_model_2, col_plot_2]
)
model_dropdown_2.change(
fn=update_model_comparison,
inputs=[split_checkbox, model_dropdown_1, model_dropdown_2],
outputs=[comparison_plot_1, comparison_plot_2, col_plot_1, col_model_2, col_plot_2]
)
gr.Markdown('
')
gr.Markdown('
')
# Exemplary Questions section
gr.Markdown(EXEMPLARY_QUESTIONS_INTRO)
exemplary_questions_df = pd.DataFrame(
EXEMPLARY_QUESTIONS_DATA,
columns=["Date", "Question", "Gold Answer", "Possible decision"]
)
gr.Dataframe(
value=exemplary_questions_df,
interactive=False,
wrap=True
)
# Citation
gr.Markdown('
')
gr.Markdown('
')
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CIT_BTN_TEXT,
label=CIT_BTN_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
# Initialize all components on load with a single load call (prevents double-rendering in HF Spaces)
demo.load(
fn=initialize_all_components,
inputs=[graph_year_selector],
outputs=[graph_model_filter, leaderboard, line_plot, model_dropdown_1, model_dropdown_2, comparison_plot_1, comparison_plot_2, col_plot_1, col_model_2, col_plot_2]
)
if __name__ == "__main__":
demo.launch()