Spaces:

michaelsh
/

PwCLeaderboardDisplay

Sleeping

File size: 34,453 Bytes

a8592c4

import hashlib
import json
import pickle
from datetime import datetime
from pathlib import Path

import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from datasets import load_dataset
from tqdm import tqdm

# Cache configuration
global CACHE_DIR
global TASKS_INDEX_FILE
global TASK_DATA_DIR
global DATASET_DATA_DIR
global METRICS_INDEX_FILE

CACHE_DIR = Path("./pwc_cache")
CACHE_DIR.mkdir(exist_ok=True)

# Directory structure for disk-based storage
TASKS_INDEX_FILE = CACHE_DIR / "tasks_index.json"  # Small JSON file with task list
TASK_DATA_DIR = CACHE_DIR / "task_data"  # Directory for individual task files
DATASET_DATA_DIR = CACHE_DIR / "dataset_data"  # Directory for individual dataset files
METRICS_INDEX_FILE = CACHE_DIR / "metrics_index.json"  # Metrics metadata

# Create directories
TASK_DATA_DIR.mkdir(exist_ok=True)
DATASET_DATA_DIR.mkdir(exist_ok=True)


def sanitize_filename(name):
    """Convert a string to a safe filename."""
    # Replace problematic characters with underscores
    safe_name = name.replace('/', '_').replace('\\', '_').replace(':', '_')
    safe_name = safe_name.replace('*', '_').replace('?', '_').replace('"', '_')
    safe_name = safe_name.replace('<', '_').replace('>', '_').replace('|', '_')
    safe_name = safe_name.replace(' ', '_').replace('.', '_')
    # Remove multiple underscores and trim
    safe_name = '_'.join(filter(None, safe_name.split('_')))
    # Limit length to avoid filesystem issues
    if len(safe_name) > 200:
        # If too long, use first 150 chars + hash of full name
        safe_name = safe_name[:150] + '_' + hashlib.md5(name.encode()).hexdigest()[:8]
    return safe_name


def get_task_filename(task):
    """Generate a safe filename for a task."""
    safe_name = sanitize_filename(task)
    return TASK_DATA_DIR / f"task_{safe_name}.pkl"


def get_dataset_filename(task, dataset_name):
    """Generate a safe filename for a dataset."""
    safe_task = sanitize_filename(task)
    safe_dataset = sanitize_filename(dataset_name)
    # Include both task and dataset in filename for clarity
    filename = f"data_{safe_task}_{safe_dataset}.pkl"
    # If combined name is too long, shorten it
    if len(filename) > 255:
        # Use shorter version with hash
        filename = f"data_{safe_task[:50]}_{safe_dataset[:50]}_{hashlib.md5(f'{task}||{dataset_name}'.encode()).hexdigest()[:8]}.pkl"
    return DATASET_DATA_DIR / filename


def cache_exists():
    """Check if cache structure exists."""
    print(f"{TASKS_INDEX_FILE =}")
    print(f"{METRICS_INDEX_FILE =}")
    print(f"{TASKS_INDEX_FILE.exists() =}")
    print(f"{METRICS_INDEX_FILE.exists() =}")

    return TASKS_INDEX_FILE.exists() and METRICS_INDEX_FILE.exists()


def build_disk_based_cache():
    """Build cache with minimal memory usage - process dataset in streaming fashion."""

    import os
    print("Michael test", os.path.isdir("./pwc_cache"))
    print("=" * 60)

    
    print("=" * 60)
    print("Building disk-based cache (one-time operation)...")
    print("=" * 60)

    # Initialize tracking structures (kept small)
    tasks_set = set()
    metrics_index = {}

    print("\n[1/4] Streaming dataset and building cache...")

    # Load dataset in streaming mode to save memory
    ds = load_dataset("pwc-archive/evaluation-tables", split="train", streaming=False)
    total_items = len(ds)

    processed_count = 0
    dataset_count = 0

    for idx, item in tqdm(enumerate(ds), total=total_items):
        # Progress indicator

        task = item['task']
        if not task:
            continue

        tasks_set.add(task)

        # Load existing task data from disk or create new
        task_file = get_task_filename(task)
        if task_file.exists():
            with open(task_file, 'rb') as f:
                task_data = pickle.load(f)
        else:
            task_data = {
                'categories': set(),
                'datasets': set(),
                'date_range': {'min': None, 'max': None}
            }

        # Update task data
        if item['categories']:
            task_data['categories'].update(item['categories'])

        # Process datasets
        if item['datasets']:
            for dataset in item['datasets']:
                if not isinstance(dataset, dict) or 'dataset' not in dataset:
                    continue

                dataset_name = dataset['dataset']
                dataset_file = get_dataset_filename(task, dataset_name)

                # Skip if already processed
                if dataset_file.exists():
                    task_data['datasets'].add(dataset_name)
                    continue

                task_data['datasets'].add(dataset_name)

                # Process SOTA data
                if 'sota' not in dataset or 'rows' not in dataset['sota']:
                    continue

                models_data = []
                for row in dataset['sota']['rows']:
                    if not isinstance(row, dict):
                        continue

                    model_name = row.get('model_name', 'Unknown Model')

                    # Extract metrics
                    metrics = {}
                    if 'metrics' in row and isinstance(row['metrics'], dict):
                        for metric_name, metric_value in row['metrics'].items():
                            if metric_value is not None:
                                metrics[metric_name] = metric_value
                                # Track metric metadata
                                if metric_name not in metrics_index:
                                    metrics_index[metric_name] = {
                                        'count': 0,
                                        'is_lower_better': any(kw in metric_name.lower()
                                                               for kw in ['error', 'loss', 'time', 'cost'])
                                    }
                                metrics_index[metric_name]['count'] += 1

                    # Parse date
                    paper_date = row.get('paper_date')
                    try:
                        if paper_date and isinstance(paper_date, str):
                            release_date = pd.to_datetime(paper_date)
                        else:
                            release_date = pd.to_datetime('2020-01-01')
                    except:
                        release_date = pd.to_datetime('2020-01-01')

                    # Update date range
                    if task_data['date_range']['min'] is None or release_date < task_data['date_range']['min']:
                        task_data['date_range']['min'] = release_date
                    if task_data['date_range']['max'] is None or release_date > task_data['date_range']['max']:
                        task_data['date_range']['max'] = release_date

                    # Build model entry
                    model_entry = {
                        'model_name': model_name,
                        'release_date': release_date,
                        'paper_date': row.get('paper_date', ''),  # Store raw paper_date for dynamic parsing
                        'paper_url': row.get('paper_url', ''),
                        'paper_title': row.get('paper_title', ''),
                        'code_url': row.get('code_links', [''])[0] if row.get('code_links') else '',
                        **metrics
                    }

                    models_data.append(model_entry)

                if models_data:
                    df = pd.DataFrame(models_data)
                    df = df.sort_values('release_date')

                    # Save dataset to its own file
                    with open(dataset_file, 'wb') as f:
                        pickle.dump(df, f, protocol=pickle.HIGHEST_PROTOCOL)

                    dataset_count += 1

                    # Clear DataFrame from memory
                    del df
                    del models_data

        # Save updated task data back to disk
        with open(task_file, 'wb') as f:
            # Convert sets to lists for serialization
            task_data_to_save = {
                'categories': sorted(list(task_data['categories'])),
                'datasets': sorted(list(task_data['datasets'])),
                'date_range': task_data['date_range']
            }
            pickle.dump(task_data_to_save, f, protocol=pickle.HIGHEST_PROTOCOL)

        # Clear task data from memory
        del task_data
        processed_count += 1

    print(f"\n✓ Processed {len(tasks_set)} tasks and {dataset_count} datasets")

    print("\n[2/4] Saving index files...")

    # Save tasks index (small file)
    tasks_list = sorted(list(tasks_set))
    with open(TASKS_INDEX_FILE, 'w') as f:
        json.dump(tasks_list, f)
    print(f"  ✓ Saved tasks index ({len(tasks_list)} tasks)")

    # Save metrics index
    with open(METRICS_INDEX_FILE, 'w') as f:
        json.dump(metrics_index, f, indent=2)
    print(f"  ✓ Saved metrics index ({len(metrics_index)} metrics)")

    print("\n[3/4] Calculating cache statistics...")

    # Calculate total cache size
    total_size = 0
    for file in TASK_DATA_DIR.glob("*.pkl"):
        total_size += file.stat().st_size
    for file in DATASET_DATA_DIR.glob("*.pkl"):
        total_size += file.stat().st_size

    print(f"  ✓ Total cache size: {total_size / 1024 / 1024:.1f} MB")
    print(f"  ✓ Task files: {len(list(TASK_DATA_DIR.glob('*.pkl')))}")
    print(f"  ✓ Dataset files: {len(list(DATASET_DATA_DIR.glob('*.pkl')))}")

    print("\n[4/4] Cache building complete!")
    print("=" * 60)

    return tasks_list


def load_tasks_index():
    """Load just the task list from disk."""
    with open(TASKS_INDEX_FILE, 'r') as f:
        return json.load(f)


def load_task_data(task):
    """Load data for a specific task from disk."""
    task_file = get_task_filename(task)
    if task_file.exists():
        with open(task_file, 'rb') as f:
            return pickle.load(f)
    return None


def load_dataset_data(task, dataset_name):
    """Load a specific dataset from disk."""
    dataset_file = get_dataset_filename(task, dataset_name)
    if dataset_file.exists():
        with open(dataset_file, 'rb') as f:
            return pickle.load(f)
    return pd.DataFrame()


def load_metrics_index():
    """Load metrics index from disk."""
    if METRICS_INDEX_FILE.exists():
        with open(METRICS_INDEX_FILE, 'r') as f:
            return json.load(f)
    return {}

# Initialize - build cache if doesn't exist
if cache_exists():
    print("Loading task index from disk...")
    TASKS = load_tasks_index()
    print(f"✓ Loaded {len(TASKS)} tasks")
else:
    TASKS = build_disk_based_cache()

# Load metrics index once (it's small)
METRICS_INDEX = load_metrics_index()


# Memory-efficient accessor functions
def get_tasks():
    """Get all tasks from index."""
    return TASKS


def get_task_data(task):
    """Load task data from disk on-demand."""
    return load_task_data(task)


def get_categories(task):
    """Get categories for a task (loads from disk)."""
    task_data = get_task_data(task)
    return task_data['categories'] if task_data else []


def get_datasets_for_task(task):
    """Get datasets for a task (loads from disk)."""
    task_data = get_task_data(task)
    return task_data['datasets'] if task_data else []


def get_cached_model_data(task, dataset_name):
    """Load dataset from disk on-demand."""
    return load_dataset_data(task, dataset_name)


def parse_paper_date(paper_date, paper_title="", paper_url=""):
    """Parse paper date with improved fallback strategies."""
    import re

    # Try to parse the raw paper_date if available
    if paper_date and isinstance(paper_date, str) and paper_date.strip():
        try:
            # Try common date formats
            date_formats = [
                '%Y-%m-%d',
                '%Y/%m/%d',
                '%d-%m-%Y',
                '%d/%m/%Y',
                '%Y-%m',
                '%Y/%m',
                '%Y'
            ]

            for fmt in date_formats:
                try:
                    return pd.to_datetime(paper_date.strip(), format=fmt)
                except:
                    continue

            # Try pandas automatic parsing
            return pd.to_datetime(paper_date.strip())
        except:
            pass

    # Fallback: try to extract year from paper title or URL
    year_pattern = r'\b(19[5-9]\d|20[0-9]\d)\b'  # Match 1950-2099

    # Look for year in paper title
    if paper_title:
        years = re.findall(year_pattern, str(paper_title))
        if years:
            try:
                year = max(years)  # Use the latest year found
                return pd.to_datetime(f'{year}-01-01')
            except:
                pass

    # Look for year in paper URL
    if paper_url:
        years = re.findall(year_pattern, str(paper_url))
        if years:
            try:
                year = max(years)  # Use the latest year found
                return pd.to_datetime(f'{year}-01-01')
            except:
                pass

    # Final fallback: return None instead of a default year
    return None


def get_task_statistics(task):
    """Get statistics about a task."""
    return {}


def create_sota_plot(df, metric):
    """Create a plot showing model performance evolution over time.

    Args:
        df: DataFrame with model data
        metric: Metric name to plot on y-axis
    """
    if df.empty or metric not in df.columns:
        fig = go.Figure()
        fig.add_annotation(
            text="No data available for this metric",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
            font=dict(size=20)
        )
        fig.update_layout(
            title="No Data Available",
            height=600,
            plot_bgcolor='white',
            paper_bgcolor='white'
        )
        return fig

    # Remove rows where the metric is NaN
    df_clean = df.dropna(subset=[metric]).copy()

    if df_clean.empty:
        fig = go.Figure()
        fig.add_annotation(
            text="No valid data points for this metric",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
            font=dict(size=20)
        )
        fig.update_layout(
            title="No Data Available",
            height=600,
            plot_bgcolor='white',
            paper_bgcolor='white'
        )
        return fig

    # Convert metric column to numeric, handling any string values
    try:
        df_clean[metric] = pd.to_numeric(
            df_clean[metric].apply(lambda x: x.strip()[:-1] if isinstance(x, str) and x.strip().endswith("%") else x),
            errors='coerce')
        # Remove any rows that couldn't be converted to numeric
        df_clean = df_clean.dropna(subset=[metric])

        if df_clean.empty:
            fig = go.Figure()
            fig.add_annotation(
                text=f"No numeric data available for metric: {metric}",
                xref="paper",
                yref="paper",
                x=0.5,
                y=0.5,
                showarrow=False,
                font=dict(size=20)
            )
            fig.update_layout(
                title="No Numeric Data Available",
                height=600,
                plot_bgcolor='white',
                paper_bgcolor='white'
            )
            return fig

    except Exception as e:
        fig = go.Figure()
        fig.add_annotation(
            text=f"Error processing metric data: {str(e)}",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
            font=dict(size=16)
        )
        fig.update_layout(
            title="Data Processing Error",
            height=600,
            plot_bgcolor='white',
            paper_bgcolor='white'
        )
        return fig

    # Recalculate release dates dynamically from raw paper_date if available
    df_processed = df_clean.copy()
    if 'paper_date' in df_processed.columns:
        # Parse dates dynamically using improved logic
        df_processed['dynamic_release_date'] = df_processed.apply(
            lambda row: parse_paper_date(
                row.get('paper_date', ''),
                row.get('paper_title', ''),
                row.get('paper_url', '')
            ), axis=1
        )
        # Use dynamic dates if available, otherwise fallback to original release_date
        df_processed['final_release_date'] = df_processed['dynamic_release_date'].fillna(df_processed['release_date'])
    else:
        # If no paper_date column, use existing release_date
        df_processed['final_release_date'] = df_processed['release_date']

    # Filter out rows with no valid date
    df_with_dates = df_processed[df_processed['final_release_date'].notna()].copy()

    if df_with_dates.empty:
        # If no valid dates, return empty plot
        fig = go.Figure()
        fig.add_annotation(
            text="No valid dates available for this dataset",
            xref="paper",
            yref="paper",
            x=0.5,
            y=0.5,
            showarrow=False,
            font=dict(size=20)
        )
        fig.update_layout(
            title="No Date Data Available",
            height=600,
            plot_bgcolor='white',
            paper_bgcolor='white'
        )
        return fig

    # Sort by final release date
    df_sorted = df_with_dates.sort_values('final_release_date').copy()

    # Check if metric is lower-better
    is_lower_better = False
    if metric in METRICS_INDEX:
        is_lower_better = METRICS_INDEX[metric].get('is_lower_better', False)
    else:
        is_lower_better = any(keyword in metric.lower() for keyword in ['error', 'loss', 'time', 'cost'])

    if is_lower_better:
        df_sorted['cumulative_best'] = df_sorted[metric].cummin()
        df_sorted['is_sota'] = df_sorted[metric] == df_sorted['cumulative_best']
    else:
        df_sorted['cumulative_best'] = df_sorted[metric].cummax()
        df_sorted['is_sota'] = df_sorted[metric] == df_sorted['cumulative_best']

    # Get SOTA models
    sota_df = df_sorted[df_sorted['is_sota']].copy()

    # Use the dynamically calculated dates for x-axis
    x_values = df_sorted['final_release_date']
    x_axis_title = 'Release Date'

    # Create the plot
    fig = go.Figure()

    # Add all models as scatter points
    fig.add_trace(go.Scatter(
        x=x_values,
        y=df_sorted[metric],
        mode='markers',
        name='All models',
        marker=dict(
            color=['#00CED1' if is_sota else 'lightgray'
                   for is_sota in df_sorted['is_sota']],
            size=8,
            opacity=0.7
        ),
        text=df_sorted['model_name'],
        customdata=df_sorted[['paper_title', 'paper_url', 'code_url']],
        hovertemplate='<b>%{text}</b><br>' +
                      f'{metric}: %{{y:.4f}}<br>' +
                      'Date: %{x}<br>' +
                      'Paper: %{customdata[0]}<br>' +
                      '<extra></extra>'
    ))

    # Add SOTA line
    fig.add_trace(go.Scatter(
        x=x_values,
        y=df_sorted['cumulative_best'],
        mode='lines',
        name=f'SOTA (cumulative {"min" if is_lower_better else "max"})',
        line=dict(color='#00CED1', width=2, dash='solid'),
        hovertemplate=f'SOTA {metric}: %{{y:.4f}}<br>{x_axis_title}: %{{x}}<extra></extra>'
    ))

    # Add labels for SOTA models
    if not sota_df.empty:
        # Calculate dynamic offset based on data range
        y_range = df_sorted[metric].max() - df_sorted[metric].min()

        # Use a percentage of the range for offset, with minimum and maximum bounds
        if y_range > 0:
            base_offset = y_range * 0.03  # 3% of the data range
            # Ensure minimum offset for readability and maximum to prevent excessive spacing
            label_offset = max(y_range * 0.01, min(base_offset, y_range * 0.08))
        else:
            # Fallback for when all values are the same
            label_offset = 1

        # Track label positions to prevent overlaps
        previous_labels = []
        # For date-based x-axis, use date separation
        try:
            date_range = (df_sorted['final_release_date'].max() - df_sorted['final_release_date'].min()).days
            min_separation = max(30, date_range * 0.05)  # Minimum 30 days or 5% of range
        except (TypeError, AttributeError):
            # Fallback if date calculation fails
            min_separation = 30

        for i, (_, row) in enumerate(sota_df.iterrows()):
            # Determine base label position based on metric type
            if is_lower_better:
                # For lower-better metrics, place label above the point (negative ay)
                base_ay_offset = -label_offset
                base_yshift = -8
                alternate_multiplier = -1
            else:
                # For higher-better metrics, place label below the point (positive ay)
                base_ay_offset = label_offset
                base_yshift = 8
                alternate_multiplier = 1

            # Check for collision with previous labels
            current_x = row['final_release_date']
            collision_detected = False

            for prev_x, prev_ay in previous_labels:
                try:
                    x_diff = abs((current_x - prev_x).days)
                    if x_diff < min_separation:
                        collision_detected = True
                        break
                except (TypeError, AttributeError):
                    # Skip collision detection if calculation fails
                    continue

            # Adjust position if collision detected
            if collision_detected:
                # Alternate the label position (above/below) to avoid overlap
                ay_offset = base_ay_offset + (alternate_multiplier * label_offset * 0.7 * (i % 2))
                yshift = base_yshift + (alternate_multiplier * 12 * (i % 2))
            else:
                ay_offset = base_ay_offset
                yshift = base_yshift

            # Add the annotation
            fig.add_annotation(
                x=current_x,
                y=row[metric],
                text=row['model_name'][:25] + '...' if len(row['model_name']) > 25 else row['model_name'],
                showarrow=True,
                arrowhead=2,
                arrowsize=1,
                arrowwidth=1,
                arrowcolor='#00CED1',  # Match the SOTA line color
                ax=0,
                ay=ay_offset,  # Dynamic offset based on data range and collision detection
                yshift=yshift,  # Fine-tune positioning
                font=dict(size=8, color='#333333'),
                bgcolor='rgba(255, 255, 255, 0.9)',  # Semi-transparent background
                borderwidth=0  # Remove border
            )

            # Track this label position
            previous_labels.append((current_x, ay_offset))

    # Update layout
    fig.update_layout(
        title=f'SOTA Evolution: {metric}',
        xaxis_title=x_axis_title,
        yaxis_title=metric,
        xaxis=dict(showgrid=True, gridcolor='lightgray'),
        yaxis=dict(showgrid=True, gridcolor='lightgray'),
        plot_bgcolor='white',
        paper_bgcolor='white',
        height=600,
        legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
        hovermode='closest'
    )

    # Clear the DataFrame from memory after plotting
    del df_clean
    del df_sorted
    del sota_df

    return fig


# Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📊 Papers with Code - SOTA Evolution Visualizer")
    gr.Markdown(
        "Navigate through ML tasks and datasets to visualize the evolution of state-of-the-art models over time.")
    gr.Markdown("*Optimized for low memory usage - data is loaded on-demand from disk*")

    # Status
    with gr.Row():
        gr.Markdown(f"""
        <div style="background-color: #f0f9ff; border-left: 4px solid #00CED1; padding: 10px; margin: 10px 0;">
        <b>💾 Disk-Based Storage Active</b><br>
        • <b>{len(TASKS)}</b> tasks indexed<br>
        • <b>{len(METRICS_INDEX)}</b> unique metrics tracked<br>
        • Data loaded on-demand to minimize RAM usage
        </div>
        """)

    # State variables
    current_df = gr.State(pd.DataFrame())
    current_task = gr.State(None)

    # Navigation dropdowns
    with gr.Row():
        task_dropdown = gr.Dropdown(
            choices=get_tasks(),
            label="Select Task",
            interactive=True
        )
        category_dropdown = gr.Dropdown(
            choices=[],
            label="Categories (info only)",
            interactive=False
        )

    with gr.Row():
        dataset_dropdown = gr.Dropdown(
            choices=[],
            label="Select Dataset",
            interactive=True
        )
        metric_dropdown = gr.Dropdown(
            choices=[],
            label="Select Metric",
            interactive=True
        )

    # Info display
    info_text = gr.Markdown("👆 Please select a task to begin")

    # Plot
    plot = gr.Plot(label="SOTA Evolution")

    # Data display
    with gr.Row():
        show_data_btn = gr.Button("📋 Show/Hide Model Data")
        export_btn = gr.Button("💾 Export Current Data (CSV)")
        clear_memory_btn = gr.Button("🧹 Clear Memory", variant="secondary")

    df_display = gr.Dataframe(
        label="Model Data",
        visible=False
    )


    # Update functions
    def update_task_selection(task):
        """Update dropdowns when task is selected."""
        if not task:
            return [], [], [], "👆 Please select a task to begin", pd.DataFrame(), None, None

        # Load task data from disk
        categories = get_categories(task)
        datasets = get_datasets_for_task(task)

        info = f"### 📂 **Task:** {task}\n"
        if categories:
            info += f"- **Categories:** {', '.join(categories[:3])}{'...' if len(categories) > 3 else ''} ({len(categories)} total)\n"

        return (
            gr.Dropdown(choices=categories, value=categories[0] if categories else None),
            gr.Dropdown(choices=datasets, value=None),
            gr.Dropdown(choices=[], value=None),
            info,
            pd.DataFrame(),
            None,
            task  # Store current task
        )


    def update_dataset_selection(task, dataset_name):
        """Update when dataset is selected - loads from disk."""
        if not task or not dataset_name:
            return [], "", pd.DataFrame(), None

        # Load dataset from disk
        df = get_cached_model_data(task, dataset_name)

        if df.empty:
            return [], f"⚠️ No models found for dataset: {dataset_name}", df, None

        # Get metric columns
        exclude_cols = ['model_name', 'release_date', 'paper_date', 'paper_url', 'paper_title', 'code_url']
        metric_cols = [col for col in df.columns if col not in exclude_cols]

        info = f"### 📊 **Dataset:** {dataset_name}\n"
        info += f"- **Models:** {len(df)} models\n"
        info += f"- **Metrics:** {len(metric_cols)} metrics available\n"
        if not df.empty:
            info += f"- **Date Range:** {df['release_date'].min().strftime('%Y-%m-%d')} to {df['release_date'].max().strftime('%Y-%m-%d')}\n"

        if metric_cols:
            info += f"- **Available Metrics:** {', '.join(metric_cols[:5])}{'...' if len(metric_cols) > 5 else ''}"

        return (
            gr.Dropdown(choices=metric_cols, value=metric_cols[0] if metric_cols else None),
            info,
            df,
            None
        )


    def update_plot(df, metric):
        """Update plot when metric is selected."""
        if df.empty or not metric:
            return None
        plot_result = create_sota_plot(df, metric)
        return plot_result


    def toggle_dataframe(df):
        """Toggle dataframe visibility."""
        if df.empty:
            return gr.Dataframe(value=pd.DataFrame(), visible=False)
        # Show relevant columns
        display_cols = ['model_name', 'release_date'] + [col for col in df.columns
                                                         if col not in ['model_name', 'release_date', 'paper_date',
                                                                        'paper_url',
                                                                        'paper_title', 'code_url']]
        display_df = df[display_cols].copy()
        display_df['release_date'] = display_df['release_date'].dt.strftime('%Y-%m-%d')
        return gr.Dataframe(value=display_df, visible=True)


    def export_data(df):
        """Export current dataframe to CSV."""
        if df.empty:
            return "⚠️ No data to export"

        filename = f"sota_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(filename, index=False)
        return f"✅ Data exported to {filename} ({len(df)} models)"


    def clear_memory():
        """Clear memory by forcing garbage collection."""
        import gc
        gc.collect()
        return "✅ Memory cleared"


    # Event handlers
    task_dropdown.change(
        fn=update_task_selection,
        inputs=task_dropdown,
        outputs=[category_dropdown, dataset_dropdown,
                 metric_dropdown, info_text, current_df, plot, current_task]
    )

    dataset_dropdown.change(
        fn=update_dataset_selection,
        inputs=[task_dropdown, dataset_dropdown],
        outputs=[metric_dropdown, info_text, current_df, plot]
    )

    metric_dropdown.change(
        fn=update_plot,
        inputs=[current_df, metric_dropdown],
        outputs=plot
    )

    show_data_btn.click(
        fn=toggle_dataframe,
        inputs=current_df,
        outputs=df_display
    )

    export_btn.click(
        fn=export_data,
        inputs=current_df,
        outputs=info_text
    )

    clear_memory_btn.click(
        fn=clear_memory,
        inputs=[],
        outputs=info_text
    )

    gr.Markdown("""
    ---
    ### 📖 How to Use
    1. **Select a Task** from the first dropdown
    2. **Select a Dataset** to analyze
    3. **Select a Metric** to visualize
    4. The plot shows SOTA model evolution over time with dynamically calculated dates

    ### 💾 Memory Optimization
    - Data is stored on disk and loaded on-demand
    - Only the current task and dataset are kept in memory
    - Use "Clear Memory" button if needed
    - Infinite disk space is utilized for permanent caching

    ### 🎨 Plot Features
    - **🔵 Cyan dots**: SOTA models when released
    - **⚪ Gray dots**: Other models
    - **📈 Cyan line**: SOTA progression
    - **🔍 Hover**: View model details
    - **🏷️ Smart Labels**: SOTA model labels positioned close to the line with intelligent collision detection
    """)


def test_sota_label_positioning():
    """Test function to validate SOTA label positioning improvements."""
    print("🧪 Testing SOTA label positioning...")

    # Create sample data for testing
    import pandas as pd
    from datetime import datetime

    # Test data with different metric types (including all required columns)
    test_data = {
        'model_name': ['Model A', 'Model B', 'Model C', 'Model D'],
        'release_date': [
            datetime(2020, 1, 1),
            datetime(2020, 6, 1),
            datetime(2021, 1, 1),
            datetime(2021, 6, 1)
        ],
        'paper_title': ['Paper A', 'Paper B', 'Paper C', 'Paper D'],
        'paper_url': ['http://example.com/a', 'http://example.com/b', 'http://example.com/c', 'http://example.com/d'],
        'code_url': ['http://github.com/a', 'http://github.com/b', 'http://github.com/c', 'http://github.com/d'],
        'accuracy': [0.85, 0.87, 0.90, 0.92],  # Higher-better metric
        'error_rate': [0.15, 0.13, 0.10, 0.08]  # Lower-better metric
    }

    df_test = pd.DataFrame(test_data)

    # Test with higher-better metric (accuracy)
    print("  Testing with higher-better metric (accuracy)...")
    try:
        fig1 = create_sota_plot(df_test, 'accuracy')
        print("  ✅ Higher-better metric test passed")
    except Exception as e:
        print(f"  ❌ Higher-better metric test failed: {e}")

    # Test with lower-better metric (error_rate)
    print("  Testing with lower-better metric (error_rate)...")
    try:
        fig2 = create_sota_plot(df_test, 'error_rate')
        print("  ✅ Lower-better metric test passed")
    except Exception as e:
        print(f"  ❌ Lower-better metric test failed: {e}")

    # Test with empty data
    print("  Testing with empty dataframe...")
    try:
        fig3 = create_sota_plot(pd.DataFrame(), 'test_metric')
        print("  ✅ Empty data test passed")
    except Exception as e:
        print(f"  ❌ Empty data test failed: {e}")

    # Test with string metric data (should handle gracefully)
    print("  Testing with string metric data...")
    try:
        df_test_string = df_test.copy()
        df_test_string['string_metric'] = ['low', 'medium', 'high', 'very_high']
        fig4 = create_sota_plot(df_test_string, 'string_metric')
        print("  ✅ String metric test passed (handled gracefully)")
    except Exception as e:
        print(f"  ❌ String metric test failed: {e}")

    # Test with mixed numeric/string data
    print("  Testing with mixed data types...")
    try:
        df_test_mixed = df_test.copy()
        df_test_mixed['mixed_metric'] = [0.85, 'N/A', 0.90, 0.92]
        fig5 = create_sota_plot(df_test_mixed, 'mixed_metric')
        print("  ✅ Mixed data test passed")
    except Exception as e:
        print(f"  ❌ Mixed data test failed: {e}")

    # Test with paper_date parsing
    print("  Testing with paper_date column...")
    try:
        df_test_dates = df_test.copy()
        df_test_dates['paper_date'] = ['2015-03-15', '2018-invalid', '2021-12-01', '2022']
        fig6 = create_sota_plot(df_test_dates, 'accuracy')
        print("  ✅ Paper date parsing test passed")
    except Exception as e:
        print(f"  ❌ Paper date parsing test failed: {e}")

    print("🎉 SOTA label positioning tests completed!")
    return True

demo.launch()