File size: 3,629 Bytes
6932b0c
 
 
 
 
 
 
 
 
 
 
5a51042
 
6932b0c
5a51042
 
 
 
 
 
 
 
 
ecc20dd
 
 
6932b0c
ecc20dd
 
 
5a51042
 
 
6932b0c
 
 
5a51042
 
6932b0c
 
 
5a51042
 
6932b0c
 
 
5a51042
 
6932b0c
 
 
 
 
 
5a51042
6932b0c
 
 
 
5a51042
6932b0c
5a51042
 
6932b0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a51042
 
 
 
6932b0c
 
 
 
5a51042
 
6932b0c
 
 
 
 
5a51042
 
6932b0c
5a51042
 
6932b0c
 
5a51042
6932b0c
5a51042
 
ecc20dd
6932b0c
5a51042
6932b0c
 
 
 
 
5a51042
6932b0c
 
5a51042
6932b0c
 
 
 
 
 
 
 
 
 
 
5a51042
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
"""
Evals Leaderboard - Gradio app for displaying model evaluation scores.

Reads leaderboard data from the hf-skills/evals-leaderboard dataset.
Run collect_evals.py separately to update the dataset.

Usage:
    python app.py
"""

from __future__ import annotations

import json

import gradio as gr
import requests

TABLE_HEADERS = [
    "Model",
    "Benchmark",
    "Score",
    "Source",
]

TABLE_DATATYPES = [
    "markdown",
    "text",
    "number",
    "markdown",
]


DATASET_REPO = "hf-skills/evals-leaderboard"
LEADERBOARD_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/raw/main/data/leaderboard.jsonl"
METADATA_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/raw/main/data/metadata.json"


def format_model_link(model_id: str) -> str:
    """Format model ID as a clickable link."""
    return f"[{model_id}](https://huggingface.co/{model_id})"


def format_source_link(source_type: str, contributor: str, source_url: str) -> str:
    """Format source as a clickable link."""
    return f"{source_type} by [{contributor}]({source_url})"


def fetch_leaderboard() -> tuple[list[dict], dict]:
    """Fetch leaderboard data from the HF dataset."""
    # Fetch leaderboard JSONL
    resp = requests.get(LEADERBOARD_URL, timeout=30)
    resp.raise_for_status()
    leaderboard = [json.loads(line) for line in resp.text.strip().split("\n") if line]

    # Fetch metadata
    resp = requests.get(METADATA_URL, timeout=30)
    resp.raise_for_status()
    metadata = resp.json()

    return leaderboard, metadata


def refresh_handler() -> tuple[str, list[list]]:
    """Refresh the leaderboard data from the dataset."""
    try:
        leaderboard, metadata = fetch_leaderboard()

        # Build table rows
        rows = []
        for entry in leaderboard:
            rows.append(
                [
                    format_model_link(entry["model_id"]),
                    entry["benchmark"],
                    entry["score"],
                    format_source_link(
                        entry["source_type"],
                        entry["contributor"],
                        entry["source_url"],
                    ),
                ]
            )

        status = "\n".join(
            [
                f"**Data from:** [{DATASET_REPO}](https://huggingface.co/datasets/{DATASET_REPO})",
                f"**Last updated:** {metadata.get('generated_at', 'unknown')}",
                f"**Models with scores:** {metadata.get('models_with_scores', 'unknown')}",
                f"**Total entries:** {metadata.get('total_entries', len(leaderboard))}",
            ]
        )

        return status, rows

    except Exception as e:
        return f"❌ Failed to load leaderboard: {e}", []


with gr.Blocks() as demo:
    gr.Markdown(
        """
        # 📊 HF Evaluation Leaderboard
        
        Shows MMLU, BigCodeBench, and ARC MC scores pulled from model-index
        metadata or their pull requests for trending text-generation models.
        """
    )

    status_box = gr.Markdown("Loading leaderboard...")

    leaderboard_table = gr.Dataframe(
        headers=TABLE_HEADERS,
        datatype=TABLE_DATATYPES,
        interactive=False,
        wrap=True,
    )

    demo.load(
        refresh_handler,
        outputs=[status_box, leaderboard_table],
    )

    gr.Markdown(
        f"""
        ---
        
        **Links:**
        - [Dataset: {DATASET_REPO}](https://huggingface.co/datasets/{DATASET_REPO})
        - [GitHub Repository](https://github.com/huggingface/skills)
        """
    )


if __name__ == "__main__":
    demo.launch()