File size: 3,629 Bytes
6932b0c 5a51042 6932b0c 5a51042 ecc20dd 6932b0c ecc20dd 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 ecc20dd 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 6932b0c 5a51042 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
#!/usr/bin/env python3
"""
Evals Leaderboard - Gradio app for displaying model evaluation scores.
Reads leaderboard data from the hf-skills/evals-leaderboard dataset.
Run collect_evals.py separately to update the dataset.
Usage:
python app.py
"""
from __future__ import annotations
import json
import gradio as gr
import requests
TABLE_HEADERS = [
"Model",
"Benchmark",
"Score",
"Source",
]
TABLE_DATATYPES = [
"markdown",
"text",
"number",
"markdown",
]
DATASET_REPO = "hf-skills/evals-leaderboard"
LEADERBOARD_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/raw/main/data/leaderboard.jsonl"
METADATA_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/raw/main/data/metadata.json"
def format_model_link(model_id: str) -> str:
"""Format model ID as a clickable link."""
return f"[{model_id}](https://huggingface.co/{model_id})"
def format_source_link(source_type: str, contributor: str, source_url: str) -> str:
"""Format source as a clickable link."""
return f"{source_type} by [{contributor}]({source_url})"
def fetch_leaderboard() -> tuple[list[dict], dict]:
"""Fetch leaderboard data from the HF dataset."""
# Fetch leaderboard JSONL
resp = requests.get(LEADERBOARD_URL, timeout=30)
resp.raise_for_status()
leaderboard = [json.loads(line) for line in resp.text.strip().split("\n") if line]
# Fetch metadata
resp = requests.get(METADATA_URL, timeout=30)
resp.raise_for_status()
metadata = resp.json()
return leaderboard, metadata
def refresh_handler() -> tuple[str, list[list]]:
"""Refresh the leaderboard data from the dataset."""
try:
leaderboard, metadata = fetch_leaderboard()
# Build table rows
rows = []
for entry in leaderboard:
rows.append(
[
format_model_link(entry["model_id"]),
entry["benchmark"],
entry["score"],
format_source_link(
entry["source_type"],
entry["contributor"],
entry["source_url"],
),
]
)
status = "\n".join(
[
f"**Data from:** [{DATASET_REPO}](https://huggingface.co/datasets/{DATASET_REPO})",
f"**Last updated:** {metadata.get('generated_at', 'unknown')}",
f"**Models with scores:** {metadata.get('models_with_scores', 'unknown')}",
f"**Total entries:** {metadata.get('total_entries', len(leaderboard))}",
]
)
return status, rows
except Exception as e:
return f"❌ Failed to load leaderboard: {e}", []
with gr.Blocks() as demo:
gr.Markdown(
"""
# 📊 HF Evaluation Leaderboard
Shows MMLU, BigCodeBench, and ARC MC scores pulled from model-index
metadata or their pull requests for trending text-generation models.
"""
)
status_box = gr.Markdown("Loading leaderboard...")
leaderboard_table = gr.Dataframe(
headers=TABLE_HEADERS,
datatype=TABLE_DATATYPES,
interactive=False,
wrap=True,
)
demo.load(
refresh_handler,
outputs=[status_box, leaderboard_table],
)
gr.Markdown(
f"""
---
**Links:**
- [Dataset: {DATASET_REPO}](https://huggingface.co/datasets/{DATASET_REPO})
- [GitHub Repository](https://github.com/huggingface/skills)
"""
)
if __name__ == "__main__":
demo.launch()
|