burtenshaw HF Staff commited on
Commit
6932b0c
·
verified ·
1 Parent(s): ecc20dd

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. app.py +84 -368
  2. collect_evals.py +480 -0
  3. requirements.txt +5 -0
app.py CHANGED
@@ -1,23 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
- import re
4
- from dataclasses import dataclass
5
- from datetime import datetime, timezone
6
- from pathlib import Path
7
- from typing import Any, Dict, List, Optional
8
 
9
  import gradio as gr
10
  import requests
11
- import yaml
12
- from huggingface_hub import hf_hub_download
13
- from huggingface_hub.utils import HfHubHTTPError
14
 
15
- API_BASE = "https://huggingface.co/api"
16
- PIPELINE_FILTER = "text-generation"
17
- TRENDING_LIMIT = 10
18
- TRENDING_FETCH_LIMIT = 50
19
- PR_SCAN_LIMIT = 40
20
- USER_AGENT = "skills-evals-leaderboard/0.2"
21
  TABLE_HEADERS = [
22
  "Model",
23
  "Benchmark",
@@ -26,393 +24,111 @@ TABLE_HEADERS = [
26
  ]
27
 
28
  TABLE_DATATYPES = [
29
- "text",
30
  "text",
31
  "number",
32
  "markdown",
33
  ]
34
 
35
 
36
- def _normalize(text: Optional[str]) -> str:
37
- if not text:
38
- return ""
39
- text = text.lower()
40
- text = re.sub(r"[^a-z0-9]+", " ", text)
41
- return text.strip()
42
-
43
-
44
- def _coerce_score(value: Any) -> Optional[float]:
45
- if value is None:
46
- return None
47
- if isinstance(value, (int, float)):
48
- return float(value)
49
- if isinstance(value, str):
50
- candidate = value.strip()
51
- if candidate.endswith("%"):
52
- candidate = candidate[:-1]
53
- try:
54
- return float(candidate)
55
- except ValueError:
56
- return None
57
- return None
58
-
59
-
60
- @dataclass(frozen=True)
61
- class BenchmarkSpec:
62
- key: str
63
- label: str
64
- aliases: tuple[str, ...]
65
 
66
- def matches(self, fields: List[str]) -> bool:
67
- for alias in self.aliases:
68
- alias_norm = _normalize(alias)
69
- if not alias_norm:
70
- continue
71
- for field in fields:
72
- if alias_norm in field:
73
- return True
74
- return False
75
 
 
 
 
76
 
77
- BENCHMARKS: Dict[str, BenchmarkSpec] = {
78
- "mmlu": BenchmarkSpec(
79
- key="mmlu",
80
- label="MMLU",
81
- aliases=("mmlu", "massive multitask language understanding"),
82
- ),
83
- "bigcodebench": BenchmarkSpec(
84
- key="bigcodebench",
85
- label="BigCodeBench",
86
- aliases=("bigcodebench", "big code bench"),
87
- ),
88
- "arc_mc": BenchmarkSpec(
89
- key="arc_mc",
90
- label="ARC MC",
91
- aliases=(
92
- "arc mc",
93
- "arc-challenge",
94
- "arc challenge",
95
- "arc multiple choice",
96
- "arc c",
97
- ),
98
- ),
99
- }
100
 
 
 
 
101
 
102
- class LeaderboardFetcher:
103
- def __init__(self) -> None:
104
- self.session = requests.Session()
105
- self.session.headers.update({"User-Agent": USER_AGENT})
106
- self.logs: List[str] = []
107
 
108
- def build(self) -> Dict[str, Any]:
109
- trending = self._fetch_trending_models()
110
- leaders: List[Dict[str, Any]] = []
111
- for entry in trending:
112
- repo_id = entry.get("modelId") or entry.get("id")
113
- if not repo_id:
114
- continue
115
- scores = self._collect_scores(repo_id)
116
- if scores["scores"]:
117
- leaders.append(scores)
118
- return self._compose_tables(leaders)
119
 
120
- def log_text(self) -> str:
121
- if not self.logs:
122
- return "No actions recorded."
123
- return "\n".join(self.logs)
124
 
125
- def _fetch_trending_models(self) -> List[Dict[str, Any]]:
126
- params = {"sort": "trendingScore", "limit": TRENDING_FETCH_LIMIT}
127
- response = self.session.get(
128
- f"{API_BASE}/models",
129
- params=params,
130
- timeout=30,
131
- )
132
- response.raise_for_status()
133
- data = response.json()
134
- if not isinstance(data, list):
135
- raise ValueError("Unexpected trending response.")
136
- filtered = [
137
- model
138
- for model in data
139
- if (model.get("pipeline_tag") == PIPELINE_FILTER or PIPELINE_FILTER in (model.get("tags") or []))
140
- ]
141
- if not filtered:
142
- self.logs.append("⚠️ No text-generation models in trending feed.")
143
- return []
144
- limited = filtered[:TRENDING_LIMIT]
145
- if len(limited) < TRENDING_LIMIT:
146
- self.logs.append(f"⚠️ Only {len(limited)} text-generation models available.")
147
- else:
148
- self.logs.append(f"🔍 Loaded {TRENDING_LIMIT} trending text-generation models.")
149
- return limited
150
-
151
- def _collect_scores(self, repo_id: str) -> Dict[str, Any]:
152
- owner = repo_id.split("/")[0]
153
- card_meta = self._read_model_card(repo_id)
154
- model_index = card_meta.get("model-index")
155
- if model_index:
156
- self.logs.append(f"✅ {repo_id}: model card metadata found.")
157
- scores = self._extract_scores(
158
- repo_id=repo_id,
159
- model_index=model_index,
160
- contributor=owner,
161
- source_type="model-card",
162
- source_url=f"https://huggingface.co/{repo_id}",
163
- revision="main",
164
- )
165
- if scores:
166
- return {"model_id": repo_id, "scores": scores}
167
-
168
- prs = self._fetch_pull_requests(repo_id)
169
- for pr in prs:
170
- revision = f"refs/pr/{pr['num']}"
171
- pr_meta = self._read_model_card(repo_id, revision=revision)
172
- pr_index = pr_meta.get("model-index")
173
- if not pr_index:
174
- continue
175
- author_info = pr.get("author", {}) or {}
176
- contributor = author_info.get("name") or author_info.get("fullname") or "unknown-author"
177
- discussion_path = f"{repo_id}/discussions/{pr['num']}"
178
- source_url = f"https://huggingface.co/{discussion_path}"
179
- scores = self._extract_scores(
180
- repo_id=repo_id,
181
- model_index=pr_index,
182
- contributor=contributor,
183
- source_type="pull-request",
184
- source_url=source_url,
185
- revision=revision,
186
- )
187
- if scores:
188
- note = f"📝 {repo_id}: PR #{pr['num']} by {contributor}."
189
- self.logs.append(note)
190
- return {"model_id": repo_id, "scores": scores}
191
 
192
- self.logs.append(f"⚠️ {repo_id}: no target benchmarks located.")
193
- return {"model_id": repo_id, "scores": {}}
194
 
195
- def _read_model_card(
196
- self,
197
- repo_id: str,
198
- revision: Optional[str] = None,
199
- ) -> Dict[str, Any]:
200
- try:
201
- path = hf_hub_download(
202
- repo_id=repo_id,
203
- filename="README.md",
204
- repo_type="model",
205
- revision=revision,
206
- )
207
- except HfHubHTTPError as err:
208
- ctx = f"{repo_id} ({revision or 'main'})"
209
- self.logs.append(f"🚫 {ctx}: README download failed ({err}).")
210
- return {}
211
- text = Path(path).read_text(encoding="utf-8", errors="ignore")
212
- return self._parse_front_matter(text)
213
-
214
- @staticmethod
215
- def _parse_front_matter(content: str) -> Dict[str, Any]:
216
- content = content.lstrip("\ufeff")
217
- if not content.startswith("---"):
218
- return {}
219
- lines = content.splitlines()
220
- end_idx = None
221
- for idx, line in enumerate(lines[1:], start=1):
222
- if line.strip() == "---":
223
- end_idx = idx
224
- break
225
- if end_idx is None:
226
- return {}
227
- front_matter = "\n".join(lines[1:end_idx])
228
- try:
229
- data = yaml.safe_load(front_matter) or {}
230
- return data if isinstance(data, dict) else {}
231
- except yaml.YAMLError:
232
- return {}
233
-
234
- def _fetch_pull_requests(self, repo_id: str) -> List[Dict[str, Any]]:
235
- url = f"{API_BASE}/models/{repo_id}/discussions"
236
- try:
237
- response = self.session.get(
238
- url,
239
- params={"limit": PR_SCAN_LIMIT},
240
- timeout=30,
241
  )
242
- response.raise_for_status()
243
- except requests.RequestException as err:
244
- self.logs.append(f"🚫 {repo_id}: PR list request failed ({err}).")
245
- return []
246
 
247
- payload = response.json()
248
- discussions = payload.get("discussions", [])
249
- prs = [disc for disc in discussions if disc.get("isPullRequest")]
250
- prs.sort(key=lambda item: item.get("createdAt", ""), reverse=True)
251
- if prs:
252
- self.logs.append(f"📬 {repo_id}: scanning {len(prs)} pull requests.")
253
- return prs
254
-
255
- def _extract_scores(
256
- self,
257
- repo_id: str,
258
- model_index: Any,
259
- contributor: str,
260
- source_type: str,
261
- source_url: str,
262
- revision: str,
263
- ) -> Dict[str, Dict[str, Any]]:
264
- if not isinstance(model_index, list):
265
- return {}
266
- scores: Dict[str, Dict[str, Any]] = {}
267
- for entry in model_index:
268
- if not isinstance(entry, dict):
269
- continue
270
- model_name = entry.get("name") or repo_id.split("/")[-1]
271
- for result in entry.get("results", []):
272
- dataset_info = result.get("dataset") or {}
273
- dataset_name = dataset_info.get("name")
274
- dataset_type = dataset_info.get("type")
275
- task_info = result.get("task") or {}
276
- task_type = task_info.get("type")
277
- for metric in result.get("metrics", []):
278
- benchmark_key = self._match_benchmark(
279
- dataset_name,
280
- dataset_type,
281
- metric,
282
- )
283
- if not benchmark_key:
284
- continue
285
- raw_value = metric.get("value")
286
- value = _coerce_score(raw_value)
287
- if value is None:
288
- continue
289
- unit = metric.get("unit") or ""
290
- is_pct = isinstance(raw_value, str) and raw_value.strip().endswith("%")
291
- if not unit and is_pct:
292
- unit = "%"
293
- metric_name = metric.get("name") or metric.get("type") or ""
294
- payload = {
295
- "model": repo_id,
296
- "model_name": model_name,
297
- "benchmark_key": benchmark_key,
298
- "benchmark_label": BENCHMARKS[benchmark_key].label,
299
- "value": value,
300
- "unit": unit,
301
- "dataset": dataset_name or dataset_type or "",
302
- "task_type": task_type or "",
303
- "metric_name": metric_name,
304
- "contributor": contributor,
305
- "source_type": source_type,
306
- "source_url": source_url,
307
- "revision": revision,
308
- }
309
- existing = scores.get(benchmark_key)
310
- if not existing or value > existing["value"]:
311
- scores[benchmark_key] = payload
312
- return scores
313
-
314
- def _match_benchmark(
315
- self,
316
- dataset_name: Optional[str],
317
- dataset_type: Optional[str],
318
- metric: Dict[str, Any],
319
- ) -> Optional[str]:
320
- fields = [
321
- _normalize(dataset_name),
322
- _normalize(dataset_type),
323
- _normalize(metric.get("name")),
324
- _normalize(metric.get("type")),
325
- ]
326
- fields = [field for field in fields if field]
327
- for key, spec in BENCHMARKS.items():
328
- if spec.matches(fields):
329
- return key
330
- return None
331
-
332
- def _compose_tables(self, entries: List[Dict[str, Any]]) -> Dict[str, Any]:
333
- all_rows: List[Dict[str, Any]] = []
334
- per_benchmark: Dict[str, List[Dict[str, Any]]] = {key: [] for key in BENCHMARKS}
335
- for entry in entries:
336
- for benchmark_key, payload in entry["scores"].items():
337
- row = {
338
- "Model": entry["model_id"],
339
- "Benchmark": BENCHMARKS[benchmark_key].label,
340
- "Score": round(payload["value"], 2),
341
- "Source": f"{payload['source_type']} by [{payload['contributor']}]({payload['source_url']})",
342
- }
343
- all_rows.append(row)
344
- per_benchmark[benchmark_key].append(row)
345
-
346
- for rows in per_benchmark.values():
347
- rows.sort(key=lambda r: r["Score"], reverse=True)
348
- all_rows.sort(key=lambda r: r["Score"], reverse=True)
349
-
350
- return {
351
- "all_rows": all_rows,
352
- "per_benchmark": per_benchmark,
353
- "stats": {
354
- "models_with_scores": len(entries),
355
- "row_count": len(all_rows),
356
- "generated_at": datetime.now(timezone.utc).isoformat(),
357
- },
358
- }
359
-
360
-
361
- def _rows_to_matrix(rows: List[Dict[str, Any]]) -> List[List[Any]]:
362
- return [[row.get(header, "") for header in TABLE_HEADERS] for row in rows]
363
-
364
-
365
- def refresh_handler() -> List[Any]:
366
- fetcher = LeaderboardFetcher()
367
- try:
368
- result = fetcher.build()
369
- stats = result["stats"]
370
  status = "\n".join(
371
  [
372
- f"Last updated: {stats['generated_at']}",
373
- f"Models with scores: {stats['models_with_scores']}",
374
- f"Total entries: {stats['row_count']}",
375
- "",
376
- fetcher.log_text(),
377
  ]
378
  )
379
- return [
380
- status,
381
- _rows_to_matrix(result["all_rows"]),
382
- ]
383
- except Exception as exc: # pylint: disable=broad-except
384
- error = f"❌ Failed to refresh leaderboard: {exc}"
385
- empty: List[List[Any]] = []
386
- return [error, empty]
387
 
388
 
389
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
390
  gr.Markdown(
391
  """
392
- # HF Evaluation Leaderboard
 
393
  Shows MMLU, BigCodeBench, and ARC MC scores pulled from model-index
394
- metadata or their pull requests for the top text-generation models.
395
  """
396
  )
397
- refresh_button = gr.Button("Refresh", variant="primary")
398
- status_box = gr.Markdown("")
399
 
400
- all_table = gr.Dataframe(headers=TABLE_HEADERS, interactive=False, datatype=TABLE_DATATYPES)
401
 
402
- refresh_button.click( # pylint: disable=no-member
403
- refresh_handler,
404
- inputs=[],
405
- outputs=[
406
- status_box,
407
- all_table,
408
- ],
409
  )
410
- demo.load( # pylint: disable=no-member
 
411
  refresh_handler,
412
- outputs=[
413
- status_box,
414
- all_table,
415
- ],
 
 
 
 
 
 
 
416
  )
417
 
418
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Evals Leaderboard - Gradio app for displaying model evaluation scores.
4
+
5
+ Reads leaderboard data from the hf-skills/evals-leaderboard dataset.
6
+ Run collect_evals.py separately to update the dataset.
7
+
8
+ Usage:
9
+ python app.py
10
+ """
11
+
12
  from __future__ import annotations
13
 
14
+ import json
 
 
 
 
15
 
16
  import gradio as gr
17
  import requests
 
 
 
18
 
 
 
 
 
 
 
19
  TABLE_HEADERS = [
20
  "Model",
21
  "Benchmark",
 
24
  ]
25
 
26
  TABLE_DATATYPES = [
27
+ "markdown",
28
  "text",
29
  "number",
30
  "markdown",
31
  ]
32
 
33
 
34
+ DATASET_REPO = "hf-skills/evals-leaderboard"
35
+ LEADERBOARD_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/raw/main/data/leaderboard.jsonl"
36
+ METADATA_URL = f"https://huggingface.co/datasets/{DATASET_REPO}/raw/main/data/metadata.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
 
 
 
 
 
 
 
 
 
38
 
39
+ def format_model_link(model_id: str) -> str:
40
+ """Format model ID as a clickable link."""
41
+ return f"[{model_id}](https://huggingface.co/{model_id})"
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ def format_source_link(source_type: str, contributor: str, source_url: str) -> str:
45
+ """Format source as a clickable link."""
46
+ return f"{source_type} by [{contributor}]({source_url})"
47
 
 
 
 
 
 
48
 
49
+ def fetch_leaderboard() -> tuple[list[dict], dict]:
50
+ """Fetch leaderboard data from the HF dataset."""
51
+ # Fetch leaderboard JSONL
52
+ resp = requests.get(LEADERBOARD_URL, timeout=30)
53
+ resp.raise_for_status()
54
+ leaderboard = [json.loads(line) for line in resp.text.strip().split("\n") if line]
 
 
 
 
 
55
 
56
+ # Fetch metadata
57
+ resp = requests.get(METADATA_URL, timeout=30)
58
+ resp.raise_for_status()
59
+ metadata = resp.json()
60
 
61
+ return leaderboard, metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
 
 
63
 
64
+ def refresh_handler() -> tuple[str, list[list]]:
65
+ """Refresh the leaderboard data from the dataset."""
66
+ try:
67
+ leaderboard, metadata = fetch_leaderboard()
68
+
69
+ # Build table rows
70
+ rows = []
71
+ for entry in leaderboard:
72
+ rows.append(
73
+ [
74
+ format_model_link(entry["model_id"]),
75
+ entry["benchmark"],
76
+ entry["score"],
77
+ format_source_link(
78
+ entry["source_type"],
79
+ entry["contributor"],
80
+ entry["source_url"],
81
+ ),
82
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  )
 
 
 
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  status = "\n".join(
86
  [
87
+ f"**Data from:** [{DATASET_REPO}](https://huggingface.co/datasets/{DATASET_REPO})",
88
+ f"**Last updated:** {metadata.get('generated_at', 'unknown')}",
89
+ f"**Models with scores:** {metadata.get('models_with_scores', 'unknown')}",
90
+ f"**Total entries:** {metadata.get('total_entries', len(leaderboard))}",
 
91
  ]
92
  )
93
+
94
+ return status, rows
95
+
96
+ except Exception as e:
97
+ return f"❌ Failed to load leaderboard: {e}", []
 
 
 
98
 
99
 
100
+ with gr.Blocks() as demo:
101
  gr.Markdown(
102
  """
103
+ # 📊 HF Evaluation Leaderboard
104
+
105
  Shows MMLU, BigCodeBench, and ARC MC scores pulled from model-index
106
+ metadata or their pull requests for trending text-generation models.
107
  """
108
  )
 
 
109
 
110
+ status_box = gr.Markdown("Loading leaderboard...")
111
 
112
+ leaderboard_table = gr.Dataframe(
113
+ headers=TABLE_HEADERS,
114
+ datatype=TABLE_DATATYPES,
115
+ interactive=False,
116
+ wrap=True,
 
 
117
  )
118
+
119
+ demo.load(
120
  refresh_handler,
121
+ outputs=[status_box, leaderboard_table],
122
+ )
123
+
124
+ gr.Markdown(
125
+ f"""
126
+ ---
127
+
128
+ **Links:**
129
+ - [Dataset: {DATASET_REPO}](https://huggingface.co/datasets/{DATASET_REPO})
130
+ - [GitHub Repository](https://github.com/huggingface/skills)
131
+ """
132
  )
133
 
134
 
collect_evals.py ADDED
@@ -0,0 +1,480 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Collect evaluation scores from trending models' model-index metadata.
4
+
5
+ Scans trending text-generation models on the Hub and extracts benchmark
6
+ scores from their model-index metadata or open pull requests.
7
+
8
+ Results are saved to a dataset for the evals leaderboard.
9
+
10
+ Usage:
11
+ python collect_evals.py [--push-to-hub]
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import os
19
+ import re
20
+ from dataclasses import dataclass
21
+ from datetime import datetime, timezone
22
+ from pathlib import Path
23
+ from typing import Any, Dict, List, Optional
24
+
25
+ import requests
26
+ import yaml
27
+ from huggingface_hub import hf_hub_download
28
+ from huggingface_hub.utils import HfHubHTTPError
29
+
30
+ API_BASE = "https://huggingface.co/api"
31
+ PIPELINE_FILTER = "text-generation"
32
+ TRENDING_LIMIT = 50
33
+ TRENDING_FETCH_LIMIT = 100
34
+ PR_SCAN_LIMIT = 40
35
+ USER_AGENT = "skills-evals-leaderboard/0.3"
36
+
37
+
38
+ def _normalize(text: Optional[str]) -> str:
39
+ if not text:
40
+ return ""
41
+ text = text.lower()
42
+ text = re.sub(r"[^a-z0-9]+", " ", text)
43
+ return text.strip()
44
+
45
+
46
+ def _coerce_score(value: Any) -> Optional[float]:
47
+ if value is None:
48
+ return None
49
+ if isinstance(value, (int, float)):
50
+ return float(value)
51
+ if isinstance(value, str):
52
+ candidate = value.strip()
53
+ if candidate.endswith("%"):
54
+ candidate = candidate[:-1]
55
+ try:
56
+ return float(candidate)
57
+ except ValueError:
58
+ return None
59
+ return None
60
+
61
+
62
+ @dataclass(frozen=True)
63
+ class BenchmarkSpec:
64
+ key: str
65
+ label: str
66
+ aliases: tuple[str, ...]
67
+
68
+ def matches(self, fields: List[str]) -> bool:
69
+ for alias in self.aliases:
70
+ alias_norm = _normalize(alias)
71
+ if not alias_norm:
72
+ continue
73
+ for field in fields:
74
+ if alias_norm in field:
75
+ return True
76
+ return False
77
+
78
+
79
+ BENCHMARKS: Dict[str, BenchmarkSpec] = {
80
+ "mmlu": BenchmarkSpec(
81
+ key="mmlu",
82
+ label="MMLU",
83
+ aliases=("mmlu", "massive multitask language understanding"),
84
+ ),
85
+ "bigcodebench": BenchmarkSpec(
86
+ key="bigcodebench",
87
+ label="BigCodeBench",
88
+ aliases=("bigcodebench", "big code bench"),
89
+ ),
90
+ "arc_mc": BenchmarkSpec(
91
+ key="arc_mc",
92
+ label="ARC MC",
93
+ aliases=(
94
+ "arc mc",
95
+ "arc-challenge",
96
+ "arc challenge",
97
+ "arc multiple choice",
98
+ "arc c",
99
+ ),
100
+ ),
101
+ }
102
+
103
+
104
+ class EvalsCollector:
105
+ """Collects evaluation scores from model-index metadata."""
106
+
107
+ def __init__(self, token: str | None = None) -> None:
108
+ self.token = token
109
+ self.session = requests.Session()
110
+ self.session.headers.update({"User-Agent": USER_AGENT})
111
+ if token:
112
+ self.session.headers.update({"Authorization": f"Bearer {token}"})
113
+ self.logs: List[str] = []
114
+ self.results: List[Dict[str, Any]] = []
115
+
116
+ def log(self, message: str) -> None:
117
+ """Add a log message."""
118
+ print(message)
119
+ self.logs.append(message)
120
+
121
+ def collect_all(self) -> List[Dict[str, Any]]:
122
+ """Collect evaluation scores from trending models."""
123
+ self.log("🔍 Fetching trending text-generation models...")
124
+ trending = self._fetch_trending_models()
125
+
126
+ for entry in trending:
127
+ repo_id = entry.get("modelId") or entry.get("id")
128
+ if not repo_id:
129
+ continue
130
+ scores = self._collect_scores(repo_id)
131
+ if scores["scores"]:
132
+ self.results.extend(self._format_scores(repo_id, scores["scores"]))
133
+
134
+ self.log(f"✅ Collected {len(self.results)} evaluation entries")
135
+ return self.results
136
+
137
+ def _fetch_trending_models(self) -> List[Dict[str, Any]]:
138
+ params = {"sort": "trendingScore", "limit": TRENDING_FETCH_LIMIT}
139
+ response = self.session.get(
140
+ f"{API_BASE}/models",
141
+ params=params,
142
+ timeout=30,
143
+ )
144
+ response.raise_for_status()
145
+ data = response.json()
146
+ if not isinstance(data, list):
147
+ raise ValueError("Unexpected trending response.")
148
+ filtered = [
149
+ model
150
+ for model in data
151
+ if (model.get("pipeline_tag") == PIPELINE_FILTER or PIPELINE_FILTER in (model.get("tags") or []))
152
+ ]
153
+ if not filtered:
154
+ self.log("⚠️ No text-generation models in trending feed.")
155
+ return []
156
+ limited = filtered[:TRENDING_LIMIT]
157
+ self.log(f"📊 Found {len(limited)} trending text-generation models")
158
+ return limited
159
+
160
+ def _collect_scores(self, repo_id: str) -> Dict[str, Any]:
161
+ owner = repo_id.split("/")[0]
162
+ card_meta = self._read_model_card(repo_id)
163
+ model_index = card_meta.get("model-index")
164
+ if model_index:
165
+ self.log(f"✅ {repo_id}: model card metadata found.")
166
+ scores = self._extract_scores(
167
+ repo_id=repo_id,
168
+ model_index=model_index,
169
+ contributor=owner,
170
+ source_type="model-card",
171
+ source_url=f"https://huggingface.co/{repo_id}",
172
+ revision="main",
173
+ )
174
+ if scores:
175
+ return {"model_id": repo_id, "scores": scores}
176
+
177
+ prs = self._fetch_pull_requests(repo_id)
178
+ for pr in prs:
179
+ revision = f"refs/pr/{pr['num']}"
180
+ pr_meta = self._read_model_card(repo_id, revision=revision)
181
+ pr_index = pr_meta.get("model-index")
182
+ if not pr_index:
183
+ continue
184
+ author_info = pr.get("author", {}) or {}
185
+ contributor = author_info.get("name") or author_info.get("fullname") or "unknown-author"
186
+ discussion_path = f"{repo_id}/discussions/{pr['num']}"
187
+ source_url = f"https://huggingface.co/{discussion_path}"
188
+ scores = self._extract_scores(
189
+ repo_id=repo_id,
190
+ model_index=pr_index,
191
+ contributor=contributor,
192
+ source_type="pull-request",
193
+ source_url=source_url,
194
+ revision=revision,
195
+ )
196
+ if scores:
197
+ note = f"📝 {repo_id}: PR #{pr['num']} by {contributor}."
198
+ self.log(note)
199
+ return {"model_id": repo_id, "scores": scores}
200
+
201
+ self.log(f"⚠️ {repo_id}: no target benchmarks located.")
202
+ return {"model_id": repo_id, "scores": {}}
203
+
204
+ def _read_model_card(
205
+ self,
206
+ repo_id: str,
207
+ revision: Optional[str] = None,
208
+ ) -> Dict[str, Any]:
209
+ try:
210
+ path = hf_hub_download(
211
+ repo_id=repo_id,
212
+ filename="README.md",
213
+ repo_type="model",
214
+ revision=revision,
215
+ token=self.token,
216
+ )
217
+ except HfHubHTTPError as err:
218
+ ctx = f"{repo_id} ({revision or 'main'})"
219
+ self.log(f"🚫 {ctx}: README download failed ({err}).")
220
+ return {}
221
+ text = Path(path).read_text(encoding="utf-8", errors="ignore")
222
+ return self._parse_front_matter(text)
223
+
224
+ @staticmethod
225
+ def _parse_front_matter(content: str) -> Dict[str, Any]:
226
+ content = content.lstrip("\ufeff")
227
+ if not content.startswith("---"):
228
+ return {}
229
+ lines = content.splitlines()
230
+ end_idx = None
231
+ for idx, line in enumerate(lines[1:], start=1):
232
+ if line.strip() == "---":
233
+ end_idx = idx
234
+ break
235
+ if end_idx is None:
236
+ return {}
237
+ front_matter = "\n".join(lines[1:end_idx])
238
+ try:
239
+ data = yaml.safe_load(front_matter) or {}
240
+ return data if isinstance(data, dict) else {}
241
+ except yaml.YAMLError:
242
+ return {}
243
+
244
+ def _fetch_pull_requests(self, repo_id: str) -> List[Dict[str, Any]]:
245
+ url = f"{API_BASE}/models/{repo_id}/discussions"
246
+ try:
247
+ response = self.session.get(
248
+ url,
249
+ params={"limit": PR_SCAN_LIMIT},
250
+ timeout=30,
251
+ )
252
+ response.raise_for_status()
253
+ except requests.RequestException as err:
254
+ self.log(f"🚫 {repo_id}: PR list request failed ({err}).")
255
+ return []
256
+
257
+ payload = response.json()
258
+ discussions = payload.get("discussions", [])
259
+ prs = [disc for disc in discussions if disc.get("isPullRequest")]
260
+ prs.sort(key=lambda item: item.get("createdAt", ""), reverse=True)
261
+ if prs:
262
+ self.log(f"📬 {repo_id}: scanning {len(prs)} pull requests.")
263
+ return prs
264
+
265
+ def _extract_scores(
266
+ self,
267
+ repo_id: str,
268
+ model_index: Any,
269
+ contributor: str,
270
+ source_type: str,
271
+ source_url: str,
272
+ revision: str,
273
+ ) -> Dict[str, Dict[str, Any]]:
274
+ if not isinstance(model_index, list):
275
+ return {}
276
+ scores: Dict[str, Dict[str, Any]] = {}
277
+ for entry in model_index:
278
+ if not isinstance(entry, dict):
279
+ continue
280
+ model_name = entry.get("name") or repo_id.split("/")[-1]
281
+ for result in entry.get("results", []):
282
+ dataset_info = result.get("dataset") or {}
283
+ dataset_name = dataset_info.get("name")
284
+ dataset_type = dataset_info.get("type")
285
+ task_info = result.get("task") or {}
286
+ task_type = task_info.get("type")
287
+ for metric in result.get("metrics", []):
288
+ benchmark_key = self._match_benchmark(
289
+ dataset_name,
290
+ dataset_type,
291
+ metric,
292
+ )
293
+ if not benchmark_key:
294
+ continue
295
+ raw_value = metric.get("value")
296
+ value = _coerce_score(raw_value)
297
+ if value is None:
298
+ continue
299
+ unit = metric.get("unit") or ""
300
+ is_pct = isinstance(raw_value, str) and raw_value.strip().endswith("%")
301
+ if not unit and is_pct:
302
+ unit = "%"
303
+ metric_name = metric.get("name") or metric.get("type") or ""
304
+ payload = {
305
+ "model": repo_id,
306
+ "model_name": model_name,
307
+ "benchmark_key": benchmark_key,
308
+ "benchmark_label": BENCHMARKS[benchmark_key].label,
309
+ "value": value,
310
+ "unit": unit,
311
+ "dataset": dataset_name or dataset_type or "",
312
+ "task_type": task_type or "",
313
+ "metric_name": metric_name,
314
+ "contributor": contributor,
315
+ "source_type": source_type,
316
+ "source_url": source_url,
317
+ "revision": revision,
318
+ }
319
+ existing = scores.get(benchmark_key)
320
+ if not existing or value > existing["value"]:
321
+ scores[benchmark_key] = payload
322
+ return scores
323
+
324
+ def _match_benchmark(
325
+ self,
326
+ dataset_name: Optional[str],
327
+ dataset_type: Optional[str],
328
+ metric: Dict[str, Any],
329
+ ) -> Optional[str]:
330
+ fields = [
331
+ _normalize(dataset_name),
332
+ _normalize(dataset_type),
333
+ _normalize(metric.get("name")),
334
+ _normalize(metric.get("type")),
335
+ ]
336
+ fields = [field for field in fields if field]
337
+ for key, spec in BENCHMARKS.items():
338
+ if spec.matches(fields):
339
+ return key
340
+ return None
341
+
342
+ def _format_scores(self, model_id: str, scores: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]:
343
+ """Format scores as flat records for the dataset."""
344
+ rows = []
345
+ for benchmark_key, payload in scores.items():
346
+ rows.append(
347
+ {
348
+ "model_id": model_id,
349
+ "benchmark": payload["benchmark_label"],
350
+ "benchmark_key": benchmark_key,
351
+ "score": round(payload["value"], 2),
352
+ "source_type": payload["source_type"],
353
+ "source_url": payload["source_url"],
354
+ "contributor": payload["contributor"],
355
+ "collected_at": datetime.now(timezone.utc).isoformat(),
356
+ }
357
+ )
358
+ return rows
359
+
360
+ def get_leaderboard(self) -> List[Dict[str, Any]]:
361
+ """Get results sorted by score descending."""
362
+ return sorted(self.results, key=lambda x: x["score"], reverse=True)
363
+
364
+ def save_json(self, filepath: str) -> None:
365
+ """Save the leaderboard to a JSON file."""
366
+ leaderboard = self.get_leaderboard()
367
+ output = {
368
+ "generated_at": datetime.now(timezone.utc).isoformat(),
369
+ "total_entries": len(leaderboard),
370
+ "benchmarks": list(BENCHMARKS.keys()),
371
+ "leaderboard": leaderboard,
372
+ }
373
+ with open(filepath, "w") as f:
374
+ json.dump(output, f, indent=2)
375
+ self.log(f"💾 Saved leaderboard to {filepath}")
376
+
377
+ def push_to_hub(self, repo_id: str = "hf-skills/evals-leaderboard") -> None:
378
+ """Push the leaderboard data to a HF dataset."""
379
+ try:
380
+ from huggingface_hub import HfApi
381
+ except ImportError:
382
+ self.log("❌ huggingface_hub not installed. Run: pip install huggingface_hub")
383
+ return
384
+
385
+ api = HfApi(token=self.token)
386
+ leaderboard = self.get_leaderboard()
387
+
388
+ # Create dataset as JSONL
389
+ jsonl_content = "\n".join(json.dumps(row) for row in leaderboard)
390
+
391
+ # Create metadata file
392
+ metadata = {
393
+ "generated_at": datetime.now(timezone.utc).isoformat(),
394
+ "total_entries": len(leaderboard),
395
+ "models_with_scores": len(set(r["model_id"] for r in leaderboard)),
396
+ "benchmarks": list(BENCHMARKS.keys()),
397
+ }
398
+
399
+ try:
400
+ # Create repo if it doesn't exist
401
+ api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True)
402
+ self.log(f"📁 Ensured dataset repo exists: {repo_id}")
403
+
404
+ # Upload leaderboard data
405
+ api.upload_file(
406
+ path_or_fileobj=jsonl_content.encode(),
407
+ path_in_repo="data/leaderboard.jsonl",
408
+ repo_id=repo_id,
409
+ repo_type="dataset",
410
+ commit_message=f"Update leaderboard - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M')} UTC",
411
+ )
412
+
413
+ # Upload metadata
414
+ api.upload_file(
415
+ path_or_fileobj=json.dumps(metadata, indent=2).encode(),
416
+ path_in_repo="data/metadata.json",
417
+ repo_id=repo_id,
418
+ repo_type="dataset",
419
+ commit_message=f"Update metadata - {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M')} UTC",
420
+ )
421
+
422
+ self.log(f"🚀 Pushed leaderboard to {repo_id}")
423
+ except Exception as e:
424
+ self.log(f"❌ Failed to push to hub: {e}")
425
+
426
+
427
+ def main() -> None:
428
+ parser = argparse.ArgumentParser(description="Collect evaluation scores from model-index metadata")
429
+ parser.add_argument(
430
+ "--push-to-hub",
431
+ action="store_true",
432
+ help="Push results to HF dataset",
433
+ )
434
+ parser.add_argument(
435
+ "--output",
436
+ type=str,
437
+ default="leaderboard.json",
438
+ help="Output JSON file path",
439
+ )
440
+ parser.add_argument(
441
+ "--repo-id",
442
+ type=str,
443
+ default="hf-skills/evals-leaderboard",
444
+ help="HF dataset repo ID for pushing",
445
+ )
446
+ args = parser.parse_args()
447
+
448
+ token = os.environ.get("HF_TOKEN")
449
+ if not token:
450
+ print("⚠️ No HF_TOKEN found. Some requests may be rate-limited.")
451
+
452
+ collector = EvalsCollector(token=token)
453
+ collector.collect_all()
454
+
455
+ # Print leaderboard summary
456
+ print("\n" + "=" * 60)
457
+ print("📊 EVALUATION LEADERBOARD")
458
+ print("=" * 60)
459
+
460
+ leaderboard = collector.get_leaderboard()
461
+ for entry in leaderboard[:20]:
462
+ print(f"{entry['model_id']:40} | {entry['benchmark']:12} | {entry['score']:6.2f}")
463
+
464
+ if len(leaderboard) > 20:
465
+ print(f" ... and {len(leaderboard) - 20} more entries")
466
+
467
+ print("=" * 60)
468
+ print(f"Total entries: {len(leaderboard)}")
469
+ print(f"Models with scores: {len(set(r['model_id'] for r in leaderboard))}")
470
+
471
+ # Save locally
472
+ collector.save_json(args.output)
473
+
474
+ # Push to hub if requested
475
+ if args.push_to_hub:
476
+ collector.push_to_hub(args.repo_id)
477
+
478
+
479
+ if __name__ == "__main__":
480
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ requests
3
+ pyyaml
4
+ huggingface_hub
5
+