“pangjh3” commited on
Commit
4edba36
·
1 Parent(s): ee81350

modified: .gitattributes

Browse files

new file: .gitignore
new file: .pre-commit-config.yaml
new file: Makefile
new file: app.py
new file: pyproject.toml
new file: requirements.txt
new file: src/about.py
new file: src/display/css_html_js.py
new file: src/display/formatting.py
new file: src/display/utils.py
new file: src/envs.py
new file: src/oss/__init__.py
new file: src/oss/oss_file_manager.py
new file: src/oss/oss_leaderboard_manager.py
new file: src/oss/oss_submission_handler.py
new file: src/populate.py
new file: src/submission/submit.py

.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto_evals/
2
+ venv/
3
+ __pycache__/
4
+ .env
5
+ .ipynb_checkpoints
6
+ *ipynb
7
+ .vscode/
8
+
9
+ eval-queue/
10
+ eval-results/
11
+ eval-queue-bk/
12
+ eval-results-bk/
13
+ logs/
14
+
.pre-commit-config.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ default_language_version:
16
+ python: python3
17
+
18
+ ci:
19
+ autofix_prs: true
20
+ autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21
+ autoupdate_schedule: quarterly
22
+
23
+ repos:
24
+ - repo: https://github.com/pre-commit/pre-commit-hooks
25
+ rev: v4.3.0
26
+ hooks:
27
+ - id: check-yaml
28
+ - id: check-case-conflict
29
+ - id: detect-private-key
30
+ - id: check-added-large-files
31
+ args: ['--maxkb=1000']
32
+ - id: requirements-txt-fixer
33
+ - id: end-of-file-fixer
34
+ - id: trailing-whitespace
35
+
36
+ - repo: https://github.com/PyCQA/isort
37
+ rev: 5.12.0
38
+ hooks:
39
+ - id: isort
40
+ name: Format imports
41
+
42
+ - repo: https://github.com/psf/black
43
+ rev: 22.12.0
44
+ hooks:
45
+ - id: black
46
+ name: Format code
47
+ additional_dependencies: ['click==8.0.2']
48
+
49
+ - repo: https://github.com/charliermarsh/ruff-pre-commit
50
+ # Ruff version.
51
+ rev: 'v0.0.267'
52
+ hooks:
53
+ - id: ruff
54
+
Makefile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
14
+
app.py ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datetime
4
+ import requests
5
+ from email.utils import parseaddr
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+ from src.about import (
11
+ CITATION_BUTTON_LABEL,
12
+ CITATION_BUTTON_TEXT,
13
+ EVALUATION_QUEUE_TEXT,
14
+ INTRODUCTION_TEXT,
15
+ LLM_BENCHMARKS_TEXT,
16
+ TITLE,
17
+ )
18
+ from src.display.css_html_js import custom_css
19
+ from src.display.utils import (
20
+ BENCHMARK_COLS,
21
+ COLS,
22
+ EVAL_COLS,
23
+ EVAL_TYPES,
24
+ AutoEvalColumn,
25
+ ModelType,
26
+ fields,
27
+ WeightType,
28
+ Precision
29
+ )
30
+
31
+ # ATLAS specific imports - use populate module to avoid transformers dependency
32
+ try:
33
+ from src.populate import process_sage_results_for_leaderboard, get_sage_leaderboard_df
34
+ SAGE_MODULES_AVAILABLE = process_sage_results_for_leaderboard is not None
35
+ if SAGE_MODULES_AVAILABLE:
36
+ print("✅ ATLAS modules loaded successfully")
37
+ else:
38
+ print("❌ ATLAS modules not available")
39
+ except ImportError as e:
40
+ print(f"Warning: ATLAS modules not available: {e}")
41
+ SAGE_MODULES_AVAILABLE = False
42
+
43
+
44
+ # Configuration
45
+ TOKEN = os.environ.get("HF_TOKEN", None)
46
+ OWNER = "opencompass"
47
+
48
+ # OSS submission tracking paths
49
+ SUBMISSION_TRACKING_PATH = "atlas_eval/submissions/user_tracking/"
50
+ SUBMISSION_HISTORY_FILE = "submission_history.json"
51
+
52
+ def format_error(msg):
53
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
54
+
55
+ def format_warning(msg):
56
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
57
+
58
+ def format_log(msg):
59
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
60
+
61
+ def model_hyperlink(link, model_name):
62
+ if link and link.startswith("http"):
63
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
64
+ return model_name
65
+
66
+ def load_submission_history():
67
+ """Load user submission history from OSS"""
68
+ try:
69
+ from src.oss.oss_file_manager import OSSFileManager
70
+ oss_manager = OSSFileManager()
71
+
72
+ # Try to download submission history file
73
+ history_content = oss_manager.download_file_content(
74
+ SUBMISSION_TRACKING_PATH + SUBMISSION_HISTORY_FILE
75
+ )
76
+
77
+ if history_content:
78
+ return json.loads(history_content)
79
+ else:
80
+ print("📝 Creating new submission history")
81
+ return {}
82
+
83
+ except Exception as e:
84
+ print(f"⚠️ Failed to load submission history: {e}")
85
+ return {}
86
+
87
+ def save_submission_history(history):
88
+ """Save user submission history to OSS"""
89
+ try:
90
+ from src.oss.oss_file_manager import OSSFileManager
91
+ oss_manager = OSSFileManager()
92
+
93
+ # Upload submission history
94
+ history_json = json.dumps(history, indent=2, ensure_ascii=False)
95
+ success = oss_manager.upload_file_content(
96
+ content=history_json,
97
+ object_key=SUBMISSION_TRACKING_PATH + SUBMISSION_HISTORY_FILE
98
+ )
99
+
100
+ return success
101
+
102
+ except Exception as e:
103
+ print(f"❌ Failed to save submission history: {e}")
104
+ return False
105
+
106
+ def check_user_submission_eligibility(profile: gr.OAuthProfile, org_name: str):
107
+ """Check user submission eligibility"""
108
+ try:
109
+ # 1. Check account age limit (60 days)
110
+ user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
111
+ if user_data.status_code == 200:
112
+ creation_date = json.loads(user_data.content)["createdAt"]
113
+ account_age = datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ')
114
+
115
+ if account_age < datetime.timedelta(days=60):
116
+ return False, "This account does not meet the submission requirement. Account age must exceed 60 days."
117
+ else:
118
+ return False, "Unable to verify account information. Please try again later."
119
+
120
+ # 2. Check daily submission limit
121
+ submission_history = load_submission_history()
122
+ user_submissions = submission_history.get(profile.username, [])
123
+
124
+ today = datetime.datetime.today().strftime('%Y-%m-%d')
125
+ today_submissions = [s for s in user_submissions if s.get("date", "") == today]
126
+
127
+ if len(today_submissions) >= 2:
128
+ return False, "You have already submitted twice today. Please try again tomorrow."
129
+
130
+ return True, "Eligibility check passed"
131
+
132
+ except Exception as e:
133
+ print(f"❌ User eligibility check failed: {e}")
134
+ return False, f"System check error, please try again later: {str(e)}"
135
+
136
+ def record_user_submission(profile: gr.OAuthProfile, model_name: str, org_name: str, email: str):
137
+ """Record user submission"""
138
+ try:
139
+ submission_history = load_submission_history()
140
+
141
+ if profile.username not in submission_history:
142
+ submission_history[profile.username] = []
143
+
144
+ # Record this submission
145
+ submission_record = {
146
+ "date": datetime.datetime.today().strftime('%Y-%m-%d'),
147
+ "time": datetime.datetime.now().strftime('%H:%M:%S'),
148
+ "model": model_name,
149
+ "organization": org_name,
150
+ "email": email,
151
+ "username": profile.username
152
+ }
153
+
154
+ submission_history[profile.username].append(submission_record)
155
+
156
+ # Save submission history
157
+ return save_submission_history(submission_history)
158
+
159
+ except Exception as e:
160
+ print(f"❌ Failed to record submission history: {e}")
161
+ return False
162
+
163
+ def get_leaderboard_dataframe():
164
+ """Generate leaderboard dataframe from ATLAS results"""
165
+ print("🔄 Loading ATLAS leaderboard data...")
166
+
167
+ if not SAGE_MODULES_AVAILABLE:
168
+ print("❌ ATLAS modules not available")
169
+ return pd.DataFrame()
170
+
171
+ try:
172
+ # Use the updated get_sage_leaderboard_df function
173
+ df = get_sage_leaderboard_df()
174
+
175
+ if df.empty:
176
+ print("❌ No ATLAS results found")
177
+ return pd.DataFrame()
178
+
179
+ print(f"✅ Generated dataframe with {len(df)} rows")
180
+ return df
181
+
182
+ except Exception as e:
183
+ print(f"❌ Error generating leaderboard dataframe: {e}")
184
+ import traceback
185
+ traceback.print_exc()
186
+ return pd.DataFrame()
187
+
188
+ def refresh_leaderboard():
189
+ """Refresh the leaderboard data"""
190
+ print("🔄 Refreshing leaderboard data...")
191
+ return get_leaderboard_dataframe()
192
+
193
+ # Initialize data
194
+ print("🚀 Initializing ATLAS leaderboard...")
195
+ leaderboard_df = get_leaderboard_dataframe()
196
+ print(f"📈 Leaderboard initialized with {len(leaderboard_df)} rows")
197
+
198
+ # Define column types for the dataframe (Model, Organization, Accuracy, mG-Pass@2, mG-Pass@4, Submission Date)
199
+ COLUMN_TYPES = ["markdown", "str", "number", "number", "number", "str"]
200
+
201
+
202
+ # Create Gradio interface
203
+ demo = gr.Blocks(css="""
204
+ .markdown-text {
205
+ font-size: 16px !important;
206
+ }
207
+ #citation-button {
208
+ font-family: monospace;
209
+ }
210
+ """)
211
+
212
+ with demo:
213
+ gr.HTML(TITLE)
214
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
215
+
216
+ with gr.Row():
217
+ with gr.Accordion("📙 Citation", open=False):
218
+ citation_button = gr.Textbox(
219
+ value=CITATION_BUTTON_TEXT,
220
+ label=CITATION_BUTTON_LABEL,
221
+ elem_id="citation-button",
222
+ lines=10,
223
+ max_lines=10,
224
+ interactive=False
225
+ )
226
+
227
+ # Main leaderboard table
228
+ gr.Markdown("## 🏆 ATLAS Benchmark Results", elem_classes="markdown-text")
229
+
230
+ # Debug information - dynamic component
231
+ results_count = gr.Markdown(f"📊 **Showing {len(leaderboard_df)} results**")
232
+
233
+ leaderboard_table = gr.Dataframe(
234
+ value=leaderboard_df,
235
+ datatype=COLUMN_TYPES,
236
+ interactive=False,
237
+ wrap=True,
238
+ column_widths=["30%", "20%", "12%", "12%", "12%", "14%"]
239
+ )
240
+
241
+ # Refresh button
242
+ refresh_button = gr.Button("🔄 Refresh Leaderboard")
243
+
244
+ def refresh_leaderboard_with_count():
245
+ """Refresh leaderboard and update count display"""
246
+ df = refresh_leaderboard()
247
+ count_text = f"📊 **Showing {len(df)} results**"
248
+ return df, count_text
249
+
250
+ refresh_button.click(
251
+ refresh_leaderboard_with_count,
252
+ inputs=[],
253
+ outputs=[leaderboard_table, results_count]
254
+ )
255
+
256
+ # Submission section
257
+ with gr.Accordion("📊 Submit Your ATLAS Results", open=False):
258
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
259
+
260
+ gr.Markdown("""
261
+ ### 📋 Submission Requirements
262
+ <!--
263
+ - Login required: You must log in with a Hugging Face account
264
+ - Account age: Account must be older than 60 days
265
+ - Submission frequency: Each user can submit up to 2 times per day
266
+ -->
267
+ - File format: Upload a JSON file in the ATLAS format
268
+ - Organization: Provide the exact organization name (shown on the leaderboard)
269
+ - Contact email: Provide a valid email for notifications
270
+ - Auto evaluation: After submission, the system will run LLM-based evaluation and update the leaderboard
271
+
272
+ <!--
273
+ ### 🔐 Security Policy
274
+ To prevent spam and ensure evaluation quality, we enforce:
275
+ - New accounts must wait 60 days before submitting (prevents abuse)
276
+ - Daily submission limits to ensure leaderboard quality and system stability
277
+ - Duplicate checks to avoid multiple submissions for the same organization
278
+ -->
279
+ """, elem_classes="markdown-text")
280
+
281
+ with gr.Row():
282
+ with gr.Column():
283
+ model_textbox = gr.Textbox(
284
+ label="Model Name - will be shown on the leaderboard",
285
+ placeholder="Your Model Name (e.g., GPT-4, Llama-2-70B)"
286
+ )
287
+ org_textbox = gr.Textbox(
288
+ label="Organization Name - will be shown on the leaderboard",
289
+ placeholder="Your Organization"
290
+ )
291
+ email_textbox = gr.Textbox(
292
+ label="Contact Email - used for contact, not publicly visible",
293
+ placeholder="contact@example.com"
294
+ )
295
+ with gr.Column():
296
+ file_upload = gr.File(
297
+ label="Upload ATLAS Results (JSON)",
298
+ file_types=[".json"],
299
+ type="filepath"
300
+ )
301
+
302
+ # 提交按钮 (登录功能暂时注释)
303
+ with gr.Row():
304
+ login_button = gr.LoginButton("🔐 Login with HuggingFace", size="lg")
305
+ submit_button = gr.Button("Submit Results", variant="primary", size="lg")
306
+ # 登录状态与用户信息
307
+ profile_state = gr.State()
308
+ login_status = gr.Markdown(visible=True)
309
+
310
+ # def on_login(profile: gr.OAuthProfile):
311
+ # try:
312
+ # if profile and getattr(profile, "name", None):
313
+ # name = profile.name
314
+ # text = f"✅ Logged in as: **{name}**"
315
+ # else:
316
+ # text = "❌ Login failed, please try again"
317
+ # return profile, text
318
+ # except Exception:
319
+ # return None, "❌ Login failed, please try again"
320
+ # login_button.click(on_login, inputs=None, outputs=[profile_state, login_status])
321
+
322
+ # 进度显示和结果显示区域
323
+ progress_info = gr.HTML()
324
+ submission_result = gr.HTML()
325
+
326
+ def show_progress(step, message, total_steps=4):
327
+ """Show progress information"""
328
+ progress_percentage = int((step / total_steps) * 100)
329
+ progress_html = f"""
330
+ <div style="background-color: #e7f3ff; border: 1px solid #4dabf7; border-radius: 5px; padding: 15px; margin: 10px 0;">
331
+ <div style="display: flex; align-items: center; margin-bottom: 10px;">
332
+ <h4 style="color: #1971c2; margin: 0; flex-grow: 1;">⏳ Processing submission...</h4>
333
+ <span style="color: #1971c2; font-weight: bold;">{progress_percentage}%</span>
334
+ </div>
335
+ <p style="color: #1971c2; margin: 5px 0;"><strong>Step {step}/{total_steps}:</strong> {message}</p>
336
+ <div style="background-color: #fff; border-radius: 10px; height: 20px; margin: 10px 0; border: 1px solid #dee2e6;">
337
+ <div style="background: linear-gradient(90deg, #4dabf7, #74c0fc); height: 100%; width: {progress_percentage}%; border-radius: 10px; transition: width 0.5s ease; display: flex; align-items: center; justify-content: center;">
338
+ {f'<span style="color: white; font-size: 12px; font-weight: bold;">{progress_percentage}%</span>' if progress_percentage > 20 else ''}
339
+ </div>
340
+ </div>
341
+ <p style="color: #495057; font-size: 14px; margin: 5px 0;">
342
+ {'✨ Almost done, please wait...' if step >= total_steps else '📤 Please wait, processing your submission...'}
343
+ </p>
344
+ </div>
345
+ """
346
+ return progress_html
347
+
348
+ def handle_submission(file_upload, model_name, org_name, email, user_profile: gr.OAuthProfile):
349
+ try:
350
+ # 步骤1: 基本验证
351
+ yield show_progress(1, "Validating submission info"), ""
352
+
353
+ # 校验登录
354
+ if user_profile is None or getattr(user_profile, "name", None) is None:
355
+ yield "", format_error("Please log in with Hugging Face before submitting")
356
+ return
357
+ print(f"user_profile: {user_profile}")
358
+ print(f"user_profile.name: {user_profile.name}")
359
+
360
+ if not file_upload:
361
+ yield "", format_error("Please select a file to upload")
362
+ return
363
+ if not model_name or not model_name.strip():
364
+ yield "", format_error("Please enter model name")
365
+ return
366
+ if not org_name or not org_name.strip():
367
+ yield "", format_error("Please enter organization name")
368
+ return
369
+ if not email or not email.strip():
370
+ yield "", format_error("Please enter email address")
371
+ return
372
+
373
+ # 验证邮箱格式
374
+ _, parsed_email = parseaddr(email)
375
+ if "@" not in parsed_email:
376
+ yield "", format_warning("Please provide a valid email address")
377
+ return
378
+
379
+ # 步骤2: 文件验证和读取
380
+ yield show_progress(2, "Validating file format and content"), ""
381
+
382
+ import time
383
+ time.sleep(0.5) # allow users to see progress update
384
+
385
+ # 用户资格检查(账号年龄/频率/重复提交)
386
+ eligible, msg = check_user_submission_eligibility(user_profile, org_name)
387
+ if not eligible:
388
+ yield "", format_error(msg)
389
+ return
390
+
391
+ # 步骤3: 上传到OSS
392
+ yield show_progress(3, "Uploading file to OSS storage"), ""
393
+
394
+ # 处理文件提交
395
+ from src.submission.submit import process_sage_submission_simple
396
+ result = process_sage_submission_simple(file_upload, model_name, org_name, email)
397
+
398
+ # 步骤4: 完成
399
+ yield show_progress(4, "Submission completed, preparing evaluation"), ""
400
+
401
+ time.sleep(0.5) # allow users to see completion state
402
+
403
+ # 记录提交历史
404
+ try:
405
+ record_user_submission(user_profile, model_name, org_name, email)
406
+ except Exception:
407
+ pass
408
+
409
+ # 生成成功信息
410
+ success_info = f"""
411
+ <div style="background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 5px; padding: 15px; margin: 10px 0;">
412
+ <h4 style="color: #155724; margin-top: 0;">🎉 Submission successful!</h4>
413
+ <p style="color: #155724; margin: 5px 0;"><strong>Model:</strong> {model_name}</p>
414
+ <p style="color: #155724; margin: 5px 0;"><strong>Organization:</strong> {org_name}</p>
415
+ <p style="color: #155724; margin: 5px 0;"><strong>Email:</strong> {email}</p>
416
+ <p style="color: #155724; margin: 5px 0;"><strong>Submitted at:</strong> {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
417
+ <p style="color: #155724; margin-bottom: 0;">Your results have been submitted via OSS. LLM evaluation will complete in 5-10 minutes and the leaderboard will be updated.</p>
418
+ </div>
419
+ """
420
+
421
+ # 清除进度条,显示最终结果
422
+ yield "", success_info + result
423
+
424
+ except ImportError as e:
425
+ yield "", format_error(f"Submission system modules unavailable: {e}")
426
+ except Exception as e:
427
+ import traceback
428
+ traceback.print_exc()
429
+ yield "", format_error(f"An error occurred during submission: {str(e)}")
430
+
431
+ submit_button.click(
432
+ handle_submission,
433
+ inputs=[file_upload, model_textbox, org_textbox, email_textbox], # profile_state
434
+ outputs=[progress_info, submission_result]
435
+ )
436
+
437
+ # Launch the app
438
+ if __name__ == "__main__":
439
+ # Disable SSR mode for better OAuth compatibility
440
+ # Note: OAuth is handled internally via gr.LoginButton, not at launch level
441
+ demo.launch(ssr_mode=False)
442
+
pyproject.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
14
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets
2
+ gradio
3
+ huggingface-hub>=0.18.0
4
+ numpy
5
+ pandas
6
+ python-dateutil
7
+ openai>=1.0.0
8
+ aiohttp
9
+ oss2
10
+ loguru
11
+ tqdm
12
+
src/about.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ col_name: str
9
+
10
+
11
+ # Select your tasks here
12
+ # ---------------------------------------------------
13
+ class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ sage_overall = Task("sage_overall", "accuracy", "ATLAS Overall")
16
+ sage_math = Task("sage_math", "accuracy", "Mathematics")
17
+ sage_physics = Task("sage_physics", "accuracy", "Physics")
18
+ sage_chemistry = Task("sage_chemistry", "accuracy", "Chemistry")
19
+ sage_biology = Task("sage_biology", "accuracy", "Biology")
20
+ sage_earth_science = Task("sage_earth_science", "accuracy", "Earth Science")
21
+ sage_astronomy = Task("sage_astronomy", "accuracy", "Astronomy")
22
+
23
+ NUM_FEWSHOT = 0 # Change with your few shot
24
+ # ---------------------------------------------------
25
+
26
+
27
+
28
+ # Your leaderboard name
29
+ TITLE = """<h1 align="center" id="space-title">ATLAS: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning</h1>"""
30
+
31
+ # What does your leaderboard evaluate?
32
+ INTRODUCTION_TEXT = """
33
+ **ATLAS (AGI-Oriented Testbed for Logical Application in Science)** is a large-scale, high-difficulty, cross-disciplinary evaluation suite for assessing the frontier scientific reasoning capabilities of LLMs. Designed to address the challenges of benchmark saturation, narrow disciplinary focus, oversimplified answer formats, and data contamination in existing evaluations, ATLAS serves as a reliable **ruler** for measuring progress toward AGI in the **AI for Science** domain.
34
+
35
+ ## Benchmark Overview
36
+ **ATLAS** evaluates models across seven core scientific fields that are central to AI for Science, encompassing 57 corresponding sub-fields to ensure comprehensive coverage of scientific reasoning requirements:
37
+ - **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
38
+ - **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
39
+ - **Chemistry** - Physical chemistry, inorganic chemistry, organic chemistry, and analytical chemistry
40
+ - **Biology** - Genetics, immunology, molecular biology, biophysics, and ecology
41
+ - **Computer Science** - Computer architecture, artificial intelligence, and software fundamentals
42
+ - **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
43
+ - **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
44
+
45
+ ## Evaluation Metrics
46
+ - **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
47
+ - **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
48
+ - **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)
49
+ The leaderboard displays model performance sorted by average accuracy, with domain-specific scores reflecting strengths in different scientific fields. All metrics are derived from the ATLAS validation/test set (≈800 expert-created original problems).
50
+ """
51
+
52
+ # Which evaluations are you running? how can people reproduce what you have?
53
+ LLM_BENCHMARKS_TEXT = f"""
54
+ ## How ATLAS Works
55
+
56
+ ATLAS evaluates language models across seven scientific domains through a comprehensive assessment of both content generation and reasoning capabilities.
57
+
58
+ ### Evaluation Process:
59
+ 1. **Multi-domain Assessment**: Models are tested on questions spanning Mathematics, Physics, Chemistry, Biology, Computer Science, Earth Science, and Materials Science
60
+ 2. **Content + Reasoning**: Each submission requires both predicted answers and reasoning explanations
61
+ 3. **Accuracy Scoring**: Performance is measured using accuracy metrics across all domains
62
+ 4. **Comprehensive Reporting**: Results are aggregated to provide both overall and domain-specific scores
63
+
64
+ ### Submission Format:
65
+ Submissions should follow this JSON structure:
66
+ ```json
67
+ {{
68
+ "submission_org": "Your Organization",
69
+ "submission_email": "contact@example.com",
70
+ "predictions": [
71
+ {{
72
+ "original_question_id": 0,
73
+ "content": ["answer1", "answer2", "answer3", "answer4"],
74
+ "reasoning_content": ["reasoning1", "reasoning2", "reasoning3", "reasoning4"]
75
+ }}
76
+ ]
77
+ }}
78
+ ```
79
+
80
+ ## Reproducibility
81
+ To reproduce our evaluation results:
82
+ 1. Download the ATLAS dataset from our repository
83
+ 2. Use the evaluation scripts provided in the benchmark toolkit
84
+ 3. Follow the submission format specifications exactly
85
+ 4. Submit your results through this leaderboard interface
86
+ """
87
+
88
+ EVALUATION_QUEUE_TEXT = """
89
+ ## Submit Your ATLAS Results
90
+
91
+ Results can be submitted as evaluation outputs in JSON format. Each submission should include predictions and reasoning content for all test questions.
92
+
93
+ ### Required JSON Format:
94
+ ```json
95
+ {
96
+ "submission_org": "Your Organization",
97
+ "submission_email": "contact@example.com",
98
+ "predictions": [
99
+ {
100
+ "original_question_id": 0,
101
+ "content": ["answer1", "answer2", "answer3", "answer4"],
102
+ "reasoning_content": ["reasoning1", "reasoning2", "reasoning3", "reasoning4"]
103
+ }
104
+ ]
105
+ }
106
+ ```
107
+
108
+ ### Submission Guidelines:
109
+ - Each prediction must include exactly 4 content items and 4 reasoning items
110
+ - Question IDs should match the official ATLAS test set
111
+ - Provide clear scientific reasoning for each prediction
112
+ - Ensure JSON format is valid and complete
113
+
114
+ Your submission will be automatically evaluated across all scientific domains and added to the leaderboard.
115
+ """
116
+
117
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
118
+ CITATION_BUTTON_TEXT = r"""@article{liu2025atlas,
119
+ title={ATLAS: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning},
120
+ author={Liu, Hongwei and Liu, Junnan and Liu, Shudong and Duan, Haodong and Li, Yuqiang and Su, Mao and Liu, Xiaohong and Zhai, Guangtao and Fang, Xinyu and Ma, Qianhong and Zhang, Taolin and Ma, Zihan and Zhao, Yufeng and Zhou, Peiheng and Xiao, Linchen and Zhang, Wenlong and Zhou, Shijie and Ma, Xingjian and Sun, Siqi and Ge, Jiaye and Li, Meng and Liu, Yuhong and Dong, Jianxin and Li, Jiaying and Wu, Hui and Liang, Hanwen and Lin, Jintai and Wang, Yanting and Dong, Jie and Zhu, Tong and Fu, Tianfan and He, Conghui and Zhang, Qi and Zhang, Songyang and Bai, Lei and Chen, Kai},
121
+ journal={arXiv preprint arXiv:2511.14366},
122
+ year={2025}
123
+ }"""
src/display/css_html_js.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ #citation-button span {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #citation-button textarea {
16
+ font-size: 16px !important;
17
+ }
18
+
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
22
+ }
23
+
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
+ #search-bar-table-box > div:first-child {
33
+ background: none;
34
+ border: none;
35
+ }
36
+
37
+ #search-bar {
38
+ padding: 0px;
39
+ }
40
+
41
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
+ #leaderboard-table td:nth-child(2),
43
+ #leaderboard-table th:nth-child(2) {
44
+ max-width: 400px;
45
+ overflow: auto;
46
+ white-space: nowrap;
47
+ }
48
+
49
+ .tab-buttons button {
50
+ font-size: 20px;
51
+ }
52
+
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+
62
+ #scale-logo .download {
63
+ display: none;
64
+ }
65
+ #filter_type{
66
+ border: 0;
67
+ padding-left: 0;
68
+ padding-top: 0;
69
+ }
70
+ #filter_type label {
71
+ display: flex;
72
+ }
73
+ #filter_type label > span{
74
+ margin-top: var(--spacing-lg);
75
+ margin-right: 0.5em;
76
+ }
77
+ #filter_type label > .wrap{
78
+ width: 103px;
79
+ }
80
+ #filter_type label > .wrap .wrap-inner{
81
+ padding: 2px;
82
+ }
83
+ #filter_type label > .wrap .wrap-inner input{
84
+ width: 1px
85
+ }
86
+ #filter-columns-type{
87
+ border:0;
88
+ padding:0.5;
89
+ }
90
+ #filter-columns-size{
91
+ border:0;
92
+ padding:0.5;
93
+ }
94
+ #box-filter > .form{
95
+ border: 0
96
+ }
97
+ """
98
+
99
+ get_window_url_params = """
100
+ function(url_params) {
101
+ const params = new URLSearchParams(window.location.search);
102
+ url_params = Object.fromEntries(params);
103
+ return url_params;
104
+ }
105
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
+
4
+
5
+ def make_clickable_model(model_name):
6
+ link = f"https://huggingface.co/{model_name}"
7
+ return model_hyperlink(link, model_name)
8
+
9
+
10
+ def styled_error(error):
11
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
+
13
+
14
+ def styled_warning(warn):
15
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
+
17
+
18
+ def styled_message(message):
19
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
+
21
+
22
+ def has_no_nan_values(df, columns):
23
+ return df[columns].notna().all(axis=1)
24
+
25
+
26
+ def has_nan_values(df, columns):
27
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ from src.about import Tasks
7
+
8
+ def fields(raw_class):
9
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
+
11
+
12
+ # These classes are for user facing column names,
13
+ # to avoid having to change them all around the code
14
+ # when a modif is needed
15
+ @dataclass
16
+ class ColumnContent:
17
+ name: str
18
+ type: str
19
+ displayed_by_default: bool
20
+ hidden: bool = False
21
+ never_hidden: bool = False
22
+
23
+ ## Leaderboard columns
24
+ auto_eval_column_dict = []
25
+ # Init
26
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
+ for task in Tasks:
31
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
+ # Model information
33
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
+
43
+ # We use make dataclass to dynamically fill the scores from Tasks
44
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
+
46
+ ## For the queue columns in the submission tab
47
+ @dataclass(frozen=True)
48
+ class EvalQueueColumn: # Queue column
49
+ model = ColumnContent("model", "markdown", True)
50
+ revision = ColumnContent("revision", "str", True)
51
+ private = ColumnContent("private", "bool", True)
52
+ precision = ColumnContent("precision", "str", True)
53
+ weight_type = ColumnContent("weight_type", "str", "Original")
54
+ status = ColumnContent("status", "str", True)
55
+
56
+ ## All the model information that we might need
57
+ @dataclass
58
+ class ModelDetails:
59
+ name: str
60
+ display_name: str = ""
61
+ symbol: str = "" # emoji
62
+
63
+
64
+ class ModelType(Enum):
65
+ PT = ModelDetails(name="pretrained", symbol="🟢")
66
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
+ Unknown = ModelDetails(name="", symbol="?")
70
+
71
+ def to_str(self, separator=" "):
72
+ return f"{self.value.symbol}{separator}{self.value.name}"
73
+
74
+ @staticmethod
75
+ def from_str(type):
76
+ if "fine-tuned" in type or "🔶" in type:
77
+ return ModelType.FT
78
+ if "pretrained" in type or "🟢" in type:
79
+ return ModelType.PT
80
+ if "RL-tuned" in type or "🟦" in type:
81
+ return ModelType.RL
82
+ if "instruction-tuned" in type or "⭕" in type:
83
+ return ModelType.IFT
84
+ return ModelType.Unknown
85
+
86
+ class WeightType(Enum):
87
+ Adapter = ModelDetails("Adapter")
88
+ Original = ModelDetails("Original")
89
+ Delta = ModelDetails("Delta")
90
+
91
+ class Precision(Enum):
92
+ float16 = ModelDetails("float16")
93
+ bfloat16 = ModelDetails("bfloat16")
94
+ Unknown = ModelDetails("?")
95
+
96
+ def from_str(precision):
97
+ if precision in ["torch.float16", "float16"]:
98
+ return Precision.float16
99
+ if precision in ["torch.bfloat16", "bfloat16"]:
100
+ return Precision.bfloat16
101
+ return Precision.Unknown
102
+
103
+ # Column selection
104
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
+
106
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
+
109
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
+
src/envs.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
+
9
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ # ----------------------------------
11
+
12
+ REPO_ID = f"{OWNER}/leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/requests"
14
+ RESULTS_REPO = f"{OWNER}/results"
15
+
16
+ # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH=os.getenv("HF_HOME", ".")
18
+
19
+ # Local caches
20
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
+
25
+ API = HfApi(token=TOKEN)
src/oss/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OSS模块 - 处理阿里云OSS相关功能
3
+ """
4
+
5
+ from .oss_file_manager import OSSFileManager
6
+ from .oss_leaderboard_manager import OSSLeaderboardManager
7
+ from .oss_submission_handler import OSSSubmissionHandler
8
+
9
+ __all__ = ["OSSFileManager", "OSSLeaderboardManager", "OSSSubmissionHandler"]
src/oss/oss_file_manager.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import oss2
4
+ import json
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import List, Dict, Optional
8
+ from loguru import logger
9
+
10
+
11
+ class OSSFileManager:
12
+ """简化的OSS文件管理器"""
13
+
14
+ def __init__(
15
+ self,
16
+ oss_access_key_id: str = None,
17
+ oss_access_key_secret: str = None,
18
+ oss_region: str = None,
19
+ oss_bucket_name: str = None
20
+ ):
21
+ """
22
+ 初始化OSS文件管理器
23
+
24
+ Args:
25
+ oss_access_key_id: OSS访问密钥ID
26
+ oss_access_key_secret: OSS访问密钥Secret
27
+ oss_region: OSS区域端点
28
+ oss_bucket_name: OSS存储桶名称
29
+ """
30
+ # 从环境变量获取配置
31
+ self.access_key_id = oss_access_key_id or os.getenv('OSS_ACCESS_KEY_ID')
32
+ self.access_key_secret = oss_access_key_secret or os.getenv('OSS_ACCESS_KEY_SECRET')
33
+ self.region = oss_region or os.getenv('OSS_REGION', 'http://oss-cn-shanghai.aliyuncs.com')
34
+ self.bucket_name = oss_bucket_name or os.getenv('OSS_BUCKET_NAME', 'opencompass')
35
+
36
+ if not self.access_key_id or not self.access_key_secret:
37
+ raise ValueError("OSS访问密钥未设置。请设置 OSS_ACCESS_KEY_ID 和 OSS_ACCESS_KEY_SECRET 环境变量。")
38
+
39
+ # 初始化OSS客户端
40
+ auth = oss2.Auth(self.access_key_id, self.access_key_secret)
41
+ self.bucket = oss2.Bucket(auth, self.region, self.bucket_name)
42
+
43
+ logger.info(f"OSS初始化成功: {self.bucket_name} @ {self.region}")
44
+
45
+ def list_files(
46
+ self,
47
+ oss_dir: str = "",
48
+ after_date: datetime = None,
49
+ file_extension: str = None
50
+ ) -> List[Dict]:
51
+ """
52
+ 列出OSS目录中的文件
53
+
54
+ Args:
55
+ oss_dir: OSS目录路径
56
+ after_date: 只返回此日期之后的文件
57
+ file_extension: 文件扩展名过滤 (如 ".json")
58
+
59
+ Returns:
60
+ 文件信息列表
61
+ """
62
+ try:
63
+ files = []
64
+
65
+ # 确保目录路径以 / 结尾
66
+ if oss_dir and not oss_dir.endswith('/'):
67
+ oss_dir += '/'
68
+
69
+ # 列出对象
70
+ for obj in oss2.ObjectIterator(self.bucket, prefix=oss_dir):
71
+ # 跳过目录本身
72
+ if obj.key.endswith('/'):
73
+ continue
74
+
75
+ # 文件扩展名过滤
76
+ if file_extension and not obj.key.endswith(file_extension):
77
+ continue
78
+
79
+ # 日期过滤
80
+ if after_date and obj.last_modified < after_date:
81
+ continue
82
+
83
+ file_info = {
84
+ 'key': obj.key,
85
+ 'name': os.path.basename(obj.key),
86
+ 'size': obj.size,
87
+ 'last_modified': obj.last_modified,
88
+ 'etag': obj.etag
89
+ }
90
+ files.append(file_info)
91
+
92
+ logger.info(f"找到 {len(files)} 个文件在 {oss_dir}")
93
+ return files
94
+
95
+ except Exception as e:
96
+ logger.error(f"列出文件失败: {e}")
97
+ raise
98
+
99
+ def download_file(self, oss_file_path: str, local_file_path: str) -> bool:
100
+ """
101
+ 从OSS下载文件到本地
102
+
103
+ Args:
104
+ oss_file_path: OSS文件路径
105
+ local_file_path: 本地文件路径
106
+
107
+ Returns:
108
+ 下载是否成功
109
+ """
110
+ try:
111
+ # 确保本地目录存在
112
+ local_dir = os.path.dirname(local_file_path)
113
+ if local_dir:
114
+ os.makedirs(local_dir, exist_ok=True)
115
+
116
+ # 下载文件
117
+ self.bucket.get_object_to_file(oss_file_path, local_file_path)
118
+
119
+ logger.info(f"下载成功: {oss_file_path} -> {local_file_path}")
120
+ return True
121
+
122
+ except Exception as e:
123
+ logger.error(f"下载文件失败: {oss_file_path} -> {local_file_path}, 错误: {e}")
124
+ return False
125
+
126
+ def upload_file_to_object(
127
+ self,
128
+ local_file_path: str,
129
+ oss_file_path: str,
130
+ replace: bool = False
131
+ ) -> bool:
132
+ """
133
+ 上传本地文件到OSS
134
+
135
+ Args:
136
+ local_file_path: 本地文件路径
137
+ oss_file_path: OSS文件路径
138
+ replace: 是否替换已存在的文件
139
+
140
+ Returns:
141
+ 上传是否成功
142
+ """
143
+ try:
144
+ # 检查本地文件是否存在
145
+ if not os.path.exists(local_file_path):
146
+ logger.error(f"本地文件不存在: {local_file_path}")
147
+ return False
148
+
149
+ # 检查OSS文件是否存在
150
+ if not replace and self.bucket.object_exists(oss_file_path):
151
+ logger.warning(f"OSS文件已存在: {oss_file_path}")
152
+ return False
153
+
154
+ # 上传文件
155
+ self.bucket.put_object_from_file(oss_file_path, local_file_path)
156
+
157
+ logger.info(f"上传成功: {local_file_path} -> {oss_file_path}")
158
+ return True
159
+
160
+ except Exception as e:
161
+ logger.error(f"上传文件失败: {local_file_path} -> {oss_file_path}, 错误: {e}")
162
+ return False
163
+
164
+ def file_exists(self, oss_file_path: str) -> bool:
165
+ """
166
+ 检查OSS文件是否存在
167
+
168
+ Args:
169
+ oss_file_path: OSS文件路径
170
+
171
+ Returns:
172
+ 文件是否存在
173
+ """
174
+ try:
175
+ return self.bucket.object_exists(oss_file_path)
176
+ except Exception as e:
177
+ logger.error(f"检查文件存在性失败: {oss_file_path}, 错误: {e}")
178
+ return False
179
+
180
+ def download_file_content(self, oss_file_path: str) -> Optional[bytes]:
181
+ """
182
+ 下载OSS文件内容到内存
183
+
184
+ Args:
185
+ oss_file_path: OSS文件路径
186
+
187
+ Returns:
188
+ 文件内容(字节)或None
189
+ """
190
+ try:
191
+ result = self.bucket.get_object(oss_file_path)
192
+ content = result.read()
193
+ logger.info(f"下载文件内容成功: {oss_file_path} ({len(content)} bytes)")
194
+ return content
195
+ except Exception as e:
196
+ logger.error(f"下载文件内容失败: {oss_file_path}, 错误: {e}")
197
+ return None
198
+
199
+ def upload_file_content(self, content: str, object_key: str) -> bool:
200
+ """
201
+ 直接上传字符串内容到OSS
202
+
203
+ Args:
204
+ content: 要上传的字符串内容
205
+ object_key: OSS对象键(文件路径)
206
+
207
+ Returns:
208
+ 上传是否成功
209
+ """
210
+ try:
211
+ # 将字符串转换为字节
212
+ if isinstance(content, str):
213
+ content_bytes = content.encode('utf-8')
214
+ else:
215
+ content_bytes = content
216
+
217
+ # 直接上传内容到OSS
218
+ self.bucket.put_object(object_key, content_bytes)
219
+
220
+ logger.info(f"上传内容成功: {object_key} ({len(content_bytes)} bytes)")
221
+ return True
222
+
223
+ except Exception as e:
224
+ logger.error(f"上传内容失败: {object_key}, 错误: {e}")
225
+ return False
226
+
227
+
228
+ def upload_file(self, local_file_path: str, oss_file_path: str) -> bool:
229
+ """
230
+ 上传本地文件到OSS(别名方法)
231
+
232
+ Args:
233
+ local_file_path: 本地文件路径
234
+ oss_file_path: OSS文件路径
235
+
236
+ Returns:
237
+ 上传是否成功
238
+ """
239
+ return self.upload_file_to_object(local_file_path, oss_file_path, replace=True)
240
+
241
+ def copy_file(self, source_path: str, target_path: str) -> bool:
242
+ """
243
+ 在OSS内部复制文件
244
+
245
+ Args:
246
+ source_path: 源文件路径
247
+ target_path: 目标文件路径
248
+
249
+ Returns:
250
+ 复制是否成功
251
+ """
252
+ try:
253
+ # 使用copy_object进行OSS内部复制
254
+ self.bucket.copy_object(
255
+ self.bucket_name, # 源bucket
256
+ source_path, # 源文件路径
257
+ target_path # 目标文件路径
258
+ )
259
+ logger.info(f"文件复制成功: {source_path} -> {target_path}")
260
+ return True
261
+ except Exception as e:
262
+ logger.error(f"文件复制失败: {source_path} -> {target_path}, 错误: {e}")
263
+ return False
264
+
265
+ def list_latest_files_by_date(
266
+ self,
267
+ object_dir: str = "",
268
+ max_num_files: int = 100,
269
+ suffix: str = ".json",
270
+ date_pattern: str = r".*",
271
+ file_date_format: str = "%Y-%m-%d"
272
+ ) -> List[str]:
273
+ """
274
+ 列出OSS目录中按日期排序的文件
275
+
276
+ Args:
277
+ object_dir: OSS目录路径
278
+ max_num_files: 最大文件数量
279
+ suffix: 文件后缀
280
+ date_pattern: 日期匹配模式
281
+ file_date_format: 日期格式
282
+
283
+ Returns:
284
+ 文件路径列表(最新的在前面)
285
+ """
286
+ try:
287
+ # 使用现有的list_files方法
288
+ files = self.list_files(
289
+ oss_dir=object_dir,
290
+ file_extension=suffix
291
+ )
292
+
293
+ # 提取文件名
294
+ filenames = []
295
+ for file_info in files:
296
+ filename = file_info['name']
297
+ # 简单的文件名匹配(不使用复杂的正则)
298
+ if suffix in filename:
299
+ filenames.append(filename)
300
+
301
+ # 按文件名排序(假设文件��包含时间戳)
302
+ filenames.sort(reverse=True)
303
+
304
+ # 限制数量
305
+ max_num_files = max_num_files or len(filenames)
306
+ filenames = filenames[:max_num_files]
307
+
308
+ logger.info(f"找到 {len(filenames)} 个文件,按日期排序")
309
+
310
+ # 返回完整的OSS路径
311
+ result = []
312
+ for filename in filenames:
313
+ if object_dir:
314
+ full_path = f"{object_dir.rstrip('/')}/{filename}"
315
+ else:
316
+ full_path = filename
317
+ result.append(full_path)
318
+
319
+ return result
320
+
321
+ except Exception as e:
322
+ logger.error(f"列出最新文件失败: {e}")
323
+ return []
324
+
325
+ def download_object_to_file(
326
+ self,
327
+ oss_file_path: str,
328
+ local_file_path: str,
329
+ replace: bool = True,
330
+ make_dir: bool = True
331
+ ) -> bool:
332
+ """
333
+ 从OSS下载对象到本地文件(兼容性方法)
334
+
335
+ Args:
336
+ oss_file_path: OSS文件路径
337
+ local_file_path: 本地文件路径
338
+ replace: 是否替换已存在的文件
339
+ make_dir: 是否创建目录
340
+
341
+ Returns:
342
+ 下载是否成功
343
+ """
344
+ try:
345
+ # 检查本地文件是否存在
346
+ if not replace and os.path.exists(local_file_path):
347
+ logger.warning(f"本地文件已存在: {local_file_path}")
348
+ return False
349
+
350
+ # 创建目录
351
+ if make_dir:
352
+ local_dir = os.path.dirname(local_file_path)
353
+ if local_dir:
354
+ os.makedirs(local_dir, exist_ok=True)
355
+
356
+ # 使用现有的download_file方法
357
+ return self.download_file(oss_file_path, local_file_path)
358
+
359
+ except Exception as e:
360
+ logger.error(f"下载对象失败: {oss_file_path} -> {local_file_path}, 错误: {e}")
361
+ return False
362
+
363
+ def get_file_info(self, oss_file_path: str) -> Optional[Dict]:
364
+ """
365
+ 获取OSS文件信息
366
+
367
+ Args:
368
+ oss_file_path: OSS文件路径
369
+
370
+ Returns:
371
+ 文件信息字典
372
+ """
373
+ try:
374
+ obj = self.bucket.get_object_meta(oss_file_path)
375
+
376
+ return {
377
+ 'key': oss_file_path,
378
+ 'name': os.path.basename(oss_file_path),
379
+ 'size': obj.content_length,
380
+ 'last_modified': obj.last_modified,
381
+ 'etag': obj.etag,
382
+ 'content_type': obj.content_type
383
+ }
384
+
385
+ except oss2.exceptions.NoSuchKey:
386
+ logger.warning(f"文件不存在: {oss_file_path}")
387
+ return None
388
+ except Exception as e:
389
+ logger.error(f"获取文件信息失败: {oss_file_path}, 错误: {e}")
390
+ return None
391
+
392
+ def delete_file(self, oss_file_path: str) -> bool:
393
+ """
394
+ 删除OSS文件
395
+
396
+ Args:
397
+ oss_file_path: OSS文件路径
398
+
399
+ Returns:
400
+ 删除是否成功
401
+ """
402
+ try:
403
+ self.bucket.delete_object(oss_file_path)
404
+ logger.info(f"删除成功: {oss_file_path}")
405
+ return True
406
+
407
+ except Exception as e:
408
+ logger.error(f"删除文件失败: {oss_file_path}, 错误: {e}")
409
+ return False
410
+
411
+
412
+ # 兼容性别名 - 保持与原始代码的兼容性
413
+ class SimpleOSSManager(OSSFileManager):
414
+ """兼容性别名"""
415
+ pass
416
+
417
+
418
+ if __name__ == "__main__":
419
+ # 测试代码
420
+ try:
421
+ manager = OSSFileManager()
422
+ print("✅ OSS file manager initialized successfully")
423
+
424
+ # 测试列出文件
425
+ files = manager.list_files("atlas_eval/submissions/", file_extension=".json")
426
+ print(f"📁 Found {len(files)} submission files")
427
+
428
+ for file_info in files[:3]: # 只显示前3个
429
+ print(f" - {file_info['name']} ({file_info['size']} bytes)")
430
+
431
+ except Exception as e:
432
+ print(f"❌ Test failed: {e}")
src/oss/oss_leaderboard_manager.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OSS排行榜管理器 - 从OSS读取和更新排行榜数据
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import tempfile
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from typing import Dict, List, Any, Optional
12
+ from .oss_file_manager import OSSFileManager
13
+
14
+
15
+ class OSSLeaderboardManager:
16
+ """OSS排行榜管理器 - 管理存储在OSS中的排行榜数据"""
17
+
18
+ def __init__(self):
19
+ """初始化OSS排行榜管理器"""
20
+ self.oss_manager = OSSFileManager()
21
+
22
+ # OSS路径配置
23
+ self.leaderboard_path = "atlas_eval/leaderboard/"
24
+ self.backup_path = "atlas_eval/leaderboard/backup/"
25
+ self.leaderboard_file = "leaderboard.json"
26
+
27
+ # 完整的OSS路径
28
+ self.oss_leaderboard_file = f"{self.leaderboard_path}{self.leaderboard_file}"
29
+
30
+ print(f"📊 OSS leaderboard path: oss://opencompass/{self.oss_leaderboard_file}")
31
+ print(f"📦 OSS backup path: oss://opencompass/{self.backup_path}")
32
+
33
+ def load_leaderboard_from_oss(self) -> List[Dict[str, Any]]:
34
+ """
35
+ 从OSS加载排行榜数据
36
+
37
+ Returns:
38
+ 排行榜数据列表
39
+ """
40
+ try:
41
+ print(f"📥 Loading leaderboard data from OSS: {self.oss_leaderboard_file}")
42
+
43
+ # 从OSS下载文件内容
44
+ content = self.oss_manager.download_file_content(self.oss_leaderboard_file)
45
+
46
+ if content:
47
+ leaderboard_data = json.loads(content.decode('utf-8'))
48
+ print(f"✅ Successfully loaded {len(leaderboard_data)} leaderboard entries")
49
+ return leaderboard_data
50
+ else:
51
+ print("⚠️ No leaderboard file found in OSS, returning empty list")
52
+ return []
53
+
54
+ except Exception as e:
55
+ print(f"❌ Failed to load leaderboard from OSS: {e}")
56
+ return []
57
+
58
+ def save_leaderboard_to_oss(self, leaderboard_data: List[Dict[str, Any]],
59
+ create_backup: bool = True) -> bool:
60
+ """
61
+ 保存排行榜数据到OSS
62
+
63
+ Args:
64
+ leaderboard_data: 排行榜数据
65
+ create_backup: 是否创建备份
66
+
67
+ Returns:
68
+ 是否保存成功
69
+ """
70
+ try:
71
+ print(f"📤 Saving leaderboard data to OSS: {self.oss_leaderboard_file}")
72
+
73
+ # 创建备份(如果需要且现有文件存在)
74
+ if create_backup:
75
+ self._create_backup()
76
+
77
+ # 创建临时文件
78
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
79
+ json.dump(leaderboard_data, temp_file, indent=2, ensure_ascii=False)
80
+ temp_file_path = temp_file.name
81
+
82
+ try:
83
+ # 上传到OSS
84
+ success = self.oss_manager.upload_file(
85
+ local_file_path=temp_file_path,
86
+ oss_file_path=self.oss_leaderboard_file
87
+ )
88
+
89
+ if success:
90
+ print(f"✅ Successfully saved {len(leaderboard_data)} leaderboard entries to OSS")
91
+ return True
92
+ else:
93
+ print("❌ Failed to upload leaderboard file to OSS")
94
+ return False
95
+
96
+ finally:
97
+ # 清理临时文件
98
+ try:
99
+ os.unlink(temp_file_path)
100
+ except:
101
+ pass
102
+
103
+ except Exception as e:
104
+ print(f"❌ Failed to save leaderboard to OSS: {e}")
105
+ return False
106
+
107
+ def _create_backup(self) -> bool:
108
+ """
109
+ 创建当前排行榜文件的备份
110
+
111
+ Returns:
112
+ 是否备份成功
113
+ """
114
+ try:
115
+ # 检查原文件是否存在
116
+ if not self.oss_manager.file_exists(self.oss_leaderboard_file):
117
+ print("📋 Original leaderboard file does not exist, skipping backup")
118
+ return True
119
+
120
+ # 生成备份文件名
121
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
122
+ backup_filename = f"leaderboard.json.backup_{timestamp}"
123
+ backup_path = f"{self.backup_path}{backup_filename}"
124
+
125
+ # 复制文件到备份路径
126
+ success = self.oss_manager.copy_file(
127
+ source_path=self.oss_leaderboard_file,
128
+ target_path=backup_path
129
+ )
130
+
131
+ if success:
132
+ print(f"📦 Backup created successfully: {backup_path}")
133
+ return True
134
+ else:
135
+ print(f"❌ Failed to create backup: {backup_path}")
136
+ return False
137
+
138
+ except Exception as e:
139
+ print(f"❌ Error creating backup: {e}")
140
+ return False
141
+
142
+ def add_evaluation_result(self, result_data: Dict[str, Any]) -> bool:
143
+ """
144
+ 添加新的评测结果到排行榜
145
+
146
+ Args:
147
+ result_data: 评测结果数据
148
+
149
+ Returns:
150
+ 是否添加成功
151
+ """
152
+ try:
153
+ # 加载现有排行榜
154
+ leaderboard_data = self.load_leaderboard_from_oss()
155
+
156
+ # 检查是否已存在相同的提交(基于organization和submitted_time)
157
+ existing_entry = None
158
+ for i, entry in enumerate(leaderboard_data):
159
+ if (entry.get("organization") == result_data.get("organization") and
160
+ entry.get("submitted_time") == result_data.get("submitted_time")):
161
+ existing_entry = i
162
+ break
163
+
164
+ if existing_entry is not None:
165
+ print(f"🔄 Updating existing leaderboard entry: {result_data.get('organization')}")
166
+ leaderboard_data[existing_entry] = result_data
167
+ else:
168
+ print(f"➕ Adding new leaderboard entry: {result_data.get('organization')}")
169
+ leaderboard_data.append(result_data)
170
+
171
+ # 按准确率排序
172
+ leaderboard_data.sort(
173
+ key=lambda x: x.get("accuracy", 0),
174
+ reverse=True
175
+ )
176
+
177
+ # 保存到OSS
178
+ return self.save_leaderboard_to_oss(leaderboard_data)
179
+
180
+ except Exception as e:
181
+ print(f"❌ Failed to add evaluation result: {e}")
182
+ return False
183
+
184
+ def get_leaderboard_summary(self) -> Dict[str, Any]:
185
+ """
186
+ 获取排行榜摘要信息
187
+
188
+ Returns:
189
+ 排行榜摘要
190
+ """
191
+ try:
192
+ leaderboard_data = self.load_leaderboard_from_oss()
193
+
194
+ if not leaderboard_data:
195
+ return {"total_entries": 0, "last_updated": None}
196
+
197
+ # 统计信息
198
+ total_entries = len(leaderboard_data)
199
+
200
+ # 获取最新更新时间
201
+ latest_time = None
202
+ for entry in leaderboard_data:
203
+ eval_time = entry.get("evaluation_timestamp")
204
+ if eval_time and (latest_time is None or eval_time > latest_time):
205
+ latest_time = eval_time
206
+
207
+ # 获取最高分
208
+ top_scores = {}
209
+ if leaderboard_data:
210
+ top_entry = leaderboard_data[0] # 已按准确率排序
211
+ top_scores = {
212
+ "accuracy": top_entry.get("accuracy", 0),
213
+ "mg_pass_2": top_entry.get("mg_pass_2", 0),
214
+ "mg_pass_4": top_entry.get("mg_pass_4", 0)
215
+ }
216
+
217
+ return {
218
+ "total_entries": total_entries,
219
+ "last_updated": latest_time,
220
+ "top_scores": top_scores,
221
+ "oss_path": self.oss_leaderboard_file
222
+ }
223
+
224
+ except Exception as e:
225
+ print(f"❌ Failed to get leaderboard summary: {e}")
226
+ return {"error": str(e)}
227
+
228
+ def migrate_local_to_oss(self, local_file_path: str) -> bool:
229
+ """
230
+ 将本地排行榜文件迁移到OSS
231
+
232
+ Args:
233
+ local_file_path: 本地文件路径
234
+
235
+ Returns:
236
+ 是否迁移成功
237
+ """
238
+ try:
239
+ if not os.path.exists(local_file_path):
240
+ print(f"❌ Local file does not exist: {local_file_path}")
241
+ return False
242
+
243
+ # 读取本地文件
244
+ with open(local_file_path, 'r', encoding='utf-8') as f:
245
+ leaderboard_data = json.load(f)
246
+
247
+ print(f"📤 Migrating {len(leaderboard_data)} entries to OSS")
248
+
249
+ # 保存到OSS
250
+ return self.save_leaderboard_to_oss(leaderboard_data, create_backup=False)
251
+
252
+ except Exception as e:
253
+ print(f"❌ Failed to migrate file to OSS: {e}")
254
+ return False
255
+
256
+
257
+ if __name__ == "__main__":
258
+ # 测试OSS排行榜管理器
259
+ manager = OSSLeaderboardManager()
260
+
261
+ # 打印摘要信息
262
+ summary = manager.get_leaderboard_summary()
263
+ print(f"📊 Leaderboard summary: {summary}")
264
+
265
+ # 测试加载排行榜
266
+ leaderboard = manager.load_leaderboard_from_oss()
267
+ print(f"📋 Number of leaderboard entries: {len(leaderboard)}")
src/oss/oss_submission_handler.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OSS提交处理器 - 替换原有的git/http提交方式
4
+ 在HuggingFace Spaces中直接将提交文件上传到OSS
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import json
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Dict, Any, Tuple
13
+
14
+ # 导入同目录下的oss_file_manager
15
+ from .oss_file_manager import OSSFileManager
16
+
17
+ class OSSSubmissionHandler:
18
+ """OSS提交处理器 - 将用户提交直接上传到OSS"""
19
+
20
+ def __init__(self, oss_submission_path: str = "atlas_eval/submissions/"):
21
+ """
22
+ 初始化OSS提交处理器
23
+
24
+ Args:
25
+ oss_submission_path: OSS中存储提交文件的路径
26
+ """
27
+ self.oss_path = oss_submission_path
28
+ self.oss_manager = OSSFileManager()
29
+
30
+ print(f"📁 OSS submission path: oss://opencompass/{oss_submission_path}")
31
+
32
+ def format_error(self, msg: str) -> str:
33
+ """格式化错误消息"""
34
+ return f"<p style='color: red; font-size: 16px;'>{msg}</p>"
35
+
36
+ def format_success(self, msg: str) -> str:
37
+ """格式化成功消息"""
38
+ return f"<p style='color: green; font-size: 16px;'>{msg}</p>"
39
+
40
+ def format_warning(self, msg: str) -> str:
41
+ """格式化警告消息"""
42
+ return f"<p style='color: orange; font-size: 16px;'>{msg}</p>"
43
+
44
+ def validate_sage_submission(self, submission_data: Dict[str, Any]) -> Tuple[bool, str]:
45
+ """验证ATLAS基准提交格式"""
46
+
47
+ # 检查必需的顶级字段
48
+ required_fields = ["submission_org", "submission_email", "predictions"]
49
+ for field in required_fields:
50
+ if field not in submission_data:
51
+ return False, f"Missing required field: {field}"
52
+
53
+ # 验证邮箱格式(基本验证)
54
+ email = submission_data["submission_email"]
55
+ if "@" not in email or "." not in email:
56
+ return False, "Invalid email format"
57
+
58
+ # 验证predictions
59
+ predictions = submission_data["predictions"]
60
+ if not isinstance(predictions, list) or len(predictions) == 0:
61
+ return False, "predictions must be a non-empty list"
62
+
63
+ for i, prediction in enumerate(predictions):
64
+ # 检查必需的prediction字段
65
+ pred_required_fields = ["original_question_id", "content", "reasoning_content"]
66
+ for field in pred_required_fields:
67
+ if field not in prediction:
68
+ return False, f"Missing field in prediction {i}: {field}"
69
+
70
+ # 验证content数组
71
+ content = prediction["content"]
72
+ reasoning_content = prediction["reasoning_content"]
73
+
74
+ if not isinstance(content, list) or len(content) != 4:
75
+ return False, f"content in prediction {i} must be a list with 4 items"
76
+
77
+ if not isinstance(reasoning_content, list):
78
+ return False, f"reasoning_content in prediction {i} must be a list"
79
+
80
+ # # reasoning_content可以为空列表,或者包含4个项目
81
+ # if len(reasoning_content) != 0 and len(reasoning_content) != 4:
82
+ # return False, f"reasoning_content in prediction {i} must be an empty list or contain 4 items"
83
+
84
+ # 验证question ID
85
+ if not isinstance(prediction["original_question_id"], int):
86
+ return False, f"question ID in prediction {i} must be an integer"
87
+
88
+ return True, "Submission format is valid"
89
+
90
+ def generate_submission_filename(self, submission_data: Dict[str, Any]) -> str:
91
+ """生成提交文件名"""
92
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
93
+ org_name = submission_data["submission_org"].replace(" ", "_").replace("/", "_").replace("\\", "_")
94
+ return f"submission_{org_name}_{timestamp}.json"
95
+
96
+ def upload_to_oss(self, submission_data: Dict[str, Any], filename: str) -> Tuple[bool, str]:
97
+ """上传提交文件到OSS"""
98
+ try:
99
+ # 创建临时本地文件
100
+ temp_file = f"/tmp/{filename}"
101
+ with open(temp_file, 'w', encoding='utf-8') as f:
102
+ json.dump(submission_data, f, indent=2, ensure_ascii=False)
103
+
104
+ # 上传到OSS
105
+ oss_file_path = f"{self.oss_path}{filename}"
106
+
107
+ print(f"⬆️ Uploading to OSS: {oss_file_path}")
108
+ self.oss_manager.upload_file_to_object(
109
+ local_file_path=temp_file,
110
+ oss_file_path=oss_file_path,
111
+ replace=True
112
+ )
113
+
114
+ # 清理临时文件
115
+ os.remove(temp_file)
116
+
117
+ print(f"✅ OSS upload successful: {oss_file_path}")
118
+ return True, f"oss://opencompass/{oss_file_path}"
119
+
120
+ except Exception as e:
121
+ print(f"❌ OSS upload failed: {e}")
122
+ return False, str(e)
123
+
124
+ def process_sage_submission(self, submission_file_or_data, org_name=None, email=None) -> str:
125
+ """
126
+ 处理ATLAS基准提交文件 - OSS模式
127
+ 替换原有的git/http方式,直接上传到OSS
128
+ """
129
+
130
+ try:
131
+ # 处理输入参数 - 可能是文件路径或者已经的数据
132
+ if submission_file_or_data is None:
133
+ return self.format_error("❌ No submission data provided.")
134
+
135
+ # 如果是字符串,认为是文件路径
136
+ if isinstance(submission_file_or_data, str):
137
+ try:
138
+ with open(submission_file_or_data, 'r', encoding='utf-8') as f:
139
+ content = f.read()
140
+ # 解析JSON
141
+ submission_data = json.loads(content)
142
+ except Exception as e:
143
+ return self.format_error(f"❌ Error reading file: {str(e)}")
144
+ # 如果是字典,直接使用
145
+ elif isinstance(submission_file_or_data, dict):
146
+ submission_data = submission_file_or_data
147
+ else:
148
+ return self.format_error("❌ Invalid submission data format.")
149
+
150
+ # 如果表单提供了组织名和邮箱,使用表单数据
151
+ if org_name and email:
152
+ submission_data["submission_org"] = org_name.strip()
153
+ submission_data["submission_email"] = email.strip()
154
+
155
+ # 验证提交格式
156
+ is_valid, message = self.validate_sage_submission(submission_data)
157
+ if not is_valid:
158
+ return self.format_error(f"❌ Submission validation failed: {message}")
159
+
160
+ # 生成文件名
161
+ filename = self.generate_submission_filename(submission_data)
162
+
163
+ # 上传到OSS
164
+ success, result = self.upload_to_oss(submission_data, filename)
165
+
166
+ if not success:
167
+ return self.format_error(f"❌ Failed to upload to OSS: {result}")
168
+
169
+ # 生成成功消息
170
+ org = submission_data["submission_org"]
171
+ email_addr = submission_data["submission_email"]
172
+ num_predictions = len(submission_data["predictions"])
173
+
174
+ success_msg = self.format_success(f"""
175
+ 🎉 <strong>Submission successful!</strong><br><br>
176
+ 📋 <strong>Submission Information:</strong><br>
177
+ • Organization: {org}<br>
178
+ • Email: {email_addr}<br>
179
+ • Number of predictions: {num_predictions} questions<br>
180
+ • Filename: {filename}<br><br>
181
+ 🚀 <strong>Storage Location:</strong><br>
182
+ {result}<br><br>
183
+ ⚡ <strong>Evaluation Status:</strong><br>
184
+ Your submission has been successfully uploaded to cloud storage. The automatic evaluation system will begin processing within 5-15 minutes.<br><br>
185
+ ⏳ <strong>Evaluation Process:</strong><br>
186
+ 1. 🔍 System automatically detects new submission<br>
187
+ 2. ⬇️ Downloads and validates submission format<br>
188
+ 3. 🔬 Performs comprehensive evaluation using LLM-as-Judge<br>
189
+ 4. 📊 Calculates accuracy for each subject and overall<br>
190
+ 5. 🏆 Automatically updates to leaderboard<br><br>
191
+ 🕐 <strong>Estimated Time:</strong><br>
192
+ Evaluation completion time is approximately 5-15 minutes, depending on current queue length.<br>
193
+ Please refresh the leaderboard later to view results.<br><br>
194
+ 🧪 Thank you for participating in the ATLAS scientific reasoning benchmark!
195
+ """)
196
+
197
+ return success_msg
198
+
199
+ except Exception as e:
200
+ return self.format_error(f"❌ Submission processing failed: {str(e)}")
201
+
202
+ # 兼容性函数 - 保持与原有代码的接口一致
203
+ def process_sage_submission_simple(submission_file, org_name=None, email=None) -> str:
204
+ """
205
+ 处理ATLAS基准提交文件 - OSS模式
206
+ 这是一个兼容性函数,保持与原有submit.py的接口一致
207
+ """
208
+ handler = OSSSubmissionHandler()
209
+ return handler.process_sage_submission(submission_file, org_name, email)
210
+
211
+ def format_error(msg):
212
+ return f"<p style='color: red; font-size: 16px;'>{msg}</p>"
213
+
214
+ def format_success(msg):
215
+ return f"<p style='color: green; font-size: 16px;'>{msg}</p>"
216
+
217
+ def format_warning(msg):
218
+ return f"<p style='color: orange; font-size: 16px;'>{msg}</p>"
219
+
220
+ if __name__ == "__main__":
221
+ # 测试代码
222
+ print("🧪 测试OSS提交处理器")
223
+
224
+ # 检查环境变量
225
+ required_env_vars = ["OSS_ACCESS_KEY_ID", "OSS_ACCESS_KEY_SECRET"]
226
+ missing_vars = [var for var in required_env_vars if not os.getenv(var)]
227
+
228
+ if missing_vars:
229
+ print(f"❌ 缺少必需的环境变量: {missing_vars}")
230
+ exit(1)
231
+
232
+ handler = OSSSubmissionHandler()
233
+ print("✅ OSS提交处理器初始化成功")
src/populate.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+ from typing import List
6
+
7
+ from src.display.formatting import has_no_nan_values, make_clickable_model
8
+ from src.display.utils import AutoEvalColumn
9
+
10
+ # Import ATLAS-specific modules - avoid transformers dependency
11
+ process_sage_results_for_leaderboard = None
12
+ try:
13
+ # Import ATLAS modules without triggering transformers dependency
14
+ import sys
15
+ import os
16
+ import json
17
+ from dataclasses import dataclass
18
+ from typing import Dict, List, Any
19
+ import numpy as np
20
+
21
+ # Copy ATLASResult class locally to avoid import issues (keeping SAGEResult name for compatibility)
22
+ @dataclass
23
+ class SAGEResult:
24
+ submission_id: str
25
+ organization: str
26
+ email: str
27
+ tokens: str
28
+ accuracy: float
29
+ mg_pass_2: float
30
+ mg_pass_4: float
31
+ submitted_time: str
32
+ status: str = "EVALUATED"
33
+
34
+ def to_dict(self):
35
+ """Converts the ATLAS Result to a dict compatible with our dataframe display"""
36
+ # Extract model name from submission_id or use model_name directly
37
+ if hasattr(self, 'model_name'):
38
+ model_name = self.model_name
39
+ elif self.submission_id.startswith("oss_"):
40
+ # Extract model name from submission_id
41
+ model_name = self.submission_id.split("_", 2)[-1].replace("_", " ")
42
+ else:
43
+ model_name = self.submission_id
44
+
45
+ # Create display name
46
+ display_name = f"**{model_name}**"
47
+ model_symbol = "🤖"
48
+
49
+ # Format date to YYYY-MM-DD only
50
+ formatted_date = self.submitted_time
51
+ if isinstance(self.submitted_time, str):
52
+ # Try to parse and reformat date
53
+ try:
54
+ if 'T' in self.submitted_time:
55
+ # ISO format like "2025-09-09T14:37:23.616340"
56
+ formatted_date = self.submitted_time.split('T')[0]
57
+ else:
58
+ # Already in simple format
59
+ formatted_date = self.submitted_time.split(' ')[0]
60
+ except:
61
+ formatted_date = self.submitted_time
62
+
63
+ data_dict = {
64
+ "Model": display_name,
65
+ "Organization": self.organization,
66
+ "Accuracy (%)": round(self.accuracy, 2),
67
+ "mG-Pass@2 (%)": round(self.mg_pass_2, 2),
68
+ "mG-Pass@4 (%)": round(self.mg_pass_4, 2),
69
+ "Submission Date": formatted_date,
70
+ }
71
+
72
+ return data_dict
73
+
74
+ def load_initial_sage_results_from_oss() -> List[SAGEResult]:
75
+ """Load initial ATLAS results from OSS"""
76
+ sage_results = []
77
+
78
+ try:
79
+ # 导入OSS排行榜管理器
80
+ from src.oss.oss_leaderboard_manager import OSSLeaderboardManager
81
+
82
+ # 从OSS加载排行榜数据
83
+ leaderboard_manager = OSSLeaderboardManager()
84
+ initial_data = leaderboard_manager.load_leaderboard_from_oss()
85
+
86
+ if initial_data:
87
+ print(f"✅ Loaded {len(initial_data)} leaderboard entries from OSS")
88
+
89
+ for i, entry in enumerate(initial_data):
90
+ sage_result = SAGEResult(
91
+ submission_id=f"oss_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
92
+ organization=entry['organization'],
93
+ email=entry.get('contact_email', f"contact@{entry['organization'].lower().replace(' ', '')}.com"),
94
+ tokens=entry.get('tokens', 'N/A'),
95
+ accuracy=entry.get('accuracy', 0.0),
96
+ mg_pass_2=entry.get('mg_pass_2', 0.0),
97
+ mg_pass_4=entry.get('mg_pass_4', 0.0),
98
+ submitted_time=entry["submitted_time"],
99
+ status="EVALUATED"
100
+ )
101
+ # Add model_name as additional attribute for display
102
+ sage_result.model_name = entry['model_name']
103
+ sage_results.append(sage_result)
104
+ else:
105
+ print("⚠️ No leaderboard data found in OSS")
106
+
107
+ except Exception as e:
108
+ print(f"❌ Failed to load leaderboard from OSS: {e}")
109
+
110
+ return sage_results
111
+
112
+ def process_sage_results_for_leaderboard_oss() -> List[SAGEResult]:
113
+ """Process all ATLAS results from OSS"""
114
+ return load_initial_sage_results_from_oss()
115
+
116
+ # Set the function
117
+ process_sage_results_for_leaderboard = process_sage_results_for_leaderboard_oss
118
+
119
+ except ImportError as e:
120
+ print(f"Could not set up ATLAS results processing: {e}")
121
+ process_sage_results_for_leaderboard = None
122
+
123
+
124
+ def get_sage_leaderboard_df() -> pd.DataFrame:
125
+ """Creates a dataframe from ATLAS evaluation results"""
126
+ if process_sage_results_for_leaderboard is None:
127
+ return pd.DataFrame()
128
+
129
+ # Get ATLAS results
130
+ sage_results = process_sage_results_for_leaderboard()
131
+ all_data_json = [result.to_dict() for result in sage_results]
132
+
133
+ if not all_data_json:
134
+ return pd.DataFrame()
135
+
136
+ df = pd.DataFrame.from_records(all_data_json)
137
+
138
+ # Remove duplicates: for same Model+Organization, keep only the latest submission date
139
+ if "Model" in df.columns and "Organization" in df.columns and "Submission Date" in df.columns:
140
+ # Sort by Submission Date (descending) to get the latest first
141
+ df = df.sort_values(by=["Submission Date"], ascending=False)
142
+ # Drop duplicates, keeping first (latest) occurrence of each Model+Organization combination
143
+ df = df.drop_duplicates(subset=["Model", "Organization"], keep="first")
144
+
145
+ # Sort by accuracy (descending)
146
+ if "Accuracy (%)" in df.columns:
147
+ df = df.sort_values(by=["Accuracy (%)"], ascending=False)
148
+
149
+ # Round numeric columns
150
+ numeric_cols = ["Accuracy (%)", "mG-Pass@2 (%)", "mG-Pass@4 (%)"]
151
+ for col in numeric_cols:
152
+ if col in df.columns:
153
+ df[col] = df[col].round(2)
154
+
155
+ return df
src/submission/submit.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ATLAS提交处理 - OSS模式
4
+ 使用阿里云OSS替代git/http提交方式
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import sys
10
+ from datetime import datetime
11
+ from typing import Dict, Any, Tuple
12
+ from pathlib import Path
13
+
14
+ # 导入OSS提交处理器
15
+ try:
16
+ from src.oss.oss_submission_handler import OSSSubmissionHandler
17
+ OSS_AVAILABLE = True
18
+ except ImportError as e:
19
+ print(f"⚠️ OSS module not available, using fallback mode: {e}")
20
+ OSS_AVAILABLE = False
21
+
22
+ def format_error(msg):
23
+ return f"<p style='color: red; font-size: 16px;'>{msg}</p>"
24
+
25
+ def format_success(msg):
26
+ return f"<p style='color: green; font-size: 16px;'>{msg}</p>"
27
+
28
+ def format_warning(msg):
29
+ return f"<p style='color: orange; font-size: 16px;'>{msg}</p>"
30
+
31
+ def validate_sage_submission(submission_data: Dict[str, Any]) -> Tuple[bool, str]:
32
+ """验证ATLAS基准提交格式"""
33
+
34
+ # 检查必需的顶级字段
35
+ required_fields = ["submission_org", "submission_email", "predictions"]
36
+ for field in required_fields:
37
+ if field not in submission_data:
38
+ return False, f"Missing required field: {field}"
39
+
40
+ # 验证邮箱格式(基本验证)
41
+ email = submission_data["submission_email"]
42
+ if "@" not in email or "." not in email:
43
+ return False, "Invalid email format"
44
+
45
+ # 验证predictions
46
+ predictions = submission_data["predictions"]
47
+ if not isinstance(predictions, list) or len(predictions) == 0:
48
+ return False, "predictions must be a non-empty list"
49
+
50
+ for i, prediction in enumerate(predictions):
51
+ # 检查必需的prediction字段
52
+ pred_required_fields = ["original_question_id", "content", "reasoning_content"]
53
+ for field in pred_required_fields:
54
+ if field not in prediction:
55
+ return False, f"Missing field in prediction {i}: {field}"
56
+
57
+ # 验证content数组
58
+ content = prediction["content"]
59
+ reasoning_content = prediction["reasoning_content"]
60
+
61
+ if not isinstance(content, list) or len(content) != 4:
62
+ return False, f"content in prediction {i} must be a list with 4 items"
63
+
64
+ if not isinstance(reasoning_content, list):
65
+ return False, f"reasoning_content in prediction {i} must be a list"
66
+
67
+ # # reasoning_content可以为空列表,或者包含4个项目
68
+ # if len(reasoning_content) != 0 and len(reasoning_content) != 4:
69
+ # return False, f"reasoning_content in prediction {i} must be an empty list or contain 4 items"
70
+
71
+ # 验证question ID
72
+ if not isinstance(prediction["original_question_id"], int):
73
+ return False, f"question ID in prediction {i} must be an integer"
74
+
75
+ return True, "Submission format is valid"
76
+
77
+ def save_submission_file(submission_data: Dict[str, Any], submissions_dir: str = "./submissions") -> str:
78
+ """保存提交文件到指定目录"""
79
+
80
+ # 确保submissions目录存在
81
+ os.makedirs(submissions_dir, exist_ok=True)
82
+
83
+ # 生成文件名
84
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
85
+ org_name = submission_data["submission_org"].replace(" ", "_").replace("/", "_").replace("\\", "_")
86
+ filename = f"submission_{org_name}_{timestamp}.json"
87
+
88
+ # 完整文件路径
89
+ file_path = os.path.join(submissions_dir, filename)
90
+
91
+ # 保存文件
92
+ with open(file_path, 'w', encoding='utf-8') as f:
93
+ json.dump(submission_data, f, indent=2, ensure_ascii=False)
94
+
95
+ return file_path
96
+
97
+ def process_sage_submission_simple(submission_file, model_name=None, org_name=None, email=None) -> str:
98
+ """
99
+ 处理ATLAS基准提交文件 - 文件收集模式
100
+ 只负责验证和保存,不进行评测
101
+ """
102
+
103
+ try:
104
+ # 读取提交的文件
105
+ if submission_file is None:
106
+ return format_error("❌ No file uploaded. Please select a JSON file.")
107
+
108
+ # submission_file是文件路径字符串
109
+ try:
110
+ with open(submission_file, 'r', encoding='utf-8') as f:
111
+ content = f.read()
112
+ except Exception as e:
113
+ return format_error(f"❌ Error reading file: {str(e)}")
114
+
115
+ # 解析JSON
116
+ try:
117
+ submission_data = json.loads(content)
118
+ except json.JSONDecodeError as e:
119
+ return format_error(f"❌ Invalid JSON format: {str(e)}")
120
+
121
+ # 如果表单提供了模型名、组织名和邮箱,使用表单数据
122
+ if model_name:
123
+ submission_data["model_name"] = model_name.strip()
124
+ if org_name and email:
125
+ submission_data["submission_org"] = org_name.strip()
126
+ submission_data["submission_email"] = email.strip()
127
+
128
+ # 验证提交格式
129
+ is_valid, message = validate_sage_submission(submission_data)
130
+ if not is_valid:
131
+ return format_error(f"❌ Submission validation failed: {message}")
132
+
133
+ # 保存提交文件
134
+ try:
135
+ saved_path = save_submission_file(submission_data)
136
+ print(f"✅ Submission file saved to: {saved_path}")
137
+
138
+ # OSS上传策略
139
+ if OSS_AVAILABLE:
140
+ try:
141
+ # 使用OSS提交处理器
142
+ oss_handler = OSSSubmissionHandler()
143
+ result = oss_handler.process_sage_submission(submission_data, org_name, email)
144
+
145
+ # 如果OSS处理成功,直接返回结果
146
+ if "Submission successful" in result or "successful" in result.lower():
147
+ return result
148
+ else:
149
+ # OSS失败,继续使用备用模式
150
+ print(f"⚠️ OSS submission failed, using fallback mode: {result}")
151
+ except Exception as e:
152
+ print(f"⚠️ OSS submission exception, using fallback mode: {e}")
153
+
154
+ # 备用模式:本地保存
155
+ filename = os.path.basename(saved_path)
156
+
157
+ # 生成成功消息
158
+ org = submission_data["submission_org"]
159
+ email_addr = submission_data["submission_email"]
160
+ num_predictions = len(submission_data["predictions"])
161
+
162
+ success_msg = format_success(f"""
163
+ 🎉 <strong>Submission successful!</strong><br><br>
164
+ 📋 <strong>Submission Information:</strong><br>
165
+ • Organization: {org}<br>
166
+ • Email: {email_addr}<br>
167
+ • Number of predictions: {num_predictions} questions<br>
168
+ • Filename: {filename}<br><br>
169
+ 🚀 <strong>Storage Status:</strong><br>
170
+ File saved to local storage, awaiting system sync to evaluation environment.<br><br>
171
+ ⏳ <strong>Evaluation Process:</strong><br>
172
+ Your submission will be automatically evaluated using LLM-as-Judge, including comprehensive testing of scientific reasoning capabilities.<br>
173
+ Results will appear automatically on the leaderboard after evaluation is complete.<br><br>
174
+ 🕐 <strong>Estimated Time:</strong><br>
175
+ • Normal case: 5-15 minutes<br>
176
+ • Sync delay: 15-60 minutes<br><br>
177
+ 🧪 Thank you for participating in the ATLAS scientific reasoning benchmark!
178
+ """)
179
+
180
+ return success_msg
181
+
182
+ except Exception as e:
183
+ return format_error(f"❌ Error saving submission file: {str(e)}")
184
+
185
+ except Exception as e:
186
+ return format_error(f"❌ Submission processing failed: {str(e)}")
187
+
188
+ def get_submission_stats(submissions_dir: str = "./submissions") -> Dict[str, Any]:
189
+ """获取提交统计信息"""
190
+
191
+ if not os.path.exists(submissions_dir):
192
+ return {"total": 0, "recent": []}
193
+
194
+ submissions = []
195
+
196
+ for filename in os.listdir(submissions_dir):
197
+ if filename.startswith("submission_") and filename.endswith(".json"):
198
+ file_path = os.path.join(submissions_dir, filename)
199
+ try:
200
+ with open(file_path, 'r', encoding='utf-8') as f:
201
+ data = json.load(f)
202
+
203
+ # 提取信息
204
+ timestamp_str = filename.split("_")[-1].replace(".json", "")
205
+ try:
206
+ timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
207
+ formatted_time = timestamp.strftime("%Y-%m-%d %H:%M")
208
+ except:
209
+ formatted_time = timestamp_str
210
+
211
+ submissions.append({
212
+ "org": data.get("submission_org", "Unknown"),
213
+ "email": data.get("submission_email", ""),
214
+ "time": formatted_time,
215
+ "predictions": len(data.get("predictions", []))
216
+ })
217
+
218
+ except Exception:
219
+ continue
220
+
221
+ # 按时间排序,最新的在前
222
+ submissions.sort(key=lambda x: x["time"], reverse=True)
223
+
224
+ return {
225
+ "total": len(submissions),
226
+ "recent": submissions[:10] # 最近10个
227
+ }
228
+
229
+ # 移除了原有的HTTP推送函数,现在使用OSS模式
230
+