Spaces:

opencompass
/

ATLAS

Sleeping

“pangjh3” commited on 27 days ago

Commit

4edba36

1 Parent(s): ee81350

modified: .gitattributes

new file: .gitignore
new file: .pre-commit-config.yaml
new file: Makefile
new file: app.py
new file: pyproject.toml
new file: requirements.txt
new file: src/about.py
new file: src/display/css_html_js.py
new file: src/display/formatting.py
new file: src/display/utils.py
new file: src/envs.py
new file: src/oss/__init__.py
new file: src/oss/oss_file_manager.py
new file: src/oss/oss_leaderboard_manager.py
new file: src/oss/oss_submission_handler.py
new file: src/populate.py
new file: src/submission/submit.py

Files changed (18) hide show

.gitattributes +1 -1
.gitignore +14 -0
.pre-commit-config.yaml +54 -0
Makefile +14 -0
app.py +442 -0
pyproject.toml +14 -0
requirements.txt +12 -0
src/about.py +123 -0
src/display/css_html_js.py +105 -0
src/display/formatting.py +27 -0
src/display/utils.py +110 -0
src/envs.py +25 -0
src/oss/__init__.py +9 -0
src/oss/oss_file_manager.py +432 -0
src/oss/oss_leaderboard_manager.py +267 -0
src/oss/oss_submission_handler.py +233 -0
src/populate.py +155 -0
src/submission/submit.py +230 -0

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+scale-hf-logo.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.vscode/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+default_language_version:
+  python: python3
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-case-conflict
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: requirements-txt-fixer
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: Format imports
+  - repo: https://github.com/psf/black
+    rev: 22.12.0
+    hooks:
+      - id: black
+        name: Format code
+        additional_dependencies: ['click==8.0.2']
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    # Ruff version.
+    rev: 'v0.0.267'
+    hooks:
+      - id: ruff

Makefile ADDED Viewed

	@@ -0,0 +1,14 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

app.py ADDED Viewed

	@@ -0,0 +1,442 @@

+import os
+import json
+import datetime
+import requests
+from email.utils import parseaddr
+import gradio as gr
+import pandas as pd
+import numpy as np
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+# ATLAS specific imports - use populate module to avoid transformers dependency
+try:
+    from src.populate import process_sage_results_for_leaderboard, get_sage_leaderboard_df
+    SAGE_MODULES_AVAILABLE = process_sage_results_for_leaderboard is not None
+    if SAGE_MODULES_AVAILABLE:
+        print("✅ ATLAS modules loaded successfully")
+    else:
+        print("❌ ATLAS modules not available")
+except ImportError as e:
+    print(f"Warning: ATLAS modules not available: {e}")
+    SAGE_MODULES_AVAILABLE = False
+# Configuration
+TOKEN = os.environ.get("HF_TOKEN", None)
+OWNER = "opencompass"
+# OSS submission tracking paths
+SUBMISSION_TRACKING_PATH = "atlas_eval/submissions/user_tracking/"
+SUBMISSION_HISTORY_FILE = "submission_history.json"
+def format_error(msg):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
+def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    if link and link.startswith("http"):
+        return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    return model_name
+def load_submission_history():
+    """Load user submission history from OSS"""
+    try:
+        from src.oss.oss_file_manager import OSSFileManager
+        oss_manager = OSSFileManager()
+        # Try to download submission history file
+        history_content = oss_manager.download_file_content(
+            SUBMISSION_TRACKING_PATH + SUBMISSION_HISTORY_FILE
+        )
+        if history_content:
+            return json.loads(history_content)
+        else:
+            print("📝 Creating new submission history")
+            return {}
+    except Exception as e:
+        print(f"⚠️ Failed to load submission history: {e}")
+        return {}
+def save_submission_history(history):
+    """Save user submission history to OSS"""
+    try:
+        from src.oss.oss_file_manager import OSSFileManager
+        oss_manager = OSSFileManager()
+        # Upload submission history
+        history_json = json.dumps(history, indent=2, ensure_ascii=False)
+        success = oss_manager.upload_file_content(
+            content=history_json,
+            object_key=SUBMISSION_TRACKING_PATH + SUBMISSION_HISTORY_FILE
+        )
+        return success
+    except Exception as e:
+        print(f"❌ Failed to save submission history: {e}")
+        return False
+def check_user_submission_eligibility(profile: gr.OAuthProfile, org_name: str):
+    """Check user submission eligibility"""
+    try:
+        # 1. Check account age limit (60 days)
+        user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
+        if user_data.status_code == 200:
+            creation_date = json.loads(user_data.content)["createdAt"]
+            account_age = datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ')
+            if account_age < datetime.timedelta(days=60):
+                return False, "This account does not meet the submission requirement. Account age must exceed 60 days."
+        else:
+            return False, "Unable to verify account information. Please try again later."
+        # 2. Check daily submission limit
+        submission_history = load_submission_history()
+        user_submissions = submission_history.get(profile.username, [])
+        today = datetime.datetime.today().strftime('%Y-%m-%d')
+        today_submissions = [s for s in user_submissions if s.get("date", "") == today]
+        if len(today_submissions) >= 2:
+            return False, "You have already submitted twice today. Please try again tomorrow."
+        return True, "Eligibility check passed"
+    except Exception as e:
+        print(f"❌ User eligibility check failed: {e}")
+        return False, f"System check error, please try again later: {str(e)}"
+def record_user_submission(profile: gr.OAuthProfile, model_name: str, org_name: str, email: str):
+    """Record user submission"""
+    try:
+        submission_history = load_submission_history()
+        if profile.username not in submission_history:
+            submission_history[profile.username] = []
+        # Record this submission
+        submission_record = {
+            "date": datetime.datetime.today().strftime('%Y-%m-%d'),
+            "time": datetime.datetime.now().strftime('%H:%M:%S'),
+            "model": model_name,
+            "organization": org_name,
+            "email": email,
+            "username": profile.username
+        }
+        submission_history[profile.username].append(submission_record)
+        # Save submission history
+        return save_submission_history(submission_history)
+    except Exception as e:
+        print(f"❌ Failed to record submission history: {e}")
+        return False
+def get_leaderboard_dataframe():
+    """Generate leaderboard dataframe from ATLAS results"""
+    print("🔄 Loading ATLAS leaderboard data...")
+    if not SAGE_MODULES_AVAILABLE:
+        print("❌ ATLAS modules not available")
+        return pd.DataFrame()
+    try:
+        # Use the updated get_sage_leaderboard_df function
+        df = get_sage_leaderboard_df()
+        if df.empty:
+            print("❌ No ATLAS results found")
+            return pd.DataFrame()
+        print(f"✅ Generated dataframe with {len(df)} rows")
+        return df
+    except Exception as e:
+        print(f"❌ Error generating leaderboard dataframe: {e}")
+        import traceback
+        traceback.print_exc()
+        return pd.DataFrame()
+def refresh_leaderboard():
+    """Refresh the leaderboard data"""
+    print("🔄 Refreshing leaderboard data...")
+    return get_leaderboard_dataframe()
+# Initialize data
+print("🚀 Initializing ATLAS leaderboard...")
+leaderboard_df = get_leaderboard_dataframe()
+print(f"📈 Leaderboard initialized with {len(leaderboard_df)} rows")
+# Define column types for the dataframe (Model, Organization, Accuracy, mG-Pass@2, mG-Pass@4, Submission Date)
+COLUMN_TYPES = ["markdown", "str", "number", "number", "number", "str"]
+# Create Gradio interface
+demo = gr.Blocks(css="""
+.markdown-text {
+    font-size: 16px !important;
+}
+#citation-button {
+    font-family: monospace;
+}
+""")
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+                lines=10,
+                max_lines=10,
+                interactive=False
+            )
+    # Main leaderboard table
+    gr.Markdown("## 🏆 ATLAS Benchmark Results", elem_classes="markdown-text")
+    # Debug information - dynamic component
+    results_count = gr.Markdown(f"📊 **Showing {len(leaderboard_df)} results**")
+    leaderboard_table = gr.Dataframe(
+        value=leaderboard_df,
+        datatype=COLUMN_TYPES,
+        interactive=False,
+        wrap=True,
+        column_widths=["30%", "20%", "12%", "12%", "12%", "14%"]
+    )
+    # Refresh button
+    refresh_button = gr.Button("🔄 Refresh Leaderboard")
+    def refresh_leaderboard_with_count():
+        """Refresh leaderboard and update count display"""
+        df = refresh_leaderboard()
+        count_text = f"📊 **Showing {len(df)} results**"
+        return df, count_text
+    refresh_button.click(
+        refresh_leaderboard_with_count,
+        inputs=[],
+        outputs=[leaderboard_table, results_count]
+    )
+    # Submission section
+    with gr.Accordion("📊 Submit Your ATLAS Results", open=False):
+        gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+        gr.Markdown("""
+### 📋 Submission Requirements
+<!--
+- Login required: You must log in with a Hugging Face account
+- Account age: Account must be older than 60 days
+- Submission frequency: Each user can submit up to 2 times per day
+-->
+- File format: Upload a JSON file in the ATLAS format
+- Organization: Provide the exact organization name (shown on the leaderboard)
+- Contact email: Provide a valid email for notifications
+- Auto evaluation: After submission, the system will run LLM-based evaluation and update the leaderboard
+<!--
+### 🔐 Security Policy
+To prevent spam and ensure evaluation quality, we enforce:
+- New accounts must wait 60 days before submitting (prevents abuse)
+- Daily submission limits to ensure leaderboard quality and system stability
+- Duplicate checks to avoid multiple submissions for the same organization
+-->
+        """, elem_classes="markdown-text")
+        with gr.Row():
+            with gr.Column():
+                model_textbox = gr.Textbox(
+                    label="Model Name - will be shown on the leaderboard",
+                    placeholder="Your Model Name (e.g., GPT-4, Llama-2-70B)"
+                )
+                org_textbox = gr.Textbox(
+                    label="Organization Name - will be shown on the leaderboard",
+                    placeholder="Your Organization"
+                )
+                email_textbox = gr.Textbox(
+                    label="Contact Email - used for contact, not publicly visible",
+                    placeholder="contact@example.com"
+                )
+            with gr.Column():
+                file_upload = gr.File(
+                    label="Upload ATLAS Results (JSON)",
+                    file_types=[".json"],
+                    type="filepath"
+                )
+        # 提交按钮 (登录功能暂时注释)
+        with gr.Row():
+            login_button = gr.LoginButton("🔐 Login with HuggingFace", size="lg")
+            submit_button = gr.Button("Submit Results", variant="primary", size="lg")
+        # 登录状态与用户信息
+        profile_state = gr.State()
+        login_status = gr.Markdown(visible=True)
+        # def on_login(profile: gr.OAuthProfile):
+        #     try:
+        #         if profile and getattr(profile, "name", None):
+        #             name = profile.name
+        #             text = f"✅ Logged in as: **{name}**"
+        #         else:
+        #             text = "❌ Login failed, please try again"
+        #         return profile, text
+        #     except Exception:
+        #         return None, "❌ Login failed, please try again"
+        # login_button.click(on_login, inputs=None, outputs=[profile_state, login_status])
+        # 进度显示和结果显示区域
+        progress_info = gr.HTML()
+        submission_result = gr.HTML()
+        def show_progress(step, message, total_steps=4):
+            """Show progress information"""
+            progress_percentage = int((step / total_steps) * 100)
+            progress_html = f"""
+<div style="background-color: #e7f3ff; border: 1px solid #4dabf7; border-radius: 5px; padding: 15px; margin: 10px 0;">
+    <div style="display: flex; align-items: center; margin-bottom: 10px;">
+        <h4 style="color: #1971c2; margin: 0; flex-grow: 1;">⏳ Processing submission...</h4>
+        <span style="color: #1971c2; font-weight: bold;">{progress_percentage}%</span>
+    </div>
+    <p style="color: #1971c2; margin: 5px 0;"><strong>Step {step}/{total_steps}:</strong> {message}</p>
+    <div style="background-color: #fff; border-radius: 10px; height: 20px; margin: 10px 0; border: 1px solid #dee2e6;">
+        <div style="background: linear-gradient(90deg, #4dabf7, #74c0fc); height: 100%; width: {progress_percentage}%; border-radius: 10px; transition: width 0.5s ease; display: flex; align-items: center; justify-content: center;">
+            {f'<span style="color: white; font-size: 12px; font-weight: bold;">{progress_percentage}%</span>' if progress_percentage > 20 else ''}
+        </div>
+    </div>
+    <p style="color: #495057; font-size: 14px; margin: 5px 0;">
+        {'✨ Almost done, please wait...' if step >= total_steps else '📤 Please wait, processing your submission...'}
+    </p>
+</div>
+            """
+            return progress_html
+        def handle_submission(file_upload, model_name, org_name, email, user_profile: gr.OAuthProfile):
+            try:
+                # 步骤1: 基本验证
+                yield show_progress(1, "Validating submission info"), ""
+                # 校验登录
+                if user_profile is None or getattr(user_profile, "name", None) is None:
+                    yield "", format_error("Please log in with Hugging Face before submitting")
+                    return
+                print(f"user_profile: {user_profile}")
+                print(f"user_profile.name: {user_profile.name}")
+                if not file_upload:
+                    yield "", format_error("Please select a file to upload")
+                    return
+                if not model_name or not model_name.strip():
+                    yield "", format_error("Please enter model name")
+                    return
+                if not org_name or not org_name.strip():
+                    yield "", format_error("Please enter organization name")
+                    return
+                if not email or not email.strip():
+                    yield "", format_error("Please enter email address")
+                    return
+                # 验证邮箱格式
+                _, parsed_email = parseaddr(email)
+                if "@" not in parsed_email:
+                    yield "", format_warning("Please provide a valid email address")
+                    return
+                # 步骤2: 文件验证和读取
+                yield show_progress(2, "Validating file format and content"), ""
+                import time
+                time.sleep(0.5)  # allow users to see progress update
+                # 用户资格检查（账号年龄/频率/重复提交）
+                eligible, msg = check_user_submission_eligibility(user_profile, org_name)
+                if not eligible:
+                    yield "", format_error(msg)
+                    return
+                # 步骤3: 上传到OSS
+                yield show_progress(3, "Uploading file to OSS storage"), ""
+                # 处理文件提交
+                from src.submission.submit import process_sage_submission_simple
+                result = process_sage_submission_simple(file_upload, model_name, org_name, email)
+                # 步骤4: 完成
+                yield show_progress(4, "Submission completed, preparing evaluation"), ""
+                time.sleep(0.5)  # allow users to see completion state
+                # 记录提交历史
+                try:
+                    record_user_submission(user_profile, model_name, org_name, email)
+                except Exception:
+                    pass
+                # 生成成功信息
+                success_info = f"""
+<div style="background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 5px; padding: 15px; margin: 10px 0;">
+    <h4 style="color: #155724; margin-top: 0;">🎉 Submission successful!</h4>
+    <p style="color: #155724; margin: 5px 0;"><strong>Model:</strong> {model_name}</p>
+    <p style="color: #155724; margin: 5px 0;"><strong>Organization:</strong> {org_name}</p>
+    <p style="color: #155724; margin: 5px 0;"><strong>Email:</strong> {email}</p>
+    <p style="color: #155724; margin: 5px 0;"><strong>Submitted at:</strong> {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
+    <p style="color: #155724; margin-bottom: 0;">Your results have been submitted via OSS. LLM evaluation will complete in 5-10 minutes and the leaderboard will be updated.</p>
+</div>
+                """
+                # 清除进度条，显示最终结果
+                yield "", success_info + result
+            except ImportError as e:
+                yield "", format_error(f"Submission system modules unavailable: {e}")
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
+                yield "", format_error(f"An error occurred during submission: {str(e)}")
+        submit_button.click(
+            handle_submission,
+            inputs=[file_upload, model_textbox, org_textbox, email_textbox], # profile_state
+            outputs=[progress_info, submission_result]
+        )
+# Launch the app
+if __name__ == "__main__":
+    # Disable SSR mode for better OAuth compatibility
+    # Note: OAuth is handled internally via gr.LoginButton, not at launch level
+    demo.launch(ssr_mode=False)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+datasets
+gradio
+huggingface-hub>=0.18.0
+numpy
+pandas
+python-dateutil
+openai>=1.0.0
+aiohttp
+oss2
+loguru
+tqdm

src/about.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+# Select your tasks here
+# ---------------------------------------------------
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    sage_overall = Task("sage_overall", "accuracy", "ATLAS Overall")
+    sage_math = Task("sage_math", "accuracy", "Mathematics")
+    sage_physics = Task("sage_physics", "accuracy", "Physics")
+    sage_chemistry = Task("sage_chemistry", "accuracy", "Chemistry")
+    sage_biology = Task("sage_biology", "accuracy", "Biology")
+    sage_earth_science = Task("sage_earth_science", "accuracy", "Earth Science")
+    sage_astronomy = Task("sage_astronomy", "accuracy", "Astronomy")
+NUM_FEWSHOT = 0 # Change with your few shot
+# ---------------------------------------------------
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">ATLAS: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+**ATLAS (AGI-Oriented Testbed for Logical Application in Science)** is a large-scale, high-difficulty, cross-disciplinary evaluation suite for assessing the frontier scientific reasoning capabilities of LLMs. Designed to address the challenges of benchmark saturation, narrow disciplinary focus, oversimplified answer formats, and data contamination in existing evaluations, ATLAS serves as a reliable **ruler** for measuring progress toward AGI in the **AI for Science** domain.
+## Benchmark Overview
+**ATLAS** evaluates models across seven core scientific fields that are central to AI for Science, encompassing 57 corresponding sub-fields to ensure comprehensive coverage of scientific reasoning requirements:
+- **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
+- **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
+- **Chemistry** - Physical chemistry, inorganic chemistry, organic chemistry, and analytical chemistry
+- **Biology** - Genetics, immunology, molecular biology, biophysics, and ecology
+- **Computer Science** - Computer architecture, artificial intelligence, and software fundamentals
+- **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
+- **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
+## Evaluation Metrics
+- **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
+- **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
+- **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)
+The leaderboard displays model performance sorted by average accuracy, with domain-specific scores reflecting strengths in different scientific fields. All metrics are derived from the ATLAS validation/test set (≈800 expert-created original problems).
+"""
+# Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = f"""
+## How ATLAS Works
+ATLAS evaluates language models across seven scientific domains through a comprehensive assessment of both content generation and reasoning capabilities.
+### Evaluation Process:
+1. **Multi-domain Assessment**: Models are tested on questions spanning Mathematics, Physics, Chemistry, Biology, Computer Science, Earth Science, and Materials Science
+2. **Content + Reasoning**: Each submission requires both predicted answers and reasoning explanations
+3. **Accuracy Scoring**: Performance is measured using accuracy metrics across all domains
+4. **Comprehensive Reporting**: Results are aggregated to provide both overall and domain-specific scores
+### Submission Format:
+Submissions should follow this JSON structure:
+```json
+{{
+    "submission_org": "Your Organization",
+    "submission_email": "contact@example.com",
+    "predictions": [
+        {{
+            "original_question_id": 0,
+            "content": ["answer1", "answer2", "answer3", "answer4"],
+            "reasoning_content": ["reasoning1", "reasoning2", "reasoning3", "reasoning4"]
+        }}
+    ]
+}}
+```
+## Reproducibility
+To reproduce our evaluation results:
+1. Download the ATLAS dataset from our repository
+2. Use the evaluation scripts provided in the benchmark toolkit
+3. Follow the submission format specifications exactly
+4. Submit your results through this leaderboard interface
+"""
+EVALUATION_QUEUE_TEXT = """
+## Submit Your ATLAS Results
+Results can be submitted as evaluation outputs in JSON format. Each submission should include predictions and reasoning content for all test questions.
+### Required JSON Format:
+```json
+{
+    "submission_org": "Your Organization",
+    "submission_email": "contact@example.com",
+    "predictions": [
+        {
+            "original_question_id": 0,
+            "content": ["answer1", "answer2", "answer3", "answer4"],
+            "reasoning_content": ["reasoning1", "reasoning2", "reasoning3", "reasoning4"]
+        }
+    ]
+}
+```
+### Submission Guidelines:
+- Each prediction must include exactly 4 content items and 4 reasoning items
+- Question IDs should match the official ATLAS test set
+- Provide clear scientific reasoning for each prediction
+- Ensure JSON format is valid and complete
+Your submission will be automatically evaluated across all scientific domains and added to the leaderboard.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@article{liu2025atlas,
+  title={ATLAS: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning},
+  author={Liu, Hongwei and Liu, Junnan and Liu, Shudong and Duan, Haodong and Li, Yuqiang and Su, Mao and Liu, Xiaohong and Zhai, Guangtao and Fang, Xinyu and Ma, Qianhong and Zhang, Taolin and Ma, Zihan and Zhao, Yufeng and Zhou, Peiheng and Xiao, Linchen and Zhang, Wenlong and Zhou, Shijie and Ma, Xingjian and Sun, Siqi and Ge, Jiaye and Li, Meng and Liu, Yuhong and Dong, Jianxin and Li, Jiaying and Wu, Hui and Liang, Hanwen and Lin, Jintai and Wang, Yanting and Dong, Jie and Zhu, Tong and Fu, Tianfan and He, Conghui and Zhang, Qi and Zhang, Songyang and Bai, Lei and Chen, Kai},
+  journal={arXiv preprint arXiv:2511.14366},
+  year={2025}
+}"""

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,105 @@

+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+#leaderboard-table td:nth-child(2),
+#leaderboard-table th:nth-child(2) {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+"""
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,27 @@

+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from dataclasses import dataclass, make_dataclass
+from enum import Enum
+import pandas as pd
+from src.about import Tasks
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+## Leaderboard columns
+auto_eval_column_dict = []
+# Init
+auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+#Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
+for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+# Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+## For the queue columns in the submission tab
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+## All the model information that we might need
+@dataclass
+class ModelDetails:
+    name: str
+    display_name: str = ""
+    symbol: str = "" # emoji
+class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelDetails(name="", symbol="?")
+    def to_str(self, separator=" "):
+        return f"{self.value.symbol}{separator}{self.value.name}"
+    @staticmethod
+    def from_str(type):
+        if "fine-tuned" in type or "🔶" in type:
+            return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
+        if "RL-tuned" in type or "🟦" in type:
+            return ModelType.RL
+        if "instruction-tuned" in type or "⭕" in type:
+            return ModelType.IFT
+        return ModelType.Unknown
+class WeightType(Enum):
+    Adapter = ModelDetails("Adapter")
+    Original = ModelDetails("Original")
+    Delta = ModelDetails("Delta")
+class Precision(Enum):
+    float16 = ModelDetails("float16")
+    bfloat16 = ModelDetails("bfloat16")
+    Unknown = ModelDetails("?")
+    def from_str(precision):
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
+        if precision in ["torch.bfloat16", "bfloat16"]:
+            return Precision.bfloat16
+        return Precision.Unknown
+# Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
+# ----------------------------------
+REPO_ID = f"{OWNER}/leaderboard"
+QUEUE_REPO = f"{OWNER}/requests"
+RESULTS_REPO = f"{OWNER}/results"
+# If you setup a cache later, just change HF_HOME
+CACHE_PATH=os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+API = HfApi(token=TOKEN)

src/oss/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+OSS模块 - 处理阿里云OSS相关功能
+"""
+from .oss_file_manager import OSSFileManager
+from .oss_leaderboard_manager import OSSLeaderboardManager
+from .oss_submission_handler import OSSSubmissionHandler
+__all__ = ["OSSFileManager", "OSSLeaderboardManager", "OSSSubmissionHandler"]

src/oss/oss_file_manager.py ADDED Viewed

	@@ -0,0 +1,432 @@

+#!/usr/bin/env python3
+import os
+import oss2
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import List, Dict, Optional
+from loguru import logger
+class OSSFileManager:
+    """简化的OSS文件管理器"""
+    def __init__(
+        self,
+        oss_access_key_id: str = None,
+        oss_access_key_secret: str = None,
+        oss_region: str = None,
+        oss_bucket_name: str = None
+    ):
+        """
+        初始化OSS文件管理器
+        Args:
+            oss_access_key_id: OSS访问密钥ID
+            oss_access_key_secret: OSS访问密钥Secret
+            oss_region: OSS区域端点
+            oss_bucket_name: OSS存储桶名称
+        """
+        # 从环境变量获取配置
+        self.access_key_id = oss_access_key_id or os.getenv('OSS_ACCESS_KEY_ID')
+        self.access_key_secret = oss_access_key_secret or os.getenv('OSS_ACCESS_KEY_SECRET')
+        self.region = oss_region or os.getenv('OSS_REGION', 'http://oss-cn-shanghai.aliyuncs.com')
+        self.bucket_name = oss_bucket_name or os.getenv('OSS_BUCKET_NAME', 'opencompass')
+        if not self.access_key_id or not self.access_key_secret:
+            raise ValueError("OSS访问密钥未设置。请设置 OSS_ACCESS_KEY_ID 和 OSS_ACCESS_KEY_SECRET 环境变量。")
+        # 初始化OSS客户端
+        auth = oss2.Auth(self.access_key_id, self.access_key_secret)
+        self.bucket = oss2.Bucket(auth, self.region, self.bucket_name)
+        logger.info(f"OSS初始化成功: {self.bucket_name} @ {self.region}")
+    def list_files(
+        self,
+        oss_dir: str = "",
+        after_date: datetime = None,
+        file_extension: str = None
+    ) -> List[Dict]:
+        """
+        列出OSS目录中的文件
+        Args:
+            oss_dir: OSS目录路径
+            after_date: 只返回此日期之后的文件
+            file_extension: 文件扩展名过滤 (如 ".json")
+        Returns:
+            文件信息列表
+        """
+        try:
+            files = []
+            # 确保目录路径以 / 结尾
+            if oss_dir and not oss_dir.endswith('/'):
+                oss_dir += '/'
+            # 列出对象
+            for obj in oss2.ObjectIterator(self.bucket, prefix=oss_dir):
+                # 跳过目录本身
+                if obj.key.endswith('/'):
+                    continue
+                # 文件扩展名过滤
+                if file_extension and not obj.key.endswith(file_extension):
+                    continue
+                # 日期过滤
+                if after_date and obj.last_modified < after_date:
+                    continue
+                file_info = {
+                    'key': obj.key,
+                    'name': os.path.basename(obj.key),
+                    'size': obj.size,
+                    'last_modified': obj.last_modified,
+                    'etag': obj.etag
+                }
+                files.append(file_info)
+            logger.info(f"找到 {len(files)} 个文件在 {oss_dir}")
+            return files
+        except Exception as e:
+            logger.error(f"列出文件失败: {e}")
+            raise
+    def download_file(self, oss_file_path: str, local_file_path: str) -> bool:
+        """
+        从OSS下载文件到本地
+        Args:
+            oss_file_path: OSS文件路径
+            local_file_path: 本地文件路径
+        Returns:
+            下载是否成功
+        """
+        try:
+            # 确保本地目录存在
+            local_dir = os.path.dirname(local_file_path)
+            if local_dir:
+                os.makedirs(local_dir, exist_ok=True)
+            # 下载文件
+            self.bucket.get_object_to_file(oss_file_path, local_file_path)
+            logger.info(f"下载成功: {oss_file_path} -> {local_file_path}")
+            return True
+        except Exception as e:
+            logger.error(f"下载文件失败: {oss_file_path} -> {local_file_path}, 错误: {e}")
+            return False
+    def upload_file_to_object(
+        self,
+        local_file_path: str,
+        oss_file_path: str,
+        replace: bool = False
+    ) -> bool:
+        """
+        上传本地文件到OSS
+        Args:
+            local_file_path: 本地文件路径
+            oss_file_path: OSS文件路径
+            replace: 是否替换已存在的文件
+        Returns:
+            上传是否成功
+        """
+        try:
+            # 检查本地文件是否存在
+            if not os.path.exists(local_file_path):
+                logger.error(f"本地文件不存在: {local_file_path}")
+                return False
+            # 检查OSS文件是否存在
+            if not replace and self.bucket.object_exists(oss_file_path):
+                logger.warning(f"OSS文件已存在: {oss_file_path}")
+                return False
+            # 上传文件
+            self.bucket.put_object_from_file(oss_file_path, local_file_path)
+            logger.info(f"上传成功: {local_file_path} -> {oss_file_path}")
+            return True
+        except Exception as e:
+            logger.error(f"上传文件失败: {local_file_path} -> {oss_file_path}, 错误: {e}")
+            return False
+    def file_exists(self, oss_file_path: str) -> bool:
+        """
+        检查OSS文件是否存在
+        Args:
+            oss_file_path: OSS文件路径
+        Returns:
+            文件是否存在
+        """
+        try:
+            return self.bucket.object_exists(oss_file_path)
+        except Exception as e:
+            logger.error(f"检查文件存在性失败: {oss_file_path}, 错误: {e}")
+            return False
+    def download_file_content(self, oss_file_path: str) -> Optional[bytes]:
+        """
+        下载OSS文件内容到内存
+        Args:
+            oss_file_path: OSS文件路径
+        Returns:
+            文件内容（字节）或None
+        """
+        try:
+            result = self.bucket.get_object(oss_file_path)
+            content = result.read()
+            logger.info(f"下载文件内容成功: {oss_file_path} ({len(content)} bytes)")
+            return content
+        except Exception as e:
+            logger.error(f"下载文件内容失败: {oss_file_path}, 错误: {e}")
+            return None
+    def upload_file_content(self, content: str, object_key: str) -> bool:
+        """
+        直接上传字符串内容到OSS
+        Args:
+            content: 要上传的字符串内容
+            object_key: OSS对象键（文件路径）
+        Returns:
+            上传是否成功
+        """
+        try:
+            # 将字符串转换为字节
+            if isinstance(content, str):
+                content_bytes = content.encode('utf-8')
+            else:
+                content_bytes = content
+            # 直接上传内容到OSS
+            self.bucket.put_object(object_key, content_bytes)
+            logger.info(f"上传内容成功: {object_key} ({len(content_bytes)} bytes)")
+            return True
+        except Exception as e:
+            logger.error(f"上传内容失败: {object_key}, 错误: {e}")
+            return False
+    def upload_file(self, local_file_path: str, oss_file_path: str) -> bool:
+        """
+        上传本地文件到OSS（别名方法）
+        Args:
+            local_file_path: 本地文件路径
+            oss_file_path: OSS文件路径
+        Returns:
+            上传是否成功
+        """
+        return self.upload_file_to_object(local_file_path, oss_file_path, replace=True)
+    def copy_file(self, source_path: str, target_path: str) -> bool:
+        """
+        在OSS内部复制文件
+        Args:
+            source_path: 源文件路径
+            target_path: 目标文件路径
+        Returns:
+            复制是否成功
+        """
+        try:
+            # 使用copy_object进行OSS内部复制
+            self.bucket.copy_object(
+                self.bucket_name,  # 源bucket
+                source_path,       # 源文件路径
+                target_path        # 目标文件路径
+            )
+            logger.info(f"文件复制成功: {source_path} -> {target_path}")
+            return True
+        except Exception as e:
+            logger.error(f"文件复制失败: {source_path} -> {target_path}, 错误: {e}")
+            return False
+    def list_latest_files_by_date(
+        self,
+        object_dir: str = "",
+        max_num_files: int = 100,
+        suffix: str = ".json",
+        date_pattern: str = r".*",
+        file_date_format: str = "%Y-%m-%d"
+    ) -> List[str]:
+        """
+        列出OSS目录中按日期排序的文件
+        Args:
+            object_dir: OSS目录路径
+            max_num_files: 最大文件数量
+            suffix: 文件后缀
+            date_pattern: 日期匹配模式
+            file_date_format: 日期格式
+        Returns:
+            文件路径列表（最新的在前面）
+        """
+        try:
+            # 使用现有的list_files方法
+            files = self.list_files(
+                oss_dir=object_dir,
+                file_extension=suffix
+            )
+            # 提取文件名
+            filenames = []
+            for file_info in files:
+                filename = file_info['name']
+                # 简单的文件名匹配（不使用复杂的正则）
+                if suffix in filename:
+                    filenames.append(filename)
+            # 按文件名排序（假设文件��包含时间戳）
+            filenames.sort(reverse=True)
+            # 限制数量
+            max_num_files = max_num_files or len(filenames)
+            filenames = filenames[:max_num_files]
+            logger.info(f"找到 {len(filenames)} 个文件，按日期排序")
+            # 返回完整的OSS路径
+            result = []
+            for filename in filenames:
+                if object_dir:
+                    full_path = f"{object_dir.rstrip('/')}/{filename}"
+                else:
+                    full_path = filename
+                result.append(full_path)
+            return result
+        except Exception as e:
+            logger.error(f"列出最新文件失败: {e}")
+            return []
+    def download_object_to_file(
+        self,
+        oss_file_path: str,
+        local_file_path: str,
+        replace: bool = True,
+        make_dir: bool = True
+    ) -> bool:
+        """
+        从OSS下载对象到本地文件（兼容性方法）
+        Args:
+            oss_file_path: OSS文件路径
+            local_file_path: 本地文件路径
+            replace: 是否替换已存在的文件
+            make_dir: 是否创建目录
+        Returns:
+            下载是否成功
+        """
+        try:
+            # 检查本地文件是否存在
+            if not replace and os.path.exists(local_file_path):
+                logger.warning(f"本地文件已存在: {local_file_path}")
+                return False
+            # 创建目录
+            if make_dir:
+                local_dir = os.path.dirname(local_file_path)
+                if local_dir:
+                    os.makedirs(local_dir, exist_ok=True)
+            # 使用现有的download_file方法
+            return self.download_file(oss_file_path, local_file_path)
+        except Exception as e:
+            logger.error(f"下载对象失败: {oss_file_path} -> {local_file_path}, 错误: {e}")
+            return False
+    def get_file_info(self, oss_file_path: str) -> Optional[Dict]:
+        """
+        获取OSS文件信息
+        Args:
+            oss_file_path: OSS文件路径
+        Returns:
+            文件信息字典
+        """
+        try:
+            obj = self.bucket.get_object_meta(oss_file_path)
+            return {
+                'key': oss_file_path,
+                'name': os.path.basename(oss_file_path),
+                'size': obj.content_length,
+                'last_modified': obj.last_modified,
+                'etag': obj.etag,
+                'content_type': obj.content_type
+            }
+        except oss2.exceptions.NoSuchKey:
+            logger.warning(f"文件不存在: {oss_file_path}")
+            return None
+        except Exception as e:
+            logger.error(f"获取文件信息失败: {oss_file_path}, 错误: {e}")
+            return None
+    def delete_file(self, oss_file_path: str) -> bool:
+        """
+        删除OSS文件
+        Args:
+            oss_file_path: OSS文件路径
+        Returns:
+            删除是否成功
+        """
+        try:
+            self.bucket.delete_object(oss_file_path)
+            logger.info(f"删除成功: {oss_file_path}")
+            return True
+        except Exception as e:
+            logger.error(f"删除文件失败: {oss_file_path}, 错误: {e}")
+            return False
+# 兼容性别名 - 保持与原始代码的兼容性
+class SimpleOSSManager(OSSFileManager):
+    """兼容性别名"""
+    pass
+if __name__ == "__main__":
+    # 测试代码
+    try:
+        manager = OSSFileManager()
+        print("✅ OSS file manager initialized successfully")
+        # 测试列出文件
+        files = manager.list_files("atlas_eval/submissions/", file_extension=".json")
+        print(f"📁 Found {len(files)} submission files")
+        for file_info in files[:3]:  # 只显示前3个
+            print(f"  - {file_info['name']} ({file_info['size']} bytes)")
+    except Exception as e:
+        print(f"❌ Test failed: {e}")

src/oss/oss_leaderboard_manager.py ADDED Viewed

	@@ -0,0 +1,267 @@

+#!/usr/bin/env python3
+"""
+OSS排行榜管理器 - 从OSS读取和更新排行榜数据
+"""
+import os
+import json
+import tempfile
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+from .oss_file_manager import OSSFileManager
+class OSSLeaderboardManager:
+    """OSS排行榜管理器 - 管理存储在OSS中的排行榜数据"""
+    def __init__(self):
+        """初始化OSS排行榜管理器"""
+        self.oss_manager = OSSFileManager()
+        # OSS路径配置
+        self.leaderboard_path = "atlas_eval/leaderboard/"
+        self.backup_path = "atlas_eval/leaderboard/backup/"
+        self.leaderboard_file = "leaderboard.json"
+        # 完整的OSS路径
+        self.oss_leaderboard_file = f"{self.leaderboard_path}{self.leaderboard_file}"
+        print(f"📊 OSS leaderboard path: oss://opencompass/{self.oss_leaderboard_file}")
+        print(f"📦 OSS backup path: oss://opencompass/{self.backup_path}")
+    def load_leaderboard_from_oss(self) -> List[Dict[str, Any]]:
+        """
+        从OSS加载排行榜数据
+        Returns:
+            排行榜数据列表
+        """
+        try:
+            print(f"📥 Loading leaderboard data from OSS: {self.oss_leaderboard_file}")
+            # 从OSS下载文件内容
+            content = self.oss_manager.download_file_content(self.oss_leaderboard_file)
+            if content:
+                leaderboard_data = json.loads(content.decode('utf-8'))
+                print(f"✅ Successfully loaded {len(leaderboard_data)} leaderboard entries")
+                return leaderboard_data
+            else:
+                print("⚠️ No leaderboard file found in OSS, returning empty list")
+                return []
+        except Exception as e:
+            print(f"❌ Failed to load leaderboard from OSS: {e}")
+            return []
+    def save_leaderboard_to_oss(self, leaderboard_data: List[Dict[str, Any]],
+                               create_backup: bool = True) -> bool:
+        """
+        保存排行榜数据到OSS
+        Args:
+            leaderboard_data: 排行榜数据
+            create_backup: 是否创建备份
+        Returns:
+            是否保存成功
+        """
+        try:
+            print(f"📤 Saving leaderboard data to OSS: {self.oss_leaderboard_file}")
+            # 创建备份（如果需要且现有文件存在）
+            if create_backup:
+                self._create_backup()
+            # 创建临时文件
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
+                json.dump(leaderboard_data, temp_file, indent=2, ensure_ascii=False)
+                temp_file_path = temp_file.name
+            try:
+                # 上传到OSS
+                success = self.oss_manager.upload_file(
+                    local_file_path=temp_file_path,
+                    oss_file_path=self.oss_leaderboard_file
+                )
+                if success:
+                    print(f"✅ Successfully saved {len(leaderboard_data)} leaderboard entries to OSS")
+                    return True
+                else:
+                    print("❌ Failed to upload leaderboard file to OSS")
+                    return False
+            finally:
+                # 清理临时文件
+                try:
+                    os.unlink(temp_file_path)
+                except:
+                    pass
+        except Exception as e:
+            print(f"❌ Failed to save leaderboard to OSS: {e}")
+            return False
+    def _create_backup(self) -> bool:
+        """
+        创建当前排行榜文件的备份
+        Returns:
+            是否备份成功
+        """
+        try:
+            # 检查原文件是否存在
+            if not self.oss_manager.file_exists(self.oss_leaderboard_file):
+                print("📋 Original leaderboard file does not exist, skipping backup")
+                return True
+            # 生成备份文件名
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            backup_filename = f"leaderboard.json.backup_{timestamp}"
+            backup_path = f"{self.backup_path}{backup_filename}"
+            # 复制文件到备份路径
+            success = self.oss_manager.copy_file(
+                source_path=self.oss_leaderboard_file,
+                target_path=backup_path
+            )
+            if success:
+                print(f"📦 Backup created successfully: {backup_path}")
+                return True
+            else:
+                print(f"❌ Failed to create backup: {backup_path}")
+                return False
+        except Exception as e:
+            print(f"❌ Error creating backup: {e}")
+            return False
+    def add_evaluation_result(self, result_data: Dict[str, Any]) -> bool:
+        """
+        添加新的评测结果到排行榜
+        Args:
+            result_data: 评测结果数据
+        Returns:
+            是否添加成功
+        """
+        try:
+            # 加载现有排行榜
+            leaderboard_data = self.load_leaderboard_from_oss()
+            # 检查是否已存在相同的提交（基于organization和submitted_time）
+            existing_entry = None
+            for i, entry in enumerate(leaderboard_data):
+                if (entry.get("organization") == result_data.get("organization") and
+                    entry.get("submitted_time") == result_data.get("submitted_time")):
+                    existing_entry = i
+                    break
+            if existing_entry is not None:
+                print(f"🔄 Updating existing leaderboard entry: {result_data.get('organization')}")
+                leaderboard_data[existing_entry] = result_data
+            else:
+                print(f"➕ Adding new leaderboard entry: {result_data.get('organization')}")
+                leaderboard_data.append(result_data)
+            # 按准确率排序
+            leaderboard_data.sort(
+                key=lambda x: x.get("accuracy", 0),
+                reverse=True
+            )
+            # 保存到OSS
+            return self.save_leaderboard_to_oss(leaderboard_data)
+        except Exception as e:
+            print(f"❌ Failed to add evaluation result: {e}")
+            return False
+    def get_leaderboard_summary(self) -> Dict[str, Any]:
+        """
+        获取排行榜摘要信息
+        Returns:
+            排行榜摘要
+        """
+        try:
+            leaderboard_data = self.load_leaderboard_from_oss()
+            if not leaderboard_data:
+                return {"total_entries": 0, "last_updated": None}
+            # 统计信息
+            total_entries = len(leaderboard_data)
+            # 获取最新更新时间
+            latest_time = None
+            for entry in leaderboard_data:
+                eval_time = entry.get("evaluation_timestamp")
+                if eval_time and (latest_time is None or eval_time > latest_time):
+                    latest_time = eval_time
+            # 获取最高分
+            top_scores = {}
+            if leaderboard_data:
+                top_entry = leaderboard_data[0]  # 已按准确率排序
+                top_scores = {
+                    "accuracy": top_entry.get("accuracy", 0),
+                    "mg_pass_2": top_entry.get("mg_pass_2", 0),
+                    "mg_pass_4": top_entry.get("mg_pass_4", 0)
+                }
+            return {
+                "total_entries": total_entries,
+                "last_updated": latest_time,
+                "top_scores": top_scores,
+                "oss_path": self.oss_leaderboard_file
+            }
+        except Exception as e:
+            print(f"❌ Failed to get leaderboard summary: {e}")
+            return {"error": str(e)}
+    def migrate_local_to_oss(self, local_file_path: str) -> bool:
+        """
+        将本地排行榜文件迁移到OSS
+        Args:
+            local_file_path: 本地文件路径
+        Returns:
+            是否迁移成功
+        """
+        try:
+            if not os.path.exists(local_file_path):
+                print(f"❌ Local file does not exist: {local_file_path}")
+                return False
+            # 读取本地文件
+            with open(local_file_path, 'r', encoding='utf-8') as f:
+                leaderboard_data = json.load(f)
+            print(f"📤 Migrating {len(leaderboard_data)} entries to OSS")
+            # 保存到OSS
+            return self.save_leaderboard_to_oss(leaderboard_data, create_backup=False)
+        except Exception as e:
+            print(f"❌ Failed to migrate file to OSS: {e}")
+            return False
+if __name__ == "__main__":
+    # 测试OSS排行榜管理器
+    manager = OSSLeaderboardManager()
+    # 打印摘要信息
+    summary = manager.get_leaderboard_summary()
+    print(f"📊 Leaderboard summary: {summary}")
+    # 测试加载排行榜
+    leaderboard = manager.load_leaderboard_from_oss()
+    print(f"📋 Number of leaderboard entries: {len(leaderboard)}")

src/oss/oss_submission_handler.py ADDED Viewed

	@@ -0,0 +1,233 @@

+#!/usr/bin/env python3
+"""
+OSS提交处理器 - 替换原有的git/http提交方式
+在HuggingFace Spaces中直接将提交文件上传到OSS
+"""
+import os
+import sys
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, Tuple
+# 导入同目录下的oss_file_manager
+from .oss_file_manager import OSSFileManager
+class OSSSubmissionHandler:
+    """OSS提交处理器 - 将用户提交直接上传到OSS"""
+    def __init__(self, oss_submission_path: str = "atlas_eval/submissions/"):
+        """
+        初始化OSS提交处理器
+        Args:
+            oss_submission_path: OSS中存储提交文件的路径
+        """
+        self.oss_path = oss_submission_path
+        self.oss_manager = OSSFileManager()
+        print(f"📁 OSS submission path: oss://opencompass/{oss_submission_path}")
+    def format_error(self, msg: str) -> str:
+        """格式化错误消息"""
+        return f"<p style='color: red; font-size: 16px;'>{msg}</p>"
+    def format_success(self, msg: str) -> str:
+        """格式化成功消息"""
+        return f"<p style='color: green; font-size: 16px;'>{msg}</p>"
+    def format_warning(self, msg: str) -> str:
+        """格式化警告消息"""
+        return f"<p style='color: orange; font-size: 16px;'>{msg}</p>"
+    def validate_sage_submission(self, submission_data: Dict[str, Any]) -> Tuple[bool, str]:
+        """验证ATLAS基准提交格式"""
+        # 检查必需的顶级字段
+        required_fields = ["submission_org", "submission_email", "predictions"]
+        for field in required_fields:
+            if field not in submission_data:
+                return False, f"Missing required field: {field}"
+        # 验证邮箱格式（基本验证）
+        email = submission_data["submission_email"]
+        if "@" not in email or "." not in email:
+            return False, "Invalid email format"
+        # 验证predictions
+        predictions = submission_data["predictions"]
+        if not isinstance(predictions, list) or len(predictions) == 0:
+            return False, "predictions must be a non-empty list"
+        for i, prediction in enumerate(predictions):
+            # 检查必需的prediction字段
+            pred_required_fields = ["original_question_id", "content", "reasoning_content"]
+            for field in pred_required_fields:
+                if field not in prediction:
+                    return False, f"Missing field in prediction {i}: {field}"
+            # 验证content数组
+            content = prediction["content"]
+            reasoning_content = prediction["reasoning_content"]
+            if not isinstance(content, list) or len(content) != 4:
+                return False, f"content in prediction {i} must be a list with 4 items"
+            if not isinstance(reasoning_content, list):
+                return False, f"reasoning_content in prediction {i} must be a list"
+            # # reasoning_content可以为空列表，或者包含4个项目
+            # if len(reasoning_content) != 0 and len(reasoning_content) != 4:
+            #     return False, f"reasoning_content in prediction {i} must be an empty list or contain 4 items"
+            # 验证question ID
+            if not isinstance(prediction["original_question_id"], int):
+                return False, f"question ID in prediction {i} must be an integer"
+        return True, "Submission format is valid"
+    def generate_submission_filename(self, submission_data: Dict[str, Any]) -> str:
+        """生成提交文件名"""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        org_name = submission_data["submission_org"].replace(" ", "_").replace("/", "_").replace("\\", "_")
+        return f"submission_{org_name}_{timestamp}.json"
+    def upload_to_oss(self, submission_data: Dict[str, Any], filename: str) -> Tuple[bool, str]:
+        """上传提交文件到OSS"""
+        try:
+            # 创建临时本地文件
+            temp_file = f"/tmp/{filename}"
+            with open(temp_file, 'w', encoding='utf-8') as f:
+                json.dump(submission_data, f, indent=2, ensure_ascii=False)
+            # 上传到OSS
+            oss_file_path = f"{self.oss_path}{filename}"
+            print(f"⬆️ Uploading to OSS: {oss_file_path}")
+            self.oss_manager.upload_file_to_object(
+                local_file_path=temp_file,
+                oss_file_path=oss_file_path,
+                replace=True
+            )
+            # 清理临时文件
+            os.remove(temp_file)
+            print(f"✅ OSS upload successful: {oss_file_path}")
+            return True, f"oss://opencompass/{oss_file_path}"
+        except Exception as e:
+            print(f"❌ OSS upload failed: {e}")
+            return False, str(e)
+    def process_sage_submission(self, submission_file_or_data, org_name=None, email=None) -> str:
+        """
+        处理ATLAS基准提交文件 - OSS模式
+        替换原有的git/http方式，直接上传到OSS
+        """
+        try:
+            # 处理输入参数 - 可能是文件路径或者已经的数据
+            if submission_file_or_data is None:
+                return self.format_error("❌ No submission data provided.")
+            # 如果是字符串，认为是文件路径
+            if isinstance(submission_file_or_data, str):
+                try:
+                    with open(submission_file_or_data, 'r', encoding='utf-8') as f:
+                        content = f.read()
+                    # 解析JSON
+                    submission_data = json.loads(content)
+                except Exception as e:
+                    return self.format_error(f"❌ Error reading file: {str(e)}")
+            # 如果是字典，直接使用
+            elif isinstance(submission_file_or_data, dict):
+                submission_data = submission_file_or_data
+            else:
+                return self.format_error("❌ Invalid submission data format.")
+            # 如果表单提供了组织名和邮箱，使用表单数据
+            if org_name and email:
+                submission_data["submission_org"] = org_name.strip()
+                submission_data["submission_email"] = email.strip()
+            # 验证提交格式
+            is_valid, message = self.validate_sage_submission(submission_data)
+            if not is_valid:
+                return self.format_error(f"❌ Submission validation failed: {message}")
+            # 生成文件名
+            filename = self.generate_submission_filename(submission_data)
+            # 上传到OSS
+            success, result = self.upload_to_oss(submission_data, filename)
+            if not success:
+                return self.format_error(f"❌ Failed to upload to OSS: {result}")
+            # 生成成功消息
+            org = submission_data["submission_org"]
+            email_addr = submission_data["submission_email"]
+            num_predictions = len(submission_data["predictions"])
+            success_msg = self.format_success(f"""
+🎉 <strong>Submission successful!</strong><br><br>
+📋 <strong>Submission Information:</strong><br>
+• Organization: {org}<br>
+• Email: {email_addr}<br>
+• Number of predictions: {num_predictions} questions<br>
+• Filename: {filename}<br><br>
+🚀 <strong>Storage Location:</strong><br>
+{result}<br><br>
+⚡ <strong>Evaluation Status:</strong><br>
+Your submission has been successfully uploaded to cloud storage. The automatic evaluation system will begin processing within 5-15 minutes.<br><br>
+⏳ <strong>Evaluation Process:</strong><br>
+1. 🔍 System automatically detects new submission<br>
+2. ⬇️ Downloads and validates submission format<br>
+3. 🔬 Performs comprehensive evaluation using LLM-as-Judge<br>
+4. 📊 Calculates accuracy for each subject and overall<br>
+5. 🏆 Automatically updates to leaderboard<br><br>
+🕐 <strong>Estimated Time:</strong><br>
+Evaluation completion time is approximately 5-15 minutes, depending on current queue length.<br>
+Please refresh the leaderboard later to view results.<br><br>
+🧪 Thank you for participating in the ATLAS scientific reasoning benchmark!
+            """)
+            return success_msg
+        except Exception as e:
+            return self.format_error(f"❌ Submission processing failed: {str(e)}")
+# 兼容性函数 - 保持与原有代码的接口一致
+def process_sage_submission_simple(submission_file, org_name=None, email=None) -> str:
+    """
+    处理ATLAS基准提交文件 - OSS模式
+    这是一个兼容性函数，保持与原有submit.py的接口一致
+    """
+    handler = OSSSubmissionHandler()
+    return handler.process_sage_submission(submission_file, org_name, email)
+def format_error(msg):
+    return f"<p style='color: red; font-size: 16px;'>{msg}</p>"
+def format_success(msg):
+    return f"<p style='color: green; font-size: 16px;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 16px;'>{msg}</p>"
+if __name__ == "__main__":
+    # 测试代码
+    print("🧪 测试OSS提交处理器")
+    # 检查环境变量
+    required_env_vars = ["OSS_ACCESS_KEY_ID", "OSS_ACCESS_KEY_SECRET"]
+    missing_vars = [var for var in required_env_vars if not os.getenv(var)]
+    if missing_vars:
+        print(f"❌ 缺少必需的环境变量: {missing_vars}")
+        exit(1)
+    handler = OSSSubmissionHandler()
+    print("✅ OSS提交处理器初始化成功")

src/populate.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import json
+import os
+import pandas as pd
+from typing import List
+from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import AutoEvalColumn
+# Import ATLAS-specific modules - avoid transformers dependency
+process_sage_results_for_leaderboard = None
+try:
+    # Import ATLAS modules without triggering transformers dependency
+    import sys
+    import os
+    import json
+    from dataclasses import dataclass
+    from typing import Dict, List, Any
+    import numpy as np
+    # Copy ATLASResult class locally to avoid import issues (keeping SAGEResult name for compatibility)
+    @dataclass
+    class SAGEResult:
+        submission_id: str
+        organization: str
+        email: str
+        tokens: str
+        accuracy: float
+        mg_pass_2: float
+        mg_pass_4: float
+        submitted_time: str
+        status: str = "EVALUATED"
+        def to_dict(self):
+            """Converts the ATLAS Result to a dict compatible with our dataframe display"""
+            # Extract model name from submission_id or use model_name directly
+            if hasattr(self, 'model_name'):
+                model_name = self.model_name
+            elif self.submission_id.startswith("oss_"):
+                # Extract model name from submission_id
+                model_name = self.submission_id.split("_", 2)[-1].replace("_", " ")
+            else:
+                model_name = self.submission_id
+            # Create display name
+            display_name = f"**{model_name}**"
+            model_symbol = "🤖"
+            # Format date to YYYY-MM-DD only
+            formatted_date = self.submitted_time
+            if isinstance(self.submitted_time, str):
+                # Try to parse and reformat date
+                try:
+                    if 'T' in self.submitted_time:
+                        # ISO format like "2025-09-09T14:37:23.616340"
+                        formatted_date = self.submitted_time.split('T')[0]
+                    else:
+                        # Already in simple format
+                        formatted_date = self.submitted_time.split(' ')[0]
+                except:
+                    formatted_date = self.submitted_time
+            data_dict = {
+                "Model": display_name,
+                "Organization": self.organization,
+                "Accuracy (%)": round(self.accuracy, 2),
+                "mG-Pass@2 (%)": round(self.mg_pass_2, 2),
+                "mG-Pass@4 (%)": round(self.mg_pass_4, 2),
+                "Submission Date": formatted_date,
+            }
+            return data_dict
+    def load_initial_sage_results_from_oss() -> List[SAGEResult]:
+        """Load initial ATLAS results from OSS"""
+        sage_results = []
+        try:
+            # 导入OSS排行榜管理器
+            from src.oss.oss_leaderboard_manager import OSSLeaderboardManager
+            # 从OSS加载排行榜数据
+            leaderboard_manager = OSSLeaderboardManager()
+            initial_data = leaderboard_manager.load_leaderboard_from_oss()
+            if initial_data:
+                print(f"✅ Loaded {len(initial_data)} leaderboard entries from OSS")
+                for i, entry in enumerate(initial_data):
+                    sage_result = SAGEResult(
+                        submission_id=f"oss_{i:02d}_{entry['model_name'].replace(' ', '_').replace('-', '_')}",
+                        organization=entry['organization'],
+                        email=entry.get('contact_email', f"contact@{entry['organization'].lower().replace(' ', '')}.com"),
+                        tokens=entry.get('tokens', 'N/A'),
+                        accuracy=entry.get('accuracy', 0.0),
+                        mg_pass_2=entry.get('mg_pass_2', 0.0),
+                        mg_pass_4=entry.get('mg_pass_4', 0.0),
+                        submitted_time=entry["submitted_time"],
+                        status="EVALUATED"
+                    )
+                    # Add model_name as additional attribute for display
+                    sage_result.model_name = entry['model_name']
+                    sage_results.append(sage_result)
+            else:
+                print("⚠️ No leaderboard data found in OSS")
+        except Exception as e:
+            print(f"❌ Failed to load leaderboard from OSS: {e}")
+        return sage_results
+    def process_sage_results_for_leaderboard_oss() -> List[SAGEResult]:
+        """Process all ATLAS results from OSS"""
+        return load_initial_sage_results_from_oss()
+    # Set the function
+    process_sage_results_for_leaderboard = process_sage_results_for_leaderboard_oss
+except ImportError as e:
+    print(f"Could not set up ATLAS results processing: {e}")
+    process_sage_results_for_leaderboard = None
+def get_sage_leaderboard_df() -> pd.DataFrame:
+    """Creates a dataframe from ATLAS evaluation results"""
+    if process_sage_results_for_leaderboard is None:
+        return pd.DataFrame()
+    # Get ATLAS results
+    sage_results = process_sage_results_for_leaderboard()
+    all_data_json = [result.to_dict() for result in sage_results]
+    if not all_data_json:
+        return pd.DataFrame()
+    df = pd.DataFrame.from_records(all_data_json)
+    # Remove duplicates: for same Model+Organization, keep only the latest submission date
+    if "Model" in df.columns and "Organization" in df.columns and "Submission Date" in df.columns:
+        # Sort by Submission Date (descending) to get the latest first
+        df = df.sort_values(by=["Submission Date"], ascending=False)
+        # Drop duplicates, keeping first (latest) occurrence of each Model+Organization combination
+        df = df.drop_duplicates(subset=["Model", "Organization"], keep="first")
+    # Sort by accuracy (descending)
+    if "Accuracy (%)" in df.columns:
+        df = df.sort_values(by=["Accuracy (%)"], ascending=False)
+    # Round numeric columns
+    numeric_cols = ["Accuracy (%)", "mG-Pass@2 (%)", "mG-Pass@4 (%)"]
+    for col in numeric_cols:
+        if col in df.columns:
+            df[col] = df[col].round(2)
+    return df

src/submission/submit.py ADDED Viewed

	@@ -0,0 +1,230 @@

+#!/usr/bin/env python3
+"""
+ATLAS提交处理 - OSS模式
+使用阿里云OSS替代git/http提交方式
+"""
+import json
+import os
+import sys
+from datetime import datetime
+from typing import Dict, Any, Tuple
+from pathlib import Path
+# 导入OSS提交处理器
+try:
+    from src.oss.oss_submission_handler import OSSSubmissionHandler
+    OSS_AVAILABLE = True
+except ImportError as e:
+    print(f"⚠️ OSS module not available, using fallback mode: {e}")
+    OSS_AVAILABLE = False
+def format_error(msg):
+    return f"<p style='color: red; font-size: 16px;'>{msg}</p>"
+def format_success(msg):
+    return f"<p style='color: green; font-size: 16px;'>{msg}</p>"
+def format_warning(msg):
+    return f"<p style='color: orange; font-size: 16px;'>{msg}</p>"
+def validate_sage_submission(submission_data: Dict[str, Any]) -> Tuple[bool, str]:
+    """验证ATLAS基准提交格式"""
+    # 检查必需的顶级字段
+    required_fields = ["submission_org", "submission_email", "predictions"]
+    for field in required_fields:
+        if field not in submission_data:
+            return False, f"Missing required field: {field}"
+    # 验证邮箱格式（基本验证）
+    email = submission_data["submission_email"]
+    if "@" not in email or "." not in email:
+        return False, "Invalid email format"
+    # 验证predictions
+    predictions = submission_data["predictions"]
+    if not isinstance(predictions, list) or len(predictions) == 0:
+        return False, "predictions must be a non-empty list"
+    for i, prediction in enumerate(predictions):
+        # 检查必需的prediction字段
+        pred_required_fields = ["original_question_id", "content", "reasoning_content"]
+        for field in pred_required_fields:
+            if field not in prediction:
+                return False, f"Missing field in prediction {i}: {field}"
+        # 验证content数组
+        content = prediction["content"]
+        reasoning_content = prediction["reasoning_content"]
+        if not isinstance(content, list) or len(content) != 4:
+            return False, f"content in prediction {i} must be a list with 4 items"
+        if not isinstance(reasoning_content, list):
+            return False, f"reasoning_content in prediction {i} must be a list"
+        # # reasoning_content可以为空列表，或者包含4个项目
+        # if len(reasoning_content) != 0 and len(reasoning_content) != 4:
+        #     return False, f"reasoning_content in prediction {i} must be an empty list or contain 4 items"
+        # 验证question ID
+        if not isinstance(prediction["original_question_id"], int):
+            return False, f"question ID in prediction {i} must be an integer"
+    return True, "Submission format is valid"
+def save_submission_file(submission_data: Dict[str, Any], submissions_dir: str = "./submissions") -> str:
+    """保存提交文件到指定目录"""
+    # 确保submissions目录存在
+    os.makedirs(submissions_dir, exist_ok=True)
+    # 生成文件名
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    org_name = submission_data["submission_org"].replace(" ", "_").replace("/", "_").replace("\\", "_")
+    filename = f"submission_{org_name}_{timestamp}.json"
+    # 完整文件路径
+    file_path = os.path.join(submissions_dir, filename)
+    # 保存文件
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(submission_data, f, indent=2, ensure_ascii=False)
+    return file_path
+def process_sage_submission_simple(submission_file, model_name=None, org_name=None, email=None) -> str:
+    """
+    处理ATLAS基准提交文件 - 文件收集模式
+    只负责验证和保存，不进行评测
+    """
+    try:
+        # 读取提交的文件
+        if submission_file is None:
+            return format_error("❌ No file uploaded. Please select a JSON file.")
+        # submission_file是文件路径字符串
+        try:
+            with open(submission_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+        except Exception as e:
+            return format_error(f"❌ Error reading file: {str(e)}")
+        # 解析JSON
+        try:
+            submission_data = json.loads(content)
+        except json.JSONDecodeError as e:
+            return format_error(f"❌ Invalid JSON format: {str(e)}")
+        # 如果表单提供了模型名、组织名和邮箱，使用表单数据
+        if model_name:
+            submission_data["model_name"] = model_name.strip()
+        if org_name and email:
+            submission_data["submission_org"] = org_name.strip()
+            submission_data["submission_email"] = email.strip()
+        # 验证提交格式
+        is_valid, message = validate_sage_submission(submission_data)
+        if not is_valid:
+            return format_error(f"❌ Submission validation failed: {message}")
+        # 保存提交文件
+        try:
+            saved_path = save_submission_file(submission_data)
+            print(f"✅ Submission file saved to: {saved_path}")
+            # OSS上传策略
+            if OSS_AVAILABLE:
+                try:
+                    # 使用OSS提交处理器
+                    oss_handler = OSSSubmissionHandler()
+                    result = oss_handler.process_sage_submission(submission_data, org_name, email)
+                    # 如果OSS处理成功，直接返回结果
+                    if "Submission successful" in result or "successful" in result.lower():
+                        return result
+                    else:
+                        # OSS失败，继续使用备用模式
+                        print(f"⚠️ OSS submission failed, using fallback mode: {result}")
+                except Exception as e:
+                    print(f"⚠️ OSS submission exception, using fallback mode: {e}")
+            # 备用模式：本地保存
+            filename = os.path.basename(saved_path)
+            # 生成成功消息
+            org = submission_data["submission_org"]
+            email_addr = submission_data["submission_email"]
+            num_predictions = len(submission_data["predictions"])
+            success_msg = format_success(f"""
+🎉 <strong>Submission successful!</strong><br><br>
+📋 <strong>Submission Information:</strong><br>
+• Organization: {org}<br>
+• Email: {email_addr}<br>
+• Number of predictions: {num_predictions} questions<br>
+• Filename: {filename}<br><br>
+🚀 <strong>Storage Status:</strong><br>
+File saved to local storage, awaiting system sync to evaluation environment.<br><br>
+⏳ <strong>Evaluation Process:</strong><br>
+Your submission will be automatically evaluated using LLM-as-Judge, including comprehensive testing of scientific reasoning capabilities.<br>
+Results will appear automatically on the leaderboard after evaluation is complete.<br><br>
+🕐 <strong>Estimated Time:</strong><br>
+• Normal case: 5-15 minutes<br>
+• Sync delay: 15-60 minutes<br><br>
+🧪 Thank you for participating in the ATLAS scientific reasoning benchmark!
+            """)
+            return success_msg
+        except Exception as e:
+            return format_error(f"❌ Error saving submission file: {str(e)}")
+    except Exception as e:
+        return format_error(f"❌ Submission processing failed: {str(e)}")
+def get_submission_stats(submissions_dir: str = "./submissions") -> Dict[str, Any]:
+    """获取提交统计信息"""
+    if not os.path.exists(submissions_dir):
+        return {"total": 0, "recent": []}
+    submissions = []
+    for filename in os.listdir(submissions_dir):
+        if filename.startswith("submission_") and filename.endswith(".json"):
+            file_path = os.path.join(submissions_dir, filename)
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    data = json.load(f)
+                # 提取信息
+                timestamp_str = filename.split("_")[-1].replace(".json", "")
+                try:
+                    timestamp = datetime.strptime(timestamp_str, "%Y%m%d_%H%M%S")
+                    formatted_time = timestamp.strftime("%Y-%m-%d %H:%M")
+                except:
+                    formatted_time = timestamp_str
+                submissions.append({
+                    "org": data.get("submission_org", "Unknown"),
+                    "email": data.get("submission_email", ""),
+                    "time": formatted_time,
+                    "predictions": len(data.get("predictions", []))
+                })
+            except Exception:
+                continue
+    # 按时间排序，最新的在前
+    submissions.sort(key=lambda x: x["time"], reverse=True)
+    return {
+        "total": len(submissions),
+        "recent": submissions[:10]  # 最近10个
+    }
+# 移除了原有的HTTP推送函数，现在使用OSS模式