Spaces:
Sleeping
Sleeping
| """ | |
| Utility classes and functions for the CodeReview Bench Leaderboard display. | |
| """ | |
| from dataclasses import dataclass, field, fields | |
| from enum import Enum, auto | |
| from typing import List, Optional | |
| class Mode(Enum): | |
| """Inference mode for the review model.""" | |
| CoT = auto() # Chain of Thought | |
| Strict = auto() | |
| def __str__(self): | |
| """String representation of the mode.""" | |
| return self.name | |
| class ModelType(Enum): | |
| """Model types for the leaderboard.""" | |
| Unknown = auto() | |
| OpenSource = auto() | |
| ClosedSource = auto() | |
| API = auto() | |
| def to_str(self, separator: str = "-") -> str: | |
| """Convert enum to string with separator.""" | |
| if self == ModelType.Unknown: | |
| return "Unknown" | |
| elif self == ModelType.OpenSource: | |
| return f"Open{separator}Source" | |
| elif self == ModelType.ClosedSource: | |
| return f"Closed{separator}Source" | |
| elif self == ModelType.API: | |
| return "API" | |
| return "Unknown" | |
| class ReviewModelType(str, Enum): | |
| """Review model types for the leaderboard.""" | |
| GPT_4 = "gpt-4" | |
| GPT_3_5 = "gpt-3.5-turbo" | |
| CLAUDE = "claude" | |
| LLAMA = "llama" | |
| GEMINI = "gemini" | |
| CUSTOM = "custom" | |
| def __str__(self): | |
| """String representation of the review model type.""" | |
| return self.value | |
| class Precision(Enum): | |
| """Model precision types.""" | |
| Unknown = auto() | |
| float16 = auto() | |
| bfloat16 = auto() | |
| float32 = auto() | |
| int8 = auto() | |
| int4 = auto() | |
| NA = auto() | |
| def __str__(self): | |
| """String representation of the precision type.""" | |
| return self.name | |
| class WeightType(Enum): | |
| """Model weight types.""" | |
| Original = auto() | |
| Delta = auto() | |
| Adapter = auto() | |
| def __str__(self): | |
| """String representation of the weight type.""" | |
| return self.name | |
| class ColumnInfo: | |
| """Information about a column in the leaderboard.""" | |
| name: str | |
| display_name: str | |
| type: str = "text" | |
| hidden: bool = False | |
| never_hidden: bool = False | |
| displayed_by_default: bool = True | |
| class CodeReviewBenchColumn: | |
| """Columns for the CodeReview Bench leaderboard.""" | |
| # Core metadata | |
| model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="model_name", | |
| display_name="Model", | |
| never_hidden=True, | |
| displayed_by_default=True | |
| )) | |
| mode: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="mode", | |
| display_name="Mode", | |
| displayed_by_default=True | |
| )) | |
| model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="model_type", | |
| display_name="Access_Type", | |
| displayed_by_default=True | |
| )) | |
| submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="submission_date", | |
| display_name="Submission_Date", | |
| displayed_by_default=False | |
| )) | |
| version: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="version", | |
| display_name="Version", | |
| displayed_by_default=False | |
| )) | |
| review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="review_model_type", | |
| display_name="Type", | |
| displayed_by_default=False | |
| )) | |
| base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="base_model", | |
| display_name="Base Model", | |
| displayed_by_default=False | |
| )) | |
| revision: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="revision", | |
| display_name="Revision", | |
| displayed_by_default=False | |
| )) | |
| precision: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="precision", | |
| display_name="Precision", | |
| displayed_by_default=False | |
| )) | |
| weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="weight_type", | |
| display_name="Weight Type", | |
| displayed_by_default=False | |
| )) | |
| topic: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="topic", | |
| display_name="Topic", | |
| displayed_by_default=True | |
| )) | |
| # LLM-based multimetric scores | |
| readability: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="readability", | |
| display_name="Readability", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="relevance", | |
| display_name="Relevance", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="explanation_clarity", | |
| display_name="Explanation_Clarity", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="problem_identification", | |
| display_name="Problem_Identification", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="actionability", | |
| display_name="Actionability", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="completeness", | |
| display_name="Completeness", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="specificity", | |
| display_name="Specificity", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="contextual_adequacy", | |
| display_name="Contextual_Adequacy", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="consistency", | |
| display_name="Consistency", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="brevity", | |
| display_name="Brevity", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| # LLM-based-exact-match metrics | |
| pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="pass_at_1", | |
| display_name="Pass@1", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="pass_at_5", | |
| display_name="Pass@5", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="pass_at_10", | |
| display_name="Pass@10", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="bleu_at_10", | |
| display_name="BLEU@10", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| # Overall aggregated metrics | |
| overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="overall_score", | |
| display_name="Overall_Score", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="multimetric_average", | |
| display_name="Multimetric_Average", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="exact_match_average", | |
| display_name="Exact_Match_Average", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="total_evaluations", | |
| display_name="Total_Evaluations", | |
| type="number", | |
| displayed_by_default=True | |
| )) | |
| # Language-specific metrics (Russian) | |
| ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="ru_readability", | |
| display_name="RU_Readability", | |
| type="number", | |
| displayed_by_default=False | |
| )) | |
| ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="ru_relevance", | |
| display_name="RU_Relevance", | |
| type="number", | |
| displayed_by_default=False | |
| )) | |
| ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="ru_overall_score", | |
| display_name="RU_Overall_Score", | |
| type="number", | |
| displayed_by_default=False | |
| )) | |
| # Language-specific metrics (English) | |
| en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="en_readability", | |
| display_name="EN_Readability", | |
| type="number", | |
| displayed_by_default=False | |
| )) | |
| en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="en_relevance", | |
| display_name="EN_Relevance", | |
| type="number", | |
| displayed_by_default=False | |
| )) | |
| en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
| name="en_overall_score", | |
| display_name="EN_Overall_Score", | |
| type="number", | |
| displayed_by_default=False | |
| )) | |
| # Create instances for easy access | |
| CODEREVIEW_COLUMN = CodeReviewBenchColumn() | |
| # Extract column lists for different views | |
| COLS = [f.name for f in fields(CODEREVIEW_COLUMN)] | |
| DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
| if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default] | |
| # Manually reorder DISPLAY_COLS to put 'mode' after 'model_name' | |
| def reorder_display_cols(): | |
| cols = DISPLAY_COLS | |
| if 'model_name' in cols and 'mode' in cols: | |
| cols.remove('mode') | |
| model_name_index = cols.index('model_name') | |
| cols.insert(model_name_index + 1, 'mode') | |
| return cols | |
| DISPLAY_COLS = reorder_display_cols() | |
| METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
| if getattr(CODEREVIEW_COLUMN, f.name).type == "number"] | |
| HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
| if getattr(CODEREVIEW_COLUMN, f.name).hidden] | |
| NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
| if getattr(CODEREVIEW_COLUMN, f.name).never_hidden] | |
| # Categories for CodeReview Bench (Programming Languages) | |
| CATEGORIES = [ | |
| 'Python', | |
| 'Java', | |
| 'Scala', | |
| 'Go' | |
| ] | |
| # Language taxonomies for CodeReview Bench | |
| COMMENT_LANGUAGES = [ | |
| 'ru', # Russian | |
| 'en' # English | |
| ] | |
| # Topics for CodeReview Bench | |
| TOPICS = [ | |
| 'Code Reliability', | |
| 'Coding Standards', | |
| 'Code Organization', | |
| 'Performance Issues', | |
| 'Validation', | |
| 'Variables' | |
| ] | |
| # Example categories | |
| EXAMPLE_CATEGORIES = [ | |
| 'Bug_Fix', | |
| 'Code_Style', | |
| 'Performance', | |
| 'Security', | |
| 'Refactoring', | |
| 'Documentation', | |
| 'Testing', | |
| 'Architecture', | |
| 'Other' | |
| ] | |
| # Metrics for CodeReview Bench | |
| MULTIMETRIC_METRICS = [ | |
| "readability", | |
| "relevance", | |
| "explanation_clarity", | |
| "problem_identification", | |
| "actionability", | |
| "completeness", | |
| "specificity", | |
| "contextual_adequacy", | |
| "consistency", | |
| "brevity" | |
| ] | |
| EXACT_MATCH_METRICS = [ | |
| "pass_at_1", | |
| "pass_at_5", | |
| "pass_at_10", | |
| "bleu_at_10" | |
| ] | |
| def get_all_column_choices(): | |
| """ | |
| Get all available column choices for the multiselect dropdown. | |
| Returns: | |
| List of tuples with (column_name, display_name) for all columns. | |
| """ | |
| column_choices = [] | |
| default_visible_columns = get_default_visible_columns() | |
| for f in fields(CODEREVIEW_COLUMN): | |
| column_info = getattr(CODEREVIEW_COLUMN, f.name) | |
| # Create a tuple with both the internal name and display name | |
| if column_info.name not in default_visible_columns: | |
| column_choices.append((column_info.name, column_info.display_name)) | |
| return column_choices | |
| def get_default_visible_columns(): | |
| """ | |
| Get the list of column names that should be visible by default. | |
| Returns: | |
| List of column names that are displayed by default. | |
| """ | |
| return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
| if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default] | |