Spaces:
Sleeping
Sleeping
| # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Accuracy metric for the Test of Time benchmark by Bahar et al. (2025).""" | |
| import ast | |
| import json | |
| from typing import Literal | |
| import datasets | |
| import evaluate | |
| _CITATION = """\ | |
| @InProceedings{huggingface:module, | |
| title = {Test of Time Accuracy}, | |
| authors={Auss Abbood}, | |
| year={2025} | |
| } | |
| """ | |
| _DESCRIPTION = """\ | |
| The Test of Time (ToT) benchmarks expects models format their answers as a JSON with an explanation field and an answer field that follows a predefined format. The metrics extracts JSONs objects from the model's output, retains only the first JSON, drops the explanation field and compares it with the reference answer. | |
| """ | |
| _KWARGS_DESCRIPTION = """ | |
| Compares the extracted answer from the model's output with the reference answer. | |
| Args: | |
| predictions: list of predictions to score. Each prediction should be a string that contains a JSON object (e.g., generated by an LLM). | |
| references: list of reference answers. | |
| subset: The subset of the benchmark being evaluated. Must be one of "arithmetic" or "semantic". | |
| return_average: If True, returns the average accuracy. If False, returns a list of boolean scores (correct/incorrect) for each sample. Defaults to True. | |
| Returns: | |
| accuracy: The accuracy score (0.0 to 1.0) if return_average=True, or a list of booleans indicating correctness per sample if return_average=False. | |
| Examples: | |
| >>> import evaluate | |
| >>> metric = evaluate.load("aauss/test_of_time_accuracy") | |
| >>> predictions = [ | |
| ... '{"explanation": "Some explanation...", "unordered_list": ["London"]}', | |
| ... ' "Response without opening curly brackets...", "answer": "2005-04-07"}', | |
| ... ] | |
| >>> references = [ | |
| ... '{"unordered_list": ["London"]}', | |
| ... "{'answer': '2005-04-07'}", | |
| ... ] | |
| >>> results = metric.compute(predictions=predictions, references=references, subset="arithmetic") | |
| >>> print(results) | |
| {'accuracy': 0.5} | |
| """ | |
| class TestOfTimeAccuracy(evaluate.Metric): | |
| """Accuracy metric for the Test of Time benchmark by Bahar et al. (2025).""" | |
| __test__ = False | |
| def _info(self): | |
| return evaluate.MetricInfo( | |
| module_type="metric", | |
| description=_DESCRIPTION, | |
| citation=_CITATION, | |
| inputs_description=_KWARGS_DESCRIPTION, | |
| # This defines the format of each prediction and reference | |
| features=datasets.Features( | |
| { | |
| "predictions": datasets.Value("string"), | |
| "references": datasets.Value("string"), | |
| } | |
| ), | |
| # Homepage of the module for documentation | |
| # homepage="http://module.homepage", | |
| # Additional links to the codebase or references | |
| # codebase_urls=["http://github.com/path/to/codebase/of/new_module"], | |
| # reference_urls=["http://path.to.reference.url/new_module"], | |
| ) | |
| def _extract_first_json_object(s: str) -> dict | None: | |
| decoder = json.JSONDecoder() | |
| idx, end = 0, len(s) | |
| while idx < end: | |
| try: | |
| obj, next_idx = decoder.raw_decode(s, idx) | |
| idx = next_idx | |
| if isinstance(obj, dict): | |
| return obj | |
| except ValueError: | |
| idx += 1 | |
| return None | |
| def _pop_explanation(d): | |
| if isinstance(d, dict): | |
| d.pop("explanation", None) | |
| return d | |
| def _get_answer(d): | |
| if isinstance(d, dict): | |
| return d.get("answer", None) | |
| return d | |
| def _parse_label(s): | |
| """Parses a string that could be a Python dict.""" | |
| try: | |
| # Safe: only parses literals, does not execute code | |
| return ast.literal_eval(s) | |
| except (ValueError, SyntaxError): | |
| return None | |
| def _compute( | |
| self, | |
| predictions, | |
| references, | |
| subset: Literal["arithmetic", "semantic"], | |
| return_average: bool = True, | |
| ): | |
| """Returns the scores""" | |
| predictions = [self._extract_first_json_object(p) for p in predictions] | |
| if subset == "semantic": | |
| predictions = [self._get_answer(p) for p in predictions] | |
| elif subset == "arithmetic": | |
| predictions = [self._pop_explanation(p) for p in predictions] | |
| references = [self._parse_label(r) for r in references] | |
| else: | |
| raise ValueError(f"Invalid subset: {subset}") | |
| accuracy = [] | |
| for i, j in zip(predictions, references): | |
| if subset == "arithmetic" and "unordered_list" in j: | |
| i = sorted(i["unordered_list"]) | |
| j = sorted(j["unordered_list"]) | |
| accuracy.append( | |
| str(i) == str(j) | |
| ) # Semantic subset answer JSON somestimes has int as value. Label is string. | |
| if return_average: | |
| return {"accuracy": sum(accuracy) / len(accuracy)} | |
| return {"accuracy": accuracy} | |