aauss commited on
Commit
d84d51c
·
1 Parent(s): b807233

Catch unordered list case and cast preds and ref to str for better reliability.

Browse files
Files changed (2) hide show
  1. test_of_time_accuracy.py +13 -10
  2. tests.py +4 -4
test_of_time_accuracy.py CHANGED
@@ -113,15 +113,12 @@ class TestOfTimeAccuracy(evaluate.Metric):
113
 
114
  @staticmethod
115
  def _parse_label(s):
116
- """Parses a string that could be a JSON object or a Python dict."""
117
  try:
118
- return json.loads(s)
119
- except json.JSONDecodeError:
120
- try:
121
- # Safe: only parses literals, does not execute code
122
- return ast.literal_eval(s)
123
- except (ValueError, SyntaxError):
124
- return None
125
 
126
  def _compute(
127
  self,
@@ -139,8 +136,14 @@ class TestOfTimeAccuracy(evaluate.Metric):
139
  references = [self._parse_label(r) for r in references]
140
  else:
141
  raise ValueError(f"Invalid subset: {subset}")
142
- accuracy = [i == j for i, j in zip(predictions, references)]
 
 
 
 
 
 
 
143
  if return_average:
144
  return {"accuracy": sum(accuracy) / len(accuracy)}
145
  return {"accuracy": accuracy}
146
-
 
113
 
114
  @staticmethod
115
  def _parse_label(s):
116
+ """Parses a string that could be a Python dict."""
117
  try:
118
+ # Safe: only parses literals, does not execute code
119
+ return ast.literal_eval(s)
120
+ except (ValueError, SyntaxError):
121
+ return None
 
 
 
122
 
123
  def _compute(
124
  self,
 
136
  references = [self._parse_label(r) for r in references]
137
  else:
138
  raise ValueError(f"Invalid subset: {subset}")
139
+ accuracy = []
140
+ for i, j in zip(predictions, references):
141
+ if subset == "arithmetic" and "unordered_list" in j:
142
+ i = sorted(i["unordered_list"])
143
+ j = sorted(j["unordered_list"])
144
+ accuracy.append(
145
+ str(i) == str(j)
146
+ ) # Semantic subset answer JSON somestimes has int as value. Label is string.
147
  if return_average:
148
  return {"accuracy": sum(accuracy) / len(accuracy)}
149
  return {"accuracy": accuracy}
 
tests.py CHANGED
@@ -5,12 +5,12 @@ from test_of_time_accuracy import TestOfTimeAccuracy
5
  arithmetic_test_cases = {
6
  "predictions": [
7
  'JSON = {"explanation": "The war began in 360 BC. Since BC years count backwards, adding 8 years to 360 BC means subtracting 8 from 360, resulting in 352 BC.", "answer": "352 BC"}',
8
- '```json\n{\n "explanation": "The dates provided are March 2012, September 2011, June 2017, September 2019, and June 2015. These correspond to visits to Miami, Sydney, Tokyo, London, and Nairobi respectively. The latest date among these is September 2019, which is associated with London. Therefore, London is the last city visited.",\n "unordered_list": ["London"]\n}\n```',
9
  ' "To find the date of the second most important game, we need to subtract 7 days from the date of the most important game. We can do this by counting back 7 days from April 14, 2005. April 14 - 7 days = April 7, 2005", "answer": "2005-04-07"}',
10
  ],
11
  "references": [
12
  '{"answer": "352 BC"}',
13
- '{"unordered_list": ["London"]}',
14
  "{'answer': '2005-04-07'}",
15
  ],
16
  "result": {"accuracy": 2 / 3},
@@ -19,9 +19,9 @@ arithmetic_test_cases = {
19
 
20
  semantic_test_cases = {
21
  "predictions": [
22
- ' "First, we need to find the third occurrence of E33 being the R53 of E22. We can see that it happened from 1959 to 1962, then from 1967 to 1968, and then from 1982 to 1984. The third occurrence happened from 1982 to 1984. We can then compute the duration by subtracting the start time from the end time.", "answer": 2}',
23
  ' "To find the duration, we need to find the start and end time when E97 was the R71 of E67. From the given facts, we can see that E97 was the R71 of E67 from 1961 to 1961, and also from 1964 to 1964. We need to find the first occurrence, which is from 1961 to 1961.", "answer": 1}',
24
- '{"explanation": "To find when E92 stopped being the R88 of E11, we need to look at the temporal facts where E92 was the R88 of E11 and find the end time. We see that E92 was the R88 of E11 from 1982 to 1985, and there is no other fact that indicates E92 stopped being the R88 of E11 before 1985. However, we also see that E92 was the R17 of E42 from 1986 to 1992, and E92 was the R88 of E42 from 1977 to 1979, but this is irrelevant to the question. Therefore, E92 stopped being the R88 of E11 in 1985.", "answer": "1985"}',
25
  ],
26
  "references": ["2", "0", "1985"],
27
  "result": {"accuracy": 1 / 3},
 
5
  arithmetic_test_cases = {
6
  "predictions": [
7
  'JSON = {"explanation": "The war began in 360 BC. Since BC years count backwards, adding 8 years to 360 BC means subtracting 8 from 360, resulting in 352 BC.", "answer": "352 BC"}',
8
+ '```json\n{\n "explanation": "The dates provided are March 2012, September 2011, June 2017, September 2019, and June 2015. These correspond to visits to Miami, Sydney, Tokyo, London, and Nairobi respectively. The latest date among these is September 2019, which is associated with London. Therefore, London is the last city visited.",\n "unordered_list": ["Berlin","London"]\n}\n```',
9
  ' "To find the date of the second most important game, we need to subtract 7 days from the date of the most important game. We can do this by counting back 7 days from April 14, 2005. April 14 - 7 days = April 7, 2005", "answer": "2005-04-07"}',
10
  ],
11
  "references": [
12
  '{"answer": "352 BC"}',
13
+ '{"unordered_list": ["London", "Berlin"]}',
14
  "{'answer': '2005-04-07'}",
15
  ],
16
  "result": {"accuracy": 2 / 3},
 
19
 
20
  semantic_test_cases = {
21
  "predictions": [
22
+ '{"explanation": First, we need to find the third occurrence of E33 being the R53 of E22. We can see that it happened from 1959 to 1962, then from 1967 to 1968, and then from 1982 to 1984. The third occurrence happened from 1982 to 1984. We can then compute the duration by subtracting the start time from the end time.", "answer": 2}',
23
  ' "To find the duration, we need to find the start and end time when E97 was the R71 of E67. From the given facts, we can see that E97 was the R71 of E67 from 1961 to 1961, and also from 1964 to 1964. We need to find the first occurrence, which is from 1961 to 1961.", "answer": 1}',
24
+ '{"explanation": "To find when E92 stopped being the R88 of E11, we need to look at the temporal facts where E92 was the R88 of E11 and find the end time. We see that E92 was the R88 of E11 from 1982 to 1985, and there is no other fact that indicates E92 stopped being the R88 of E11 before 1985. However, we also see that E92 was the R17 of E42 from 1986 to 1992, and E92 was the R88 of E42 from 1977 to 1979, but this is irrelevant to the question. Therefore, E92 stopped being the R88 of E11 in 1985.", "answer": 1985}',
25
  ],
26
  "references": ["2", "0", "1985"],
27
  "result": {"accuracy": 1 / 3},