Gabe Mancino-Ball commited on
Commit
9b9ead9
·
1 Parent(s): f48dc65
Files changed (2) hide show
  1. app.py +39 -8
  2. utils.py +14 -0
app.py CHANGED
@@ -33,6 +33,16 @@ def get_max_score(group: pd.DataFrame, metric: str, use_selection: bool = True)
33
  return group.loc[max_idx]
34
 
35
 
 
 
 
 
 
 
 
 
 
 
36
  @st.cache_data
37
  def load_results(task_key, best_only, metric="balanced_accuracy"):
38
  to_return = {}
@@ -41,16 +51,37 @@ def load_results(task_key, best_only, metric="balanced_accuracy"):
41
  file_path = f"{results_path}/{task_key}_{score}_{split}_score.csv"
42
  if os.path.exists(file_path):
43
  df = pd.read_csv(file_path)
 
44
  if not best_only:
45
  to_return[f"{split}_{score}_score"] = df
46
  else:
47
- df = df.sort_values(["team", metric], ascending=False).reset_index(drop=True)
48
- df = (
49
- df.groupby("team", group_keys=False)
50
- .apply(get_max_score, metric=metric, use_selection=False if split == "public" else True)
51
- .sort_values([metric], ascending=False)
52
- .set_index("team")
53
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  to_return[f"{split}_{score}_score"] = df
55
  return to_return
56
 
@@ -305,7 +336,7 @@ def show_dataframe_w_format(df, format="compact", top_n=None):
305
  # Calculate the mean of top n values for each column
306
  top_n_means = {}
307
  for col in df.columns:
308
- sorted_values = df[col].sort_values(ascending=False)
309
  # Ensure we don't try to take more values than available
310
  actual_n = min(top_n, len(sorted_values))
311
  if actual_n > 0:
 
33
  return group.loc[max_idx]
34
 
35
 
36
+ def select_rows(df, metric: str = "balanced_accuracy"):
37
+ def select(group):
38
+ if group["selected"].any():
39
+ return group[group["selected"]].loc[group[group["selected"]][metric].idxmax()]
40
+ else:
41
+ return group.loc[group[f"{metric}_public"].idxmax()]
42
+
43
+ return df.groupby("team", group_keys=False).apply(select)
44
+
45
+
46
  @st.cache_data
47
  def load_results(task_key, best_only, metric="balanced_accuracy"):
48
  to_return = {}
 
51
  file_path = f"{results_path}/{task_key}_{score}_{split}_score.csv"
52
  if os.path.exists(file_path):
53
  df = pd.read_csv(file_path)
54
+ public_df = pd.read_csv(f"{results_path}/{task_key}_{score}_public_score.csv")
55
  if not best_only:
56
  to_return[f"{split}_{score}_score"] = df
57
  else:
58
+ if split == "public":
59
+ df = df.sort_values(["team", metric], ascending=False).reset_index(drop=True)
60
+ selected_max = (
61
+ df.copy()
62
+ .groupby("team", group_keys=False)
63
+ .apply(get_max_score, metric=metric, use_selection=True)
64
+ .sort_values([metric], ascending=False)
65
+ .set_index("team")
66
+ )
67
+ df = (
68
+ df.copy()
69
+ .groupby("team", group_keys=False)
70
+ .apply(get_max_score, metric=metric, use_selection=False)
71
+ .sort_values([metric], ascending=False)
72
+ .set_index("team")
73
+ )
74
+ print((df["balanced_accuracy"] - selected_max["balanced_accuracy"]))
75
+ else:
76
+ public_df = (
77
+ public_df.sort_values(["team", metric], ascending=False)
78
+ .reset_index(drop=True)
79
+ .set_index("submission_id")["balanced_accuracy"]
80
+ )
81
+ tmp = df.set_index("submission_id")
82
+ tmp = tmp.join(public_df, on=["submission_id"], rsuffix="_public")
83
+ df = select_rows(tmp)
84
+ df = df.sort_values([metric], ascending=False).set_index("team")
85
  to_return[f"{split}_{score}_score"] = df
86
  return to_return
87
 
 
336
  # Calculate the mean of top n values for each column
337
  top_n_means = {}
338
  for col in df.columns:
339
+ sorted_values = df[col] # .sort_values(ascending=False)
340
  # Ensure we don't try to take more values than available
341
  actual_n = min(top_n, len(sorted_values))
342
  if actual_n > 0:
utils.py CHANGED
@@ -30,6 +30,7 @@ STATUS_MAP = {0: "PENDING", 1: "QUEUED", 2: "PROCESSING", 3: "SUCCESS", 4: "FAIL
30
 
31
  ## Make a directory to store computed results
32
  os.makedirs(Path("competition_cache") / "cached_results", exist_ok=True)
 
33
 
34
 
35
  def load_teams(competition_space_path: Path) -> pd.DataFrame:
@@ -218,6 +219,17 @@ def create_custom_subs():
218
  )
219
 
220
 
 
 
 
 
 
 
 
 
 
 
 
221
  if __name__ == "__main__":
222
 
223
  ## Download data
@@ -513,6 +525,8 @@ if __name__ == "__main__":
513
  / f"{str(local_dir).split('/')[-1]}_{score_name}_private_only_score.csv",
514
  index=False,
515
  )
 
 
516
 
517
  rocs.to_csv(
518
  Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv",
 
30
 
31
  ## Make a directory to store computed results
32
  os.makedirs(Path("competition_cache") / "cached_results", exist_ok=True)
33
+ os.makedirs(Path("competition_cache") / "cached_results" / "by_team", exist_ok=True)
34
 
35
 
36
  def load_teams(competition_space_path: Path) -> pd.DataFrame:
 
219
  )
220
 
221
 
222
+ def save_by_team(df: pd.DataFrame, save_path_base: str) -> None:
223
+ df = df.copy()
224
+ for team in df["team"].unique():
225
+ os.makedirs(f"competition_cache/cached_results/by_team/{team}", exist_ok=True)
226
+ df_ = df[df["team"] == team].copy()
227
+ df_.to_csv(
228
+ f"competition_cache/cached_results/by_team/{team}/{save_path_base}",
229
+ index=False,
230
+ )
231
+
232
+
233
  if __name__ == "__main__":
234
 
235
  ## Download data
 
525
  / f"{str(local_dir).split('/')[-1]}_{score_name}_private_only_score.csv",
526
  index=False,
527
  )
528
+ save_by_team(df=public, save_path_base=f"{str(local_dir).split('/')[-1]}_{score_name}_public.csv")
529
+ save_by_team(df=private, save_path_base=f"{str(local_dir).split('/')[-1]}_{score_name}_private.csv")
530
 
531
  rocs.to_csv(
532
  Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv",