File size: 22,912 Bytes
56b1556 fba09af 56b1556 9b9ead9 56b1556 e04d80e 5938bb1 e04d80e 56b1556 5938bb1 880c8df e04d80e 5938bb1 880c8df 8362467 1b887f5 56b1556 fba09af f4b0324 e04d80e f4b0324 f646948 f4b0324 fba09af f4b0324 e04d80e f4b0324 e04d80e f4b0324 f646948 fba09af f646948 e04d80e f4b0324 e04d80e f4b0324 fba09af f646948 fba09af f646948 f4b0324 9b9ead9 56b1556 e04d80e f548cd1 f4b0324 f646948 56b1556 e04d80e fba09af e04d80e 5938bb1 e04d80e fba09af e04d80e fba09af e04d80e fba09af e04d80e 11f025b e04d80e 11f025b e04d80e 11f025b e04d80e 11f025b e04d80e 11f025b e04d80e 11f025b e04d80e 11f025b e04d80e 11f025b e04d80e fba09af e04d80e fba09af e04d80e 56b1556 e04d80e 56b1556 fba09af 9b9ead9 fba09af e04d80e 56b1556 805e16c 5938bb1 805e16c e04d80e 1b887f5 56b1556 27015a1 1b887f5 27015a1 880c8df 56b1556 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 |
import json
from datetime import datetime
from pathlib import Path
from huggingface_hub import snapshot_download
import tqdm.auto as tqdm
from typing import Any, Dict, List, Tuple
from collections import defaultdict
from metric import _metric
import os
import pandas as pd
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "20"
COMP_CACHE = os.environ.get("COMP_CACHE", "./competition_cache")
def download_competition_data(competition_names: List[str]) -> None:
"""Download copies to local environment"""
for repo_id in tqdm.tqdm(competition_names):
snapshot_download(
repo_id=repo_id,
local_dir=os.path.join(COMP_CACHE, repo_id),
repo_type="dataset",
token=os.environ.get("HF_TOKEN"),
ignore_patterns="submission_logs/*",
)
STATUS_MAP = {0: "PENDING", 1: "QUEUED", 2: "PROCESSING", 3: "SUCCESS", 4: "FAILED"}
## Make a directory to store computed results
os.makedirs(Path("competition_cache") / "cached_results", exist_ok=True)
os.makedirs(Path("competition_cache") / "cached_results" / "by_team", exist_ok=True)
def load_teams(competition_space_path: Path) -> pd.DataFrame:
team_file_name = "teams.json"
return pd.read_json(Path(competition_space_path) / team_file_name).T
def json_to_dataframe(data, extra_column_name=None, extra_column_value=None):
flat_data = []
for entry in data:
original_flat_entry = {**entry}
flat_entry = {k: v for k, v in original_flat_entry.items() if not "score" in k}
times = {
k.replace("score", "time"): v.get("total_time", -1) for k, v in original_flat_entry.items() if "score" in k
}
flat_entry.update(times)
if extra_column_name:
flat_entry[extra_column_name] = extra_column_value
flat_data.append(flat_entry)
df = pd.DataFrame(flat_data)
return df
def load_submission_map(competition_space_path: Path) -> Tuple[Dict[str, str], pd.DataFrame]:
submission_info_dir = "submission_info"
submission_info_files = list((Path(competition_space_path) / submission_info_dir).glob("*.json"))
# Loop and collect submission IDs by team
team_submissions: Dict[str, str] = {}
submission_summaries: List[pd.DataFrame] = []
for file in submission_info_files:
with open(file, "r") as fn:
json_data = json.load(fn)
submission_summaries.append(
json_to_dataframe(
data=json_data["submissions"], extra_column_name="team_id", extra_column_value=json_data["id"]
)
)
submission_list = pd.read_json(file).submissions.values.tolist()
for submission in submission_list:
team_submissions[submission["submission_id"]] = submission["submitted_by"]
submission_summary = pd.concat(submission_summaries, axis=0)
submission_summary["status_reason"] = submission_summary["status"].apply(lambda x: STATUS_MAP[x])
return team_submissions, submission_summary
def get_member_to_team_map(teams: pd.DataFrame, team_submissions: Dict[str, str]) -> Dict[str, str]:
member_map: Dict[str, str] = {}
for member_id in team_submissions.values():
member_map[member_id] = teams[teams.members.apply(lambda x: member_id in x)].id.values[0]
return member_map
def load_submissions(competition_space_path: Path) -> Dict[str, Dict[str, pd.DataFrame]]:
submission_dir = "submissions"
submissions: Dict[str, Dict[str, pd.DataFrame]] = defaultdict(dict)
for file in list((Path(competition_space_path) / submission_dir).glob("*.csv")):
file_name = str(file).split("/")[-1].split(".")[0]
team_id = "-".join(file_name.split("/")[-1].split("-")[:5])
sub_id = "-".join(file_name.split("/")[-1].split("-")[5:])
submissions[team_id][sub_id] = pd.read_csv(file).set_index("id")
return submissions
def compute_metric_per_team(
solution_df: pd.DataFrame,
team_submissions: Dict[str, pd.DataFrame],
submission_summaries: pd.DataFrame,
score_split: str = "source",
) -> Dict[str, Any]:
results: Dict[str, Any] = {}
for submission_id, submission in team_submissions.items():
selected = (
submission_summaries.query(f'submission_id=="{submission_id}"')
.filter(["selected"])
.reset_index(drop=True)
.to_dict(orient="index")
.get(0, {"selected": "False"})
.get("selected", "False")
)
try:
results[submission_id] = _metric(
solution_df=solution_df,
submission_df=submission,
score_name=score_split,
use_all=True if score_split == "source" else False,
)
for key in (current_results := results[submission_id]):
current_results[key]["selected"] = selected
except Exception as e:
# raise e
print("SKIPPING: ", submission_id, e)
return results
def prep_public(public_results: Dict[str, Any]) -> Dict[str, Any]:
new: Dict[str, Any] = {}
for key, value in public_results.items():
if key in ["proportion", "roc", "original_source"]:
continue
new[key] = value
return new
def prep_private(private_results: Dict[str, Any]) -> Dict[str, Any]:
new: Dict[str, Any] = {}
for key, value in private_results.items():
if key in ["proportion", "roc", "anon_source"]:
continue
new[key] = value
return new
def extract_roc(results: Dict[str, Any]) -> Dict[str, Any]:
new: Dict[str, Any] = {}
for key, value in results.items():
if key in ["roc"]:
for sub_key, sub_value in value.items():
new[sub_key] = sub_value
continue
if key in ["auc"]:
new[key] = value
return new
def add_custom_submission(path_to_cache, path_to_subfile, threshold=0):
import pandas as pd
import json
data = pd.read_csv(path_to_subfile)
data["id"] = data["ID"]
data["score"] = data["Score"]
data["pred"] = data["score"].apply(lambda a: "generated" if a >= threshold else "real")
team_id = "insiders-id-1-2-3"
team_name = "insiders"
submission_id = f"sub{threshold}".replace(".", "")
## update teams
teams = json.load(open(path_to_cache + "/teams.json"))
teams[team_id] = {"id": team_id, "name": team_name, "members": ["na"], "leader": "na"}
with open(path_to_cache + "/teams.json", "w") as f:
json.dump(teams, f, indent=4)
## create submission
submission_info_file = path_to_cache + f"/submission_info/{team_id}.json"
if os.path.exists(submission_info_file):
temp = json.load(open(submission_info_file))
else:
temp = {"id": team_id, "submissions": []}
temp["submissions"].append(
{
"datetime": "2025-09-22 14:42:14",
"submission_id": submission_id,
"submission_comment": "",
"submission_repo": "",
"space_id": "",
"submitted_by": "na",
"status": 3,
"selected": True,
"public_score": {},
"private_score": {},
}
)
with open(submission_info_file, "w") as f:
json.dump(temp, f)
data.loc[:, ["id", "pred", "score"]].to_csv(
path_to_cache + f"/submissions/{team_id}-{submission_id}.csv", index=False
)
def create_custom_subs():
import numpy as np
for threshold in np.linspace(-6, 0, 10):
add_custom_submission(
path_to_cache="competition_cache/safe-challenge/video-challenge-task-1-config",
path_to_subfile="competition_cache/custom/Scores-DSRI-brian.txt",
threshold=threshold,
)
def save_by_team(df: pd.DataFrame, save_path_base: str) -> None:
df = df.copy()
for team in df["team"].unique():
os.makedirs(f"competition_cache/cached_results/by_team/{team}", exist_ok=True)
df_ = df[df["team"] == team].copy()
df_.to_csv(
f"competition_cache/cached_results/by_team/{team}/{save_path_base}",
index=False,
)
if __name__ == "__main__":
## Download data
spaces: List[str] = [
"safe-challenge/video-challenge-pilot-config",
"safe-challenge/video-challenge-task-1-config",
"safe-challenge/video-challenge-task-2-config",
]
download_competition_data(competition_names=spaces)
if os.environ.get("MAKE_CUSTOM"):
print("adding custom subs")
create_custom_subs()
## Loop
for space in spaces:
local_dir = Path("competition_cache") / space
## Load relevant data
teams = load_teams(competition_space_path=local_dir)
team_submissions, submission_summaries = load_submission_map(competition_space_path=local_dir)
member_map = get_member_to_team_map(teams=teams, team_submissions=team_submissions)
submissions = load_submissions(competition_space_path=local_dir)
## Load solutions
solutions_df = pd.read_csv(local_dir / "solution.csv").set_index("id")
## Map if applicable
try:
with open(local_dir / "map.json", "r") as fn:
space_map = json.load(fn)
for df_col, df_map in space_map.items():
solutions_df[df_col] = solutions_df[df_col].map(df_map)
except Exception as e:
print("NO MAP FOUND.")
pass
## Update categories
prep_categories = False
try:
categories = {}
for category in solutions_df["category"].unique():
if category.replace("real_", "").replace("generated_", "") not in categories:
categories[category.replace("real_", "").replace("generated_", "")] = f"c_{len(categories):02d}"
solutions_df.loc[solutions_df["category"] == "real_camera", "category"] = "camera"
solutions_df.loc[solutions_df["category"] == "generated_camera", "category"] = "camera"
solutions_df["category_og"] = solutions_df["category"].copy()
solutions_df["category"] = solutions_df["category_og"].map(categories)
prep_categories = True
except Exception as e:
print(f"CATEGORIES NOT UPDATED.")
pass
solutions_df.to_csv(local_dir / "solution-processed.csv", index=False)
## Loop over sources and categories
if prep_categories:
scores = ["source", "category"]
else:
scores = ["source"]
for score_name in scores:
## Loop and save by team
public, private, private_only, rocs = [], [], [], []
# for team_id, submission_set in submissions.items():
for team_id, submission_set_ids in submission_summaries.query("status_reason=='SUCCESS'").groupby(
"team_id"
)["submission_id"]:
### lets check if we have the solution csvs
submission_set = submissions[team_id]
submission_set_ids_from_csvs = set(submission_set.keys())
submission_set_ids = set(submission_set_ids)
union = submission_set_ids | submission_set_ids_from_csvs
if not (submission_set_ids.issubset(submission_set_ids_from_csvs)):
missing = union - submission_set_ids_from_csvs
print(f"not all submission csv files found for {team_id}, missing {len(missing)}")
if submission_set_ids != submission_set_ids_from_csvs:
extra = union - submission_set_ids
print(f"extra {len(extra)} submissions in csvs than in summary file for team {team_id}")
print(f"dropping {extra}")
for submission_id in extra:
submission_set.pop(submission_id)
results = compute_metric_per_team(
solution_df=solutions_df,
team_submissions=submission_set,
submission_summaries=submission_summaries.query(f'team_id=="{team_id}"'),
score_split=score_name,
)
public_results = {
key: prep_public(value["public_score"]) for key, value in results.items() if key in team_submissions
}
private_results = {
key: prep_private(value["private_score"])
for key, value in results.items()
if key in team_submissions
}
private_only_results = {
key: prep_private(value["private_only_score"])
for key, value in results.items()
if key in team_submissions
}
## Add timing
public_times = {
x["submission_id"]: x["public_time"]
for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][
["submission_id", "public_time"]
].to_dict(orient="records")
}
private_times = {
x["submission_id"]: x["private_time"]
for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][
["submission_id", "private_time"]
].to_dict(orient="records")
}
private_only_times = {
x["submission_id"]: x["private_time"] - x["public_time"]
for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][
["submission_id", "private_time", "public_time"]
].to_dict(orient="records")
}
for key in public_results.keys():
public_results[key]["total_time"] = public_times[key]
for key in private_results.keys():
private_results[key]["total_time"] = private_times[key]
for key in private_only_results.keys():
private_only_results[key]["total_time"] = private_only_times[key]
## Roc computations
roc_results = {
key: extract_roc(value["private_score"])
for key, value in results.items()
if key in team_submissions
}
roc_df = pd.json_normalize(roc_results.values())
if len(roc_df) != 0:
roc_df.insert(loc=0, column="submission_id", value=roc_results.keys())
roc_df.insert(
loc=0,
column="team",
value=[
teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
for submission_id in roc_results.keys()
],
)
roc_df.insert(
loc=0,
column="submission_repo",
value=[
submission_summaries[
submission_summaries.team_id == member_map[team_submissions[submission_id]]
].submission_repo.values[0]
for submission_id in roc_results.keys()
],
)
roc_df.insert(
loc=0,
column="datetime",
value=[
submission_summaries[
submission_summaries.team_id == member_map[team_submissions[submission_id]]
].datetime.values[0]
for submission_id in roc_results.keys()
],
)
roc_df["label"] = roc_df.apply(
lambda x: f"AUC: {round(x['auc'], 2)} - {x['team']} - {x['submission_repo']}", axis=1
)
rocs.append(roc_df)
## Append results to save in cache
public_df = pd.json_normalize(public_results.values())
public_df.insert(
loc=0,
column="submission_id",
value=list(public_results.keys()),
)
public_df.insert(
loc=0,
column="team",
value=[
teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
for submission_id in public_results.keys()
],
)
public_df.insert(
loc=0,
column="team_id",
value=[
teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0]
for submission_id in public_results.keys()
],
)
public_df.insert(
loc=0,
column="datetime",
value=[
submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0]
for submission_id in public_results.keys()
],
)
public.append(public_df)
## Private results
private_df = pd.json_normalize(private_results.values())
private_df.insert(
loc=0,
column="submission_id",
value=list(private_results.keys()),
)
private_df.insert(
loc=0,
column="team",
value=[
teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
for submission_id in private_results.keys()
],
)
private_df.insert(
loc=0,
column="team_id",
value=[
teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0]
for submission_id in private_results.keys()
],
)
private_df.insert(
loc=0,
column="datetime",
value=[
submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0]
for submission_id in private_results.keys()
],
)
private.append(private_df)
## Private ONLY results
private_only_df = pd.json_normalize(private_only_results.values())
private_only_df.insert(
loc=0,
column="submission_id",
value=list(private_only_results.keys()),
)
private_only_df.insert(
loc=0,
column="team",
value=[
teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0]
for submission_id in private_only_results.keys()
],
)
private_only_df.insert(
loc=0,
column="team_id",
value=[
teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0]
for submission_id in private_only_results.keys()
],
)
private_only_df.insert(
loc=0,
column="datetime",
value=[
submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0]
for submission_id in private_only_results.keys()
],
)
private_only.append(private_only_df)
## Save as csvs
public = pd.concat(public, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
private = pd.concat(private, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False)
private_only = pd.concat(private_only, axis=0, ignore_index=True).sort_values(
by="balanced_accuracy", ascending=False
)
rocs = pd.concat(rocs, axis=0, ignore_index=True).explode(["tpr", "fpr", "threshold"], ignore_index=True)
public.to_csv(
Path("competition_cache")
/ "cached_results"
/ f"{str(local_dir).split('/')[-1]}_{score_name}_public_score.csv",
index=False,
)
private.to_csv(
Path("competition_cache")
/ "cached_results"
/ f"{str(local_dir).split('/')[-1]}_{score_name}_private_score.csv",
index=False,
)
private_only.to_csv(
Path("competition_cache")
/ "cached_results"
/ f"{str(local_dir).split('/')[-1]}_{score_name}_private_only_score.csv",
index=False,
)
save_by_team(df=public, save_path_base=f"{str(local_dir).split('/')[-1]}_{score_name}_public.csv")
save_by_team(df=private, save_path_base=f"{str(local_dir).split('/')[-1]}_{score_name}_private.csv")
rocs.to_csv(
Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv",
index=False,
)
submission_summaries["team"] = submission_summaries["team_id"].apply(lambda a: teams.loc[a, "name"])
submission_summaries.to_csv(
Path("competition_cache")
/ "cached_results"
/ f"{str(local_dir).split('/')[-1]}_{score_name}_submissions.csv",
index=False,
)
## Update time
import datetime
import pytz
# Get the current time in EST
est_timezone = pytz.timezone("US/Eastern")
current_time_est = datetime.datetime.now(est_timezone)
# Format the time as desired
formatted_time = current_time_est.strftime("%Y-%m-%d %H:%M:%S %Z")
formatted = f"Updated on {formatted_time}"
with open("competition_cache/updated.txt", "w") as file:
file.write(formatted)
|