import json from datetime import datetime from pathlib import Path from huggingface_hub import snapshot_download import tqdm.auto as tqdm from typing import Any, Dict, List, Tuple from collections import defaultdict from metric import _metric import os import pandas as pd os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "20" COMP_CACHE = os.environ.get("COMP_CACHE", "./competition_cache") def download_competition_data(competition_names: List[str]) -> None: """Download copies to local environment""" for repo_id in tqdm.tqdm(competition_names): snapshot_download( repo_id=repo_id, local_dir=os.path.join(COMP_CACHE, repo_id), repo_type="dataset", token=os.environ.get("HF_TOKEN"), ignore_patterns="submission_logs/*", ) STATUS_MAP = {0: "PENDING", 1: "QUEUED", 2: "PROCESSING", 3: "SUCCESS", 4: "FAILED"} ## Make a directory to store computed results os.makedirs(Path("competition_cache") / "cached_results", exist_ok=True) os.makedirs(Path("competition_cache") / "cached_results" / "by_team", exist_ok=True) def load_teams(competition_space_path: Path) -> pd.DataFrame: team_file_name = "teams.json" return pd.read_json(Path(competition_space_path) / team_file_name).T def json_to_dataframe(data, extra_column_name=None, extra_column_value=None): flat_data = [] for entry in data: original_flat_entry = {**entry} flat_entry = {k: v for k, v in original_flat_entry.items() if not "score" in k} times = { k.replace("score", "time"): v.get("total_time", -1) for k, v in original_flat_entry.items() if "score" in k } flat_entry.update(times) if extra_column_name: flat_entry[extra_column_name] = extra_column_value flat_data.append(flat_entry) df = pd.DataFrame(flat_data) return df def load_submission_map(competition_space_path: Path) -> Tuple[Dict[str, str], pd.DataFrame]: submission_info_dir = "submission_info" submission_info_files = list((Path(competition_space_path) / submission_info_dir).glob("*.json")) # Loop and collect submission IDs by team team_submissions: Dict[str, str] = {} submission_summaries: List[pd.DataFrame] = [] for file in submission_info_files: with open(file, "r") as fn: json_data = json.load(fn) submission_summaries.append( json_to_dataframe( data=json_data["submissions"], extra_column_name="team_id", extra_column_value=json_data["id"] ) ) submission_list = pd.read_json(file).submissions.values.tolist() for submission in submission_list: team_submissions[submission["submission_id"]] = submission["submitted_by"] submission_summary = pd.concat(submission_summaries, axis=0) submission_summary["status_reason"] = submission_summary["status"].apply(lambda x: STATUS_MAP[x]) return team_submissions, submission_summary def get_member_to_team_map(teams: pd.DataFrame, team_submissions: Dict[str, str]) -> Dict[str, str]: member_map: Dict[str, str] = {} for member_id in team_submissions.values(): member_map[member_id] = teams[teams.members.apply(lambda x: member_id in x)].id.values[0] return member_map def load_submissions(competition_space_path: Path) -> Dict[str, Dict[str, pd.DataFrame]]: submission_dir = "submissions" submissions: Dict[str, Dict[str, pd.DataFrame]] = defaultdict(dict) for file in list((Path(competition_space_path) / submission_dir).glob("*.csv")): file_name = str(file).split("/")[-1].split(".")[0] team_id = "-".join(file_name.split("/")[-1].split("-")[:5]) sub_id = "-".join(file_name.split("/")[-1].split("-")[5:]) submissions[team_id][sub_id] = pd.read_csv(file).set_index("id") return submissions def compute_metric_per_team( solution_df: pd.DataFrame, team_submissions: Dict[str, pd.DataFrame], submission_summaries: pd.DataFrame, score_split: str = "source", ) -> Dict[str, Any]: results: Dict[str, Any] = {} for submission_id, submission in team_submissions.items(): selected = ( submission_summaries.query(f'submission_id=="{submission_id}"') .filter(["selected"]) .reset_index(drop=True) .to_dict(orient="index") .get(0, {"selected": "False"}) .get("selected", "False") ) try: results[submission_id] = _metric( solution_df=solution_df, submission_df=submission, score_name=score_split, use_all=True if score_split == "source" else False, ) for key in (current_results := results[submission_id]): current_results[key]["selected"] = selected except Exception as e: # raise e print("SKIPPING: ", submission_id, e) return results def prep_public(public_results: Dict[str, Any]) -> Dict[str, Any]: new: Dict[str, Any] = {} for key, value in public_results.items(): if key in ["proportion", "roc", "original_source"]: continue new[key] = value return new def prep_private(private_results: Dict[str, Any]) -> Dict[str, Any]: new: Dict[str, Any] = {} for key, value in private_results.items(): if key in ["proportion", "roc", "anon_source"]: continue new[key] = value return new def extract_roc(results: Dict[str, Any]) -> Dict[str, Any]: new: Dict[str, Any] = {} for key, value in results.items(): if key in ["roc"]: for sub_key, sub_value in value.items(): new[sub_key] = sub_value continue if key in ["auc"]: new[key] = value return new def add_custom_submission(path_to_cache, path_to_subfile, threshold=0): import pandas as pd import json data = pd.read_csv(path_to_subfile) data["id"] = data["ID"] data["score"] = data["Score"] data["pred"] = data["score"].apply(lambda a: "generated" if a >= threshold else "real") team_id = "insiders-id-1-2-3" team_name = "insiders" submission_id = f"sub{threshold}".replace(".", "") ## update teams teams = json.load(open(path_to_cache + "/teams.json")) teams[team_id] = {"id": team_id, "name": team_name, "members": ["na"], "leader": "na"} with open(path_to_cache + "/teams.json", "w") as f: json.dump(teams, f, indent=4) ## create submission submission_info_file = path_to_cache + f"/submission_info/{team_id}.json" if os.path.exists(submission_info_file): temp = json.load(open(submission_info_file)) else: temp = {"id": team_id, "submissions": []} temp["submissions"].append( { "datetime": "2025-09-22 14:42:14", "submission_id": submission_id, "submission_comment": "", "submission_repo": "", "space_id": "", "submitted_by": "na", "status": 3, "selected": True, "public_score": {}, "private_score": {}, } ) with open(submission_info_file, "w") as f: json.dump(temp, f) data.loc[:, ["id", "pred", "score"]].to_csv( path_to_cache + f"/submissions/{team_id}-{submission_id}.csv", index=False ) def create_custom_subs(): import numpy as np for threshold in np.linspace(-6, 0, 10): add_custom_submission( path_to_cache="competition_cache/safe-challenge/video-challenge-task-1-config", path_to_subfile="competition_cache/custom/Scores-DSRI-brian.txt", threshold=threshold, ) def save_by_team(df: pd.DataFrame, save_path_base: str) -> None: df = df.copy() for team in df["team"].unique(): os.makedirs(f"competition_cache/cached_results/by_team/{team}", exist_ok=True) df_ = df[df["team"] == team].copy() df_.to_csv( f"competition_cache/cached_results/by_team/{team}/{save_path_base}", index=False, ) if __name__ == "__main__": ## Download data spaces: List[str] = [ "safe-challenge/video-challenge-pilot-config", "safe-challenge/video-challenge-task-1-config", "safe-challenge/video-challenge-task-2-config", ] download_competition_data(competition_names=spaces) if os.environ.get("MAKE_CUSTOM"): print("adding custom subs") create_custom_subs() ## Loop for space in spaces: local_dir = Path("competition_cache") / space ## Load relevant data teams = load_teams(competition_space_path=local_dir) team_submissions, submission_summaries = load_submission_map(competition_space_path=local_dir) member_map = get_member_to_team_map(teams=teams, team_submissions=team_submissions) submissions = load_submissions(competition_space_path=local_dir) ## Load solutions solutions_df = pd.read_csv(local_dir / "solution.csv").set_index("id") ## Map if applicable try: with open(local_dir / "map.json", "r") as fn: space_map = json.load(fn) for df_col, df_map in space_map.items(): solutions_df[df_col] = solutions_df[df_col].map(df_map) except Exception as e: print("NO MAP FOUND.") pass ## Update categories prep_categories = False try: categories = {} for category in solutions_df["category"].unique(): if category.replace("real_", "").replace("generated_", "") not in categories: categories[category.replace("real_", "").replace("generated_", "")] = f"c_{len(categories):02d}" solutions_df.loc[solutions_df["category"] == "real_camera", "category"] = "camera" solutions_df.loc[solutions_df["category"] == "generated_camera", "category"] = "camera" solutions_df["category_og"] = solutions_df["category"].copy() solutions_df["category"] = solutions_df["category_og"].map(categories) prep_categories = True except Exception as e: print(f"CATEGORIES NOT UPDATED.") pass solutions_df.to_csv(local_dir / "solution-processed.csv", index=False) ## Loop over sources and categories if prep_categories: scores = ["source", "category"] else: scores = ["source"] for score_name in scores: ## Loop and save by team public, private, private_only, rocs = [], [], [], [] # for team_id, submission_set in submissions.items(): for team_id, submission_set_ids in submission_summaries.query("status_reason=='SUCCESS'").groupby( "team_id" )["submission_id"]: ### lets check if we have the solution csvs submission_set = submissions[team_id] submission_set_ids_from_csvs = set(submission_set.keys()) submission_set_ids = set(submission_set_ids) union = submission_set_ids | submission_set_ids_from_csvs if not (submission_set_ids.issubset(submission_set_ids_from_csvs)): missing = union - submission_set_ids_from_csvs print(f"not all submission csv files found for {team_id}, missing {len(missing)}") if submission_set_ids != submission_set_ids_from_csvs: extra = union - submission_set_ids print(f"extra {len(extra)} submissions in csvs than in summary file for team {team_id}") print(f"dropping {extra}") for submission_id in extra: submission_set.pop(submission_id) results = compute_metric_per_team( solution_df=solutions_df, team_submissions=submission_set, submission_summaries=submission_summaries.query(f'team_id=="{team_id}"'), score_split=score_name, ) public_results = { key: prep_public(value["public_score"]) for key, value in results.items() if key in team_submissions } private_results = { key: prep_private(value["private_score"]) for key, value in results.items() if key in team_submissions } private_only_results = { key: prep_private(value["private_only_score"]) for key, value in results.items() if key in team_submissions } ## Add timing public_times = { x["submission_id"]: x["public_time"] for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][ ["submission_id", "public_time"] ].to_dict(orient="records") } private_times = { x["submission_id"]: x["private_time"] for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][ ["submission_id", "private_time"] ].to_dict(orient="records") } private_only_times = { x["submission_id"]: x["private_time"] - x["public_time"] for x in submission_summaries[submission_summaries["submission_id"].isin(results.keys())][ ["submission_id", "private_time", "public_time"] ].to_dict(orient="records") } for key in public_results.keys(): public_results[key]["total_time"] = public_times[key] for key in private_results.keys(): private_results[key]["total_time"] = private_times[key] for key in private_only_results.keys(): private_only_results[key]["total_time"] = private_only_times[key] ## Roc computations roc_results = { key: extract_roc(value["private_score"]) for key, value in results.items() if key in team_submissions } roc_df = pd.json_normalize(roc_results.values()) if len(roc_df) != 0: roc_df.insert(loc=0, column="submission_id", value=roc_results.keys()) roc_df.insert( loc=0, column="team", value=[ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0] for submission_id in roc_results.keys() ], ) roc_df.insert( loc=0, column="submission_repo", value=[ submission_summaries[ submission_summaries.team_id == member_map[team_submissions[submission_id]] ].submission_repo.values[0] for submission_id in roc_results.keys() ], ) roc_df.insert( loc=0, column="datetime", value=[ submission_summaries[ submission_summaries.team_id == member_map[team_submissions[submission_id]] ].datetime.values[0] for submission_id in roc_results.keys() ], ) roc_df["label"] = roc_df.apply( lambda x: f"AUC: {round(x['auc'], 2)} - {x['team']} - {x['submission_repo']}", axis=1 ) rocs.append(roc_df) ## Append results to save in cache public_df = pd.json_normalize(public_results.values()) public_df.insert( loc=0, column="submission_id", value=list(public_results.keys()), ) public_df.insert( loc=0, column="team", value=[ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0] for submission_id in public_results.keys() ], ) public_df.insert( loc=0, column="team_id", value=[ teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0] for submission_id in public_results.keys() ], ) public_df.insert( loc=0, column="datetime", value=[ submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0] for submission_id in public_results.keys() ], ) public.append(public_df) ## Private results private_df = pd.json_normalize(private_results.values()) private_df.insert( loc=0, column="submission_id", value=list(private_results.keys()), ) private_df.insert( loc=0, column="team", value=[ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0] for submission_id in private_results.keys() ], ) private_df.insert( loc=0, column="team_id", value=[ teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0] for submission_id in private_results.keys() ], ) private_df.insert( loc=0, column="datetime", value=[ submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0] for submission_id in private_results.keys() ], ) private.append(private_df) ## Private ONLY results private_only_df = pd.json_normalize(private_only_results.values()) private_only_df.insert( loc=0, column="submission_id", value=list(private_only_results.keys()), ) private_only_df.insert( loc=0, column="team", value=[ teams[teams.id == member_map[team_submissions[submission_id]]].name.values[0] for submission_id in private_only_results.keys() ], ) private_only_df.insert( loc=0, column="team_id", value=[ teams[teams.id == member_map[team_submissions[submission_id]]].id.values[0] for submission_id in private_only_results.keys() ], ) private_only_df.insert( loc=0, column="datetime", value=[ submission_summaries[submission_summaries.submission_id == submission_id].datetime.values[0] for submission_id in private_only_results.keys() ], ) private_only.append(private_only_df) ## Save as csvs public = pd.concat(public, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False) private = pd.concat(private, axis=0, ignore_index=True).sort_values(by="balanced_accuracy", ascending=False) private_only = pd.concat(private_only, axis=0, ignore_index=True).sort_values( by="balanced_accuracy", ascending=False ) rocs = pd.concat(rocs, axis=0, ignore_index=True).explode(["tpr", "fpr", "threshold"], ignore_index=True) public.to_csv( Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_public_score.csv", index=False, ) private.to_csv( Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_private_score.csv", index=False, ) private_only.to_csv( Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_private_only_score.csv", index=False, ) save_by_team(df=public, save_path_base=f"{str(local_dir).split('/')[-1]}_{score_name}_public.csv") save_by_team(df=private, save_path_base=f"{str(local_dir).split('/')[-1]}_{score_name}_private.csv") rocs.to_csv( Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_rocs.csv", index=False, ) submission_summaries["team"] = submission_summaries["team_id"].apply(lambda a: teams.loc[a, "name"]) submission_summaries.to_csv( Path("competition_cache") / "cached_results" / f"{str(local_dir).split('/')[-1]}_{score_name}_submissions.csv", index=False, ) ## Update time import datetime import pytz # Get the current time in EST est_timezone = pytz.timezone("US/Eastern") current_time_est = datetime.datetime.now(est_timezone) # Format the time as desired formatted_time = current_time_est.strftime("%Y-%m-%d %H:%M:%S %Z") formatted = f"Updated on {formatted_time}" with open("competition_cache/updated.txt", "w") as file: file.write(formatted)