import streamlit as st import pandas as pd import json import os import posixpath from huggingface_hub import hf_hub_download from huggingface_hub import list_repo_files import io import zipfile import shutil import tempfile, uuid # Replace this with your actual Hugging Face repo ID REPO_ID = "PortPy-Project/PortPy_Dataset" # Load from private repo using token token = os.getenv("HF_TOKEN") @st.cache_data def get_patient_ids(): # Extract disease site from patient ID prefix (e.g., Lung_Patient_1) file = hf_hub_download(REPO_ID, repo_type="dataset", filename="data_info.jsonl", token=token) with open(file) as f: # data_info = json.load(f) data_info = [json.loads(line) for line in f] patient_ids = [pat['patient_id'] for pat in data_info] df = pd.DataFrame(patient_ids, columns=["patient_id"]) df["disease_site"] = df["patient_id"].str.extract(r"^(.*?)_") return df @st.cache_data def _list_all_repo_files(): return list_repo_files(repo_id=REPO_ID, repo_type="dataset") @st.cache_data def load_all_metadata(disease_site): # Get the list of patient IDs for the selected disease site patient_df = get_patient_ids() filtered_patients = patient_df[patient_df["disease_site"] == disease_site] metadata = {} for patient_id in filtered_patients["patient_id"]: # TODO: limit for testing # Load structure metadata for the patient structs = load_structure_metadata(patient_id) # Load beam metadata for the patient beams = load_beam_metadata(patient_id) planner_file = hf_hub_download(REPO_ID, repo_type="dataset", filename=f"data/{patient_id}/PlannerBeams.json", token=token) with open(planner_file) as f: planner_data = json.load(f) planner_beam_ids = planner_data.get("IDs", []) metadata[patient_id] = { "structures": structs, "beams": beams, "planner_beam_ids": planner_beam_ids } return metadata @st.cache_data def load_structure_metadata(patient_id): file = hf_hub_download(REPO_ID, repo_type="dataset", filename=f"data/{patient_id}/StructureSet_MetaData.json", token=token) with open(file) as f: return json.load(f) @st.cache_data def load_beam_metadata(patient_id): files = _list_all_repo_files() beam_meta_paths = [ f for f in files if f.startswith(f"data/{patient_id}/Beams/Beam_") and f.endswith("_MetaData.json") ] beam_meta = [] for path in beam_meta_paths: file = hf_hub_download(REPO_ID, repo_type="dataset", filename=path, token=token) # no local_dir with open(file) as f: beam_meta.append(json.load(f)) return beam_meta def get_patient_summary_from_cached_data(patient_id, all_metadata): structs = all_metadata[patient_id]["structures"] beams = all_metadata[patient_id]["beams"] ptv_vol = None for s in structs: if "PTV" in s["name"].upper(): ptv_vol = s.get("volume_cc") break return { "ptv_volume": ptv_vol, "num_beams": len(beams), "beams": beams } def filter_matched_data(filtered_patients, query_ptv_vol, beam_gantry_filter, beam_collimator_filter, beam_energy_filter, beam_couch_filter, only_planner, all_metadata): matched = [] gantry_angles = set(map(int, beam_gantry_filter.split(","))) if beam_gantry_filter else None collimator_angles = set(map(int, beam_collimator_filter.split(","))) if beam_collimator_filter else None couch_angles = set(map(int, beam_couch_filter.split(","))) if beam_couch_filter else None energies = set(beam_energy_filter.replace(" ", "").split(",")) if beam_energy_filter else None for pid in filtered_patients["patient_id"]: # Retrieve metadata for the patient from the pre-cached all_metadata summary = get_patient_summary_from_cached_data(pid, all_metadata) if summary["ptv_volume"] is None or summary["ptv_volume"] < query_ptv_vol: continue # Filter beams by all conditions selected_beams = summary["beams"] if gantry_angles: selected_beams = [b for b in selected_beams if b["gantry_angle"] in gantry_angles] if collimator_angles: selected_beams = [b for b in selected_beams if b["collimator_angle"] in collimator_angles] if couch_angles: selected_beams = [b for b in selected_beams if b["couch_angle"] in couch_angles] if energies: selected_beams = [b for b in selected_beams if b['energy_MV'] in energies] selected_beam_ids = [b["ID"] for b in selected_beams] if not selected_beam_ids: continue if only_planner: planner_beam_ids = set(all_metadata[pid]["planner_beam_ids"]) selected_beam_ids = list(planner_beam_ids.intersection(selected_beam_ids)) if not selected_beam_ids: continue matched.append({ "patient_id": pid, "num_beams": len(selected_beam_ids), "ptv_volume": summary["ptv_volume"], "selected_beam_ids": selected_beam_ids }) return pd.DataFrame(matched) def download_data(repo_id, patient_ids, beam_ids=None, planner_beam_ids=True, max_retries=2, local_dir='./', download_dicom=True): from huggingface_hub import hf_hub_download downloaded_files = [] for patient_id in patient_ids: static_files = [ "CT_Data.h5", "CT_MetaData.json", "StructureSet_Data.h5", "StructureSet_MetaData.json", "OptimizationVoxels_Data.h5", "OptimizationVoxels_MetaData.json", "PlannerBeams.json" ] for filename in static_files: hf_path = posixpath.join("data", patient_id, filename) for attempt in range(max_retries): try: local_path = hf_hub_download( repo_id=repo_id, repo_type="dataset", filename=hf_path, local_dir=local_dir, token=token ) downloaded_files.append(local_path) break except Exception as e: if attempt == max_retries - 1: st.error(f"Failed to download {hf_path}: {e}") # --------------------------------------------------------------- # 2. Download all DICOM files under data//DicomFiles/ # --------------------------------------------------------------- if download_dicom: try: all_files = list_repo_files(repo_id, repo_type="dataset") dicom_prefix = f"data/{patient_id}/DicomFiles/" dicom_files = [f for f in all_files if f.startswith(dicom_prefix)] for hf_path in dicom_files: for attempt in range(max_retries): try: local_path = hf_hub_download( repo_id=repo_id, repo_type="dataset", filename=hf_path, local_dir=local_dir, token=token ) downloaded_files.append(local_path) break except Exception as e: if attempt == max_retries - 1: st.error(f"Failed to download {hf_path}: {e}") except Exception as e: st.error(f"Error listing DICOM files for {patient_id}: {e}") if planner_beam_ids: planner_file = os.path.join(local_dir, 'data', patient_id, "PlannerBeams.json") try: with open(planner_file, "r") as f: planner_data = json.load(f) beam_ids = planner_data.get("IDs", []) except Exception as e: st.error(f"Error reading PlannerBeams.json: {e}") beam_ids = [] if beam_ids is not None: for bid in beam_ids: beam_data_file = f"Beams/Beam_{bid}_Data.h5" beam_meta_file = f"Beams/Beam_{bid}_MetaData.json" for beam_file in [beam_data_file, beam_meta_file]: hf_path = posixpath.join("data", patient_id, beam_file) for attempt in range(max_retries): try: local_path = hf_hub_download( repo_id=repo_id, repo_type="dataset", filename=hf_path, local_dir=local_dir, token=token ) downloaded_files.append(local_path) break except Exception as e: if attempt == max_retries - 1: st.error(f"Failed to download {hf_path}: {e}") return downloaded_files from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode def show_aggrid_table(df): gb = GridOptionsBuilder.from_dataframe(df) gb.configure_default_column(groupable=True, value=True, enableRowGroup=True, aggFunc='sum', editable=False) gb.configure_grid_options(domLayout='normal') # Enable multiple row selection with checkboxes gb.configure_selection('multiple', use_checkbox=True) gb.configure_column("patient_id", checkboxSelection=True) grid_options = gb.build() grid_response = AgGrid( df, gridOptions=grid_options, enable_enterprise_modules=False, allow_unsafe_jscode=True, fit_columns_on_grid_load=True, theme='balham', update_mode=GridUpdateMode.SELECTION_CHANGED ) return grid_response def main(): st.set_page_config(page_title="PortPy Metadata Explorer", layout="wide") st.title("📊 PortPy Metadata Explorer & Downloader") patient_df = get_patient_ids() disease_site = st.sidebar.selectbox("Select Disease Site", patient_df["disease_site"].unique()) all_metadata = load_all_metadata(disease_site) # Load and cache all metadata for selected disease site filtered_patients = pd.DataFrame(all_metadata.keys(), columns=["patient_id"]) beam_gantry_filter = st.sidebar.text_input("Gantry Angles (comma-separated)", "") beam_collimator_filter = st.sidebar.text_input("Collimator Angles (comma-separated)", "") beam_energy_filter = st.sidebar.text_input("Beam Energies (comma-separated)", "") beam_couch_filter = st.sidebar.text_input("Couch Angles (comma-separated)", "") query_ptv_vol = st.sidebar.number_input("Minimum PTV volume (cc):", value=0) # Checkbox: Only planner beams only_planner = st.sidebar.checkbox( "Show only planner beams (if selected it will download only planner beams)", value=True, ) results_df = filter_matched_data( filtered_patients, query_ptv_vol, beam_gantry_filter, beam_collimator_filter, beam_energy_filter, beam_couch_filter, only_planner, all_metadata ) # Summary Table # st.dataframe(results_df) grid_response = show_aggrid_table(results_df) selected_rows = grid_response.get("selected_rows", pd.DataFrame()) if isinstance(selected_rows, pd.DataFrame): print(selected_rows) if not selected_rows.empty: for _, row in selected_rows.iterrows(): pid = row["patient_id"] st.markdown(f"### Patient: {pid}") st.markdown("#### Structures") st.dataframe(pd.DataFrame(all_metadata[pid]["structures"])) st.markdown("#### Beams") st.dataframe(pd.DataFrame(all_metadata[pid]["beams"])) # selected_patient = st.selectbox("Select patient for detailed view", results_df["patient_id"] if not results_df.empty else []) # if selected_patient: # structs = all_metadata[selected_patient]["structures"] # beams = all_metadata[selected_patient]["beams"] # st.subheader(f"🏗️ Structures for {selected_patient}") # st.dataframe(pd.DataFrame(structs), use_container_width=True) # st.subheader(f"📡 Beams for {selected_patient}") # st.dataframe(pd.DataFrame(beams), use_container_width=True) if "open_download_expander" not in st.session_state: st.session_state["open_download_expander"] = False with st.expander("Download matched patients", expanded=st.session_state["open_download_expander"]): # Multi-select and download to_download = st.sidebar.multiselect("Select Patients to Download", results_df["patient_id"].tolist()) # local_dir = st.sidebar.text_input("Enter local directory to download data:", value="./downloaded") # if st.sidebar.button("Download Selected Patients"): # if to_download: # patient_to_beams = { # row["patient_id"]: row["beam_ids"] for ind, row in results_df.iterrows() if ind in to_download # } # for pid, beam_ids in patient_to_beams.items(): # download_data(REPO_ID, [pid], beam_ids=beam_ids, planner_beam_ids=False, local_dir=local_dir) # st.success("Download complete!") # else: # st.warning("No patients selected.") if st.sidebar.button("Download Selected Patients"): st.session_state["open_download_expander"] = True # Force open expander if not to_download: st.warning("No patients selected.") else: progress = st.progress(0) status = st.empty() local_dir = "./downloaded" if os.path.exists(local_dir): shutil.rmtree(local_dir) os.makedirs(local_dir, exist_ok=True) patient_to_beams = { row["patient_id"]: row["selected_beam_ids"] for _, row in results_df.iterrows() if row["patient_id"] in to_download } total = len(patient_to_beams) for i, (pid, beam_ids) in enumerate(patient_to_beams.items(), start=1): status.write(f"Downloading {pid} ({i}/{total})…") download_data(REPO_ID, [pid], beam_ids=beam_ids, planner_beam_ids=only_planner, local_dir=local_dir, download_dicom=True) progress.progress(i / total) status.success("All downloads complete. Preparing zip…") zip_path = os.path.join(tempfile.gettempdir(), f"portpy_patients_{uuid.uuid4().hex}.zip") # optional: guard size to avoid crashes total_bytes = 0 for root, _, files in os.walk(local_dir): for f in files: total_bytes += os.path.getsize(os.path.join(root, f)) total_gb = total_bytes / (1024 ** 3) status.write(f"Preparing zip (~{total_gb:.2f} GB)…") if total_gb > 40.0: st.error("Selection too large for a single zip. Please download fewer patients.") st.stop() if os.path.exists(zip_path): os.remove(zip_path) with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_STORED, allowZip64=True) as zf: for root, _, files in os.walk(local_dir): for f in files: full_path = os.path.join(root, f) rel_path = os.path.relpath(full_path, local_dir) zf.write(full_path, rel_path) with open(zip_path, "rb") as fp: st.download_button( label="Your download is ready! Click to save.", data=fp, file_name="portpy_patients.zip", mime="application/zip", ) # # Create zip in memory # buf = io.BytesIO() # with zipfile.ZipFile(buf, "w", zipfile.ZIP_STORED) as zf: # for root, _, files in os.walk(local_dir): # for f in files: # full_path = os.path.join(root, f) # rel_path = os.path.relpath(full_path, local_dir) # zf.write(full_path, rel_path) # buf.seek(0) # # # Trigger file download automatically from the SAME BUTTON CLICK # st.download_button( # label="Your download is ready! Click to save.", # data=buf, # file_name="portpy_patients.zip", # mime="application/zip", # ) # if st.button("Download Data"): # patients_to_download = results_df["patient_id"].tolist() # download_data(REPO_ID, patients_to_download, planner_beam_ids=True, local_dir=local_dir) # st.success("Download complete!") if __name__ == "__main__": main() #to run: streamlit run app.py