Spaces:

facebook
/

EdgeTAM

Paused

App Files Files Community

chongzhou commited on May 14

Commit

defc201

1 Parent(s): 93472c9

remove predictor from state

Browse files

Files changed (1) hide show

app.py +44 -35

app.py CHANGED Viewed

@@ -87,21 +87,45 @@ def get_video_fps(video_path):
     return fps
 def reset(
     first_frame,
     all_frames,
     input_points,
     input_labels,
     inference_state,
-    predictor,
 ):
     first_frame = None
     all_frames = None
     input_points = []
     input_labels = []
-    if inference_state and predictor:
-        predictor.reset_state(inference_state)
     inference_state = None
     return (
         None,
@@ -114,7 +138,6 @@ def reset(
         input_points,
         input_labels,
         inference_state,
-        predictor,
     )
@@ -124,12 +147,11 @@ def clear_points(
     input_points,
     input_labels,
     inference_state,
-    predictor,
 ):
     input_points = []
     input_labels = []
-    if inference_state and predictor and inference_state["tracking_has_started"]:
-        predictor.reset_state(inference_state)
     return (
         first_frame,
         None,
@@ -139,7 +161,6 @@ def clear_points(
         input_points,
         input_labels,
         inference_state,
-        predictor,
     )
@@ -150,7 +171,6 @@ def preprocess_video_in(
     input_points,
     input_labels,
     inference_state,
-    predictor,
 ):
     if video_path is None:
         return (
@@ -163,7 +183,6 @@ def preprocess_video_in(
             input_points,
             input_labels,
             inference_state,
-            predictor,
         )
     # Read the first frame
@@ -180,12 +199,8 @@ def preprocess_video_in(
             input_points,
             input_labels,
             inference_state,
-            predictor,
         )
-    if predictor is None:
-        predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
     frame_number = 0
     _first_frame = None
     all_frames = []
@@ -207,10 +222,19 @@ def preprocess_video_in(
     cap.release()
     first_frame = copy.deepcopy(_first_frame)
-    inference_state = predictor.init_state(video_path=video_path)
     input_points = []
     input_labels = []
     return [
         gr.update(open=False),  # video_in_drawer
         first_frame,  # points_map
@@ -221,7 +245,6 @@ def preprocess_video_in(
         input_points,
         input_labels,
         inference_state,
-        predictor,
     ]
@@ -232,9 +255,9 @@ def segment_with_points(
     input_points,
     input_labels,
     inference_state,
-    predictor,
     evt: gr.SelectData,
 ):
     if torch.cuda.is_available():
         predictor.to("cuda")
         inference_state["device"] = "cuda"
@@ -299,7 +322,6 @@ def segment_with_points(
         input_points,
         input_labels,
         inference_state,
-        predictor,
     )
@@ -325,8 +347,8 @@ def propagate_to_all(
     input_points,
     input_labels,
     inference_state,
-    predictor,
 ):
     if torch.cuda.is_available():
         predictor.to("cuda")
         inference_state["device"] = "cuda"
@@ -383,15 +405,15 @@ def propagate_to_all(
         input_points,
         input_labels,
         inference_state,
-        predictor,
     )
 try:
     from spaces import GPU
-    segment_with_points = GPU(segment_with_points)
-    propagate_to_all = GPU(propagate_to_all)
 except:
     print("spaces unavailable")
@@ -406,7 +428,6 @@ with gr.Blocks() as demo:
     input_points = gr.State([])
     input_labels = gr.State([])
     inference_state = gr.State()
-    predictor = gr.State()
     with gr.Column():
         # Title
@@ -461,7 +482,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         outputs=[
             video_in_drawer,  # Accordion to hide uploaded video player
@@ -473,7 +493,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         queue=False,
     )
@@ -487,7 +506,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         outputs=[
             video_in_drawer,  # Accordion to hide uploaded video player
@@ -499,7 +517,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         queue=False,
     )
@@ -514,7 +531,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         outputs=[
             points_map,  # updated image with points
@@ -524,7 +540,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         queue=False,
     )
@@ -538,7 +553,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         outputs=[
             points_map,
@@ -549,7 +563,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         queue=False,
     )
@@ -562,7 +575,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         outputs=[
             video_in,
@@ -575,7 +587,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         queue=False,
     )
@@ -594,7 +605,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         outputs=[
             output_video,
@@ -603,7 +613,6 @@ with gr.Blocks() as demo:
             input_points,
             input_labels,
             inference_state,
-            predictor,
         ],
         concurrency_limit=10,
         queue=False,

     return fps
+def reset_state(inference_state):
+    for v in inference_state["point_inputs_per_obj"].values():
+        v.clear()
+    for v in inference_state["mask_inputs_per_obj"].values():
+        v.clear()
+    for v in inference_state["output_dict_per_obj"].values():
+        v["cond_frame_outputs"].clear()
+        v["non_cond_frame_outputs"].clear()
+    for v in inference_state["temp_output_dict_per_obj"].values():
+        v["cond_frame_outputs"].clear()
+        v["non_cond_frame_outputs"].clear()
+    inference_state["output_dict"]["cond_frame_outputs"].clear()
+    inference_state["output_dict"]["non_cond_frame_outputs"].clear()
+    inference_state["consolidated_frame_inds"]["cond_frame_outputs"].clear()
+    inference_state["consolidated_frame_inds"]["non_cond_frame_outputs"].clear()
+    inference_state["tracking_has_started"] = False
+    inference_state["frames_already_tracked"].clear()
+    inference_state["obj_id_to_idx"].clear()
+    inference_state["obj_idx_to_id"].clear()
+    inference_state["obj_ids"].clear()
+    inference_state["point_inputs_per_obj"].clear()
+    inference_state["mask_inputs_per_obj"].clear()
+    inference_state["output_dict_per_obj"].clear()
+    inference_state["temp_output_dict_per_obj"].clear()
+    return inference_state
 def reset(
     first_frame,
     all_frames,
     input_points,
     input_labels,
     inference_state,
 ):
     first_frame = None
     all_frames = None
     input_points = []
     input_labels = []
     inference_state = None
     return (
         None,
         input_points,
         input_labels,
         inference_state,
     )
     input_points,
     input_labels,
     inference_state,
 ):
     input_points = []
     input_labels = []
+    if inference_state and inference_state["tracking_has_started"]:
+        inference_state = reset_state(inference_state)
     return (
         first_frame,
         None,
         input_points,
         input_labels,
         inference_state,
     )
     input_points,
     input_labels,
     inference_state,
 ):
     if video_path is None:
         return (
             input_points,
             input_labels,
             inference_state,
         )
     # Read the first frame
             input_points,
             input_labels,
             inference_state,
         )
     frame_number = 0
     _first_frame = None
     all_frames = []
     cap.release()
     first_frame = copy.deepcopy(_first_frame)
     input_points = []
     input_labels = []
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
+    if torch.cuda.is_available():
+        predictor.to("cuda")
+        inference_state["device"] = "cuda"
+        if torch.cuda.get_device_properties(0).major >= 8:
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+        torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+    inference_state = predictor.init_state(video_path=video_path)
     return [
         gr.update(open=False),  # video_in_drawer
         first_frame,  # points_map
         input_points,
         input_labels,
         inference_state,
     ]
     input_points,
     input_labels,
     inference_state,
     evt: gr.SelectData,
 ):
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
     if torch.cuda.is_available():
         predictor.to("cuda")
         inference_state["device"] = "cuda"
         input_points,
         input_labels,
         inference_state,
     )
     input_points,
     input_labels,
     inference_state,
 ):
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
     if torch.cuda.is_available():
         predictor.to("cuda")
         inference_state["device"] = "cuda"
         input_points,
         input_labels,
         inference_state,
     )
 try:
     from spaces import GPU
+    preprocess_video_in = GPU(preprocess_video_in, duration=10)
+    segment_with_points = GPU(segment_with_points, duration=5)
+    propagate_to_all = GPU(propagate_to_all, duration=30)
 except:
     print("spaces unavailable")
     input_points = gr.State([])
     input_labels = gr.State([])
     inference_state = gr.State()
     with gr.Column():
         # Title
             input_points,
             input_labels,
             inference_state,
         ],
         outputs=[
             video_in_drawer,  # Accordion to hide uploaded video player
             input_points,
             input_labels,
             inference_state,
         ],
         queue=False,
     )
             input_points,
             input_labels,
             inference_state,
         ],
         outputs=[
             video_in_drawer,  # Accordion to hide uploaded video player
             input_points,
             input_labels,
             inference_state,
         ],
         queue=False,
     )
             input_points,
             input_labels,
             inference_state,
         ],
         outputs=[
             points_map,  # updated image with points
             input_points,
             input_labels,
             inference_state,
         ],
         queue=False,
     )
             input_points,
             input_labels,
             inference_state,
         ],
         outputs=[
             points_map,
             input_points,
             input_labels,
             inference_state,
         ],
         queue=False,
     )
             input_points,
             input_labels,
             inference_state,
         ],
         outputs=[
             video_in,
             input_points,
             input_labels,
             inference_state,
         ],
         queue=False,
     )
             input_points,
             input_labels,
             inference_state,
         ],
         outputs=[
             output_video,
             input_points,
             input_labels,
             inference_state,
         ],
         concurrency_limit=10,
         queue=False,