Spaces:

facebook
/

EdgeTAM

Paused

App Files Files Community

chongzhou commited on May 17

Commit

238c545

1 Parent(s): 129e201

carefully move between CPU and GPU

Browse files

Files changed (3) hide show

app.py +6 -11
sam2/modeling/sam2_base.py +2 -2
sam2/sam2_video_predictor.py +20 -12

app.py CHANGED Viewed

@@ -165,7 +165,6 @@ def clear_points(
     )
-@spaces.GPU(duration=10)
 def preprocess_video_in(
     video_path,
     first_frame,
@@ -227,16 +226,12 @@ def preprocess_video_in(
     input_points = []
     input_labels = []
-    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cuda")
-    if torch.cuda.get_device_properties(0).major >= 8:
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-        inference_state = predictor.init_state(
-            offload_video_to_cpu=True,
-            offload_state_to_cpu=True,
-            video_path=video_path,
-        )
     return [
         gr.update(open=False),  # video_in_drawer

     )
 def preprocess_video_in(
     video_path,
     first_frame,
     input_points = []
     input_labels = []
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
+    inference_state = predictor.init_state(
+        offload_video_to_cpu=True,
+        offload_state_to_cpu=True,
+        video_path=video_path,
+    )
     return [
         gr.update(open=False),  # video_in_drawer

sam2/modeling/sam2_base.py CHANGED Viewed

@@ -617,7 +617,7 @@ class SAM2Base(torch.nn.Module):
                             if self.use_signed_tpos_enc_to_obj_ptrs
                             else abs(frame_idx - t)
                         ),
-                        out["obj_ptr"],
                     )
                     for t, out in ptr_cond_outputs.items()
                 ]
@@ -630,7 +630,7 @@ class SAM2Base(torch.nn.Module):
                         t, unselected_cond_outputs.get(t, None)
                     )
                     if out is not None:
-                        pos_and_ptrs.append((t_diff, out["obj_ptr"]))
                 # If we have at least one object pointer, add them to the across attention
                 if len(pos_and_ptrs) > 0:
                     pos_list, ptrs_list = zip(*pos_and_ptrs)

                             if self.use_signed_tpos_enc_to_obj_ptrs
                             else abs(frame_idx - t)
                         ),
+                        out["obj_ptr"].to(device),
                     )
                     for t, out in ptr_cond_outputs.items()
                 ]
                         t, unselected_cond_outputs.get(t, None)
                     )
                     if out is not None:
+                        pos_and_ptrs.append((t_diff, out["obj_ptr"].to(device)))
                 # If we have at least one object pointer, add them to the across attention
                 if len(pos_and_ptrs) > 0:
                     pos_list, ptrs_list = zip(*pos_and_ptrs)

sam2/sam2_video_predictor.py CHANGED Viewed

@@ -107,7 +107,7 @@ class SAM2VideoPredictor(SAM2Base):
         inference_state["tracking_has_started"] = False
         inference_state["frames_already_tracked"] = {}
         # Warm up the visual backbone and cache the image feature on frame 0
-        self._get_image_feature(inference_state, frame_idx=0, batch_size=1)
         return inference_state
     @classmethod
@@ -470,7 +470,7 @@ class SAM2VideoPredictor(SAM2Base):
                 size=(batch_size, self.hidden_dim),
                 fill_value=NO_OBJ_SCORE,
                 dtype=torch.float32,
-                device=inference_state["device"],
             ),
             "object_score_logits": torch.full(
                 size=(batch_size, 1),
@@ -478,7 +478,7 @@ class SAM2VideoPredictor(SAM2Base):
                 # present as sigmoid(10)=1, same as in `predict_masks` of `MaskDecoder`
                 fill_value=10.0,
                 dtype=torch.float32,
-                device=inference_state["device"],
             ),
         }
         empty_mask_ptr = None
@@ -545,7 +545,9 @@ class SAM2VideoPredictor(SAM2Base):
                 frame_idx=frame_idx,
                 batch_size=batch_size,
                 high_res_masks=high_res_masks,
-                object_score_logits=consolidated_out["object_score_logits"],
                 is_mask_from_pts=True,  # these frames are what the user interacted with
             )
             consolidated_out["maskmem_features"] = maskmem_features
@@ -879,9 +881,10 @@ class SAM2VideoPredictor(SAM2Base):
     def _get_image_feature(self, inference_state, frame_idx, batch_size):
         """Compute the image features on a given frame."""
         # Look up in the cache first
-        image, backbone_out = inference_state["cached_features"].get(
-            frame_idx, (None, None)
-        )
         if backbone_out is None:
             # Cache miss -- we will run inference on a single image
             device = inference_state["device"]
@@ -889,7 +892,7 @@ class SAM2VideoPredictor(SAM2Base):
             backbone_out = self.forward_image(image)
             # Cache the most recent frame's feature (for repeated interactions with
             # a frame; we can use an LRU cache for more frames in the future).
-            inference_state["cached_features"] = {frame_idx: (image, backbone_out)}
         # expand the features to have the same dimension as the number of objects
         expanded_image = image.expand(batch_size, -1, -1, -1)
@@ -964,9 +967,11 @@ class SAM2VideoPredictor(SAM2Base):
         pred_masks = pred_masks_gpu.to(storage_device, non_blocking=True)
         # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
         maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, current_out)
-        # object pointer is a small tensor, so we always keep it on GPU memory for fast access
-        obj_ptr = current_out["obj_ptr"]
-        object_score_logits = current_out["object_score_logits"]
         # make a compact version of this frame's output to reduce the state size
         compact_current_out = {
             "maskmem_features": maskmem_features,
@@ -1018,6 +1023,7 @@ class SAM2VideoPredictor(SAM2Base):
         `maskmem_pos_enc` is the same across frames and objects, so we cache it as
         a constant in the inference session to reduce session storage size.
         """
         model_constants = inference_state["constants"]
         # "out_maskmem_pos_enc" should be either a list of tensors or None
         out_maskmem_pos_enc = current_out["maskmem_pos_enc"]
@@ -1026,7 +1032,9 @@ class SAM2VideoPredictor(SAM2Base):
                 assert isinstance(out_maskmem_pos_enc, list)
                 # only take the slice for one object, since it's same across objects
                 maskmem_pos_enc = [x[0:1].clone() for x in out_maskmem_pos_enc]
-                model_constants["maskmem_pos_enc"] = maskmem_pos_enc
             else:
                 maskmem_pos_enc = model_constants["maskmem_pos_enc"]
             # expand the cached maskmem_pos_enc to the actual batch size

         inference_state["tracking_has_started"] = False
         inference_state["frames_already_tracked"] = {}
         # Warm up the visual backbone and cache the image feature on frame 0
+        # self._get_image_feature(inference_state, frame_idx=0, batch_size=1)
         return inference_state
     @classmethod
                 size=(batch_size, self.hidden_dim),
                 fill_value=NO_OBJ_SCORE,
                 dtype=torch.float32,
+                device=inference_state["storage_device"],
             ),
             "object_score_logits": torch.full(
                 size=(batch_size, 1),
                 # present as sigmoid(10)=1, same as in `predict_masks` of `MaskDecoder`
                 fill_value=10.0,
                 dtype=torch.float32,
+                device=inference_state["storage_device"],
             ),
         }
         empty_mask_ptr = None
                 frame_idx=frame_idx,
                 batch_size=batch_size,
                 high_res_masks=high_res_masks,
+                object_score_logits=consolidated_out["object_score_logits"].to(
+                    device, non_blocking=True
+                ),
                 is_mask_from_pts=True,  # these frames are what the user interacted with
             )
             consolidated_out["maskmem_features"] = maskmem_features
     def _get_image_feature(self, inference_state, frame_idx, batch_size):
         """Compute the image features on a given frame."""
         # Look up in the cache first
+        # image, backbone_out = inference_state["cached_features"].get(
+        #     frame_idx, (None, None)
+        # )
+        image, backbone_out = None, None
         if backbone_out is None:
             # Cache miss -- we will run inference on a single image
             device = inference_state["device"]
             backbone_out = self.forward_image(image)
             # Cache the most recent frame's feature (for repeated interactions with
             # a frame; we can use an LRU cache for more frames in the future).
+            # inference_state["cached_features"] = {frame_idx: (image, backbone_out)}
         # expand the features to have the same dimension as the number of objects
         expanded_image = image.expand(batch_size, -1, -1, -1)
         pred_masks = pred_masks_gpu.to(storage_device, non_blocking=True)
         # "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
         maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, current_out)
+        # object pointer is a small tensor, so we always keep it on GPU memory for fast access (modified for ZeroGPU)
+        obj_ptr = current_out["obj_ptr"].to(storage_device, non_blocking=True)
+        object_score_logits = current_out["object_score_logits"].to(
+            storage_device, non_blocking=True
+        )
         # make a compact version of this frame's output to reduce the state size
         compact_current_out = {
             "maskmem_features": maskmem_features,
         `maskmem_pos_enc` is the same across frames and objects, so we cache it as
         a constant in the inference session to reduce session storage size.
         """
+        storage_device = inference_state["storage_device"]
         model_constants = inference_state["constants"]
         # "out_maskmem_pos_enc" should be either a list of tensors or None
         out_maskmem_pos_enc = current_out["maskmem_pos_enc"]
                 assert isinstance(out_maskmem_pos_enc, list)
                 # only take the slice for one object, since it's same across objects
                 maskmem_pos_enc = [x[0:1].clone() for x in out_maskmem_pos_enc]
+                model_constants["maskmem_pos_enc"] = maskmem_pos_enc.to(
+                    storage_device, non_blocking=True
+                )
             else:
                 maskmem_pos_enc = model_constants["maskmem_pos_enc"]
             # expand the cached maskmem_pos_enc to the actual batch size