Spaces:

PolyU-ChenLab
/

UniPixel

Running on Zero

App Files Files Community

UniPixel / sam2 /sam2_video_predictor.py

yeliudev

Add files

f880dff about 1 month ago

raw

history blame contribute delete

64 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	from collections import OrderedDict

	import torch
	import torch.nn.functional as F
	from tqdm import tqdm

	from sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
	from sam2.utils.misc import concat_points, fill_holes_in_mask_scores, load_video_frames


	class SAM2VideoPredictor(SAM2Base):
	"""The predictor class to handle user interactions and manage inference states."""

	def __init__(
	self,
	fill_hole_area=0,
	# whether to apply non-overlapping constraints on the output object masks
	non_overlap_masks=False,
	# whether to clear non-conditioning memory of the surrounding frames (which may contain outdated information) after adding correction clicks;
	# note that this would only apply to single-object tracking unless `clear_non_cond_mem_for_multi_obj` is also set to True)
	clear_non_cond_mem_around_input=False,
	# if `add_all_frames_to_correct_as_cond` is True, we also append to the conditioning frame list any frame that receives a later correction click
	# if `add_all_frames_to_correct_as_cond` is False, we conditioning frame list to only use those initial conditioning frames
	add_all_frames_to_correct_as_cond=False,
	inference_mode=True,
	**kwargs,
	):
	super().__init__(**kwargs)
	self.fill_hole_area = fill_hole_area
	self.non_overlap_masks = non_overlap_masks
	self.clear_non_cond_mem_around_input = clear_non_cond_mem_around_input
	self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond
	self.inference_mode = inference_mode

	@property
	def dtype(self):
	return self.image_encoder.trunk.patch_embed.proj.weight.dtype

	def init_state(
	self,
	frame,
	frame_size=None,
	offload_video_to_cpu=False,
	offload_state_to_cpu=False,
	async_loading_frames=False,
	):
	"""Initialize an inference state."""
	compute_device = self.device # device of the model
	if isinstance(frame, str):
	images, video_height, video_width = load_video_frames(
	video_path=frame,
	image_size=self.image_size,
	offload_video_to_cpu=offload_video_to_cpu,
	async_loading_frames=async_loading_frames,
	compute_device=compute_device,
	)
	else:
	if frame_size is None:
	frame_size = (self.image_size, self.image_size)
	images, video_height, video_width = (frame, *frame_size)
	inference_state = {}
	inference_state["images"] = images
	inference_state["num_frames"] = len(images)
	# whether to offload the video frames to CPU memory
	# turning on this option saves the GPU memory with only a very small overhead
	inference_state["offload_video_to_cpu"] = offload_video_to_cpu
	# whether to offload the inference state to CPU memory
	# turning on this option saves the GPU memory at the cost of a lower tracking fps
	# (e.g. in a test case of 768x768 model, fps dropped from 27 to 24 when tracking one object
	# and from 24 to 21 when tracking two objects)
	inference_state["offload_state_to_cpu"] = offload_state_to_cpu
	# the original video height and width, used for resizing final output scores
	inference_state["video_height"] = video_height
	inference_state["video_width"] = video_width
	inference_state["device"] = compute_device
	if offload_state_to_cpu:
	inference_state["storage_device"] = torch.device("cpu")
	else:
	inference_state["storage_device"] = compute_device
	# inputs on each frame
	inference_state["point_inputs_per_obj"] = {}
	inference_state["mask_inputs_per_obj"] = {}
	# visual features on a small number of recently visited frames for quick interactions
	inference_state["cached_features"] = {}
	# values that don't change across frames (so we only need to hold one copy of them)
	inference_state["constants"] = {}
	# mapping between client-side object id and model-side object index
	inference_state["obj_id_to_idx"] = OrderedDict()
	inference_state["obj_idx_to_id"] = OrderedDict()
	inference_state["obj_ids"] = []
	# Slice (view) of each object tracking results, sharing the same memory with "output_dict"
	inference_state["output_dict_per_obj"] = {}
	# A temporary storage to hold new outputs when user interact with a frame
	# to add clicks or mask (it's merged into "output_dict" before propagation starts)
	inference_state["temp_output_dict_per_obj"] = {}
	# Frames that already holds consolidated outputs from click or mask inputs
	# (we directly use their consolidated outputs during tracking)
	# metadata for each tracking frame (e.g. which direction it's tracked)
	inference_state["frames_tracked_per_obj"] = {}
	# Warm up the visual backbone and cache the image feature on all frames
	self._get_image_feature(inference_state, frame_idx=0, batch_size=1)
	return inference_state

	@classmethod
	def from_pretrained(cls, model_id: str, **kwargs) -> "SAM2VideoPredictor":
	"""
	Load a pretrained model from the Hugging Face hub.

	Arguments:
	model_id (str): The Hugging Face repository ID.
	**kwargs: Additional arguments to pass to the model constructor.

	Returns:
	(SAM2VideoPredictor): The loaded model.
	"""
	from sam2.build_sam import build_sam2_video_predictor_hf

	sam_model = build_sam2_video_predictor_hf(model_id, **kwargs)
	return sam_model

	def _obj_id_to_idx(self, inference_state, obj_id):
	"""Map client-side object id to model-side object index."""
	obj_idx = inference_state["obj_id_to_idx"].get(obj_id, None)
	if obj_idx is not None:
	return obj_idx

	# We always allow adding new objects (including after tracking starts)
	# get the next object slot
	obj_idx = len(inference_state["obj_id_to_idx"])
	inference_state["obj_id_to_idx"][obj_id] = obj_idx
	inference_state["obj_idx_to_id"][obj_idx] = obj_id
	inference_state["obj_ids"] = list(inference_state["obj_id_to_idx"])
	# set up input and output structures for this object
	inference_state["point_inputs_per_obj"][obj_idx] = {}
	inference_state["mask_inputs_per_obj"][obj_idx] = {}
	inference_state["output_dict_per_obj"][obj_idx] = {
	"cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	"non_cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	}
	inference_state["temp_output_dict_per_obj"][obj_idx] = {
	"cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	"non_cond_frame_outputs": {}, # dict containing {frame_idx: <out>}
	}
	inference_state["frames_tracked_per_obj"][obj_idx] = {}
	return obj_idx

	def _obj_idx_to_id(self, inference_state, obj_idx):
	"""Map model-side object index to client-side object id."""
	return inference_state["obj_idx_to_id"][obj_idx]

	def _get_obj_num(self, inference_state):
	"""Get the total number of unique object ids received so far in this session."""
	return len(inference_state["obj_idx_to_id"])

	@torch.inference_mode()
	def add_new_hidden_state(
	self,
	inference_state,
	frame_idx,
	obj_id,
	hidden,
	):
	obj_idx = self._obj_id_to_idx(inference_state, obj_id)
	# If this frame hasn't been tracked before, we treat it as an initial conditioning
	# frame, meaning that the inputs points are to generate segments on this frame without
	# using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
	# the input points will be used to correct the already tracked masks.
	obj_frames_tracked = inference_state["frames_tracked_per_obj"][obj_idx]
	is_init_cond_frame = frame_idx not in obj_frames_tracked
	# whether to track in reverse time order
	if is_init_cond_frame:
	reverse = False
	else:
	reverse = obj_frames_tracked[frame_idx]["reverse"]
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
	# Add a frame to conditioning output if it's an initial conditioning frame or
	# if the model sees all frames receiving clicks/mask as conditioning frames.
	is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
	storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"

	# Get any previously predicted mask logits on this object and feed it along with
	# the new clicks into the SAM mask decoder.
	prev_sam_mask_logits = None
	# lookup temporary output dict first, which contains the most recent output
	# (if not found, then lookup conditioning and non-conditioning frame output)
	prev_out = obj_temp_output_dict[storage_key].get(frame_idx)
	if prev_out is None:
	prev_out = obj_output_dict["cond_frame_outputs"].get(frame_idx)
	if prev_out is None:
	prev_out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx)

	if prev_out is not None and prev_out["pred_masks"] is not None:
	device = inference_state["device"]
	prev_sam_mask_logits = prev_out["pred_masks"].to(device, non_blocking=True)
	# Clamp the scale of prev_sam_mask_logits to avoid rare numerical issues.
	prev_sam_mask_logits = torch.clamp(prev_sam_mask_logits, -32.0, 32.0)
	current_out, _ = self._run_single_frame_inference(
	inference_state=inference_state,
	output_dict=obj_output_dict, # run on the slice of a single object
	frame_idx=frame_idx,
	batch_size=1, # run on the slice of a single object
	is_init_cond_frame=is_init_cond_frame,
	point_inputs=None,
	mask_inputs=None,
	hidden_inputs=hidden,
	reverse=reverse,
	# Skip the memory encoder when adding clicks or mask. We execute the memory encoder
	# at the beginning of `propagate_in_video` (after user finalize their clicks). This
	# allows us to enforce non-overlapping constraints on all objects before encoding
	# them into memory.
	run_mem_encoder=False,
	prev_sam_mask_logits=prev_sam_mask_logits,
	)
	# Add the output to the output dict (to be used as future memory)
	obj_temp_output_dict[storage_key][frame_idx] = current_out

	# Resize the output mask to the original video resolution
	obj_ids = inference_state["obj_ids"]
	consolidated_out = self._consolidate_temp_output_across_obj(
	inference_state,
	frame_idx,
	is_cond=is_cond,
	consolidate_at_video_res=True,
	)
	_, video_res_masks = self._get_orig_video_res_output(inference_state, consolidated_out["pred_masks_video_res"])
	return frame_idx, obj_ids, video_res_masks

	@torch.inference_mode()
	def add_new_points_or_box(
	self,
	inference_state,
	frame_idx,
	obj_id,
	points=None,
	labels=None,
	clear_old_points=True,
	normalize_coords=True,
	box=None,
	):
	"""Add new points to a frame."""
	obj_idx = self._obj_id_to_idx(inference_state, obj_id)
	point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
	mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]

	if (points is not None) != (labels is not None):
	raise ValueError("points and labels must be provided together")
	if points is None and box is None:
	raise ValueError("at least one of points or box must be provided as input")

	if points is None:
	points = torch.zeros(0, 2, dtype=torch.float32)
	elif not isinstance(points, torch.Tensor):
	points = torch.tensor(points, dtype=torch.float32)
	if labels is None:
	labels = torch.zeros(0, dtype=torch.int32)
	elif not isinstance(labels, torch.Tensor):
	labels = torch.tensor(labels, dtype=torch.int32)
	if points.dim() == 2:
	points = points.unsqueeze(0) # add batch dimension
	if labels.dim() == 1:
	labels = labels.unsqueeze(0) # add batch dimension

	# If `box` is provided, we add it as the first two points with labels 2 and 3
	# along with the user-provided points (consistent with how SAM 2 is trained).
	if box is not None:
	if not clear_old_points:
	raise ValueError("cannot add box without clearing old points, since "
	"box prompt must be provided before any point prompt "
	"(please use clear_old_points=True instead)")
	if not isinstance(box, torch.Tensor):
	box = torch.tensor(box, dtype=torch.float32, device=points.device)
	box_coords = box.reshape(1, 2, 2)
	box_labels = torch.tensor([2, 3], dtype=torch.int32, device=labels.device)
	box_labels = box_labels.reshape(1, 2)
	points = torch.cat([box_coords, points], dim=1)
	labels = torch.cat([box_labels, labels], dim=1)

	if normalize_coords:
	video_H = inference_state["video_height"]
	video_W = inference_state["video_width"]
	points = points / torch.tensor([video_W, video_H]).to(points.device)
	# scale the (normalized) coordinates by the model's internal image size
	points = points * self.image_size
	points = points.to(inference_state["device"])
	labels = labels.to(inference_state["device"])

	if not clear_old_points:
	point_inputs = point_inputs_per_frame.get(frame_idx, None)
	else:
	point_inputs = None
	point_inputs = concat_points(point_inputs, points, labels)

	point_inputs_per_frame[frame_idx] = point_inputs
	mask_inputs_per_frame.pop(frame_idx, None)
	# If this frame hasn't been tracked before, we treat it as an initial conditioning
	# frame, meaning that the inputs points are to generate segments on this frame without
	# using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
	# the input points will be used to correct the already tracked masks.
	obj_frames_tracked = inference_state["frames_tracked_per_obj"][obj_idx]
	is_init_cond_frame = frame_idx not in obj_frames_tracked
	# whether to track in reverse time order
	if is_init_cond_frame:
	reverse = False
	else:
	reverse = obj_frames_tracked[frame_idx]["reverse"]
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
	# Add a frame to conditioning output if it's an initial conditioning frame or
	# if the model sees all frames receiving clicks/mask as conditioning frames.
	is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
	storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"

	# Get any previously predicted mask logits on this object and feed it along with
	# the new clicks into the SAM mask decoder.
	prev_sam_mask_logits = None
	# lookup temporary output dict first, which contains the most recent output
	# (if not found, then lookup conditioning and non-conditioning frame output)
	prev_out = obj_temp_output_dict[storage_key].get(frame_idx)
	if prev_out is None:
	prev_out = obj_output_dict["cond_frame_outputs"].get(frame_idx)
	if prev_out is None:
	prev_out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx)

	if prev_out is not None and prev_out["pred_masks"] is not None:
	device = inference_state["device"]
	prev_sam_mask_logits = prev_out["pred_masks"].to(device, non_blocking=True)
	# Clamp the scale of prev_sam_mask_logits to avoid rare numerical issues.
	prev_sam_mask_logits = torch.clamp(prev_sam_mask_logits, -32.0, 32.0)
	current_out, _ = self._run_single_frame_inference(
	inference_state=inference_state,
	output_dict=obj_output_dict, # run on the slice of a single object
	frame_idx=frame_idx,
	batch_size=1, # run on the slice of a single object
	is_init_cond_frame=is_init_cond_frame,
	point_inputs=point_inputs,
	mask_inputs=None,
	hidden_inputs=None,
	reverse=reverse,
	# Skip the memory encoder when adding clicks or mask. We execute the memory encoder
	# at the beginning of `propagate_in_video` (after user finalize their clicks). This
	# allows us to enforce non-overlapping constraints on all objects before encoding
	# them into memory.
	run_mem_encoder=False,
	prev_sam_mask_logits=prev_sam_mask_logits,
	)
	# Add the output to the output dict (to be used as future memory)
	obj_temp_output_dict[storage_key][frame_idx] = current_out

	# Resize the output mask to the original video resolution
	obj_ids = inference_state["obj_ids"]
	consolidated_out = self._consolidate_temp_output_across_obj(
	inference_state,
	frame_idx,
	is_cond=is_cond,
	consolidate_at_video_res=True,
	)
	_, video_res_masks = self._get_orig_video_res_output(inference_state, consolidated_out["pred_masks_video_res"])
	return frame_idx, obj_ids, video_res_masks

	def add_new_points(self, args, *kwargs):
	"""Deprecated method. Please use `add_new_points_or_box` instead."""
	return self.add_new_points_or_box(args, *kwargs)

	@torch.inference_mode()
	def add_new_mask(
	self,
	inference_state,
	frame_idx,
	obj_id,
	mask,
	):
	"""Add new mask to a frame."""
	obj_idx = self._obj_id_to_idx(inference_state, obj_id)
	point_inputs_per_frame = inference_state["point_inputs_per_obj"][obj_idx]
	mask_inputs_per_frame = inference_state["mask_inputs_per_obj"][obj_idx]

	if not isinstance(mask, torch.Tensor):
	mask = torch.tensor(mask, dtype=torch.bool)
	assert mask.dim() == 2
	mask_H, mask_W = mask.shape
	mask_inputs_orig = mask[None, None] # add batch and channel dimension
	mask_inputs_orig = mask_inputs_orig.float().to(inference_state["device"])

	# resize the mask if it doesn't match the model's image size
	if mask_H != self.image_size or mask_W != self.image_size:
	mask_inputs = torch.nn.functional.interpolate(
	mask_inputs_orig,
	size=(self.image_size, self.image_size),
	align_corners=False,
	mode="bilinear",
	antialias=True, # use antialias for downsampling
	)
	mask_inputs = (mask_inputs >= 0.5).float()
	else:
	mask_inputs = mask_inputs_orig

	mask_inputs_per_frame[frame_idx] = mask_inputs
	point_inputs_per_frame.pop(frame_idx, None)
	# If this frame hasn't been tracked before, we treat it as an initial conditioning
	# frame, meaning that the inputs points are to generate segments on this frame without
	# using any memory from other frames, like in SAM. Otherwise (if it has been tracked),
	# the input points will be used to correct the already tracked masks.
	obj_frames_tracked = inference_state["frames_tracked_per_obj"][obj_idx]
	is_init_cond_frame = frame_idx not in obj_frames_tracked
	# whether to track in reverse time order
	if is_init_cond_frame:
	reverse = False
	else:
	reverse = obj_frames_tracked[frame_idx]["reverse"]
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
	# Add a frame to conditioning output if it's an initial conditioning frame or
	# if the model sees all frames receiving clicks/mask as conditioning frames.
	is_cond = is_init_cond_frame or self.add_all_frames_to_correct_as_cond
	storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"

	current_out, _ = self._run_single_frame_inference(
	inference_state=inference_state,
	output_dict=obj_output_dict, # run on the slice of a single object
	frame_idx=frame_idx,
	batch_size=1, # run on the slice of a single object
	is_init_cond_frame=is_init_cond_frame,
	point_inputs=None,
	mask_inputs=mask_inputs,
	hidden_inputs=None,
	reverse=reverse,
	# Skip the memory encoder when adding clicks or mask. We execute the memory encoder
	# at the beginning of `propagate_in_video` (after user finalize their clicks). This
	# allows us to enforce non-overlapping constraints on all objects before encoding
	# them into memory.
	run_mem_encoder=False,
	)
	# Add the output to the output dict (to be used as future memory)
	obj_temp_output_dict[storage_key][frame_idx] = current_out

	# Resize the output mask to the original video resolution
	obj_ids = inference_state["obj_ids"]
	consolidated_out = self._consolidate_temp_output_across_obj(
	inference_state,
	frame_idx,
	is_cond=is_cond,
	consolidate_at_video_res=True,
	)
	_, video_res_masks = self._get_orig_video_res_output(inference_state, consolidated_out["pred_masks_video_res"])
	return frame_idx, obj_ids, video_res_masks

	def _get_orig_video_res_output(self, inference_state, any_res_masks):
	"""
	Resize the object scores to the original video resolution (video_res_masks)
	and apply non-overlapping constraints for final output.
	"""
	device = inference_state["device"]
	video_H = inference_state["video_height"]
	video_W = inference_state["video_width"]
	any_res_masks = any_res_masks.to(device, non_blocking=True)
	if any_res_masks.shape[-2:] == (video_H, video_W):
	video_res_masks = any_res_masks
	else:
	video_res_masks = torch.nn.functional.interpolate(
	any_res_masks,
	size=(video_H, video_W),
	mode="bilinear",
	align_corners=False,
	)
	if self.non_overlap_masks:
	video_res_masks = self._apply_non_overlapping_constraints(video_res_masks)
	return any_res_masks, video_res_masks

	def _consolidate_temp_output_across_obj(
	self,
	inference_state,
	frame_idx,
	is_cond,
	consolidate_at_video_res=False,
	):
	"""
	Consolidate the per-object temporary outputs in `temp_output_dict_per_obj` on
	a frame into a single output for all objects, including
	1) fill any missing objects either from `output_dict_per_obj` (if they exist in
	`output_dict_per_obj` for this frame) or leave them as placeholder values
	(if they don't exist in `output_dict_per_obj` for this frame);
	2) if specified, rerun memory encoder after apply non-overlapping constraints
	on the object scores.
	"""
	batch_size = self._get_obj_num(inference_state)
	storage_key = "cond_frame_outputs" if is_cond else "non_cond_frame_outputs"
	# Optionally, we allow consolidating the temporary outputs at the original
	# video resolution (to provide a better editing experience for mask prompts).
	if consolidate_at_video_res:
	consolidated_H = inference_state["video_height"]
	consolidated_W = inference_state["video_width"]
	consolidated_mask_key = "pred_masks_video_res"
	else:
	consolidated_H = consolidated_W = self.image_size // 4
	consolidated_mask_key = "pred_masks"

	# Initialize `consolidated_out`. Its "maskmem_features" and "maskmem_pos_enc"
	# will be added when rerunning the memory encoder after applying non-overlapping
	# constraints to object scores. Its "pred_masks" are prefilled with a large
	# negative value (NO_OBJ_SCORE) to represent missing objects.
	consolidated_out = {
	consolidated_mask_key:
	torch.full(
	size=(batch_size, 1, consolidated_H, consolidated_W),
	fill_value=NO_OBJ_SCORE,
	dtype=inference_state["cached_features"][frame_idx][0].dtype,
	device=inference_state["storage_device"],
	),
	}
	for obj_idx in range(batch_size):
	obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	out = obj_temp_output_dict[storage_key].get(frame_idx, None)
	# If the object doesn't appear in "temp_output_dict_per_obj" on this frame,
	# we fall back and look up its previous output in "output_dict_per_obj".
	# We look up both "cond_frame_outputs" and "non_cond_frame_outputs" in
	# "output_dict_per_obj" to find a previous output for this object.
	if out is None:
	out = obj_output_dict["cond_frame_outputs"].get(frame_idx, None)
	if out is None:
	out = obj_output_dict["non_cond_frame_outputs"].get(frame_idx, None)
	# If the object doesn't appear in "output_dict_per_obj" either, we skip it
	# and leave its mask scores to the default scores (i.e. the NO_OBJ_SCORE
	# placeholder above) and set its object pointer to be a dummy pointer.
	if out is None:
	continue
	# Add the temporary object output mask to consolidated output mask
	obj_mask = out["pred_masks"]
	consolidated_pred_masks = consolidated_out[consolidated_mask_key]
	if obj_mask.shape[-2:] == consolidated_pred_masks.shape[-2:]:
	consolidated_pred_masks[obj_idx:obj_idx + 1] = obj_mask
	else:
	# Resize first if temporary object mask has a different resolution
	resized_obj_mask = torch.nn.functional.interpolate(
	obj_mask,
	size=consolidated_pred_masks.shape[-2:],
	mode="bilinear",
	align_corners=False,
	)
	consolidated_pred_masks[obj_idx:obj_idx + 1] = resized_obj_mask

	return consolidated_out

	@torch.inference_mode()
	def propagate_in_video_preflight(self, inference_state):
	"""Prepare inference_state and consolidate temporary outputs before tracking."""
	# Check and make sure that every object has received input points or masks.
	batch_size = self._get_obj_num(inference_state)
	if batch_size == 0:
	raise RuntimeError("No input points or masks are provided for any object; please add inputs first.")

	# Consolidate per-object temporary outputs in "temp_output_dict_per_obj" and
	# add them into "output_dict".
	for obj_idx in range(batch_size):
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	obj_temp_output_dict = inference_state["temp_output_dict_per_obj"][obj_idx]
	for is_cond in [False, True]:
	# Separately consolidate conditioning and non-conditioning temp outputs
	storage_key = ("cond_frame_outputs" if is_cond else "non_cond_frame_outputs")
	# Find all the frames that contain temporary outputs for any objects
	# (these should be the frames that have just received clicks for mask inputs
	# via `add_new_points_or_box` or `add_new_mask`)
	for frame_idx, out in obj_temp_output_dict[storage_key].items():
	# Run memory encoder on the temporary outputs (if the memory feature is missing)
	if out["maskmem_features"] is None:
	high_res_masks = torch.nn.functional.interpolate(
	out["pred_masks"].to(inference_state["device"]),
	size=(self.image_size, self.image_size),
	mode="bilinear",
	align_corners=False,
	)
	maskmem_features, maskmem_pos_enc = self._run_memory_encoder(
	inference_state=inference_state,
	frame_idx=frame_idx,
	batch_size=1, # run on the slice of a single object
	high_res_masks=high_res_masks,
	object_score_logits=out["object_score_logits"],
	# these frames are what the user interacted with
	is_mask_from_pts=True,
	)
	out["maskmem_features"] = maskmem_features
	out["maskmem_pos_enc"] = maskmem_pos_enc

	obj_output_dict[storage_key][frame_idx] = out
	if self.clear_non_cond_mem_around_input:
	# clear non-conditioning memory of the surrounding frames
	self._clear_obj_non_cond_mem_around_input(inference_state, frame_idx, obj_idx)

	# clear temporary outputs in `temp_output_dict_per_obj`
	obj_temp_output_dict[storage_key].clear()

	# check and make sure that every object has received input points or masks
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	if len(obj_output_dict["cond_frame_outputs"]) == 0:
	obj_id = self._obj_idx_to_id(inference_state, obj_idx)
	raise RuntimeError(
	f"No input points or masks are provided for object id {obj_id}; please add inputs first.")
	# edge case: if an output is added to "cond_frame_outputs", we remove any prior
	# output on the same frame in "non_cond_frame_outputs"
	for frame_idx in obj_output_dict["cond_frame_outputs"]:
	obj_output_dict["non_cond_frame_outputs"].pop(frame_idx, None)

	@torch.inference_mode()
	def propagate_in_video(
	self,
	inference_state,
	start_frame_idx=None,
	max_frame_num_to_track=None,
	reverse=False,
	verbose=True,
	):
	"""Propagate the input points across frames to track in the entire video."""
	self.propagate_in_video_preflight(inference_state)

	obj_ids = inference_state["obj_ids"]
	num_frames = inference_state["num_frames"]
	batch_size = self._get_obj_num(inference_state)

	# set start index, end index, and processing order
	if start_frame_idx is None:
	# default: start from the earliest frame with input points
	start_frame_idx = min(t for obj_output_dict in inference_state["output_dict_per_obj"].values()
	for t in obj_output_dict["cond_frame_outputs"])
	if max_frame_num_to_track is None:
	# default: track all the frames in the video
	max_frame_num_to_track = num_frames
	if reverse:
	end_frame_idx = max(start_frame_idx - max_frame_num_to_track, 0)
	if start_frame_idx > 0:
	processing_order = range(start_frame_idx, end_frame_idx - 1, -1)
	else:
	processing_order = [] # skip reverse tracking if starting from frame 0
	else:
	end_frame_idx = min(start_frame_idx + max_frame_num_to_track, num_frames - 1)
	processing_order = range(start_frame_idx, end_frame_idx + 1)

	for frame_idx in tqdm(processing_order, desc="propagate in video", disable=not verbose):
	pred_masks_per_obj = [None] * batch_size
	for obj_idx in range(batch_size):
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	# We skip those frames already in consolidated outputs (these are frames
	# that received input clicks or mask). Note that we cannot directly run
	# batched forward on them via `_run_single_frame_inference` because the
	# number of clicks on each object might be different.
	if frame_idx in obj_output_dict["cond_frame_outputs"]:
	storage_key = "cond_frame_outputs"
	current_out = obj_output_dict[storage_key][frame_idx]
	device = inference_state["device"]
	pred_masks = current_out["pred_masks"].to(device, non_blocking=True)
	if self.clear_non_cond_mem_around_input:
	# clear non-conditioning memory of the surrounding frames
	self._clear_obj_non_cond_mem_around_input(inference_state, frame_idx, obj_idx)
	else:
	storage_key = "non_cond_frame_outputs"
	current_out, pred_masks = self._run_single_frame_inference(
	inference_state=inference_state,
	output_dict=obj_output_dict,
	frame_idx=frame_idx,
	batch_size=1, # run on the slice of a single object
	is_init_cond_frame=False,
	point_inputs=None,
	mask_inputs=None,
	hidden_inputs=None,
	reverse=reverse,
	run_mem_encoder=True,
	)
	obj_output_dict[storage_key][frame_idx] = current_out

	inference_state["frames_tracked_per_obj"][obj_idx][frame_idx] = {"reverse": reverse}
	pred_masks_per_obj[obj_idx] = pred_masks

	# Resize the output mask to the original video resolution (we directly use
	# the mask scores on GPU for output to avoid any CPU conversion in between)
	if len(pred_masks_per_obj) > 1:
	all_pred_masks = torch.cat(pred_masks_per_obj, dim=0)
	else:
	all_pred_masks = pred_masks_per_obj[0]
	_, video_res_masks = self._get_orig_video_res_output(inference_state, all_pred_masks)
	yield frame_idx, obj_ids, video_res_masks

	@torch.inference_mode()
	def clear_all_prompts_in_frame(self, inference_state, frame_idx, obj_id, need_output=True):
	"""Remove all input points or mask in a specific frame for a given object."""
	obj_idx = self._obj_id_to_idx(inference_state, obj_id)

	# Clear the conditioning information on the given frame
	inference_state["point_inputs_per_obj"][obj_idx].pop(frame_idx, None)
	inference_state["mask_inputs_per_obj"][obj_idx].pop(frame_idx, None)

	temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
	temp_output_dict_per_obj[obj_idx]["cond_frame_outputs"].pop(frame_idx, None)
	temp_output_dict_per_obj[obj_idx]["non_cond_frame_outputs"].pop(frame_idx, None)

	# Remove the frame's conditioning output (possibly downgrading it to non-conditioning)
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	out = obj_output_dict["cond_frame_outputs"].pop(frame_idx, None)
	if out is not None:
	# The frame is not a conditioning frame anymore since it's not receiving inputs,
	# so we "downgrade" its output (if exists) to a non-conditioning frame output.
	obj_output_dict["non_cond_frame_outputs"][frame_idx] = out
	inference_state["frames_tracked_per_obj"][obj_idx].pop(frame_idx, None)

	if not need_output:
	return
	# Finally, output updated masks per object (after removing the inputs above)
	obj_ids = inference_state["obj_ids"]
	is_cond = any(frame_idx in obj_temp_output_dict["cond_frame_outputs"]
	for obj_temp_output_dict in temp_output_dict_per_obj.values())
	consolidated_out = self._consolidate_temp_output_across_obj(
	inference_state,
	frame_idx,
	is_cond=is_cond,
	consolidate_at_video_res=True,
	)
	_, video_res_masks = self._get_orig_video_res_output(inference_state, consolidated_out["pred_masks_video_res"])
	return frame_idx, obj_ids, video_res_masks

	@torch.inference_mode()
	def reset_state(self, inference_state):
	"""Remove all input points or mask in all frames throughout the video."""
	self._reset_tracking_results(inference_state)
	# Remove all object ids
	inference_state["obj_id_to_idx"].clear()
	inference_state["obj_idx_to_id"].clear()
	inference_state["obj_ids"].clear()
	inference_state["point_inputs_per_obj"].clear()
	inference_state["mask_inputs_per_obj"].clear()
	inference_state["output_dict_per_obj"].clear()
	inference_state["temp_output_dict_per_obj"].clear()
	inference_state["frames_tracked_per_obj"].clear()

	def _reset_tracking_results(self, inference_state):
	"""Reset all tracking inputs and results across the videos."""
	for v in inference_state["point_inputs_per_obj"].values():
	v.clear()
	for v in inference_state["mask_inputs_per_obj"].values():
	v.clear()
	for v in inference_state["output_dict_per_obj"].values():
	v["cond_frame_outputs"].clear()
	v["non_cond_frame_outputs"].clear()
	for v in inference_state["temp_output_dict_per_obj"].values():
	v["cond_frame_outputs"].clear()
	v["non_cond_frame_outputs"].clear()
	for v in inference_state["frames_tracked_per_obj"].values():
	v.clear()

	def _get_image_feature(self, inference_state, frame_idx, batch_size):
	"""Compute the image features on a given frame."""
	# NOTE: check me ======================================================================
	# # Look up in the cache first
	# image, backbone_out = inference_state["cached_features"].get(frame_idx, (None, None))
	# if backbone_out is None:
	# # Cache miss -- we will run inference on a single image
	# device = inference_state["device"]
	# image = inference_state["images"][frame_idx].to(device).unsqueeze(0)
	# backbone_out = self.forward_image(image)
	# # Cache the most recent frame's feature (for repeated interactions with
	# # a frame; we can use an LRU cache for more frames in the future).
	# inference_state["cached_features"] = {frame_idx: (image, backbone_out)}
	# =====================================================================================

	# build cache for image features
	if not inference_state["cached_features"]:
	image = inference_state["images"].to(inference_state["device"])
	backbone_out = self.forward_image(image)
	inference_state["cached_features"] = {
	i: (image[i, None], {
	k: v[i, None] if torch.is_tensor(v) else [t[i, None] for t in v]
	for k, v in backbone_out.items()
	})
	for i in range(image.size(0))
	}

	# retrieve from cache
	image, backbone_out = inference_state["cached_features"][frame_idx]

	# expand the features to have the same dimension as the number of objects
	expanded_image = image.expand(batch_size, -1, -1, -1)
	expanded_backbone_out = {
	"backbone_fpn": backbone_out["backbone_fpn"].copy(),
	"vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
	}
	for i, feat in enumerate(expanded_backbone_out["backbone_fpn"]):
	expanded_backbone_out["backbone_fpn"][i] = feat.expand(batch_size, -1, -1, -1)
	for i, pos in enumerate(expanded_backbone_out["vision_pos_enc"]):
	pos = pos.expand(batch_size, -1, -1, -1)
	expanded_backbone_out["vision_pos_enc"][i] = pos

	features = self._prepare_backbone_features(expanded_backbone_out)
	features = (expanded_image, ) + features
	return features

	def _run_single_frame_inference(
	self,
	inference_state,
	output_dict,
	frame_idx,
	batch_size,
	is_init_cond_frame,
	point_inputs,
	mask_inputs,
	hidden_inputs,
	reverse,
	run_mem_encoder,
	prev_sam_mask_logits=None,
	):
	"""Run tracking on a single frame based on current inputs and previous memory."""
	# Retrieve correct image features
	(
	_,
	_,
	current_vision_feats,
	current_vision_pos_embeds,
	feat_sizes,
	) = self._get_image_feature(inference_state, frame_idx, batch_size)

	# point and mask should not appear as input simultaneously on the same frame
	assert point_inputs is None or mask_inputs is None
	current_out = self.track_step(
	frame_idx=frame_idx,
	is_init_cond_frame=is_init_cond_frame,
	current_vision_feats=current_vision_feats,
	current_vision_pos_embeds=current_vision_pos_embeds,
	feat_sizes=feat_sizes,
	point_inputs=point_inputs,
	mask_inputs=mask_inputs,
	hidden_inputs=hidden_inputs,
	output_dict=output_dict,
	num_frames=inference_state["num_frames"],
	track_in_reverse=reverse,
	run_mem_encoder=run_mem_encoder,
	prev_sam_mask_logits=prev_sam_mask_logits,
	)

	# optionally offload the output to CPU memory to save GPU space
	storage_device = inference_state["storage_device"]
	maskmem_features = current_out["maskmem_features"]
	if maskmem_features is not None:
	maskmem_features = maskmem_features.to(inference_state["cached_features"][frame_idx][0].dtype)
	maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
	pred_masks_gpu = current_out["pred_masks"]
	# potentially fill holes in the predicted masks
	if self.fill_hole_area > 0:
	pred_masks_gpu = fill_holes_in_mask_scores(pred_masks_gpu, self.fill_hole_area)
	pred_masks = pred_masks_gpu.to(storage_device, non_blocking=True)
	# "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
	maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, current_out)
	# object pointer is a small tensor, so we always keep it on GPU memory for fast access
	obj_ptr = current_out["obj_ptr"]
	object_score_logits = current_out["object_score_logits"]
	# make a compact version of this frame's output to reduce the state size
	compact_current_out = {
	"maskmem_features": maskmem_features,
	"maskmem_pos_enc": maskmem_pos_enc,
	"pred_masks": pred_masks,
	"obj_ptr": obj_ptr,
	"object_score_logits": object_score_logits,
	}
	# NOTE: reduce memory during inference ----------------------------------------
	# https://github.com/facebookresearch/sam2/issues/196
	# step = self.num_maskmem * self.memory_temporal_stride_for_eval * 2
	# drop_frame_inds = [
	# i for i in output_dict["non_cond_frame_outputs"].keys()
	# if (i > frame_idx + step if reverse else i < frame_idx - step)
	# ]
	# for idx in drop_frame_inds:
	# output_dict["non_cond_frame_outputs"].pop(idx)
	# for obj_output_dict in inference_state["output_dict_per_obj"].values():
	# obj_output_dict["non_cond_frame_outputs"].pop(idx, None)
	# -----------------------------------------------------------------------------
	return compact_current_out, pred_masks_gpu

	def _run_memory_encoder(
	self,
	inference_state,
	frame_idx,
	batch_size,
	high_res_masks,
	object_score_logits,
	is_mask_from_pts,
	):
	"""
	Run the memory encoder on `high_res_masks`. This is usually after applying
	non-overlapping constraints to object scores. Since their scores changed, their
	memory also need to be computed again with the memory encoder.
	"""
	# Retrieve correct image features
	_, _, current_vision_feats, _, feat_sizes = self._get_image_feature(inference_state, frame_idx, batch_size)
	maskmem_features, maskmem_pos_enc = self._encode_new_memory(
	current_vision_feats=current_vision_feats,
	feat_sizes=feat_sizes,
	pred_masks_high_res=high_res_masks,
	object_score_logits=object_score_logits,
	is_mask_from_pts=is_mask_from_pts,
	)

	# optionally offload the output to CPU memory to save GPU space
	storage_device = inference_state["storage_device"]
	maskmem_features = maskmem_features.to(inference_state["cached_features"][frame_idx][0].dtype)
	maskmem_features = maskmem_features.to(storage_device, non_blocking=True)
	# "maskmem_pos_enc" is the same across frames, so we only need to store one copy of it
	maskmem_pos_enc = self._get_maskmem_pos_enc(inference_state, {"maskmem_pos_enc": maskmem_pos_enc})
	return maskmem_features, maskmem_pos_enc

	def _get_maskmem_pos_enc(self, inference_state, current_out):
	"""
	`maskmem_pos_enc` is the same across frames and objects, so we cache it as
	a constant in the inference session to reduce session storage size.
	"""
	model_constants = inference_state["constants"]
	# "out_maskmem_pos_enc" should be either a list of tensors or None
	out_maskmem_pos_enc = current_out["maskmem_pos_enc"]
	if out_maskmem_pos_enc is not None:
	if "maskmem_pos_enc" not in model_constants:
	assert isinstance(out_maskmem_pos_enc, list)
	# only take the slice for one object, since it's same across objects
	maskmem_pos_enc = [x[0:1].clone() for x in out_maskmem_pos_enc]
	model_constants["maskmem_pos_enc"] = maskmem_pos_enc
	else:
	maskmem_pos_enc = model_constants["maskmem_pos_enc"]
	# expand the cached maskmem_pos_enc to the actual batch size
	batch_size = out_maskmem_pos_enc[0].size(0)
	expanded_maskmem_pos_enc = [x.expand(batch_size, -1, -1, -1) for x in maskmem_pos_enc]
	else:
	expanded_maskmem_pos_enc = None
	return expanded_maskmem_pos_enc

	@torch.inference_mode()
	def remove_object(self, inference_state, obj_id, strict=False, need_output=True):
	"""
	Remove an object id from the tracking state. If strict is True, we check whether
	the object id actually exists and raise an error if it doesn't exist.
	"""
	old_obj_idx_to_rm = inference_state["obj_id_to_idx"].get(obj_id, None)
	updated_frames = []
	# Check whether this object_id to remove actually exists and possibly raise an error.
	if old_obj_idx_to_rm is None:
	if not strict:
	return inference_state["obj_ids"], updated_frames
	raise RuntimeError(f"Cannot remove object id {obj_id} as it doesn't exist. "
	f"All existing object ids: {inference_state['obj_ids']}.")

	# If this is the only remaining object id, we simply reset the state.
	if len(inference_state["obj_id_to_idx"]) == 1:
	self.reset_state(inference_state)
	return inference_state["obj_ids"], updated_frames

	# There are still remaining objects after removing this object id. In this case,
	# we need to delete the object storage from inference state tensors.
	# Step 0: clear the input on those frames where this object id has point or mask input
	# (note that this step is required as it might downgrade conditioning frames to
	# non-conditioning ones)
	obj_input_frames_inds = set()
	obj_input_frames_inds.update(inference_state["point_inputs_per_obj"][old_obj_idx_to_rm])
	obj_input_frames_inds.update(inference_state["mask_inputs_per_obj"][old_obj_idx_to_rm])
	for frame_idx in obj_input_frames_inds:
	self.clear_all_prompts_in_frame(inference_state, frame_idx, obj_id, need_output=False)

	# Step 1: Update the object id mapping (note that it must be done after Step 0,
	# since Step 0 still requires the old object id mappings in inference_state)
	old_obj_ids = inference_state["obj_ids"]
	old_obj_inds = list(range(len(old_obj_ids)))
	remain_old_obj_inds = old_obj_inds.copy()
	remain_old_obj_inds.remove(old_obj_idx_to_rm)
	new_obj_ids = [old_obj_ids[old_idx] for old_idx in remain_old_obj_inds]
	new_obj_inds = list(range(len(new_obj_ids)))
	# build new mappings
	old_idx_to_new_idx = dict(zip(remain_old_obj_inds, new_obj_inds))
	inference_state["obj_id_to_idx"] = dict(zip(new_obj_ids, new_obj_inds))
	inference_state["obj_idx_to_id"] = dict(zip(new_obj_inds, new_obj_ids))
	inference_state["obj_ids"] = new_obj_ids

	# Step 2: For per-object tensor storage, we shift their obj_idx in the dict keys.
	def _map_keys(container):
	new_kvs = []
	for k in old_obj_inds:
	v = container.pop(k)
	if k in old_idx_to_new_idx:
	new_kvs.append((old_idx_to_new_idx[k], v))
	container.update(new_kvs)

	_map_keys(inference_state["point_inputs_per_obj"])
	_map_keys(inference_state["mask_inputs_per_obj"])
	_map_keys(inference_state["output_dict_per_obj"])
	_map_keys(inference_state["temp_output_dict_per_obj"])
	_map_keys(inference_state["frames_tracked_per_obj"])

	# Step 3: Further collect the outputs on those frames in `obj_input_frames_inds`, which
	# could show an updated mask for objects previously occluded by the object being removed
	if need_output:
	temp_output_dict_per_obj = inference_state["temp_output_dict_per_obj"]
	for frame_idx in obj_input_frames_inds:
	is_cond = any(frame_idx in obj_temp_output_dict["cond_frame_outputs"]
	for obj_temp_output_dict in temp_output_dict_per_obj.values())
	consolidated_out = self._consolidate_temp_output_across_obj(
	inference_state,
	frame_idx,
	is_cond=is_cond,
	consolidate_at_video_res=True,
	)
	_, video_res_masks = self._get_orig_video_res_output(inference_state,
	consolidated_out["pred_masks_video_res"])
	updated_frames.append((frame_idx, video_res_masks))

	return inference_state["obj_ids"], updated_frames

	def _clear_non_cond_mem_around_input(self, inference_state, frame_idx):
	"""
	Remove the non-conditioning memory around the input frame. When users provide
	correction clicks, the surrounding frames' non-conditioning memories can still
	contain outdated object appearance information and could confuse the model.

	This method clears those non-conditioning memories surrounding the interacted
	frame to avoid giving the model both old and new information about the object.
	"""
	r = self.memory_temporal_stride_for_eval
	frame_idx_begin = frame_idx - r * self.num_maskmem
	frame_idx_end = frame_idx + r * self.num_maskmem
	batch_size = self._get_obj_num(inference_state)
	for obj_idx in range(batch_size):
	obj_output_dict = inference_state["output_dict_per_obj"][obj_idx]
	non_cond_frame_outputs = obj_output_dict["non_cond_frame_outputs"]
	for t in range(frame_idx_begin, frame_idx_end + 1):
	non_cond_frame_outputs.pop(t, None)


	class SAM2VideoPredictorVOS(SAM2VideoPredictor):
	"""Optimized for the VOS setting"""

	def __init__(self, args, *kwargs):
	raise NotImplementedError("SAM2VideoPredictorVOS has not been modified for LLMs")
	super().__init__(args, *kwargs)
	self._compile_all_components()

	def _compile_all_components(self):
	print("Compiling all components for VOS setting. First time may be very slow.")
	self.memory_encoder.forward = torch.compile(
	self.memory_encoder.forward,
	mode="max-autotune",
	fullgraph=True,
	dynamic=False,
	)

	self.memory_attention.forward = torch.compile(
	self.memory_attention.forward,
	mode="max-autotune",
	fullgraph=True,
	dynamic=True, # Num. of memories varies
	)

	self.sam_prompt_encoder.forward = torch.compile(
	self.sam_prompt_encoder.forward,
	mode="max-autotune",
	fullgraph=True,
	dynamic=False, # Accuracy regression on True
	)

	self.sam_mask_decoder.forward = torch.compile(
	self.sam_mask_decoder.forward,
	mode="max-autotune",
	fullgraph=True,
	dynamic=False, # Accuracy regression on True
	)

	def forward_image(self, img_batch: torch.Tensor):
	"""
	Identical to the corresponding method in the parent (SAM2VideoPredictor), but
	cloning the backbone features and pos encoding to enable compilation.
	"""
	backbone_out = self.image_encoder(img_batch)
	if self.use_high_res_features_in_sam:
	# precompute projected level 0 and level 1 features in SAM decoder
	# to avoid running it again on every SAM click
	backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0(backbone_out["backbone_fpn"][0])
	backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1(backbone_out["backbone_fpn"][1])
	# Clone to help torch.compile
	for i in range(len(backbone_out["backbone_fpn"])):
	backbone_out["backbone_fpn"][i] = backbone_out["backbone_fpn"][i].clone()
	backbone_out["vision_pos_enc"][i] = backbone_out["vision_pos_enc"][i].clone()
	return backbone_out

	def _forward_sam_heads(
	self,
	backbone_features,
	point_inputs=None,
	mask_inputs=None,
	high_res_features=None,
	multimask_output=False,
	):
	"""
	Identical to the corresponding method in the parent (SAM2VideoPredictor), but
	cloning the outputs of prompt_encoder and mask_decoder to enable compilation.
	"""
	B = backbone_features.size(0)
	device = backbone_features.device
	assert backbone_features.size(1) == self.sam_prompt_embed_dim
	assert backbone_features.size(2) == self.sam_image_embedding_size
	assert backbone_features.size(3) == self.sam_image_embedding_size

	# a) Handle point prompts
	if point_inputs is not None:
	sam_point_coords = point_inputs["point_coords"]
	sam_point_labels = point_inputs["point_labels"]
	assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B
	else:
	# If no points are provide, pad with an empty point (with label -1)
	sam_point_coords = torch.zeros(B, 1, 2, device=device)
	sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)

	# b) Handle mask prompts
	if mask_inputs is not None:
	# If mask_inputs is provided, downsize it into low-res mask input if needed
	# and feed it as a dense mask prompt into the SAM mask encoder
	assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1)
	if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size:
	sam_mask_prompt = F.interpolate(
	mask_inputs.float(),
	size=self.sam_prompt_encoder.mask_input_size,
	align_corners=False,
	mode="bilinear",
	antialias=True, # use antialias for downsampling
	)
	else:
	sam_mask_prompt = mask_inputs
	else:
	# Otherwise, simply feed None (and SAM's prompt encoder will add
	# a learned `no_mask_embed` to indicate no mask input in this case).
	sam_mask_prompt = None

	sparse_embeddings, dense_embeddings = self.sam_prompt_encoder(
	points=(sam_point_coords, sam_point_labels),
	boxes=None,
	masks=sam_mask_prompt,
	)
	# Clone image_pe and the outputs of sam_prompt_encoder
	# to enable compilation
	sparse_embeddings = sparse_embeddings.clone()
	dense_embeddings = dense_embeddings.clone()
	image_pe = self.sam_prompt_encoder.get_dense_pe().clone()
	(
	low_res_multimasks,
	ious,
	sam_output_tokens,
	object_score_logits,
	) = self.sam_mask_decoder(
	image_embeddings=backbone_features,
	image_pe=image_pe,
	sparse_prompt_embeddings=sparse_embeddings,
	dense_prompt_embeddings=dense_embeddings,
	multimask_output=multimask_output,
	repeat_image=False, # the image is already batched
	high_res_features=high_res_features,
	)
	# Clone the output of sam_mask_decoder
	# to enable compilation
	low_res_multimasks = low_res_multimasks.clone()
	ious = ious.clone()
	sam_output_tokens = sam_output_tokens.clone()
	object_score_logits = object_score_logits.clone()

	if self.pred_obj_scores:
	is_obj_appearing = object_score_logits > 0

	# Mask used for spatial memories is always a hard choice between obj and no obj,
	# consistent with the actual mask prediction
	low_res_multimasks = torch.where(
	is_obj_appearing[:, None, None],
	low_res_multimasks,
	NO_OBJ_SCORE,
	)

	# convert masks from possibly bfloat16 (or float16) to float32
	low_res_multimasks = low_res_multimasks.float()
	high_res_multimasks = F.interpolate(
	low_res_multimasks,
	size=(self.image_size, self.image_size),
	mode="bilinear",
	align_corners=False,
	)

	sam_output_token = sam_output_tokens[:, 0]
	if multimask_output:
	# take the best mask prediction (with the highest IoU estimation)
	best_iou_inds = torch.argmax(ious, dim=-1)
	batch_inds = torch.arange(B, device=device)
	low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
	high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
	if sam_output_tokens.size(1) > 1:
	sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
	else:
	low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks

	# Extract object pointer from the SAM output token (with occlusion handling)
	obj_ptr = self.obj_ptr_proj(sam_output_token)
	if self.pred_obj_scores:
	# Allow soft no obj ptr, unlike for masks
	if self.soft_no_obj_ptr:
	lambda_is_obj_appearing = object_score_logits.sigmoid()
	else:
	lambda_is_obj_appearing = is_obj_appearing.float()

	if self.fixed_no_obj_ptr:
	obj_ptr = lambda_is_obj_appearing * obj_ptr
	obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr

	return (
	low_res_multimasks,
	high_res_multimasks,
	ious,
	low_res_masks,
	high_res_masks,
	obj_ptr,
	object_score_logits,
	)

	def _encode_new_memory(
	self,
	current_vision_feats,
	feat_sizes,
	pred_masks_high_res,
	object_score_logits,
	is_mask_from_pts,
	):
	"""
	Identical to the corresponding method in the parent (SAM2VideoPredictor), but
	cloning the memories and their pos enc to enable compilation.
	"""
	B = current_vision_feats[-1].size(1) # batch size on this frame
	C = self.hidden_dim
	H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size
	# top-level feature, (HW)BC => BCHW
	pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W)
	if self.non_overlap_masks_for_mem_enc and not self.training:
	# optionally, apply non-overlapping constraints to the masks (it's applied
	# in the batch dimension and should only be used during eval, where all
	# the objects come from the same video under batch size 1).
	pred_masks_high_res = self._apply_non_overlapping_constraints(pred_masks_high_res)
	# scale the raw mask logits with a temperature before applying sigmoid
	binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts
	if binarize and not self.training:
	mask_for_mem = (pred_masks_high_res > 0).float()
	else:
	# apply sigmoid on the raw mask logits to turn them into range (0, 1)
	mask_for_mem = torch.sigmoid(pred_masks_high_res)
	# apply scale and bias terms to the sigmoid probabilities
	if self.sigmoid_scale_for_mem_enc != 1.0:
	mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
	if self.sigmoid_bias_for_mem_enc != 0.0:
	mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
	maskmem_out = self.memory_encoder(
	pix_feat,
	mask_for_mem,
	skip_mask_sigmoid=True # sigmoid already applied
	)
	# Clone the feats and pos_enc to enable compilation
	maskmem_features = maskmem_out["vision_features"].clone()
	maskmem_pos_enc = [m.clone() for m in maskmem_out["vision_pos_enc"]]
	# add a no-object embedding to the spatial memory to indicate that the frame
	# is predicted to be occluded (i.e. no object is appearing in the frame)
	if self.no_obj_embed_spatial is not None:
	is_obj_appearing = (object_score_logits > 0).float()
	maskmem_features += (1 - is_obj_appearing[..., None, None]
	) * self.no_obj_embed_spatial[..., None, None].expand(*maskmem_features.shape)

	return maskmem_features, maskmem_pos_enc