VLA-Adapter
/

LIBERO-Object

@@ -338,7 +338,7 @@ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
                 f"there might be inference-time regressions due to dependency changes. If in doubt, please"
                 f"use the above versions."
             )
-        # import pdb; pdb.set_trace()
         # Instantiate PrismaticVisionBackbone (w/ Potential Fused Backbone)
         self.vision_backbone = PrismaticVisionBackbone(
             config.use_fused_vision_backbone, config.image_sizes, config.timm_model_ids, config.timm_override_act_layers
@@ -432,7 +432,7 @@ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
         # Move the noisy action features into their correct positions
         # print(noisy_action_features.size())
-        # import pdb; pdb.set_trace()
         repositioned_noisy_action_features[batch_indices, masked_indices] = noisy_action_features
         # Combine original input embeddings and noisy action embeddings using the mask
@@ -475,7 +475,7 @@ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
     def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddings, attention_mask):
         """Build multimodal embeddings and attention mask"""
         # Update attention mask
-        # import pdb; pdb.set_trace()
         projected_patch_attention_mask = None
         if attention_mask is not None:
             projected_patch_attention_mask = torch.full(
@@ -589,12 +589,12 @@ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
             # Get input embeddings (from language model embeddings)
             input_embeddings = self.get_input_embeddings()(input_ids)  # (B, seq_len, D)
-            # import pdb; pdb.set_trace()
             # Extract action masks
             all_actions_mask = self._process_action_masks(labels)
             # Extract the language portion of the input embeddings (i.e. remove the action tokens portion)
-            # import pdb; pdb.set_trace()
             # print(input_embeddings[~all_actions_mask].size())
             language_embeddings = input_embeddings[~all_actions_mask].reshape(
                 input_embeddings.shape[0], -1, input_embeddings.shape[2]
@@ -624,7 +624,7 @@ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
             # Process action embeddings
             if noisy_actions is not None:
-                # import pdb; pdb.set_trace()
                 if self.version == 'v1':
                     # action_queries = self.action_queries.weight  # (1, h)
                     # action_queries = action_queries.view(1, 1, action_queries.shape[1]).repeat(input_embeddings.shape[0], 1, 1)  # (b, chunk_size, h)
@@ -642,7 +642,7 @@ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
                     all_actions_mask = self._process_action_masks(labels)
                     input_embeddings = self._replace_input_embeddings(
                         input_embeddings, all_actions_mask, action_queries)
-                    # import pdb; pdb.set_trace()
                 else:
                     # Get mask corresponding to all action tokens
@@ -665,7 +665,7 @@ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
                     all_actions_mask = self._process_action_masks(labels)
                     input_embeddings = self._replace_input_embeddings(
                         input_embeddings, all_actions_mask, action_queries)
-                    # import pdb; pdb.set_trace()
                 else:
                     # Replace the embeddings of the action tokens with zeros
                     # (Later on, the positional embeddings will be added to them)
@@ -677,14 +677,14 @@ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
             multimodal_embeddings, multimodal_attention_mask = self._build_multimodal_attention(
                 input_embeddings, projected_patch_embeddings, attention_mask
             )
-            # import pdb; pdb.set_trace()
             # Build labels for multimodal sequence if needed
             multimodal_labels = self._build_multimodal_labels(labels, projected_patch_embeddings)
-            # import pdb; pdb.set_trace()
             # Dispatch to language model
             if self.version == 'v1':
-                # import pdb; pdb.set_trace()
                 language_model_output = self.language_model(
                     input_ids=None,
                     attention_mask=multimodal_attention_mask,
@@ -697,7 +697,7 @@ class PrismaticForConditionalGeneration(PrismaticPreTrainedModel):
                     output_hidden_states=output_hidden_states,
                     return_dict=return_dict,
                 )
-                # import pdb; pdb.set_trace()
             else:
                 language_model_output = self.language_model(
                     input_ids=None,
@@ -802,7 +802,7 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
     def __init__(self, config: OpenVLAConfig) -> None:
         super().__init__(config)
         self.norm_stats = config.norm_stats
-        # import pdb; pdb.set_trace()
         # Compute action bins
         self.bins = np.linspace(-1, 1, config.n_action_bins)
@@ -1048,7 +1048,7 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
         # Clone embedding for reuse in each timestep
         curr_noisy_actions = noise
-        # import pdb; pdb.set_trace()
         action_queries = self.action_queries.weight  # (1, h)
         action_queries = action_queries.view(1, action_queries.shape[0], action_queries.shape[1]).repeat(input_embeddings.shape[0], 1, 1)  # (b, chunk_size, h)
@@ -1068,7 +1068,7 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
             input_embeddings, projected_patch_embeddings, attention_mask
         )
-        # import pdb; pdb.set_trace()
         # Forward pass through language model
         language_model_output = self.language_model(
             input_ids=None,
@@ -1083,21 +1083,21 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
             return_dict=True,
         )
         multi_layer_hidden_states = []
-        # import pdb; pdb.set_trace()
         for item in language_model_output.hidden_states[0:]:
             # last_hidden_states = output.hidden_states[-1]  # (B, seq_len, D)
             # Get hidden states for text portion of prompt+response (after the vision patches)
             text_hidden_states = item
             # Get hidden states for action portion of response
             actions_hidden_states = text_hidden_states[:, NUM_PATCHES+ NUM_PROMPT_TOKENS  : NUM_PATCHES + NUM_PROMPT_TOKENS + NUM_TOKENS, :,].reshape(1, 1, NUM_TOKENS, -1).to(torch.bfloat16)
-            # import pdb; pdb.set_trace()
             batch_size = item.shape[0]
             task_latten_states = item[:, :NUM_PATCHES].reshape(batch_size, 1, NUM_PATCHES , -1)
             all_hidden_states = torch.cat((task_latten_states, actions_hidden_states),2)
             multi_layer_hidden_states.append(all_hidden_states)
-            # import pdb; pdb.set_trace()
         multi_layer_hidden_states = torch.cat(multi_layer_hidden_states, dim = 1)
-        # import pdb; pdb.set_trace()
@@ -1176,21 +1176,21 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
         # Extract hidden states for action tokens
         multi_layer_hidden_states = []
-        # import pdb; pdb.set_trace()
         for item in language_model_output.hidden_states[0:]:
             # last_hidden_states = output.hidden_states[-1]  # (B, seq_len, D)
             # Get hidden states for text portion of prompt+response (after the vision patches)
             text_hidden_states = item
             # Get hidden states for action portion of response
             actions_hidden_states = text_hidden_states[:, NUM_PATCHES+ NUM_PROMPT_TOKENS : NUM_PATCHES + NUM_PROMPT_TOKENS + NUM_TOKENS, :,].reshape(1, 1, NUM_TOKENS, -1).to(torch.bfloat16)
-            # import pdb; pdb.set_trace()
             batch_size = item.shape[0]
             task_latten_states = item[:, :NUM_PATCHES].reshape(batch_size, 1, NUM_PATCHES , -1)
             all_hidden_states = torch.cat((task_latten_states, actions_hidden_states),2)
             multi_layer_hidden_states.append(all_hidden_states)
-            # import pdb; pdb.set_trace()
         multi_layer_hidden_states = torch.cat(multi_layer_hidden_states, dim = 1)
-        # import pdb; pdb.set_trace()
         # Handle different prediction methods
         if action_head is not None:
@@ -1311,11 +1311,11 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
         Returns:
             Tuple of (unnormalized_actions, action_hidden_states)
         """
-        # import pdb; pdb.set_trace()
         # If the special empty token ('') does not already appear after the colon (':') token in the prompt
         # (after "OUT:" or "ASSISTANT:"), insert it to match the inputs seen at training time
-        # 如果是 minivla, 不用加这个判断！！！！！
         # if not torch.all(input_ids[:, -1] == 29871):
         #     input_ids = torch.cat(
         #         (input_ids, torch.unsqueeze(torch.Tensor([29871]).long(), dim=0).to(input_ids.device)), dim=1
@@ -1332,7 +1332,7 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
         # Get number of tokens in prompt (excluding the start token)
         NUM_PROMPT_TOKENS = input_ids.shape[-1] - 1  # Subtract action tokens and stop token
-        # import pdb; pdb.set_trace()
         # Prepare inputs by adding necessary tokens
         input_ids, attention_mask = self._prepare_input_for_action_prediction(input_ids, attention_mask)
@@ -1362,7 +1362,7 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
                 projected_patch_embeddings = self._process_proprio_features(
                     projected_patch_embeddings, proprio, proprio_projector
                 )
-        # import pdb; pdb.set_trace()
         # Use diffusion if provided, otherwise use regression or discrete prediction
         use_diffusion = noisy_action_projector is not None and hasattr(action_head, "noise_scheduler")
         use_flow_matching = noisy_action_projector is not None and hasattr(action_head, "sample_actions")
@@ -1380,7 +1380,7 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
             if use_diffusion:
                 NUM_PATCHES += 1
-        # import pdb; pdb.set_trace()
         if use_flow_matching:
             # Sample random noise with shape equal to output action, used as the starting state for flow matching
             noise = action_head.sample_noise((1, NUM_ACTIONS_CHUNK, ACTION_DIM),device=input_embeddings.device, dtype=input_embeddings.dtype)
@@ -1403,10 +1403,10 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
             noise = torch.randn(
                 size=(1, NUM_ACTIONS_CHUNK, ACTION_DIM), device=input_embeddings.device, dtype=input_embeddings.dtype
             )
-            # import pdb; pdb.set_trace()
             if self.version == 'v1':
-                # import pdb; pdb.set_trace()
                 # Run diffusion-based prediction
                 normalized_actions, actions_hidden_states = self._run_diffusion_prediction_V1(
                     input_embeddings, # [1, 86, 4096]
@@ -1465,7 +1465,7 @@ class OpenVLAForActionPrediction(PrismaticForConditionalGeneration):
                     action_head,
                 )
-        # import pdb; pdb.set_trace()
         # Unnormalize predicted actions
         actions = self._unnormalize_actions(normalized_actions, unnorm_key)

                 f"there might be inference-time regressions due to dependency changes. If in doubt, please"
                 f"use the above versions."
             )
         # Instantiate PrismaticVisionBackbone (w/ Potential Fused Backbone)
         self.vision_backbone = PrismaticVisionBackbone(
             config.use_fused_vision_backbone, config.image_sizes, config.timm_model_ids, config.timm_override_act_layers
         # Move the noisy action features into their correct positions
         # print(noisy_action_features.size())
         repositioned_noisy_action_features[batch_indices, masked_indices] = noisy_action_features
         # Combine original input embeddings and noisy action embeddings using the mask
     def _build_multimodal_attention(self, input_embeddings, projected_patch_embeddings, attention_mask):
         """Build multimodal embeddings and attention mask"""
         # Update attention mask
         projected_patch_attention_mask = None
         if attention_mask is not None:
             projected_patch_attention_mask = torch.full(
             # Get input embeddings (from language model embeddings)
             input_embeddings = self.get_input_embeddings()(input_ids)  # (B, seq_len, D)
             # Extract action masks
             all_actions_mask = self._process_action_masks(labels)
             # Extract the language portion of the input embeddings (i.e. remove the action tokens portion)
             # print(input_embeddings[~all_actions_mask].size())
             language_embeddings = input_embeddings[~all_actions_mask].reshape(
                 input_embeddings.shape[0], -1, input_embeddings.shape[2]
             # Process action embeddings
             if noisy_actions is not None:
                 if self.version == 'v1':
                     # action_queries = self.action_queries.weight  # (1, h)
                     # action_queries = action_queries.view(1, 1, action_queries.shape[1]).repeat(input_embeddings.shape[0], 1, 1)  # (b, chunk_size, h)
                     all_actions_mask = self._process_action_masks(labels)
                     input_embeddings = self._replace_input_embeddings(
                         input_embeddings, all_actions_mask, action_queries)
                 else:
                     # Get mask corresponding to all action tokens
                     all_actions_mask = self._process_action_masks(labels)
                     input_embeddings = self._replace_input_embeddings(
                         input_embeddings, all_actions_mask, action_queries)
                 else:
                     # Replace the embeddings of the action tokens with zeros
                     # (Later on, the positional embeddings will be added to them)
             multimodal_embeddings, multimodal_attention_mask = self._build_multimodal_attention(
                 input_embeddings, projected_patch_embeddings, attention_mask
             )
             # Build labels for multimodal sequence if needed
             multimodal_labels = self._build_multimodal_labels(labels, projected_patch_embeddings)
             # Dispatch to language model
             if self.version == 'v1':
                 language_model_output = self.language_model(
                     input_ids=None,
                     attention_mask=multimodal_attention_mask,
                     output_hidden_states=output_hidden_states,
                     return_dict=return_dict,
                 )
             else:
                 language_model_output = self.language_model(
                     input_ids=None,
     def __init__(self, config: OpenVLAConfig) -> None:
         super().__init__(config)
         self.norm_stats = config.norm_stats
         # Compute action bins
         self.bins = np.linspace(-1, 1, config.n_action_bins)
         # Clone embedding for reuse in each timestep
         curr_noisy_actions = noise
         action_queries = self.action_queries.weight  # (1, h)
         action_queries = action_queries.view(1, action_queries.shape[0], action_queries.shape[1]).repeat(input_embeddings.shape[0], 1, 1)  # (b, chunk_size, h)
             input_embeddings, projected_patch_embeddings, attention_mask
         )
         # Forward pass through language model
         language_model_output = self.language_model(
             input_ids=None,
             return_dict=True,
         )
         multi_layer_hidden_states = []
         for item in language_model_output.hidden_states[0:]:
             # last_hidden_states = output.hidden_states[-1]  # (B, seq_len, D)
             # Get hidden states for text portion of prompt+response (after the vision patches)
             text_hidden_states = item
             # Get hidden states for action portion of response
             actions_hidden_states = text_hidden_states[:, NUM_PATCHES+ NUM_PROMPT_TOKENS  : NUM_PATCHES + NUM_PROMPT_TOKENS + NUM_TOKENS, :,].reshape(1, 1, NUM_TOKENS, -1).to(torch.bfloat16)
             batch_size = item.shape[0]
             task_latten_states = item[:, :NUM_PATCHES].reshape(batch_size, 1, NUM_PATCHES , -1)
             all_hidden_states = torch.cat((task_latten_states, actions_hidden_states),2)
             multi_layer_hidden_states.append(all_hidden_states)
         multi_layer_hidden_states = torch.cat(multi_layer_hidden_states, dim = 1)
         # Extract hidden states for action tokens
         multi_layer_hidden_states = []
         for item in language_model_output.hidden_states[0:]:
             # last_hidden_states = output.hidden_states[-1]  # (B, seq_len, D)
             # Get hidden states for text portion of prompt+response (after the vision patches)
             text_hidden_states = item
             # Get hidden states for action portion of response
             actions_hidden_states = text_hidden_states[:, NUM_PATCHES+ NUM_PROMPT_TOKENS : NUM_PATCHES + NUM_PROMPT_TOKENS + NUM_TOKENS, :,].reshape(1, 1, NUM_TOKENS, -1).to(torch.bfloat16)
             batch_size = item.shape[0]
             task_latten_states = item[:, :NUM_PATCHES].reshape(batch_size, 1, NUM_PATCHES , -1)
             all_hidden_states = torch.cat((task_latten_states, actions_hidden_states),2)
             multi_layer_hidden_states.append(all_hidden_states)
         multi_layer_hidden_states = torch.cat(multi_layer_hidden_states, dim = 1)
         # Handle different prediction methods
         if action_head is not None:
         Returns:
             Tuple of (unnormalized_actions, action_hidden_states)
         """
         # If the special empty token ('') does not already appear after the colon (':') token in the prompt
         # (after "OUT:" or "ASSISTANT:"), insert it to match the inputs seen at training time
         # if not torch.all(input_ids[:, -1] == 29871):
         #     input_ids = torch.cat(
         #         (input_ids, torch.unsqueeze(torch.Tensor([29871]).long(), dim=0).to(input_ids.device)), dim=1
         # Get number of tokens in prompt (excluding the start token)
         NUM_PROMPT_TOKENS = input_ids.shape[-1] - 1  # Subtract action tokens and stop token
         # Prepare inputs by adding necessary tokens
         input_ids, attention_mask = self._prepare_input_for_action_prediction(input_ids, attention_mask)
                 projected_patch_embeddings = self._process_proprio_features(
                     projected_patch_embeddings, proprio, proprio_projector
                 )
         # Use diffusion if provided, otherwise use regression or discrete prediction
         use_diffusion = noisy_action_projector is not None and hasattr(action_head, "noise_scheduler")
         use_flow_matching = noisy_action_projector is not None and hasattr(action_head, "sample_actions")
             if use_diffusion:
                 NUM_PATCHES += 1
         if use_flow_matching:
             # Sample random noise with shape equal to output action, used as the starting state for flow matching
             noise = action_head.sample_noise((1, NUM_ACTIONS_CHUNK, ACTION_DIM),device=input_embeddings.device, dtype=input_embeddings.dtype)
             noise = torch.randn(
                 size=(1, NUM_ACTIONS_CHUNK, ACTION_DIM), device=input_embeddings.device, dtype=input_embeddings.dtype
             )
             if self.version == 'v1':
                 # Run diffusion-based prediction
                 normalized_actions, actions_hidden_states = self._run_diffusion_prediction_V1(
                     input_embeddings, # [1, 86, 4096]
                     action_head,
                 )
         # Unnormalize predicted actions
         actions = self._unnormalize_actions(normalized_actions, unnorm_key)