Remove hardcoded .cuda() calls to support single forward pass on CPU and ensure DeepSeekOCR model compatibility with transformers==4.52.4

#54

by kamalrajkannanmcw - opened 3 days ago

base: refs/heads/main

←

from: refs/pr/54

Discussion Files changed

-7

Files changed (2) hide show

modeling_deepseekocr.py +1 -1
modeling_deepseekv2.py +8 -6

modeling_deepseekocr.py CHANGED Viewed

@@ -502,7 +502,7 @@ class DeepseekOCRModel(DeepseekV2Model):
                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
-                    inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
                 idx += 1

                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
+                    inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1), images_in_this_batch)
                 idx += 1

modeling_deepseekv2.py CHANGED Viewed

@@ -36,7 +36,6 @@ from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
-    LlamaFlashAttention2
 )
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
@@ -60,6 +59,8 @@ from transformers.utils.import_utils import is_torch_fx_available
 from .configuration_deepseek_v2 import DeepseekV2Config
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
@@ -1235,7 +1236,6 @@ ATTENTION_CLASSES = {
     "mla_flash_attention_2": DeepseekV2FlashAttention2,
     "mha_eager": LlamaAttention,
-    "mha_flash_attention_2": LlamaFlashAttention2
 }
@@ -1269,6 +1269,8 @@ class DeepseekV2DecoderLayer(nn.Module):
         self.post_attention_layernorm = DeepseekV2RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps
         )
     def forward(
         self,
@@ -1303,15 +1305,18 @@ class DeepseekV2DecoderLayer(nn.Module):
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
             **kwargs,
         )
         hidden_states = residual + hidden_states
@@ -1327,9 +1332,6 @@ class DeepseekV2DecoderLayer(nn.Module):
         if output_attentions:
             outputs += (self_attn_weights,)
-        if use_cache:
-            outputs += (present_key_value,)
         return outputs

 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
 )
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
 from .configuration_deepseek_v2 import DeepseekV2Config
+from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
     "mla_flash_attention_2": DeepseekV2FlashAttention2,
     "mha_eager": LlamaAttention,
 }
         self.post_attention_layernorm = DeepseekV2RMSNorm(
             config.hidden_size, eps=config.rms_norm_eps
         )
+        # Compute position_embeddings
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
     def forward(
         self,
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
         # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_ids=position_ids,
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
+            position_embeddings=position_embeddings,
             **kwargs,
         )
         hidden_states = residual + hidden_states
         if output_attentions:
             outputs += (self_attn_weights,)
         return outputs