enable intel XPU platform (#7)

- add intel xpu platform support (feaa3d5aef9b856f0f68ce753342014b21ea6266)

Co-authored-by: Liu,Kaixuan <Kaixuanliu@users.noreply.huggingface.co>

Files changed (3) hide show

modeling_cogvlm.py CHANGED Viewed

@@ -8,6 +8,7 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 from torchvision import transforms
 from einops import rearrange
 from transformers import PreTrainedModel, PreTrainedTokenizer
 from transformers.utils.logging import get_logger
 from transformers.activations import ACT2FN
@@ -723,9 +724,14 @@ class CogVLMVideoForCausalLM(CogVLMPreTrainedModel):
             standardize_cache_format: bool = False,
     ) -> Dict[str, Any]:
         # update past_key_values
-        cache_name, cache = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
         model_kwargs[cache_name] = cache
         if getattr(outputs, "state", None) is not None:

 from torch.nn import CrossEntropyLoss
 from torchvision import transforms
 from einops import rearrange
+import transformers
 from transformers import PreTrainedModel, PreTrainedTokenizer
 from transformers.utils.logging import get_logger
 from transformers.activations import ACT2FN
             standardize_cache_format: bool = False,
     ) -> Dict[str, Any]:
         # update past_key_values
+        if transformers.__version__ >= "4.44.0":
+            cache_name, cache = self._extract_past_from_model_output(
+                outputs
+            )
+        else:
+            cache_name, cache = self._extract_past_from_model_output(
+                outputs, standardize_cache_format=standardize_cache_format
+            )
         model_kwargs[cache_name] = cache
         if getattr(outputs, "state", None) is not None:

util.py CHANGED Viewed

@@ -7,6 +7,10 @@ import torch.nn.functional as F
 import triton
 import triton.language as tl
 @triton.jit
 def rotary_kernel(
@@ -197,7 +201,9 @@ def apply_rotary(
     # Need this, otherwise Triton tries to launch from cuda:0 and we get
     # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
-    with torch.cuda.device(x.device.index):
         rotary_kernel[grid](
             output,  # data ptrs
             x,

 import triton
 import triton.language as tl
+device_contexts = {
+    'cuda': torch.cuda.device,
+    'xpu': torch.xpu.device
+}
 @triton.jit
 def rotary_kernel(
     # Need this, otherwise Triton tries to launch from cuda:0 and we get
     # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+    device_type = x.device.type
+    assert device_type in device_contexts
+    with device_contexts[device_type](x.device.index):
         rotary_kernel[grid](
             output,  # data ptrs
             x,

visual.py CHANGED Viewed

@@ -75,6 +75,8 @@ class Attention(nn.Module):
         out = out.transpose(2, 1)
         # breakpoint()
         # output = self.dense(out.reshape(B, L, -1))
         output = self.dense(out.view(B, L, -1))
         output = self.output_dropout(output)
         return output

         out = out.transpose(2, 1)
         # breakpoint()
         # output = self.dense(out.reshape(B, L, -1))
+        if not out.is_contiguous():
+            out = out.contiguous()
         output = self.dense(out.view(B, L, -1))
         output = self.output_dropout(output)
         return output