Update VisionSdpaAttention to support memory efficient backend.

Because of a bug (https://github.com/pytorch/pytorch/issues/127523), the memory-efficient backend for scaled_dot_product_attention currently only supports 4D data.

Typically, users only switch to VisionSdpaAttention when their hardware does not support FlashAttention2 (such as Turing architecture GPUs, e.g., the 2080 Ti, and earlier models). However, memory usage increases dramatically with input size. This implementation helps reduce memory consumption, which is the bottleneck in 99% of cases.

Files changed (1) hide show

modeling_dots_vision.py +14 -5

modeling_dots_vision.py CHANGED Viewed

@@ -274,12 +274,21 @@ class VisionSdpaAttention(nn.Module):
         for i in range(1, len(cu_seqlens)):
             attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = True
-        q = q.transpose(0, 1)
-        k = k.transpose(0, 1)
-        v = v.transpose(0, 1)
-        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
-        attn_output = attn_output.transpose(0, 1)
         attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)

         for i in range(1, len(cu_seqlens)):
             attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = True
+        # Convert q, k, v to 4D to enable : (1, num_heads, seq_length, head_dim)
+        q = q.transpose(0, 1).unsqueeze(0)   # (1, num_heads, seq_length, head_dim)
+        k = k.transpose(0, 1).unsqueeze(0)
+        v = v.transpose(0, 1).unsqueeze(0)
+        # See: https://github.com/pytorch/pytorch/issues/127523
+        if attention_mask.stride(-1) != 1:
+            attention_mask = torch.empty_like(attention_mask, memory_format=torch.contiguous_format).copy_(attention_mask)
+        # use memory efficient backend
+        from torch.nn.attention import SDPBackend, sdpa_kernel
+        with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
+            attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.squeeze(0).transpose(0, 1)  # (seq_length, num_heads, head_dim)
         attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)