Optimize MPS to use bfloat16 precision

After testing, MPS can handle bfloat16 precision without autocast.
This reduces memory usage while maintaining stable output.

Changes from previous commit:
- Use bfloat16 for images on both MPS and CUDA (unified dtype)
- Keep nullcontext() for MPS (no autocast - causes issues)
- CUDA path unchanged (still uses bfloat16 autocast)

Key insight: The row-wise embedding assignment fix was the critical
change. With that in place, bfloat16 works stably on MPS without
needing fp32 precision.

Tested on: macOS 26.0.1, Apple M4 Max, PyTorch 2.9.0

Files changed (1) hide show

modeling_deepseekocr.py +6 -6

modeling_deepseekocr.py CHANGED Viewed

@@ -816,8 +816,8 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
-                # MPS needs fp32, CUDA can use bfloat16
-                image_dtype = torch.float32 if self.device.type == "mps" else torch.bfloat16
                 images_list.append(image_transform(global_view).to(image_dtype))
                 # global_view_tensor = image_transform(global_view).to(torch.bfloat16)
@@ -865,8 +865,8 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 # else:
                 global_view = ImageOps.pad(image, (image_size, image_size),
                                         color=tuple(int(x * 255) for x in image_transform.mean))
-                # MPS needs fp32, CUDA can use bfloat16
-                image_dtype = torch.float32 if self.device.type == "mps" else torch.bfloat16
                 images_list.append(image_transform(global_view).to(image_dtype))
                 if base_size == 1024:
@@ -932,7 +932,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         if not eval_mode:
             streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
-            # MPS: no autocast (pure fp32); CUDA: keep original bfloat16 autocast
             autocast_ctx = nullcontext() if self.device.type == "mps" else torch.autocast("cuda", dtype=torch.bfloat16)
             with autocast_ctx:
                 with torch.no_grad():
@@ -952,7 +952,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                         )
         else:
-            # MPS: no autocast (pure fp32); CUDA: keep original bfloat16 autocast
             autocast_ctx = nullcontext() if self.device.type == "mps" else torch.autocast("cuda", dtype=torch.bfloat16)
             with autocast_ctx:
                 with torch.no_grad():

+                # MPS and CUDA both use bfloat16
+                image_dtype = torch.bfloat16
                 images_list.append(image_transform(global_view).to(image_dtype))
                 # global_view_tensor = image_transform(global_view).to(torch.bfloat16)
                 # else:
                 global_view = ImageOps.pad(image, (image_size, image_size),
                                         color=tuple(int(x * 255) for x in image_transform.mean))
+                # MPS and CUDA both use bfloat16
+                image_dtype = torch.bfloat16
                 images_list.append(image_transform(global_view).to(image_dtype))
                 if base_size == 1024:
         if not eval_mode:
             streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
+            # MPS: no autocast (pure bfloat16); CUDA: bfloat16 autocast
             autocast_ctx = nullcontext() if self.device.type == "mps" else torch.autocast("cuda", dtype=torch.bfloat16)
             with autocast_ctx:
                 with torch.no_grad():
                         )
         else:
+            # MPS: no autocast (pure bfloat16); CUDA: bfloat16 autocast
             autocast_ctx = nullcontext() if self.device.type == "mps" else torch.autocast("cuda", dtype=torch.bfloat16)
             with autocast_ctx:
                 with torch.no_grad():