megablocks-hip / _dev /debug-build-3-torch-ext.sh
leonardlin's picture
Add ROCm build debugging utilities
2d8a802
#!/usr/bin/env bash
# Debug script 3: PyTorch C++ extension compilation test
set -euo pipefail
echo "=== PyTorch C++ Extension Debug Script 3 ==="
echo "Testing PyTorch C++ extension compilation with HIP"
echo
# Set ROCm environment variables
export ROCM_PATH="${ROCM_PATH:-/opt/rocm-7.0.1}"
export ROCM_HOME="${ROCM_HOME:-$ROCM_PATH}"
export HIP_PATH="${HIP_PATH:-$ROCM_PATH}"
export HIP_HOME="${HIP_HOME:-$ROCM_PATH}"
export PATH="$ROCM_HOME/bin:$PATH"
export TORCH_HIP_ARCH_LIST="${TORCH_HIP_ARCH_LIST:-gfx942}"
export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-gfx942}"
export TORCH_EXTENSIONS_DIR="${TORCH_EXTENSIONS_DIR:-$PWD/.torch_extensions_debug}"
# Create a test directory
mkdir -p /tmp/torch_ext_test
cd /tmp/torch_ext_test
echo "=== Creating Simple PyTorch Extension ==="
# Create a minimal CUDA/HIP kernel similar to megablocks
cat > simple_kernel.cu << 'EOF'
#include <torch/extension.h>
#include <vector>
#ifdef __HIP_PLATFORM_AMD__
#include <hip/hip_runtime.h>
#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
hipLaunchKernelGGL(kernel, grid, block, smem, stream, __VA_ARGS__)
#else
#include <cuda_runtime.h>
#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
kernel<<<grid, block, smem, stream>>>(__VA_ARGS__)
#endif
__global__ void add_kernel(const float* a, const float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
torch::Tensor add_tensors_cuda(torch::Tensor a, torch::Tensor b) {
auto c = torch::zeros_like(a);
int n = a.numel();
const int block_size = 256;
const int grid_size = (n + block_size - 1) / block_size;
CUDA_LAUNCH_KERNEL(
add_kernel,
dim3(grid_size),
dim3(block_size),
0,
0,
a.data_ptr<float>(),
b.data_ptr<float>(),
c.data_ptr<float>(),
n
);
return c;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("add_tensors", &add_tensors_cuda, "Add two tensors (CUDA/HIP)");
}
EOF
# Create Python test script
cat > test_extension.py << 'EOF'
import os
import sys
import torch
from torch.utils.cpp_extension import load
print("=== PyTorch Extension Load Test ===")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
if hasattr(torch.version, 'hip') and torch.version.hip:
print(f"HIP version: {torch.version.hip}")
print("\n=== Loading Extension ===")
print("This may take a while and will show compilation output...")
print("If this hangs, it indicates the same issue as build.py")
try:
# Mimic the same load call as build.py
simple_ext = load(
name="simple_test_ext",
sources=["simple_kernel.cu"],
extra_cflags=["-O3", "-std=c++17"],
extra_cuda_cflags=["-O3"], # torch switches this to hipcc on ROCm
verbose=True,
is_python_module=False
)
print("βœ“ Extension compilation successful!")
# Test the extension
print("\n=== Testing Extension ===")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
a = torch.randn(1000, device=device)
b = torch.randn(1000, device=device)
if device == 'cuda':
result = simple_ext.add_tensors(a, b)
expected = a + b
if torch.allclose(result, expected):
print("βœ“ Extension execution successful!")
else:
print("βœ— Extension execution failed - results don't match")
else:
print("⚠ No CUDA device, skipping execution test")
except Exception as e:
print(f"βœ— Extension compilation/loading failed: {e}")
import traceback
traceback.print_exc()
EOF
echo "=== Running PyTorch Extension Test ==="
echo "This test mimics the same compilation process as build.py"
echo "If this hangs, it shows the same issue as the main build"
echo
# Set a timeout to prevent infinite hang
timeout 300 python3 test_extension.py || {
exit_code=$?
if [ $exit_code -eq 124 ]; then
echo "βœ— Extension compilation timed out after 5 minutes (same as build.py hang)"
else
echo "βœ— Extension compilation failed with exit code $exit_code"
fi
}
echo
echo "=== Testing with Minimal Sources ==="
# Create an even simpler version
cat > minimal_kernel.cu << 'EOF'
#include <torch/extension.h>
torch::Tensor dummy_function(torch::Tensor input) {
return input.clone();
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("dummy", &dummy_function, "Dummy function");
}
EOF
cat > test_minimal.py << 'EOF'
import torch
from torch.utils.cpp_extension import load
print("=== Minimal Extension Test ===")
try:
minimal_ext = load(
name="minimal_test_ext",
sources=["minimal_kernel.cu"],
extra_cflags=["-O3"],
verbose=True,
with_cuda=False # Skip CUDA/HIP compilation
)
print("βœ“ Minimal extension (CPU only) successful!")
except Exception as e:
print(f"βœ— Even minimal extension failed: {e}")
EOF
echo "Testing minimal CPU-only extension..."
timeout 120 python3 test_minimal.py || echo "Minimal extension also failed/timed out"
echo
echo "=== Debug Script 3 Complete ==="
# Cleanup
cd /
rm -rf /tmp/torch_ext_test