File size: 5,294 Bytes
2d8a802 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
#!/usr/bin/env bash
# Debug script 3: PyTorch C++ extension compilation test
set -euo pipefail
echo "=== PyTorch C++ Extension Debug Script 3 ==="
echo "Testing PyTorch C++ extension compilation with HIP"
echo
# Set ROCm environment variables
export ROCM_PATH="${ROCM_PATH:-/opt/rocm-7.0.1}"
export ROCM_HOME="${ROCM_HOME:-$ROCM_PATH}"
export HIP_PATH="${HIP_PATH:-$ROCM_PATH}"
export HIP_HOME="${HIP_HOME:-$ROCM_PATH}"
export PATH="$ROCM_HOME/bin:$PATH"
export TORCH_HIP_ARCH_LIST="${TORCH_HIP_ARCH_LIST:-gfx942}"
export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-gfx942}"
export TORCH_EXTENSIONS_DIR="${TORCH_EXTENSIONS_DIR:-$PWD/.torch_extensions_debug}"
# Create a test directory
mkdir -p /tmp/torch_ext_test
cd /tmp/torch_ext_test
echo "=== Creating Simple PyTorch Extension ==="
# Create a minimal CUDA/HIP kernel similar to megablocks
cat > simple_kernel.cu << 'EOF'
#include <torch/extension.h>
#include <vector>
#ifdef __HIP_PLATFORM_AMD__
#include <hip/hip_runtime.h>
#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
hipLaunchKernelGGL(kernel, grid, block, smem, stream, __VA_ARGS__)
#else
#include <cuda_runtime.h>
#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
kernel<<<grid, block, smem, stream>>>(__VA_ARGS__)
#endif
__global__ void add_kernel(const float* a, const float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
torch::Tensor add_tensors_cuda(torch::Tensor a, torch::Tensor b) {
auto c = torch::zeros_like(a);
int n = a.numel();
const int block_size = 256;
const int grid_size = (n + block_size - 1) / block_size;
CUDA_LAUNCH_KERNEL(
add_kernel,
dim3(grid_size),
dim3(block_size),
0,
0,
a.data_ptr<float>(),
b.data_ptr<float>(),
c.data_ptr<float>(),
n
);
return c;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("add_tensors", &add_tensors_cuda, "Add two tensors (CUDA/HIP)");
}
EOF
# Create Python test script
cat > test_extension.py << 'EOF'
import os
import sys
import torch
from torch.utils.cpp_extension import load
print("=== PyTorch Extension Load Test ===")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")
if hasattr(torch.version, 'hip') and torch.version.hip:
print(f"HIP version: {torch.version.hip}")
print("\n=== Loading Extension ===")
print("This may take a while and will show compilation output...")
print("If this hangs, it indicates the same issue as build.py")
try:
# Mimic the same load call as build.py
simple_ext = load(
name="simple_test_ext",
sources=["simple_kernel.cu"],
extra_cflags=["-O3", "-std=c++17"],
extra_cuda_cflags=["-O3"], # torch switches this to hipcc on ROCm
verbose=True,
is_python_module=False
)
print("β Extension compilation successful!")
# Test the extension
print("\n=== Testing Extension ===")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
a = torch.randn(1000, device=device)
b = torch.randn(1000, device=device)
if device == 'cuda':
result = simple_ext.add_tensors(a, b)
expected = a + b
if torch.allclose(result, expected):
print("β Extension execution successful!")
else:
print("β Extension execution failed - results don't match")
else:
print("β No CUDA device, skipping execution test")
except Exception as e:
print(f"β Extension compilation/loading failed: {e}")
import traceback
traceback.print_exc()
EOF
echo "=== Running PyTorch Extension Test ==="
echo "This test mimics the same compilation process as build.py"
echo "If this hangs, it shows the same issue as the main build"
echo
# Set a timeout to prevent infinite hang
timeout 300 python3 test_extension.py || {
exit_code=$?
if [ $exit_code -eq 124 ]; then
echo "β Extension compilation timed out after 5 minutes (same as build.py hang)"
else
echo "β Extension compilation failed with exit code $exit_code"
fi
}
echo
echo "=== Testing with Minimal Sources ==="
# Create an even simpler version
cat > minimal_kernel.cu << 'EOF'
#include <torch/extension.h>
torch::Tensor dummy_function(torch::Tensor input) {
return input.clone();
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("dummy", &dummy_function, "Dummy function");
}
EOF
cat > test_minimal.py << 'EOF'
import torch
from torch.utils.cpp_extension import load
print("=== Minimal Extension Test ===")
try:
minimal_ext = load(
name="minimal_test_ext",
sources=["minimal_kernel.cu"],
extra_cflags=["-O3"],
verbose=True,
with_cuda=False # Skip CUDA/HIP compilation
)
print("β Minimal extension (CPU only) successful!")
except Exception as e:
print(f"β Even minimal extension failed: {e}")
EOF
echo "Testing minimal CPU-only extension..."
timeout 120 python3 test_minimal.py || echo "Minimal extension also failed/timed out"
echo
echo "=== Debug Script 3 Complete ==="
# Cleanup
cd /
rm -rf /tmp/torch_ext_test |