|
|
#!/usr/bin/env bash |
|
|
|
|
|
|
|
|
|
|
|
set -euo pipefail |
|
|
|
|
|
echo "=== PyTorch C++ Extension Debug Script 3 ===" |
|
|
echo "Testing PyTorch C++ extension compilation with HIP" |
|
|
echo |
|
|
|
|
|
|
|
|
export ROCM_PATH="${ROCM_PATH:-/opt/rocm-7.0.1}" |
|
|
export ROCM_HOME="${ROCM_HOME:-$ROCM_PATH}" |
|
|
export HIP_PATH="${HIP_PATH:-$ROCM_PATH}" |
|
|
export HIP_HOME="${HIP_HOME:-$ROCM_PATH}" |
|
|
export PATH="$ROCM_HOME/bin:$PATH" |
|
|
export TORCH_HIP_ARCH_LIST="${TORCH_HIP_ARCH_LIST:-gfx942}" |
|
|
export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-gfx942}" |
|
|
export TORCH_EXTENSIONS_DIR="${TORCH_EXTENSIONS_DIR:-$PWD/.torch_extensions_debug}" |
|
|
|
|
|
|
|
|
mkdir -p /tmp/torch_ext_test |
|
|
cd /tmp/torch_ext_test |
|
|
|
|
|
echo "=== Creating Simple PyTorch Extension ===" |
|
|
|
|
|
|
|
|
cat > simple_kernel.cu << 'EOF' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hipLaunchKernelGGL(kernel, grid, block, smem, stream, __VA_ARGS__) |
|
|
|
|
|
|
|
|
|
|
|
kernel<<<grid, block, smem, stream>>>(__VA_ARGS__) |
|
|
#endif |
|
|
|
|
|
__global__ void add_kernel(const float* a, const float* b, float* c, int n) { |
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
if (idx < n) { |
|
|
c[idx] = a[idx] + b[idx]; |
|
|
} |
|
|
} |
|
|
|
|
|
torch::Tensor add_tensors_cuda(torch::Tensor a, torch::Tensor b) { |
|
|
auto c = torch::zeros_like(a); |
|
|
int n = a.numel(); |
|
|
|
|
|
const int block_size = 256; |
|
|
const int grid_size = (n + block_size - 1) / block_size; |
|
|
|
|
|
CUDA_LAUNCH_KERNEL( |
|
|
add_kernel, |
|
|
dim3(grid_size), |
|
|
dim3(block_size), |
|
|
0, |
|
|
0, |
|
|
a.data_ptr<float>(), |
|
|
b.data_ptr<float>(), |
|
|
c.data_ptr<float>(), |
|
|
n |
|
|
); |
|
|
|
|
|
return c; |
|
|
} |
|
|
|
|
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { |
|
|
m.def("add_tensors", &add_tensors_cuda, "Add two tensors (CUDA/HIP)"); |
|
|
} |
|
|
EOF |
|
|
|
|
|
# Create Python test script |
|
|
cat > test_extension.py << 'EOF' |
|
|
import os |
|
|
import sys |
|
|
import torch |
|
|
from torch.utils.cpp_extension import load |
|
|
|
|
|
print("=== PyTorch Extension Load Test ===") |
|
|
print(f"PyTorch version: {torch.__version__}") |
|
|
print(f"CUDA available: {torch.cuda.is_available()}") |
|
|
print(f"Device count: {torch.cuda.device_count()}") |
|
|
|
|
|
if hasattr(torch.version, 'hip') and torch.version.hip: |
|
|
print(f"HIP version: {torch.version.hip}") |
|
|
|
|
|
print("\n=== Loading Extension ===") |
|
|
print("This may take a while and will show compilation output...") |
|
|
print("If this hangs, it indicates the same issue as build.py") |
|
|
|
|
|
try: |
|
|
# Mimic the same load call as build.py |
|
|
simple_ext = load( |
|
|
name="simple_test_ext", |
|
|
sources=["simple_kernel.cu"], |
|
|
extra_cflags=["-O3", "-std=c++17"], |
|
|
extra_cuda_cflags=["-O3"], # torch switches this to hipcc on ROCm |
|
|
verbose=True, |
|
|
is_python_module=False |
|
|
) |
|
|
print("β Extension compilation successful!") |
|
|
|
|
|
# Test the extension |
|
|
print("\n=== Testing Extension ===") |
|
|
device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
a = torch.randn(1000, device=device) |
|
|
b = torch.randn(1000, device=device) |
|
|
|
|
|
if device == 'cuda': |
|
|
result = simple_ext.add_tensors(a, b) |
|
|
expected = a + b |
|
|
if torch.allclose(result, expected): |
|
|
print("β Extension execution successful!") |
|
|
else: |
|
|
print("β Extension execution failed - results don't match") |
|
|
else: |
|
|
print("β No CUDA device, skipping execution test") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Extension compilation/loading failed: {e}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
EOF |
|
|
|
|
|
echo "=== Running PyTorch Extension Test ===" |
|
|
echo "This test mimics the same compilation process as build.py" |
|
|
echo "If this hangs, it shows the same issue as the main build" |
|
|
echo |
|
|
|
|
|
# Set a timeout to prevent infinite hang |
|
|
timeout 300 python3 test_extension.py || { |
|
|
exit_code=$? |
|
|
if [ $exit_code -eq 124 ]; then |
|
|
echo "β Extension compilation timed out after 5 minutes (same as build.py hang)" |
|
|
else |
|
|
echo "β Extension compilation failed with exit code $exit_code" |
|
|
fi |
|
|
} |
|
|
|
|
|
echo |
|
|
echo "=== Testing with Minimal Sources ===" |
|
|
|
|
|
# Create an even simpler version |
|
|
cat > minimal_kernel.cu << 'EOF' |
|
|
#include <torch/extension.h> |
|
|
|
|
|
torch::Tensor dummy_function(torch::Tensor input) { |
|
|
return input.clone(); |
|
|
} |
|
|
|
|
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { |
|
|
m.def("dummy", &dummy_function, "Dummy function"); |
|
|
} |
|
|
EOF |
|
|
|
|
|
cat > test_minimal.py << 'EOF' |
|
|
import torch |
|
|
from torch.utils.cpp_extension import load |
|
|
|
|
|
print("=== Minimal Extension Test ===") |
|
|
|
|
|
try: |
|
|
minimal_ext = load( |
|
|
name="minimal_test_ext", |
|
|
sources=["minimal_kernel.cu"], |
|
|
extra_cflags=["-O3"], |
|
|
verbose=True, |
|
|
with_cuda=False # Skip CUDA/HIP compilation |
|
|
) |
|
|
print("β Minimal extension (CPU only) successful!") |
|
|
except Exception as e: |
|
|
print(f"β Even minimal extension failed: {e}") |
|
|
EOF |
|
|
|
|
|
echo "Testing minimal CPU-only extension..." |
|
|
timeout 120 python3 test_minimal.py || echo "Minimal extension also failed/timed out" |
|
|
|
|
|
echo |
|
|
echo "=== Debug Script 3 Complete ===" |
|
|
|
|
|
# Cleanup |
|
|
cd / |
|
|
rm -rf /tmp/torch_ext_test |