#!/usr/bin/env bash # Debug script 3: PyTorch C++ extension compilation test set -euo pipefail echo "=== PyTorch C++ Extension Debug Script 3 ===" echo "Testing PyTorch C++ extension compilation with HIP" echo # Set ROCm environment variables export ROCM_PATH="${ROCM_PATH:-/opt/rocm-7.0.1}" export ROCM_HOME="${ROCM_HOME:-$ROCM_PATH}" export HIP_PATH="${HIP_PATH:-$ROCM_PATH}" export HIP_HOME="${HIP_HOME:-$ROCM_PATH}" export PATH="$ROCM_HOME/bin:$PATH" export TORCH_HIP_ARCH_LIST="${TORCH_HIP_ARCH_LIST:-gfx942}" export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-gfx942}" export TORCH_EXTENSIONS_DIR="${TORCH_EXTENSIONS_DIR:-$PWD/.torch_extensions_debug}" # Create a test directory mkdir -p /tmp/torch_ext_test cd /tmp/torch_ext_test echo "=== Creating Simple PyTorch Extension ===" # Create a minimal CUDA/HIP kernel similar to megablocks cat > simple_kernel.cu << 'EOF' #include #include #ifdef __HIP_PLATFORM_AMD__ #include #define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \ hipLaunchKernelGGL(kernel, grid, block, smem, stream, __VA_ARGS__) #else #include #define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \ kernel<<>>(__VA_ARGS__) #endif __global__ void add_kernel(const float* a, const float* b, float* c, int n) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < n) { c[idx] = a[idx] + b[idx]; } } torch::Tensor add_tensors_cuda(torch::Tensor a, torch::Tensor b) { auto c = torch::zeros_like(a); int n = a.numel(); const int block_size = 256; const int grid_size = (n + block_size - 1) / block_size; CUDA_LAUNCH_KERNEL( add_kernel, dim3(grid_size), dim3(block_size), 0, 0, a.data_ptr(), b.data_ptr(), c.data_ptr(), n ); return c; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("add_tensors", &add_tensors_cuda, "Add two tensors (CUDA/HIP)"); } EOF # Create Python test script cat > test_extension.py << 'EOF' import os import sys import torch from torch.utils.cpp_extension import load print("=== PyTorch Extension Load Test ===") print(f"PyTorch version: {torch.__version__}") print(f"CUDA available: {torch.cuda.is_available()}") print(f"Device count: {torch.cuda.device_count()}") if hasattr(torch.version, 'hip') and torch.version.hip: print(f"HIP version: {torch.version.hip}") print("\n=== Loading Extension ===") print("This may take a while and will show compilation output...") print("If this hangs, it indicates the same issue as build.py") try: # Mimic the same load call as build.py simple_ext = load( name="simple_test_ext", sources=["simple_kernel.cu"], extra_cflags=["-O3", "-std=c++17"], extra_cuda_cflags=["-O3"], # torch switches this to hipcc on ROCm verbose=True, is_python_module=False ) print("✓ Extension compilation successful!") # Test the extension print("\n=== Testing Extension ===") device = 'cuda' if torch.cuda.is_available() else 'cpu' a = torch.randn(1000, device=device) b = torch.randn(1000, device=device) if device == 'cuda': result = simple_ext.add_tensors(a, b) expected = a + b if torch.allclose(result, expected): print("✓ Extension execution successful!") else: print("✗ Extension execution failed - results don't match") else: print("⚠ No CUDA device, skipping execution test") except Exception as e: print(f"✗ Extension compilation/loading failed: {e}") import traceback traceback.print_exc() EOF echo "=== Running PyTorch Extension Test ===" echo "This test mimics the same compilation process as build.py" echo "If this hangs, it shows the same issue as the main build" echo # Set a timeout to prevent infinite hang timeout 300 python3 test_extension.py || { exit_code=$? if [ $exit_code -eq 124 ]; then echo "✗ Extension compilation timed out after 5 minutes (same as build.py hang)" else echo "✗ Extension compilation failed with exit code $exit_code" fi } echo echo "=== Testing with Minimal Sources ===" # Create an even simpler version cat > minimal_kernel.cu << 'EOF' #include torch::Tensor dummy_function(torch::Tensor input) { return input.clone(); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("dummy", &dummy_function, "Dummy function"); } EOF cat > test_minimal.py << 'EOF' import torch from torch.utils.cpp_extension import load print("=== Minimal Extension Test ===") try: minimal_ext = load( name="minimal_test_ext", sources=["minimal_kernel.cu"], extra_cflags=["-O3"], verbose=True, with_cuda=False # Skip CUDA/HIP compilation ) print("✓ Minimal extension (CPU only) successful!") except Exception as e: print(f"✗ Even minimal extension failed: {e}") EOF echo "Testing minimal CPU-only extension..." timeout 120 python3 test_minimal.py || echo "Minimal extension also failed/timed out" echo echo "=== Debug Script 3 Complete ===" # Cleanup cd / rm -rf /tmp/torch_ext_test