File size: 5,294 Bytes

2d8a802

#!/usr/bin/env bash

# Debug script 3: PyTorch C++ extension compilation test

set -euo pipefail

echo "=== PyTorch C++ Extension Debug Script 3 ==="
echo "Testing PyTorch C++ extension compilation with HIP"
echo

# Set ROCm environment variables
export ROCM_PATH="${ROCM_PATH:-/opt/rocm-7.0.1}"
export ROCM_HOME="${ROCM_HOME:-$ROCM_PATH}"
export HIP_PATH="${HIP_PATH:-$ROCM_PATH}"
export HIP_HOME="${HIP_HOME:-$ROCM_PATH}"
export PATH="$ROCM_HOME/bin:$PATH"
export TORCH_HIP_ARCH_LIST="${TORCH_HIP_ARCH_LIST:-gfx942}"
export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-gfx942}"
export TORCH_EXTENSIONS_DIR="${TORCH_EXTENSIONS_DIR:-$PWD/.torch_extensions_debug}"

# Create a test directory
mkdir -p /tmp/torch_ext_test
cd /tmp/torch_ext_test

echo "=== Creating Simple PyTorch Extension ==="

# Create a minimal CUDA/HIP kernel similar to megablocks
cat > simple_kernel.cu << 'EOF'
#include <torch/extension.h>
#include <vector>

#ifdef __HIP_PLATFORM_AMD__
#include <hip/hip_runtime.h>
#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
    hipLaunchKernelGGL(kernel, grid, block, smem, stream, __VA_ARGS__)
#else
#include <cuda_runtime.h>
#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
    kernel<<<grid, block, smem, stream>>>(__VA_ARGS__)
#endif

__global__ void add_kernel(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

torch::Tensor add_tensors_cuda(torch::Tensor a, torch::Tensor b) {
    auto c = torch::zeros_like(a);
    int n = a.numel();

    const int block_size = 256;
    const int grid_size = (n + block_size - 1) / block_size;

    CUDA_LAUNCH_KERNEL(
        add_kernel,
        dim3(grid_size),
        dim3(block_size),
        0,
        0,
        a.data_ptr<float>(),
        b.data_ptr<float>(),
        c.data_ptr<float>(),
        n
    );

    return c;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("add_tensors", &add_tensors_cuda, "Add two tensors (CUDA/HIP)");
}
EOF

# Create Python test script
cat > test_extension.py << 'EOF'
import os
import sys
import torch
from torch.utils.cpp_extension import load

print("=== PyTorch Extension Load Test ===")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")

if hasattr(torch.version, 'hip') and torch.version.hip:
    print(f"HIP version: {torch.version.hip}")

print("\n=== Loading Extension ===")
print("This may take a while and will show compilation output...")
print("If this hangs, it indicates the same issue as build.py")

try:
    # Mimic the same load call as build.py
    simple_ext = load(
        name="simple_test_ext",
        sources=["simple_kernel.cu"],
        extra_cflags=["-O3", "-std=c++17"],
        extra_cuda_cflags=["-O3"],  # torch switches this to hipcc on ROCm
        verbose=True,
        is_python_module=False
    )
    print("✓ Extension compilation successful!")

    # Test the extension
    print("\n=== Testing Extension ===")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    a = torch.randn(1000, device=device)
    b = torch.randn(1000, device=device)

    if device == 'cuda':
        result = simple_ext.add_tensors(a, b)
        expected = a + b
        if torch.allclose(result, expected):
            print("✓ Extension execution successful!")
        else:
            print("✗ Extension execution failed - results don't match")
    else:
        print("⚠ No CUDA device, skipping execution test")

except Exception as e:
    print(f"✗ Extension compilation/loading failed: {e}")
    import traceback
    traceback.print_exc()
EOF

echo "=== Running PyTorch Extension Test ==="
echo "This test mimics the same compilation process as build.py"
echo "If this hangs, it shows the same issue as the main build"
echo

# Set a timeout to prevent infinite hang
timeout 300 python3 test_extension.py || {
    exit_code=$?
    if [ $exit_code -eq 124 ]; then
        echo "✗ Extension compilation timed out after 5 minutes (same as build.py hang)"
    else
        echo "✗ Extension compilation failed with exit code $exit_code"
    fi
}

echo
echo "=== Testing with Minimal Sources ==="

# Create an even simpler version
cat > minimal_kernel.cu << 'EOF'
#include <torch/extension.h>

torch::Tensor dummy_function(torch::Tensor input) {
    return input.clone();
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("dummy", &dummy_function, "Dummy function");
}
EOF

cat > test_minimal.py << 'EOF'
import torch
from torch.utils.cpp_extension import load

print("=== Minimal Extension Test ===")

try:
    minimal_ext = load(
        name="minimal_test_ext",
        sources=["minimal_kernel.cu"],
        extra_cflags=["-O3"],
        verbose=True,
        with_cuda=False  # Skip CUDA/HIP compilation
    )
    print("✓ Minimal extension (CPU only) successful!")
except Exception as e:
    print(f"✗ Even minimal extension failed: {e}")
EOF

echo "Testing minimal CPU-only extension..."
timeout 120 python3 test_minimal.py || echo "Minimal extension also failed/timed out"

echo
echo "=== Debug Script 3 Complete ==="

# Cleanup
cd /
rm -rf /tmp/torch_ext_test