File size: 5,294 Bytes
2d8a802
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env bash

# Debug script 3: PyTorch C++ extension compilation test

set -euo pipefail

echo "=== PyTorch C++ Extension Debug Script 3 ==="
echo "Testing PyTorch C++ extension compilation with HIP"
echo

# Set ROCm environment variables
export ROCM_PATH="${ROCM_PATH:-/opt/rocm-7.0.1}"
export ROCM_HOME="${ROCM_HOME:-$ROCM_PATH}"
export HIP_PATH="${HIP_PATH:-$ROCM_PATH}"
export HIP_HOME="${HIP_HOME:-$ROCM_PATH}"
export PATH="$ROCM_HOME/bin:$PATH"
export TORCH_HIP_ARCH_LIST="${TORCH_HIP_ARCH_LIST:-gfx942}"
export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-gfx942}"
export TORCH_EXTENSIONS_DIR="${TORCH_EXTENSIONS_DIR:-$PWD/.torch_extensions_debug}"

# Create a test directory
mkdir -p /tmp/torch_ext_test
cd /tmp/torch_ext_test

echo "=== Creating Simple PyTorch Extension ==="

# Create a minimal CUDA/HIP kernel similar to megablocks
cat > simple_kernel.cu << 'EOF'
#include <torch/extension.h>
#include <vector>

#ifdef __HIP_PLATFORM_AMD__
#include <hip/hip_runtime.h>
#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
    hipLaunchKernelGGL(kernel, grid, block, smem, stream, __VA_ARGS__)
#else
#include <cuda_runtime.h>
#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
    kernel<<<grid, block, smem, stream>>>(__VA_ARGS__)
#endif

__global__ void add_kernel(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

torch::Tensor add_tensors_cuda(torch::Tensor a, torch::Tensor b) {
    auto c = torch::zeros_like(a);
    int n = a.numel();

    const int block_size = 256;
    const int grid_size = (n + block_size - 1) / block_size;

    CUDA_LAUNCH_KERNEL(
        add_kernel,
        dim3(grid_size),
        dim3(block_size),
        0,
        0,
        a.data_ptr<float>(),
        b.data_ptr<float>(),
        c.data_ptr<float>(),
        n
    );

    return c;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("add_tensors", &add_tensors_cuda, "Add two tensors (CUDA/HIP)");
}
EOF

# Create Python test script
cat > test_extension.py << 'EOF'
import os
import sys
import torch
from torch.utils.cpp_extension import load

print("=== PyTorch Extension Load Test ===")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device count: {torch.cuda.device_count()}")

if hasattr(torch.version, 'hip') and torch.version.hip:
    print(f"HIP version: {torch.version.hip}")

print("\n=== Loading Extension ===")
print("This may take a while and will show compilation output...")
print("If this hangs, it indicates the same issue as build.py")

try:
    # Mimic the same load call as build.py
    simple_ext = load(
        name="simple_test_ext",
        sources=["simple_kernel.cu"],
        extra_cflags=["-O3", "-std=c++17"],
        extra_cuda_cflags=["-O3"],  # torch switches this to hipcc on ROCm
        verbose=True,
        is_python_module=False
    )
    print("βœ“ Extension compilation successful!")

    # Test the extension
    print("\n=== Testing Extension ===")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    a = torch.randn(1000, device=device)
    b = torch.randn(1000, device=device)

    if device == 'cuda':
        result = simple_ext.add_tensors(a, b)
        expected = a + b
        if torch.allclose(result, expected):
            print("βœ“ Extension execution successful!")
        else:
            print("βœ— Extension execution failed - results don't match")
    else:
        print("⚠ No CUDA device, skipping execution test")

except Exception as e:
    print(f"βœ— Extension compilation/loading failed: {e}")
    import traceback
    traceback.print_exc()
EOF

echo "=== Running PyTorch Extension Test ==="
echo "This test mimics the same compilation process as build.py"
echo "If this hangs, it shows the same issue as the main build"
echo

# Set a timeout to prevent infinite hang
timeout 300 python3 test_extension.py || {
    exit_code=$?
    if [ $exit_code -eq 124 ]; then
        echo "βœ— Extension compilation timed out after 5 minutes (same as build.py hang)"
    else
        echo "βœ— Extension compilation failed with exit code $exit_code"
    fi
}

echo
echo "=== Testing with Minimal Sources ==="

# Create an even simpler version
cat > minimal_kernel.cu << 'EOF'
#include <torch/extension.h>

torch::Tensor dummy_function(torch::Tensor input) {
    return input.clone();
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("dummy", &dummy_function, "Dummy function");
}
EOF

cat > test_minimal.py << 'EOF'
import torch
from torch.utils.cpp_extension import load

print("=== Minimal Extension Test ===")

try:
    minimal_ext = load(
        name="minimal_test_ext",
        sources=["minimal_kernel.cu"],
        extra_cflags=["-O3"],
        verbose=True,
        with_cuda=False  # Skip CUDA/HIP compilation
    )
    print("βœ“ Minimal extension (CPU only) successful!")
except Exception as e:
    print(f"βœ— Even minimal extension failed: {e}")
EOF

echo "Testing minimal CPU-only extension..."
timeout 120 python3 test_minimal.py || echo "Minimal extension also failed/timed out"

echo
echo "=== Debug Script 3 Complete ==="

# Cleanup
cd /
rm -rf /tmp/torch_ext_test