megablocks-hip / _dev /debug-build-3-torch-ext.sh

Add ROCm build debugging utilities

2d8a802 about 2 months ago

5.29 kB

	#!/usr/bin/env bash

	# Debug script 3: PyTorch C++ extension compilation test

	set -euo pipefail

	echo "=== PyTorch C++ Extension Debug Script 3 ==="
	echo "Testing PyTorch C++ extension compilation with HIP"
	echo

	# Set ROCm environment variables
	export ROCM_PATH="${ROCM_PATH:-/opt/rocm-7.0.1}"
	export ROCM_HOME="${ROCM_HOME:-$ROCM_PATH}"
	export HIP_PATH="${HIP_PATH:-$ROCM_PATH}"
	export HIP_HOME="${HIP_HOME:-$ROCM_PATH}"
	export PATH="$ROCM_HOME/bin:$PATH"
	export TORCH_HIP_ARCH_LIST="${TORCH_HIP_ARCH_LIST:-gfx942}"
	export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-gfx942}"
	export TORCH_EXTENSIONS_DIR="${TORCH_EXTENSIONS_DIR:-$PWD/.torch_extensions_debug}"

	# Create a test directory
	mkdir -p /tmp/torch_ext_test
	cd /tmp/torch_ext_test

	echo "=== Creating Simple PyTorch Extension ==="

	# Create a minimal CUDA/HIP kernel similar to megablocks
	cat > simple_kernel.cu << 'EOF'
	#include <torch/extension.h>
	#include <vector>

	#ifdef __HIP_PLATFORM_AMD__
	#include <hip/hip_runtime.h>
	#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
	hipLaunchKernelGGL(kernel, grid, block, smem, stream, __VA_ARGS__)
	#else
	#include <cuda_runtime.h>
	#define CUDA_LAUNCH_KERNEL(kernel, grid, block, smem, stream, ...) \
	kernel<<<grid, block, smem, stream>>>(__VA_ARGS__)
	#endif

	__global__ void add_kernel(const float* a, const float* b, float* c, int n) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < n) {
	c[idx] = a[idx] + b[idx];
	}
	}

	torch::Tensor add_tensors_cuda(torch::Tensor a, torch::Tensor b) {
	auto c = torch::zeros_like(a);
	int n = a.numel();

	const int block_size = 256;
	const int grid_size = (n + block_size - 1) / block_size;

	CUDA_LAUNCH_KERNEL(
	add_kernel,
	dim3(grid_size),
	dim3(block_size),
	0,
	0,
	a.data_ptr<float>(),
	b.data_ptr<float>(),
	c.data_ptr<float>(),
	n
	);

	return c;
	}

	PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
	m.def("add_tensors", &add_tensors_cuda, "Add two tensors (CUDA/HIP)");
	}
	EOF

	# Create Python test script
	cat > test_extension.py << 'EOF'
	import os
	import sys
	import torch
	from torch.utils.cpp_extension import load

	print("=== PyTorch Extension Load Test ===")
	print(f"PyTorch version: {torch.__version__}")
	print(f"CUDA available: {torch.cuda.is_available()}")
	print(f"Device count: {torch.cuda.device_count()}")

	if hasattr(torch.version, 'hip') and torch.version.hip:
	print(f"HIP version: {torch.version.hip}")

	print("\n=== Loading Extension ===")
	print("This may take a while and will show compilation output...")
	print("If this hangs, it indicates the same issue as build.py")

	try:
	# Mimic the same load call as build.py
	simple_ext = load(
	name="simple_test_ext",
	sources=["simple_kernel.cu"],
	extra_cflags=["-O3", "-std=c++17"],
	extra_cuda_cflags=["-O3"], # torch switches this to hipcc on ROCm
	verbose=True,
	is_python_module=False
	)
	print("✓ Extension compilation successful!")

	# Test the extension
	print("\n=== Testing Extension ===")
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	a = torch.randn(1000, device=device)
	b = torch.randn(1000, device=device)

	if device == 'cuda':
	result = simple_ext.add_tensors(a, b)
	expected = a + b
	if torch.allclose(result, expected):
	print("✓ Extension execution successful!")
	else:
	print("✗ Extension execution failed - results don't match")
	else:
	print("⚠ No CUDA device, skipping execution test")

	except Exception as e:
	print(f"✗ Extension compilation/loading failed: {e}")
	import traceback
	traceback.print_exc()
	EOF

	echo "=== Running PyTorch Extension Test ==="
	echo "This test mimics the same compilation process as build.py"
	echo "If this hangs, it shows the same issue as the main build"
	echo

	# Set a timeout to prevent infinite hang
	timeout 300 python3 test_extension.py \|\| {
	exit_code=$?
	if [ $exit_code -eq 124 ]; then
	echo "✗ Extension compilation timed out after 5 minutes (same as build.py hang)"
	else
	echo "✗ Extension compilation failed with exit code $exit_code"
	fi
	}

	echo
	echo "=== Testing with Minimal Sources ==="

	# Create an even simpler version
	cat > minimal_kernel.cu << 'EOF'
	#include <torch/extension.h>

	torch::Tensor dummy_function(torch::Tensor input) {
	return input.clone();
	}

	PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
	m.def("dummy", &dummy_function, "Dummy function");
	}
	EOF

	cat > test_minimal.py << 'EOF'
	import torch
	from torch.utils.cpp_extension import load

	print("=== Minimal Extension Test ===")

	try:
	minimal_ext = load(
	name="minimal_test_ext",
	sources=["minimal_kernel.cu"],
	extra_cflags=["-O3"],
	verbose=True,
	with_cuda=False # Skip CUDA/HIP compilation
	)
	print("✓ Minimal extension (CPU only) successful!")
	except Exception as e:
	print(f"✗ Even minimal extension failed: {e}")
	EOF

	echo "Testing minimal CPU-only extension..."
	timeout 120 python3 test_minimal.py \|\| echo "Minimal extension also failed/timed out"

	echo
	echo "=== Debug Script 3 Complete ==="

	# Cleanup
	cd /
	rm -rf /tmp/torch_ext_test