megablocks-hip / _dev /debug-build-1-env.sh
leonardlin's picture
Add ROCm build debugging utilities
2d8a802
#!/usr/bin/env bash
# Debug script 1: Basic ROCm environment and tool availability check
set -euo pipefail
echo "=== ROCm Environment Debug Script 1 ==="
echo "Testing basic ROCm/HIP environment setup"
echo
# Set ROCm environment variables
export ROCM_PATH="${ROCM_PATH:-/opt/rocm-7.0.1}"
export ROCM_HOME="${ROCM_HOME:-$ROCM_PATH}"
export HIP_PATH="${HIP_PATH:-$ROCM_PATH}"
export HIP_HOME="${HIP_HOME:-$ROCM_PATH}"
export PATH="$ROCM_HOME/bin:$PATH"
export TORCH_HIP_ARCH_LIST="${TORCH_HIP_ARCH_LIST:-gfx942}"
export HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION:-gfx942}"
echo "Environment Variables:"
echo "ROCM_PATH=$ROCM_PATH"
echo "ROCM_HOME=$ROCM_HOME"
echo "HIP_PATH=$HIP_PATH"
echo "HIP_HOME=$HIP_HOME"
echo "TORCH_HIP_ARCH_LIST=$TORCH_HIP_ARCH_LIST"
echo "HSA_OVERRIDE_GFX_VERSION=$HSA_OVERRIDE_GFX_VERSION"
echo "PATH (ROCm portion): $(echo $PATH | tr ':' '\n' | grep rocm || echo 'No ROCm in PATH')"
echo
echo "=== Directory Checks ==="
echo "ROCm installation directory exists: $(test -d "$ROCM_PATH" && echo 'YES' || echo 'NO')"
echo "ROCm bin directory exists: $(test -d "$ROCM_PATH/bin" && echo 'YES' || echo 'NO')"
echo "ROCm include directory exists: $(test -d "$ROCM_PATH/include" && echo 'YES' || echo 'NO')"
echo "ROCm lib directory exists: $(test -d "$ROCM_PATH/lib" && echo 'YES' || echo 'NO')"
echo
echo "=== Tool Availability ==="
echo "hipcc available: $(which hipcc >/dev/null 2>&1 && echo 'YES' || echo 'NO')"
echo "hip-clang available: $(which hip-clang >/dev/null 2>&1 && echo 'YES' || echo 'NO')"
echo "rocm-smi available: $(which rocm-smi >/dev/null 2>&1 && echo 'YES' || echo 'NO')"
echo "hipconfig available: $(which hipconfig >/dev/null 2>&1 && echo 'YES' || echo 'NO')"
echo
echo "=== Tool Versions ==="
if which hipcc >/dev/null 2>&1; then
echo "hipcc version:"
hipcc --version || echo "Failed to get hipcc version"
echo
fi
if which hipconfig >/dev/null 2>&1; then
echo "HIP config:"
hipconfig --full || echo "Failed to get hipconfig"
echo
fi
if which rocm-smi >/dev/null 2>&1; then
echo "ROCm SMI:"
rocm-smi --showproductname || echo "Failed to get ROCm SMI info"
echo
fi
echo "=== Python Environment ==="
python3 --version || echo "Python3 not available"
python3 -c "import torch; print(f'PyTorch version: {torch.__version__}')" || echo "PyTorch not available"
python3 -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')" || echo "Failed to check CUDA availability"
python3 -c "import torch; print(f'HIP available: {hasattr(torch.version, \"hip\") and torch.version.hip is not None}')" || echo "Failed to check HIP availability"
echo
echo "=== Basic HIP Device Check ==="
if which hipinfo >/dev/null 2>&1; then
echo "HIP devices:"
hipinfo || echo "hipinfo failed"
else
echo "hipinfo not available"
fi
echo
echo "=== Debug Script 1 Complete ==="