megablocks-hip / build-strace.sh
leonardlin's picture
Add ROCm build debugging utilities
2d8a802
raw
history blame
1.67 kB
#!/usr/bin/env bash
set -euo pipefail
# Build script with strace to debug hanging
echo "=== Build with strace debugging ==="
echo "This will trace system calls to identify where the build hangs"
# Same environment as build-fixed.sh but without MAX_JOBS limit
export ROCM_PATH="${ROCM_PATH:-/opt/rocm-7.0.1}"
export ROCM_HOME="${ROCM_HOME:-$ROCM_PATH}"
export HIP_PATH="${HIP_PATH:-$ROCM_PATH}"
export HIP_HOME="${HIP_HOME:-$ROCM_PATH}"
export PATH="$ROCM_HOME/bin:$PATH"
# Fix architecture specifications
export TORCH_HIP_ARCH_LIST="gfx942"
export PYTORCH_ROCM_ARCH="gfx942"
# Remove HSA_OVERRIDE_GFX_VERSION
unset HSA_OVERRIDE_GFX_VERSION
# Remove MAX_JOBS limit to see parallel compilation hang
unset MAX_JOBS
# Enable PyTorch JIT logging
export PYTORCH_JIT_LOG_LEVEL=1
export TORCH_EXTENSIONS_DIR="${TORCH_EXTENSIONS_DIR:-$PWD/.torch_extensions}"
echo "Environment configured for strace:"
echo "ROCM_PATH=$ROCM_PATH"
echo "TORCH_HIP_ARCH_LIST=$TORCH_HIP_ARCH_LIST"
echo "PYTORCH_ROCM_ARCH=$PYTORCH_ROCM_ARCH"
echo "MAX_JOBS=${MAX_JOBS:-unset}"
echo "PYTORCH_JIT_LOG_LEVEL=$PYTORCH_JIT_LOG_LEVEL"
echo
echo "Starting build with strace..."
echo "Tracing process creation, signals, and file operations..."
echo "Output will be saved to strace.log"
# Use strace to trace the build process
# -f: follow child processes
# -e trace=process,signal: trace process creation and signals
# -e trace=file: trace file operations
# -o strace.log: save output to file
# -T: show time spent in each syscall
strace -f -e trace=process,signal,file -o strace.log -T python -u build.py
echo "Build completed or interrupted"
echo "Check strace.log for detailed system call trace"