megablocks-hip / _dev /debug-build-all.sh
leonardlin's picture
Add ROCm build debugging utilities
2d8a802
#!/usr/bin/env bash
# Debug script: Run all debug tests in sequence
set -euo pipefail
echo "=== MegaBlocks Build Debugging Suite ==="
echo "Running progressive debug tests to identify build hang issue"
echo "ROCm installation: /opt/rocm-7.0.1"
echo
# Make all scripts executable
chmod +x debug-build-1-env.sh debug-build-2-hipcc.sh debug-build-3-torch-ext.sh debug-build-4-megablocks.sh
scripts=(
"debug-build-1-env.sh"
"debug-build-2-hipcc.sh"
"debug-build-3-torch-ext.sh"
"debug-build-4-megablocks.sh"
)
results=()
start_time=$(date +%s)
for script in "${scripts[@]}"; do
echo
echo "========================================"
echo "Running $script"
echo "========================================"
script_start=$(date +%s)
if ./"$script"; then
script_end=$(date +%s)
duration=$((script_end - script_start))
echo "βœ“ $script completed successfully in ${duration}s"
results+=("βœ“ $script: SUCCESS (${duration}s)")
else
script_end=$(date +%s)
duration=$((script_end - script_start))
echo "βœ— $script failed in ${duration}s"
results+=("βœ— $script: FAILED (${duration}s)")
fi
echo "----------------------------------------"
done
end_time=$(date +%s)
total_duration=$((end_time - start_time))
echo
echo "========================================"
echo "SUMMARY REPORT"
echo "========================================"
echo "Total runtime: ${total_duration}s"
echo
for result in "${results[@]}"; do
echo "$result"
done
echo
echo "=== Analysis ==="
echo "1. If debug-1-env.sh fails: ROCm installation/environment issue"
echo "2. If debug-2-hipcc.sh fails: HIP compiler issue"
echo "3. If debug-3-torch-ext.sh hangs: PyTorch extension compilation issue"
echo "4. If debug-4-megablocks.sh hangs: MegaBlocks-specific compilation issue"
echo
echo "=== Next Steps Based on Results ==="
echo "- If all pass: The issue may be intermittent or environment-specific"
echo "- If script 3 or 4 hangs: Run with strace to see where it hangs:"
echo " strace -f -e trace=process,signal python3 build.py"
echo "- Check compilation log files in .torch_extensions for more details"
echo "- Consider using PYTORCH_JIT_LOG_LEVEL=1 for more verbose output"
echo
echo "=== Additional Debugging Commands ==="
echo "# Check for stuck processes:"
echo "ps aux | grep -E '(hipcc|hip-clang|python)'"
echo
echo "# Monitor system resources during build:"
echo "htop"
echo
echo "# Check for device issues:"
echo "dmesg | tail -20"
echo
echo "# Force clean rebuild:"
echo "rm -rf .torch_extensions* && ./build.sh"
echo
echo "Debug suite complete."