|
|
#!/usr/bin/env bash |
|
|
|
|
|
|
|
|
|
|
|
set -euo pipefail |
|
|
|
|
|
echo "=== MegaBlocks Build Debugging Suite ===" |
|
|
echo "Running progressive debug tests to identify build hang issue" |
|
|
echo "ROCm installation: /opt/rocm-7.0.1" |
|
|
echo |
|
|
|
|
|
|
|
|
chmod +x debug-build-1-env.sh debug-build-2-hipcc.sh debug-build-3-torch-ext.sh debug-build-4-megablocks.sh |
|
|
|
|
|
scripts=( |
|
|
"debug-build-1-env.sh" |
|
|
"debug-build-2-hipcc.sh" |
|
|
"debug-build-3-torch-ext.sh" |
|
|
"debug-build-4-megablocks.sh" |
|
|
) |
|
|
|
|
|
results=() |
|
|
start_time=$(date +%s) |
|
|
|
|
|
for script in "${scripts[@]}"; do |
|
|
echo |
|
|
echo "========================================" |
|
|
echo "Running $script" |
|
|
echo "========================================" |
|
|
|
|
|
script_start=$(date +%s) |
|
|
|
|
|
if ./"$script"; then |
|
|
script_end=$(date +%s) |
|
|
duration=$((script_end - script_start)) |
|
|
echo "β $script completed successfully in ${duration}s" |
|
|
results+=("β $script: SUCCESS (${duration}s)") |
|
|
else |
|
|
script_end=$(date +%s) |
|
|
duration=$((script_end - script_start)) |
|
|
echo "β $script failed in ${duration}s" |
|
|
results+=("β $script: FAILED (${duration}s)") |
|
|
fi |
|
|
|
|
|
echo "----------------------------------------" |
|
|
done |
|
|
|
|
|
end_time=$(date +%s) |
|
|
total_duration=$((end_time - start_time)) |
|
|
|
|
|
echo |
|
|
echo "========================================" |
|
|
echo "SUMMARY REPORT" |
|
|
echo "========================================" |
|
|
echo "Total runtime: ${total_duration}s" |
|
|
echo |
|
|
|
|
|
for result in "${results[@]}"; do |
|
|
echo "$result" |
|
|
done |
|
|
|
|
|
echo |
|
|
echo "=== Analysis ===" |
|
|
echo "1. If debug-1-env.sh fails: ROCm installation/environment issue" |
|
|
echo "2. If debug-2-hipcc.sh fails: HIP compiler issue" |
|
|
echo "3. If debug-3-torch-ext.sh hangs: PyTorch extension compilation issue" |
|
|
echo "4. If debug-4-megablocks.sh hangs: MegaBlocks-specific compilation issue" |
|
|
echo |
|
|
echo "=== Next Steps Based on Results ===" |
|
|
echo "- If all pass: The issue may be intermittent or environment-specific" |
|
|
echo "- If script 3 or 4 hangs: Run with strace to see where it hangs:" |
|
|
echo " strace -f -e trace=process,signal python3 build.py" |
|
|
echo "- Check compilation log files in .torch_extensions for more details" |
|
|
echo "- Consider using PYTORCH_JIT_LOG_LEVEL=1 for more verbose output" |
|
|
|
|
|
echo |
|
|
echo "=== Additional Debugging Commands ===" |
|
|
echo "# Check for stuck processes:" |
|
|
echo "ps aux | grep -E '(hipcc|hip-clang|python)'" |
|
|
echo |
|
|
echo "# Monitor system resources during build:" |
|
|
echo "htop" |
|
|
echo |
|
|
echo "# Check for device issues:" |
|
|
echo "dmesg | tail -20" |
|
|
echo |
|
|
echo "# Force clean rebuild:" |
|
|
echo "rm -rf .torch_extensions* && ./build.sh" |
|
|
|
|
|
echo |
|
|
echo "Debug suite complete." |