megablocks-hip / run-tests.sh
leonardlin's picture
Add ROCm build debugging utilities
2d8a802
raw
history blame
3.93 kB
#!/usr/bin/env bash
set -euo pipefail
KERNEL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
cd "$KERNEL_DIR"
export KERNEL_DIR
detect_variant() {
python - <<'PY'
import os
import pathlib
root = pathlib.Path(os.environ["KERNEL_DIR"])
build_dir = root / "build"
variant = None
try:
from kernels.utils import build_variant as _build_variant
except Exception:
_build_variant = None
if _build_variant is not None:
try:
variant = _build_variant()
except Exception:
variant = None
if variant is None:
candidates = sorted(build_dir.glob("torch*-rocm64-*") or build_dir.glob("torch*-cu*"))
if candidates:
variant = candidates[0].name
if variant is None:
raise SystemExit("Could not determine MegaBlocks build variant. Run build.py first.")
print(variant)
PY
}
VARIANT=$(detect_variant)
STAGED_DIR="$KERNEL_DIR/build/$VARIANT"
find_staged_lib() {
local base="$1"
local candidates=(
"$base/_megablocks_rocm.so"
"$base/megablocks/_megablocks_rocm.so"
)
for path in "${candidates[@]}"; do
if [[ -f "$path" ]]; then
echo "$path"
return 0
fi
done
return 1
}
STAGED_LIB=$(find_staged_lib "$STAGED_DIR") || true
if [[ -z "${STAGED_LIB:-}" ]]; then
echo "Staged ROCm extension not found under $STAGED_DIR; rebuilding kernels..."
python build.py
VARIANT=$(detect_variant)
STAGED_DIR="$KERNEL_DIR/build/$VARIANT"
STAGED_LIB=$(find_staged_lib "$STAGED_DIR") || true
if [[ -z "${STAGED_LIB:-}" ]]; then
echo "ERROR: build.py completed but no extension was found under $STAGED_DIR" >&2
exit 1
fi
fi
export PYTHONPATH="$STAGED_DIR:${PYTHONPATH:-}"
echo "Using MegaBlocks build variant: $VARIANT"
declare -i GPU_COUNT
GPU_COUNT=$(python - <<'PY'
import torch
print(torch.cuda.device_count() if torch.cuda.is_available() else 0)
PY
)
if (( GPU_COUNT == 0 )); then
echo "ERROR: No HIP/CUDA GPUs detected. Tests require at least one visible accelerator." >&2
exit 1
fi
echo "Detected $GPU_COUNT visible GPU(s)."
log() {
echo
echo "==> $1"
}
run_pytest() {
local label="$1"
shift
log "$label"
set -x
"$@"
{ set +x; } 2>/dev/null || true
}
SINGLE_GPU_ENV=(HIP_VISIBLE_DEVICES=0 CUDA_VISIBLE_DEVICES=0 WORLD_SIZE=1)
MULTI2_GPU_ENV=(HIP_VISIBLE_DEVICES=0,1 CUDA_VISIBLE_DEVICES=0,1 WORLD_SIZE=2)
MULTI8_GPU_ENV=(HIP_VISIBLE_DEVICES=$(seq -s, 0 7) CUDA_VISIBLE_DEVICES=$(seq -s, 0 7) WORLD_SIZE=8)
SINGLE_TESTS=(
"test_mb_moe.py"
"test_mb_moe_shared_expert.py"
"layer_test.py"
"test_gg.py"
"ops_test.py"
)
for test in "${SINGLE_TESTS[@]}"; do
run_pytest "Single-GPU pytest ${test}" env "${SINGLE_GPU_ENV[@]}" python -m pytest "tests/${test}" -q
done
if (( GPU_COUNT >= 2 )); then
run_pytest "Distributed layer smoke (2 GPUs)" env "${MULTI2_GPU_ENV[@]}" python -m pytest "tests/parallel_layer_test.py::test_megablocks_moe_mlp_functionality" -q
else
log "Skipping 2-GPU distributed layer test (requires >=2 GPUs, detected ${GPU_COUNT})."
fi
run_pytest "Shared expert functionality (world_size=1)" env "${SINGLE_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_functionality[1]' -q
run_pytest "Shared expert weighted sum (world_size=1)" env "${SINGLE_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_weighted_sum[1]' -q
if (( GPU_COUNT >= 8 )); then
run_pytest "Shared expert functionality (world_size=8)" env "${MULTI8_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_functionality[8]' -q
run_pytest "Shared expert weighted sum (world_size=8)" env "${MULTI8_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_weighted_sum[8]' -q
else
log "Skipping 8-GPU shared expert tests (requires >=8 GPUs, detected ${GPU_COUNT})."
fi
echo
echo "All requested tests completed."