|
|
#!/usr/bin/env bash |
|
|
|
|
|
set -euo pipefail |
|
|
|
|
|
echo "================= RUNTIME CAPABILITIES =================" |
|
|
date |
|
|
|
|
|
echo |
|
|
if command -v nvidia-smi >/dev/null 2>&1; then |
|
|
nvidia-smi |
|
|
else |
|
|
echo "nvidia-smi: not available" |
|
|
fi |
|
|
echo |
|
|
|
|
|
echo "CUDA_HOME: ${CUDA_HOME:-/usr/local/cuda}" |
|
|
if command -v nvcc >/dev/null 2>&1; then |
|
|
nvcc --version || true |
|
|
else |
|
|
echo "nvcc: not available" |
|
|
fi |
|
|
echo |
|
|
|
|
|
echo "[PyTorch / CUDA backend]" |
|
|
python3 - <<'PY' |
|
|
import json, os, torch, inspect |
|
|
|
|
|
def to_bool(x): |
|
|
try: |
|
|
if callable(x): |
|
|
try: |
|
|
sig = inspect.signature(x) |
|
|
if len(sig.parameters)==0: |
|
|
return bool(x()) |
|
|
except Exception: |
|
|
pass |
|
|
return True |
|
|
return bool(x) |
|
|
except Exception: |
|
|
return None |
|
|
|
|
|
info = { |
|
|
"torch": getattr(torch, "__version__", None), |
|
|
"cuda_available": torch.cuda.is_available(), |
|
|
"cuda_device_count": torch.cuda.device_count(), |
|
|
"cuda_runtime_version": getattr(torch.version, "cuda", None), |
|
|
"cudnn_version": torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else None, |
|
|
"tf32": (torch.backends.cuda.matmul.allow_tf32 if torch.cuda.is_available() else None), |
|
|
"flash_sdp": (to_bool(getattr(torch.backends.cuda, "enable_flash_sdp", None)) if torch.cuda.is_available() else None), |
|
|
"mem_efficient_sdp": (to_bool(getattr(torch.backends.cuda, "enable_mem_efficient_sdp", None)) if torch.cuda.is_available() else None), |
|
|
"math_sdp": (to_bool(getattr(torch.backends.cuda, "enable_math_sdp", None)) if torch.cuda.is_available() else None), |
|
|
} |
|
|
print(json.dumps(info, indent=2)) |
|
|
for i in range(min(torch.cuda.device_count(), 16)): |
|
|
print(f"GPU {i}: {torch.cuda.get_device_name(i)}") |
|
|
PY |
|
|
echo |
|
|
|
|
|
echo "[Apex (FusedLayerNorm/RMSNorm)]" |
|
|
python3 - <<'PY' |
|
|
try: |
|
|
from apex.normalization import FusedLayerNorm, FusedRMSNorm |
|
|
import importlib; importlib.import_module("fused_layer_norm_cuda") |
|
|
print("apex.normalization: OK") |
|
|
except Exception as e: |
|
|
print("apex.normalization: FAIL ->", e) |
|
|
PY |
|
|
echo |
|
|
|
|
|
echo "[FlashAttention (CUDA/Triton/RMSNorm)]" |
|
|
python3 - <<'PY' |
|
|
import importlib |
|
|
mods = [ |
|
|
'flash_attn', 'flash_attn_2_cuda', |
|
|
'flash_attn.ops.rms_norm', 'flash_attn.ops.layer_norm', |
|
|
'flash_attn.layers.layer_norm' |
|
|
] |
|
|
for m in mods: |
|
|
try: |
|
|
importlib.import_module(m) |
|
|
print(f"{m}: OK") |
|
|
except Exception as e: |
|
|
print(f"{m}: FAIL -> {e}") |
|
|
PY |
|
|
echo |
|
|
|
|
|
echo "[FlashAttention versão/details]" |
|
|
python3 - <<'PY' |
|
|
try: |
|
|
import flash_attn |
|
|
fa_ver = getattr(flash_attn, "__version__", None) |
|
|
print(f"flash_attn: {fa_ver}") |
|
|
except Exception: |
|
|
print("flash_attn: not importable.") |
|
|
try: |
|
|
import torch |
|
|
print(f"torch: {torch.__version__} | cuda: {getattr(torch.version, 'cuda', None)}") |
|
|
except Exception: |
|
|
pass |
|
|
PY |
|
|
echo |
|
|
|
|
|
echo "[Triton]" |
|
|
python3 - <<'PY' |
|
|
try: |
|
|
import triton |
|
|
print("triton:", triton.__version__) |
|
|
try: |
|
|
import triton.ops as _; print("triton.ops: OK") |
|
|
except Exception: |
|
|
print("triton.ops: not present (ok on Triton>=3.x)") |
|
|
except Exception as e: |
|
|
print("triton: FAIL ->", e) |
|
|
PY |
|
|
echo |
|
|
|
|
|
echo "[BitsAndBytes (Q8/Q4)]" |
|
|
python3 - <<'PY' |
|
|
try: |
|
|
import bitsandbytes as bnb |
|
|
print("bitsandbytes:", bnb.__version__) |
|
|
try: |
|
|
from bitsandbytes.triton import _custom_ops as _; print("bnb.triton._custom_ops: OK") |
|
|
except Exception as e: |
|
|
print("bnb.triton: partial ->", e) |
|
|
except Exception as e: |
|
|
print("bitsandbytes: FAIL ->", e) |
|
|
PY |
|
|
echo |
|
|
|
|
|
echo "[Transformers / Diffusers / XFormers / EcoML]" |
|
|
python3 - <<'PY' |
|
|
def _v(m): |
|
|
try: |
|
|
mod = __import__(m) |
|
|
print(f"{m}: {getattr(mod, '__version__', 'unknown')}") |
|
|
except Exception as e: |
|
|
print(f"{m}: FAIL -> {e}") |
|
|
for m in ("transformers", "diffusers", "xformers", "ecuml", "mlx", "ecobase"): |
|
|
_v(m) |
|
|
PY |
|
|
echo |
|
|
|
|
|
echo "[Distribuído / NCCL Env]" |
|
|
env | grep -E '^(CUDA_VISIBLE_DEVICES|NCCL_|TORCH_|ENABLE_.*SDP|HF_HUB_.*|CUDA_|NV_.*NCCL.*|PYTORCH_CUDA_ALLOC_CONF)=' | sort |
|
|
echo |
|
|
|
|
|
echo "[Output dir/perms]" |
|
|
OUT="/app/outputs" |
|
|
echo "OUT dir: $OUT" |
|
|
mkdir -p "$OUT" |
|
|
ls -la "$OUT" || true |
|
|
|
|
|
echo "================= END CAPABILITIES =================" |
|
|
|