File size: 6,957 Bytes
7b27e12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
#!/usr/bin/env bash
set -euo pipefail
echo "🚀 Start (wheels + app)"
# ===== Config e diretórios =====
export SELF_HF_REPO_ID="${SELF_HF_REPO_ID:-XCarleX/Apex-l40s}"
export HF_HOME="${HF_HOME:-/app/model_cache}"
export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
export TORCH_HOME="${TORCH_HOME:-$HF_HOME/torch}"
export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-8.9}"
export HF_HUB_ENABLE_HF_TRANSFER="${HF_HUB_ENABLE_HF_TRANSFER:-1}"
export PATH="$HOME/.local/bin:$PATH"
mkdir -p /app/wheels /app/cuda_cache "$HF_HOME" "$TORCH_HOME" /app/wheels/src /app/whells
chmod -R 777 /app/wheels /app/whells || true
export CUDA_CACHE_PATH="/app/cuda_cache"
# Copia licença NGC se presente (não é enviada a logs do app)
if [ -f "/NGC-DL-CONTAINER-LICENSE" ]; then
cp -f /NGC-DL-CONTAINER-LICENSE /app/wheels/NGC-DL-CONTAINER-LICENSE || true
fi
# ===== Dependências mínimas (pip e Hub) =====
python -m pip install -q -U pip build setuptools wheel hatchling hatch-vcs scikit-build-core cmake ninja packaging
python -m pip install -q -U "huggingface_hub[hf_transfer]" || python -m pip install -q -U huggingface_hub
# ===== Tags úteis (Python/CUDA) =====
read -r PY_TAG CU_TAG <<EOF
$(python - <<'PY'
import sys, torch
py_tag=f"cp{sys.version_info.major}{sys.version_info.minor}"
cu_tag="cu"+(torch.version.cuda or "0").replace(".","")
print(py_tag, cu_tag)
PY
)
EOF
echo "[env] Python tag=${PY_TAG} | CUDA tag=${CU_TAG}"
# ===== Checagens (retornam 0/1 sem abortar) =====
check_apex() {
python - <<'PY' >/dev/null 2>&1
from importlib import import_module
from apex.normalization import FusedLayerNorm # exige extensão CUDA
import_module("fused_layer_norm_cuda")
PY
[ $? -eq 0 ] && { echo "[apex] import OK"; return 0; } || { echo "[apex] import falhou (vai tentar Hub/Build)"; return 1; }
}
check_flashattn() {
python - <<'PY' >/dev/null 2>&1
import flash_attn
PY
[ $? -eq 0 ] && { echo "[flash_attn] import OK"; return 0; } || { echo "[flash_attn] import falhou (vai tentar Hub/Build)"; return 1; }
}
# ===== Download do Hub (sem usar cache interno entre reinicializações) =====
install_from_hf () {
echo "[hub] Verificando wheel de $1 no repositório ${SELF_HF_REPO_ID}"
python - <<'PY' "$1" "$PY_TAG" "$CU_TAG" 2>/dev/null || exit 1
import os, sys
from huggingface_hub import HfApi, hf_hub_download, HfFolder
pkg, py_tag, cu_tag = sys.argv[1], sys.argv[2], sys.argv[3]
repo = os.environ.get("SELF_HF_REPO_ID","XCarleX/Apex-l40s")
api = HfApi(token=os.getenv("HF_TOKEN") or HfFolder.get_token())
try:
files = api.list_repo_files(repo_id=repo, repo_type="model")
except Exception:
raise SystemExit(0)
cands=[f for f in files if f.endswith(".whl") and f.rsplit("/",1)[-1].startswith(pkg+"-") and py_tag in f]
pref=[f for f in cands if cu_tag and cu_tag in f] or cands
if not pref:
raise SystemExit(0)
target=sorted(pref, reverse=True)[0]
print(target)
path=hf_hub_download(repo_id=repo, filename=target, repo_type="model", local_dir="/app/wheels")
print(path)
PY
}
# ===== Builders (mantêm o fluxo do original) =====
build_apex () {
local SRC="/app/wheels/src/apex"
echo "[build] Preparando fonte Apex em ${SRC}"
if [ -d "$SRC/.git" ]; then
git -C "$SRC" fetch --all -p || true
git -C "$SRC" reset --hard HEAD || true
git -C "$SRC" clean -fdx || true
else
rm -rf "$SRC"
git clone --depth 1 https://github.com/NVIDIA/apex "$SRC"
fi
echo "[build] Compilando Apex -> wheel"
export APEX_CPP_EXT=1 APEX_CUDA_EXT=1 APEX_ALL_CONTRIB_EXT=0
python -m pip wheel --no-build-isolation --no-deps "$SRC" -w /app/wheels -q || true
local W="$(ls -t /app/wheels/apex-*.whl 2>/dev/null | head -n1 || true)"
if [ -n "${W}" ]; then
python -m pip install -U --no-deps "${W}" -q || true
echo "[build] Apex instalado da wheel recém-compilada: ${W}"
else
echo "[build] Nenhuma wheel Apex gerada; instalando do source (pode falhar)"
python -m pip install -q --no-build-isolation "$SRC" || true
fi
echo "${W:-}"
}
FLASH_ATTN_VERSION="${FLASH_ATTN_VERSION:-2.7.4.post1}"
build_flashattn () {
echo "[build] Compilando flash-attn==${FLASH_ATTN_VERSION} -> wheel"
python -m pip wheel --no-build-isolation --no-deps --no-binary :all: "flash-attn==${FLASH_ATTN_VERSION}" -w /app/wheels -q || true
local W="$(ls -t /app/wheels/flash_attn-${FLASH_ATTN_VERSION}-*.whl 2>/dev/null | head -n1 || true)"
if [ -n "${W}" ]; then
python -m pip install -U --no-deps "${W}" -q || true
echo "[build] flash-attn instalado da wheel recém-compilada: ${W}"
else
echo "[build] Nenhuma wheel flash-attn gerada; instalação não aplicada"
fi
echo "${W:-}"
}
# ===== Pipeline com logs de decisão; não encerra em falha (tenta build em bg) =====
ensure_pkg () {
local PKG="$1"
local CHECK_FN="$2"
local BUILD_FN="$3"
local WHEEL_PATH=""
echo "[flow] === ${PKG} ==="
if ${CHECK_FN}; then
echo "[flow] ${PKG}: já instalado (import OK)"
return 0
fi
echo "[flow] ${PKG}: procurando wheel no Hub (${SELF_HF_REPO_ID})"
HF_OUT="$(install_from_hf "$PKG" || true)"
if [ -n "${HF_OUT}" ]; then
WHEEL_PATH="$(printf "%s\n" "${HF_OUT}" | tail -n1)"
echo "[hub] Baixado: ${WHEEL_PATH}"
python -m pip install -U --no-deps "${WHEEL_PATH}" -q || true
if ${CHECK_FN}; then
echo "[flow] ${PKG}: sucesso via Hub (${WHEEL_PATH})"
return 0
else
echo "[flow] ${PKG}: import falhou após instalar wheel do Hub; seguirá para build"
fi
else
echo "[hub] Nenhuma wheel compatível encontrada para ${PKG}"
fi
echo "[flow] ${PKG}: compilando (fallback)"
WHEEL_PATH="$(${BUILD_FN})"
if ${CHECK_FN}; then
echo "[flow] ${PKG}: sucesso após compilação ${WHEEL_PATH:-'(instalação direta)'}"
return 0
fi
echo "[flow] ${PKG}: falha após tentativa de build; agendando recompilação em segundo plano e seguindo adiante"
nohup bash -lc "${BUILD_FN}; exit 0" >/app/wheels/build_${PKG}.log 2>&1 & disown || true
return 1
}
# ===== Apex e flash-attn =====
ensure_pkg "apex" check_apex build_apex || true
ensure_pkg "flash_attn" check_flashattn build_flashattn || true
# ===== Upload final (somente wheels e licença, opcional) =====
if [ "${HF_UPLOAD_WHEELS:-0}" = "1" ]; then
python - <<'PY'
import os
from huggingface_hub import HfApi, HfFolder
repo=os.environ.get("SELF_HF_REPO_ID","XCarleX/Apex-l40s")
token=os.getenv("HF_TOKEN") or HfFolder.get_token()
if not token:
raise SystemExit("HF_TOKEN ausente; upload desabilitado")
api=HfApi(token=token)
api.upload_folder(
folder_path="/app/wheels",
repo_id=repo,
repo_type="model",
allow_patterns=["*.whl","NGC-DL-CONTAINER-LICENSE"],
ignore_patterns=["**/src/**","**/*.log","**/logs/**",".git/**"],
)
print("Upload concluído (wheels + licença).")
PY
else
echo "ℹ️ Upload desabilitado (defina HF_UPLOAD_WHEELS=1)"
fi
# Permissões finais
chmod -R 777 /app/wheels /app/whells || true
chmod +x ./run.sh
./run.sh
|