File size: 6,957 Bytes
7b27e12 |
|
#!/usr/bin/env bash
set -euo pipefail
echo "🚀 Start (wheels + app)"
# ===== Config e diretórios =====
export SELF_HF_REPO_ID="${SELF_HF_REPO_ID:-XCarleX/Apex-l40s}"
export HF_HOME="${HF_HOME:-/app/model_cache}"
export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
export TORCH_HOME="${TORCH_HOME:-$HF_HOME/torch}"
export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-8.9}"
export HF_HUB_ENABLE_HF_TRANSFER="${HF_HUB_ENABLE_HF_TRANSFER:-1}"
export PATH="$HOME/.local/bin:$PATH"
mkdir -p /app/wheels /app/cuda_cache "$HF_HOME" "$TORCH_HOME" /app/wheels/src /app/whells
chmod -R 777 /app/wheels /app/whells || true
export CUDA_CACHE_PATH="/app/cuda_cache"
# Copia licença NGC se presente (não é enviada a logs do app)
if [ -f "/NGC-DL-CONTAINER-LICENSE" ]; then
cp -f /NGC-DL-CONTAINER-LICENSE /app/wheels/NGC-DL-CONTAINER-LICENSE || true
fi
# ===== Dependências mínimas (pip e Hub) =====
python -m pip install -q -U pip build setuptools wheel hatchling hatch-vcs scikit-build-core cmake ninja packaging
python -m pip install -q -U "huggingface_hub[hf_transfer]" || python -m pip install -q -U huggingface_hub
# ===== Tags úteis (Python/CUDA) =====
read -r PY_TAG CU_TAG <<EOF
$(python - <<'PY'
import sys, torch
py_tag=f"cp{sys.version_info.major}{sys.version_info.minor}"
cu_tag="cu"+(torch.version.cuda or "0").replace(".","")
print(py_tag, cu_tag)
PY
)
EOF
echo "[env] Python tag=${PY_TAG} | CUDA tag=${CU_TAG}"
# ===== Checagens (retornam 0/1 sem abortar) =====
check_apex() {
python - <<'PY' >/dev/null 2>&1
from importlib import import_module
from apex.normalization import FusedLayerNorm # exige extensão CUDA
import_module("fused_layer_norm_cuda")
PY
[ $? -eq 0 ] && { echo "[apex] import OK"; return 0; } || { echo "[apex] import falhou (vai tentar Hub/Build)"; return 1; }
}
check_flashattn() {
python - <<'PY' >/dev/null 2>&1
import flash_attn
PY
[ $? -eq 0 ] && { echo "[flash_attn] import OK"; return 0; } || { echo "[flash_attn] import falhou (vai tentar Hub/Build)"; return 1; }
}
# ===== Download do Hub (sem usar cache interno entre reinicializações) =====
install_from_hf () {
echo "[hub] Verificando wheel de $1 no repositório ${SELF_HF_REPO_ID}"
python - <<'PY' "$1" "$PY_TAG" "$CU_TAG" 2>/dev/null || exit 1
import os, sys
from huggingface_hub import HfApi, hf_hub_download, HfFolder
pkg, py_tag, cu_tag = sys.argv[1], sys.argv[2], sys.argv[3]
repo = os.environ.get("SELF_HF_REPO_ID","XCarleX/Apex-l40s")
api = HfApi(token=os.getenv("HF_TOKEN") or HfFolder.get_token())
try:
files = api.list_repo_files(repo_id=repo, repo_type="model")
except Exception:
raise SystemExit(0)
cands=[f for f in files if f.endswith(".whl") and f.rsplit("/",1)[-1].startswith(pkg+"-") and py_tag in f]
pref=[f for f in cands if cu_tag and cu_tag in f] or cands
if not pref:
raise SystemExit(0)
target=sorted(pref, reverse=True)[0]
print(target)
path=hf_hub_download(repo_id=repo, filename=target, repo_type="model", local_dir="/app/wheels")
print(path)
PY
}
# ===== Builders (mantêm o fluxo do original) =====
build_apex () {
local SRC="/app/wheels/src/apex"
echo "[build] Preparando fonte Apex em ${SRC}"
if [ -d "$SRC/.git" ]; then
git -C "$SRC" fetch --all -p || true
git -C "$SRC" reset --hard HEAD || true
git -C "$SRC" clean -fdx || true
else
rm -rf "$SRC"
git clone --depth 1 https://github.com/NVIDIA/apex "$SRC"
fi
echo "[build] Compilando Apex -> wheel"
export APEX_CPP_EXT=1 APEX_CUDA_EXT=1 APEX_ALL_CONTRIB_EXT=0
python -m pip wheel --no-build-isolation --no-deps "$SRC" -w /app/wheels -q || true
local W="$(ls -t /app/wheels/apex-*.whl 2>/dev/null | head -n1 || true)"
if [ -n "${W}" ]; then
python -m pip install -U --no-deps "${W}" -q || true
echo "[build] Apex instalado da wheel recém-compilada: ${W}"
else
echo "[build] Nenhuma wheel Apex gerada; instalando do source (pode falhar)"
python -m pip install -q --no-build-isolation "$SRC" || true
fi
echo "${W:-}"
}
FLASH_ATTN_VERSION="${FLASH_ATTN_VERSION:-2.7.4.post1}"
build_flashattn () {
echo "[build] Compilando flash-attn==${FLASH_ATTN_VERSION} -> wheel"
python -m pip wheel --no-build-isolation --no-deps --no-binary :all: "flash-attn==${FLASH_ATTN_VERSION}" -w /app/wheels -q || true
local W="$(ls -t /app/wheels/flash_attn-${FLASH_ATTN_VERSION}-*.whl 2>/dev/null | head -n1 || true)"
if [ -n "${W}" ]; then
python -m pip install -U --no-deps "${W}" -q || true
echo "[build] flash-attn instalado da wheel recém-compilada: ${W}"
else
echo "[build] Nenhuma wheel flash-attn gerada; instalação não aplicada"
fi
echo "${W:-}"
}
# ===== Pipeline com logs de decisão; não encerra em falha (tenta build em bg) =====
ensure_pkg () {
local PKG="$1"
local CHECK_FN="$2"
local BUILD_FN="$3"
local WHEEL_PATH=""
echo "[flow] === ${PKG} ==="
if ${CHECK_FN}; then
echo "[flow] ${PKG}: já instalado (import OK)"
return 0
fi
echo "[flow] ${PKG}: procurando wheel no Hub (${SELF_HF_REPO_ID})"
HF_OUT="$(install_from_hf "$PKG" || true)"
if [ -n "${HF_OUT}" ]; then
WHEEL_PATH="$(printf "%s\n" "${HF_OUT}" | tail -n1)"
echo "[hub] Baixado: ${WHEEL_PATH}"
python -m pip install -U --no-deps "${WHEEL_PATH}" -q || true
if ${CHECK_FN}; then
echo "[flow] ${PKG}: sucesso via Hub (${WHEEL_PATH})"
return 0
else
echo "[flow] ${PKG}: import falhou após instalar wheel do Hub; seguirá para build"
fi
else
echo "[hub] Nenhuma wheel compatível encontrada para ${PKG}"
fi
echo "[flow] ${PKG}: compilando (fallback)"
WHEEL_PATH="$(${BUILD_FN})"
if ${CHECK_FN}; then
echo "[flow] ${PKG}: sucesso após compilação ${WHEEL_PATH:-'(instalação direta)'}"
return 0
fi
echo "[flow] ${PKG}: falha após tentativa de build; agendando recompilação em segundo plano e seguindo adiante"
nohup bash -lc "${BUILD_FN}; exit 0" >/app/wheels/build_${PKG}.log 2>&1 & disown || true
return 1
}
# ===== Apex e flash-attn =====
ensure_pkg "apex" check_apex build_apex || true
ensure_pkg "flash_attn" check_flashattn build_flashattn || true
# ===== Upload final (somente wheels e licença, opcional) =====
if [ "${HF_UPLOAD_WHEELS:-0}" = "1" ]; then
python - <<'PY'
import os
from huggingface_hub import HfApi, HfFolder
repo=os.environ.get("SELF_HF_REPO_ID","XCarleX/Apex-l40s")
token=os.getenv("HF_TOKEN") or HfFolder.get_token()
if not token:
raise SystemExit("HF_TOKEN ausente; upload desabilitado")
api=HfApi(token=token)
api.upload_folder(
folder_path="/app/wheels",
repo_id=repo,
repo_type="model",
allow_patterns=["*.whl","NGC-DL-CONTAINER-LICENSE"],
ignore_patterns=["**/src/**","**/*.log","**/logs/**",".git/**"],
)
print("Upload concluído (wheels + licença).")
PY
else
echo "ℹ️ Upload desabilitado (defina HF_UPLOAD_WHEELS=1)"
fi
# Permissões finais
chmod -R 777 /app/wheels /app/whells || true
chmod +x ./run.sh
./run.sh
|