Spaces:

KillerKing93
/

Transformers-InferenceServer-OpenAPI

Sleeping

File size: 1,392 Bytes

# Server
PORT=3000

# Model from Hugging Face (Transformers)
MODEL_REPO_ID=Qwen/Qwen3-VL-2B-Thinking
# HF token for gated/private models (optional)
HF_TOKEN=

# Inference parameters
MAX_TOKENS=4096
TEMPERATURE=0.7

# Multimedia processing
MAX_VIDEO_FRAMES=16

# Transformers loading hints
DEVICE_MAP=auto
TORCH_DTYPE=auto

# Quantization config (BitsAndBytes 4-bit) - Disabled for CPU deployment
# Enable 4-bit quantization to reduce VRAM usage (~5GB -> ~1.5GB)
LOAD_IN_4BIT=0
BNB_4BIT_COMPUTE_DTYPE=float16
BNB_4BIT_USE_DOUBLE_QUANT=1
BNB_4BIT_QUANT_TYPE=nf4

# Concurrency config
MAX_WORKERS=4
OCR_TIMEOUT_SECONDS=120
# Persistent SSE session store (SQLite)
# Enable to persist streaming chunks per session_id and allow resume after server restarts.
# 1=true, 0=false
PERSIST_SESSIONS=1
SESSIONS_DB_PATH=sessions.db
# TTL for sessions (seconds). Finished sessions older than TTL are garbage collected.
SESSIONS_TTL_SECONDS=600
# Auto compression and context reporting
# Enable automatic prompt compression if context would overflow. Drops oldest non-system messages.
ENABLE_AUTO_COMPRESSION=1
# Force a max context window for budgeting; 0 = use model/tokenizer defaults
CONTEXT_MAX_TOKENS_AUTO=0
# Safety margin kept free for generation and special tokens
CONTEXT_SAFETY_MARGIN=256
# Compression strategy: truncate (default). summarize reserved for future use.
COMPRESSION_STRATEGY=truncate