File size: 1,392 Bytes
7cd14d8 7e03eea 97e3466 7e03eea 97e3466 7e03eea 97e3466 7cd14d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# Server
PORT=3000
# Model from Hugging Face (Transformers)
MODEL_REPO_ID=Qwen/Qwen3-VL-2B-Thinking
# HF token for gated/private models (optional)
HF_TOKEN=
# Inference parameters
MAX_TOKENS=4096
TEMPERATURE=0.7
# Multimedia processing
MAX_VIDEO_FRAMES=16
# Transformers loading hints
DEVICE_MAP=auto
TORCH_DTYPE=auto
# Quantization config (BitsAndBytes 4-bit) - Disabled for CPU deployment
# Enable 4-bit quantization to reduce VRAM usage (~5GB -> ~1.5GB)
LOAD_IN_4BIT=0
BNB_4BIT_COMPUTE_DTYPE=float16
BNB_4BIT_USE_DOUBLE_QUANT=1
BNB_4BIT_QUANT_TYPE=nf4
# Concurrency config
MAX_WORKERS=4
OCR_TIMEOUT_SECONDS=120
# Persistent SSE session store (SQLite)
# Enable to persist streaming chunks per session_id and allow resume after server restarts.
# 1=true, 0=false
PERSIST_SESSIONS=1
SESSIONS_DB_PATH=sessions.db
# TTL for sessions (seconds). Finished sessions older than TTL are garbage collected.
SESSIONS_TTL_SECONDS=600
# Auto compression and context reporting
# Enable automatic prompt compression if context would overflow. Drops oldest non-system messages.
ENABLE_AUTO_COMPRESSION=1
# Force a max context window for budgeting; 0 = use model/tokenizer defaults
CONTEXT_MAX_TOKENS_AUTO=0
# Safety margin kept free for generation and special tokens
CONTEXT_SAFETY_MARGIN=256
# Compression strategy: truncate (default). summarize reserved for future use.
COMPRESSION_STRATEGY=truncate |