# Server PORT=3000 # Model from Hugging Face (Transformers) MODEL_REPO_ID=Qwen/Qwen3-VL-2B-Thinking # HF token for gated/private models (optional) HF_TOKEN= # Inference parameters MAX_TOKENS=4096 TEMPERATURE=0.7 # Multimedia processing MAX_VIDEO_FRAMES=16 # Transformers loading hints DEVICE_MAP=auto TORCH_DTYPE=auto # Quantization config (BitsAndBytes 4-bit) - Disabled for CPU deployment # Enable 4-bit quantization to reduce VRAM usage (~5GB -> ~1.5GB) LOAD_IN_4BIT=0 BNB_4BIT_COMPUTE_DTYPE=float16 BNB_4BIT_USE_DOUBLE_QUANT=1 BNB_4BIT_QUANT_TYPE=nf4 # Concurrency config MAX_WORKERS=4 OCR_TIMEOUT_SECONDS=120 # Persistent SSE session store (SQLite) # Enable to persist streaming chunks per session_id and allow resume after server restarts. # 1=true, 0=false PERSIST_SESSIONS=1 SESSIONS_DB_PATH=sessions.db # TTL for sessions (seconds). Finished sessions older than TTL are garbage collected. SESSIONS_TTL_SECONDS=600 # Auto compression and context reporting # Enable automatic prompt compression if context would overflow. Drops oldest non-system messages. ENABLE_AUTO_COMPRESSION=1 # Force a max context window for budgeting; 0 = use model/tokenizer defaults CONTEXT_MAX_TOKENS_AUTO=0 # Safety margin kept free for generation and special tokens CONTEXT_SAFETY_MARGIN=256 # Compression strategy: truncate (default). summarize reserved for future use. COMPRESSION_STRATEGY=truncate