| # Server | |
| PORT=3000 | |
| # Model from Hugging Face (Transformers) | |
| MODEL_REPO_ID=Qwen/Qwen3-VL-2B-Thinking | |
| # HF token for gated/private models (optional) | |
| HF_TOKEN= | |
| # Inference parameters | |
| MAX_TOKENS=4096 | |
| TEMPERATURE=0.7 | |
| # Multimedia processing | |
| MAX_VIDEO_FRAMES=16 | |
| # Transformers loading hints | |
| DEVICE_MAP=auto | |
| TORCH_DTYPE=auto | |
| # Quantization config (BitsAndBytes 4-bit) - Disabled for CPU deployment | |
| # Enable 4-bit quantization to reduce VRAM usage (~5GB -> ~1.5GB) | |
| LOAD_IN_4BIT=0 | |
| BNB_4BIT_COMPUTE_DTYPE=float16 | |
| BNB_4BIT_USE_DOUBLE_QUANT=1 | |
| BNB_4BIT_QUANT_TYPE=nf4 | |
| # Concurrency config | |
| MAX_WORKERS=4 | |
| OCR_TIMEOUT_SECONDS=120 | |
| # Persistent SSE session store (SQLite) | |
| # Enable to persist streaming chunks per session_id and allow resume after server restarts. | |
| # 1=true, 0=false | |
| PERSIST_SESSIONS=1 | |
| SESSIONS_DB_PATH=sessions.db | |
| # TTL for sessions (seconds). Finished sessions older than TTL are garbage collected. | |
| SESSIONS_TTL_SECONDS=600 | |
| # Auto compression and context reporting | |
| # Enable automatic prompt compression if context would overflow. Drops oldest non-system messages. | |
| ENABLE_AUTO_COMPRESSION=1 | |
| # Force a max context window for budgeting; 0 = use model/tokenizer defaults | |
| CONTEXT_MAX_TOKENS_AUTO=0 | |
| # Safety margin kept free for generation and special tokens | |
| CONTEXT_SAFETY_MARGIN=256 | |
| # Compression strategy: truncate (default). summarize reserved for future use. | |
| COMPRESSION_STRATEGY=truncate |