Spaces:

KillerKing93
/

Transformers-InferenceServer-OpenAPI

Sleeping

Sync from GitHub f73ad69

97e3466 verified 14 days ago

1.39 kB

	# Server
	PORT=3000

	# Model from Hugging Face (Transformers)
	MODEL_REPO_ID=Qwen/Qwen3-VL-2B-Thinking
	# HF token for gated/private models (optional)
	HF_TOKEN=

	# Inference parameters
	MAX_TOKENS=4096
	TEMPERATURE=0.7

	# Multimedia processing
	MAX_VIDEO_FRAMES=16

	# Transformers loading hints
	DEVICE_MAP=auto
	TORCH_DTYPE=auto

	# Quantization config (BitsAndBytes 4-bit) - Disabled for CPU deployment
	# Enable 4-bit quantization to reduce VRAM usage (~5GB -> ~1.5GB)
	LOAD_IN_4BIT=0
	BNB_4BIT_COMPUTE_DTYPE=float16
	BNB_4BIT_USE_DOUBLE_QUANT=1
	BNB_4BIT_QUANT_TYPE=nf4

	# Concurrency config
	MAX_WORKERS=4
	OCR_TIMEOUT_SECONDS=120
	# Persistent SSE session store (SQLite)
	# Enable to persist streaming chunks per session_id and allow resume after server restarts.
	# 1=true, 0=false
	PERSIST_SESSIONS=1
	SESSIONS_DB_PATH=sessions.db
	# TTL for sessions (seconds). Finished sessions older than TTL are garbage collected.
	SESSIONS_TTL_SECONDS=600
	# Auto compression and context reporting
	# Enable automatic prompt compression if context would overflow. Drops oldest non-system messages.
	ENABLE_AUTO_COMPRESSION=1
	# Force a max context window for budgeting; 0 = use model/tokenizer defaults
	CONTEXT_MAX_TOKENS_AUTO=0
	# Safety margin kept free for generation and special tokens
	CONTEXT_SAFETY_MARGIN=256
	# Compression strategy: truncate (default). summarize reserved for future use.
	COMPRESSION_STRATEGY=truncate