Update run_vllm.sh (#20)
Browse files- Update run_vllm.sh (8c6ccbc03601c74bfae5594c3d130a36c0a2b630)
Co-authored-by: Fred Reiss <frreiss@users.noreply.huggingface.co>
- run_vllm.sh +7 -6
run_vllm.sh
CHANGED
|
@@ -5,16 +5,17 @@
|
|
| 5 |
# available LoRA adapters in this repository.
|
| 6 |
#
|
| 7 |
# To run this script:
|
| 8 |
-
# 1. Install an appropriate build of vLLM for your machine
|
| 9 |
-
# 2. Install the Hugging Face CLI (`
|
| 10 |
# 3. Download the intrinsics library by running:
|
| 11 |
-
# hf download ibm-granite/intrinsics-lib --local-dir ./intrinsics-lib
|
| 12 |
-
# 4. Edit the constants BASE_MODEL_NAME and
|
| 13 |
-
# 5. Run this script from the root of your local copy of intrinsics-lib.
|
| 14 |
################################################################################
|
| 15 |
|
| 16 |
BASE_MODEL_NAME=granite-3.3-8b-instruct
|
| 17 |
BASE_MODEL_ORG=ibm-granite
|
|
|
|
| 18 |
|
| 19 |
export VLLM_API_KEY=rag_intrinsics_1234
|
| 20 |
|
|
@@ -30,7 +31,7 @@ done
|
|
| 30 |
|
| 31 |
|
| 32 |
CMD="vllm serve ${BASE_MODEL_ORG}/${BASE_MODEL_NAME} \
|
| 33 |
-
--port
|
| 34 |
--gpu-memory-utilization 0.45 \
|
| 35 |
--max-model-len 8192 \
|
| 36 |
--enable-lora \
|
|
|
|
| 5 |
# available LoRA adapters in this repository.
|
| 6 |
#
|
| 7 |
# To run this script:
|
| 8 |
+
# 1. Install an appropriate build of vLLM for your machine (`pip install vllm`)
|
| 9 |
+
# 2. Install the Hugging Face CLI (`pip install -U "huggingface_hub[cli]"`)
|
| 10 |
# 3. Download the intrinsics library by running:
|
| 11 |
+
# hf download ibm-granite/rag-intrinsics-lib --local-dir ./rag-intrinsics-lib
|
| 12 |
+
# 4. Edit the constants BASE_MODEL_NAME, BASE_MODEL_ORG, and PORT as needed
|
| 13 |
+
# 5. Run this script from the root of your local copy of rag-intrinsics-lib.
|
| 14 |
################################################################################
|
| 15 |
|
| 16 |
BASE_MODEL_NAME=granite-3.3-8b-instruct
|
| 17 |
BASE_MODEL_ORG=ibm-granite
|
| 18 |
+
PORT=55555
|
| 19 |
|
| 20 |
export VLLM_API_KEY=rag_intrinsics_1234
|
| 21 |
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
CMD="vllm serve ${BASE_MODEL_ORG}/${BASE_MODEL_NAME} \
|
| 34 |
+
--port ${PORT} \
|
| 35 |
--gpu-memory-utilization 0.45 \
|
| 36 |
--max-model-len 8192 \
|
| 37 |
--enable-lora \
|