added support for HF TGI
Browse files- .env.example +1 -0
- .gitignore +2 -0
- app_modules/llm_loader.py +14 -0
- requirements.txt +2 -1
- tgi.sh +9 -0
.env.example
CHANGED
|
@@ -6,6 +6,7 @@ LLM_MODEL_TYPE=huggingface
|
|
| 6 |
# LLM_MODEL_TYPE=mosaicml
|
| 7 |
# LLM_MODEL_TYPE=stablelm
|
| 8 |
# LLM_MODEL_TYPE=openllm
|
|
|
|
| 9 |
|
| 10 |
OPENLLM_SERVER_URL=http://localhost:64300
|
| 11 |
|
|
|
|
| 6 |
# LLM_MODEL_TYPE=mosaicml
|
| 7 |
# LLM_MODEL_TYPE=stablelm
|
| 8 |
# LLM_MODEL_TYPE=openllm
|
| 9 |
+
# LLM_MODEL_TYPE=hftgi
|
| 10 |
|
| 11 |
OPENLLM_SERVER_URL=http://localhost:64300
|
| 12 |
|
.gitignore
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
pdfs
|
| 2 |
.vscode/
|
|
|
|
|
|
|
| 3 |
|
| 4 |
# Byte-compiled / optimized / DLL files
|
| 5 |
__pycache__/
|
|
|
|
| 1 |
pdfs
|
| 2 |
.vscode/
|
| 3 |
+
data/version.txt
|
| 4 |
+
data/models*
|
| 5 |
|
| 6 |
# Byte-compiled / optimized / DLL files
|
| 7 |
__pycache__/
|
app_modules/llm_loader.py
CHANGED
|
@@ -5,6 +5,7 @@ from queue import Queue
|
|
| 5 |
from typing import Any, Optional
|
| 6 |
|
| 7 |
import torch
|
|
|
|
| 8 |
from langchain.callbacks.base import BaseCallbackHandler
|
| 9 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
| 10 |
from langchain.chat_models import ChatOpenAI
|
|
@@ -188,6 +189,19 @@ class LLMLoader:
|
|
| 188 |
verbose=True,
|
| 189 |
use_mlock=True,
|
| 190 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
elif self.llm_model_type.startswith("huggingface"):
|
| 192 |
MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
|
| 193 |
print(f" loading model: {MODEL_NAME_OR_PATH}")
|
|
|
|
| 5 |
from typing import Any, Optional
|
| 6 |
|
| 7 |
import torch
|
| 8 |
+
from langchain import HuggingFaceTextGenInference
|
| 9 |
from langchain.callbacks.base import BaseCallbackHandler
|
| 10 |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
| 11 |
from langchain.chat_models import ChatOpenAI
|
|
|
|
| 189 |
verbose=True,
|
| 190 |
use_mlock=True,
|
| 191 |
)
|
| 192 |
+
elif self.llm_model_type == "hftgi":
|
| 193 |
+
HFTGI_SERVER_URL = os.environ.get("HFTGI_SERVER_URL")
|
| 194 |
+
self.llm = HuggingFaceTextGenInference(
|
| 195 |
+
inference_server_url=HFTGI_SERVER_URL,
|
| 196 |
+
max_new_tokens=self.max_tokens_limit / 2,
|
| 197 |
+
top_k=10,
|
| 198 |
+
top_p=0.95,
|
| 199 |
+
typical_p=0.95,
|
| 200 |
+
temperature=0.01,
|
| 201 |
+
repetition_penalty=1.03,
|
| 202 |
+
callbacks=callbacks,
|
| 203 |
+
streaming=True,
|
| 204 |
+
)
|
| 205 |
elif self.llm_model_type.startswith("huggingface"):
|
| 206 |
MODEL_NAME_OR_PATH = os.environ.get("HUGGINGFACE_MODEL_NAME_OR_PATH")
|
| 207 |
print(f" loading model: {MODEL_NAME_OR_PATH}")
|
requirements.txt
CHANGED
|
@@ -34,4 +34,5 @@ pypdf
|
|
| 34 |
python-telegram-bot
|
| 35 |
transformers_stream_generator
|
| 36 |
openllm
|
| 37 |
-
openllm[llama]
|
|
|
|
|
|
| 34 |
python-telegram-bot
|
| 35 |
transformers_stream_generator
|
| 36 |
openllm
|
| 37 |
+
openllm[llama]
|
| 38 |
+
text_generation
|
tgi.sh
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/sh
|
| 2 |
+
|
| 3 |
+
export HUGGINGFACE_MODEL_NAME_OR_PATH="meta-llama/Llama-2-7b-chat-hf"
|
| 4 |
+
|
| 5 |
+
echo Running $HUGGINGFACE_MODEL_NAME_OR_PATH with TGI
|
| 6 |
+
|
| 7 |
+
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
|
| 8 |
+
|
| 9 |
+
docker run -e HUGGING_FACE_HUB_TOKEN=$HUGGINGFACE_AUTH_TOKEN --shm-size 1g -p 8081:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.0.0 --model-id $HUGGINGFACE_MODEL_NAME_OR_PATH
|