Spaces:
Sleeping
Sleeping
Luke Stanley
commited on
Commit
·
56e785c
1
Parent(s):
434144a
Introduces worker mode env var
Browse files
utils.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
| 1 |
import json
|
| 2 |
from os import environ as env
|
| 3 |
from typing import Any, Dict, Union
|
|
|
|
| 4 |
import requests
|
| 5 |
|
| 6 |
from huggingface_hub import hf_hub_download
|
| 7 |
from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
| 8 |
|
| 9 |
-
|
|
|
|
| 10 |
# 1. Use the HTTP server (USE_HTTP_SERVER=True), this is good for development
|
| 11 |
# when you want to change the logic of the translator without restarting the server.
|
| 12 |
# 2. Load the model into memory
|
|
@@ -15,18 +17,24 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
|
| 15 |
# to the OpenAI API but adds a unique "grammar" parameter.
|
| 16 |
# The real OpenAI API has other ways to set the output format.
|
| 17 |
# It's possible to switch to another LLM API by changing the llm_streaming function.
|
|
|
|
|
|
|
| 18 |
|
| 19 |
URL = "http://localhost:5834/v1/chat/completions"
|
| 20 |
in_memory_llm = None
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
| 24 |
LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
|
| 25 |
-
|
| 26 |
MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
|
| 27 |
TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
|
| 28 |
|
| 29 |
-
if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
|
| 30 |
print(f"Using local model from {LLM_MODEL_PATH}")
|
| 31 |
else:
|
| 32 |
print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
|
|
@@ -36,7 +44,7 @@ else:
|
|
| 36 |
)
|
| 37 |
print(f"Model downloaded to {LLM_MODEL_PATH}")
|
| 38 |
|
| 39 |
-
if in_memory_llm is None and
|
| 40 |
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
| 41 |
in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
|
| 42 |
|
|
@@ -141,33 +149,37 @@ def llm_stream_sans_network(
|
|
| 141 |
json_output = json.loads(output_text)
|
| 142 |
return json_output
|
| 143 |
|
| 144 |
-
def query_ai_prompt(prompt, replacements, model_class, in_memory=True):
|
| 145 |
-
prompt = replace_text(prompt, replacements)
|
| 146 |
-
if in_memory:
|
| 147 |
-
return llm_stream_sans_network(prompt, model_class)
|
| 148 |
-
else:
|
| 149 |
-
return llm_streaming(prompt, model_class)
|
| 150 |
-
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
)
|
| 155 |
-
|
|
|
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
from os import environ as env
|
| 3 |
from typing import Any, Dict, Union
|
| 4 |
+
# TODO: Make imports conditional on type of worker being used:
|
| 5 |
import requests
|
| 6 |
|
| 7 |
from huggingface_hub import hf_hub_download
|
| 8 |
from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
| 9 |
|
| 10 |
+
|
| 11 |
+
# There are 3 ways to use the LLM model currently used:
|
| 12 |
# 1. Use the HTTP server (USE_HTTP_SERVER=True), this is good for development
|
| 13 |
# when you want to change the logic of the translator without restarting the server.
|
| 14 |
# 2. Load the model into memory
|
|
|
|
| 17 |
# to the OpenAI API but adds a unique "grammar" parameter.
|
| 18 |
# The real OpenAI API has other ways to set the output format.
|
| 19 |
# It's possible to switch to another LLM API by changing the llm_streaming function.
|
| 20 |
+
# 3. Use the RunPod API, which is a paid service with severless GPU functions.
|
| 21 |
+
# TODO: Update README with instructions on how to use the RunPod API and options.
|
| 22 |
|
| 23 |
URL = "http://localhost:5834/v1/chat/completions"
|
| 24 |
in_memory_llm = None
|
| 25 |
+
worker_options = ["runpod", "http", "in_memory"]
|
| 26 |
|
| 27 |
+
LLM_WORKER = env.get("LLM_WORKER", "runpod")
|
| 28 |
+
if LLM_WORKER not in worker_options:
|
| 29 |
+
raise ValueError(f"Invalid worker: {LLM_WORKER}")
|
| 30 |
+
N_GPU_LAYERS = int(env.get("N_GPU_LAYERS", -1)) # Default to -1, use all layers if available
|
| 31 |
+
CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 2048))
|
| 32 |
LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
|
| 33 |
+
|
| 34 |
MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
|
| 35 |
TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
|
| 36 |
|
| 37 |
+
if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0 and (LLM_WORKER == "in_memory" or LLM_WORKER == "http"):
|
| 38 |
print(f"Using local model from {LLM_MODEL_PATH}")
|
| 39 |
else:
|
| 40 |
print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
|
|
|
|
| 44 |
)
|
| 45 |
print(f"Model downloaded to {LLM_MODEL_PATH}")
|
| 46 |
|
| 47 |
+
if in_memory_llm is None and LLM_WORKER == "in_memory":
|
| 48 |
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
| 49 |
in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
|
| 50 |
|
|
|
|
| 149 |
json_output = json.loads(output_text)
|
| 150 |
return json_output
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
+
# Function to call the RunPod API with a Pydantic model and movie name
|
| 154 |
+
def llm_stream_serverless(prompt,model):
|
| 155 |
+
RUNPOD_ENDPOINT_ID = env("RUNPOD_API_KEY")
|
| 156 |
+
RUNPOD_API_KEY = env("RUNPOD_API_KEY")
|
| 157 |
+
url = f"https://api.runpod.ai/v2/{RUNPOD_ENDPOINT_ID}/runsync"
|
| 158 |
|
| 159 |
+
headers = {
|
| 160 |
+
'Content-Type': 'application/json',
|
| 161 |
+
'Authorization': f'Bearer {RUNPOD_API_KEY}'
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
schema = model.schema()
|
| 165 |
+
data = {
|
| 166 |
+
'input': {
|
| 167 |
+
'schema': json.dumps(schema),
|
| 168 |
+
'prompt': prompt
|
| 169 |
+
}
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
response = requests.post(url, json=data, headers=headers)
|
| 173 |
+
result = response.json()
|
| 174 |
+
output = result.get('output', '').replace("model:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf\n", "")
|
| 175 |
+
print(output)
|
| 176 |
+
return json.loads(output)
|
| 177 |
+
|
| 178 |
+
def query_ai_prompt(prompt, replacements, model_class):
|
| 179 |
+
prompt = replace_text(prompt, replacements)
|
| 180 |
+
if LLM_WORKER == "runpod":
|
| 181 |
+
return llm_stream_serverless(prompt, model_class)
|
| 182 |
+
if LLM_WORKER == "http":
|
| 183 |
+
return llm_streaming(prompt, model_class)
|
| 184 |
+
if LLM_WORKER == "in_memory":
|
| 185 |
+
return llm_stream_sans_network(prompt, model_class)
|