Spaces:
Sleeping
Sleeping
Commit
·
e01e28e
1
Parent(s):
0945e5b
Add env vars to set GPU layer count and context size, make verbose
Browse files
utils.py
CHANGED
|
@@ -19,9 +19,12 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
|
| 19 |
URL = "http://localhost:5834/v1/chat/completions"
|
| 20 |
in_memory_llm = None
|
| 21 |
|
| 22 |
-
|
|
|
|
| 23 |
LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
|
| 24 |
USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
|
|
|
|
|
|
|
| 25 |
|
| 26 |
if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
|
| 27 |
print(f"Using local model from {LLM_MODEL_PATH}")
|
|
@@ -35,7 +38,7 @@ else:
|
|
| 35 |
|
| 36 |
if in_memory_llm is None and USE_HTTP_SERVER is False:
|
| 37 |
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
| 38 |
-
in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=
|
| 39 |
|
| 40 |
def llm_streaming(
|
| 41 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
|
@@ -51,9 +54,9 @@ def llm_streaming(
|
|
| 51 |
|
| 52 |
payload = {
|
| 53 |
"stream": True,
|
| 54 |
-
"max_tokens":
|
| 55 |
"grammar": grammar,
|
| 56 |
-
"temperature":
|
| 57 |
"messages": [{"role": "user", "content": prompt}],
|
| 58 |
}
|
| 59 |
headers = {
|
|
@@ -117,8 +120,8 @@ def llm_stream_sans_network(
|
|
| 117 |
|
| 118 |
stream = in_memory_llm(
|
| 119 |
prompt,
|
| 120 |
-
max_tokens=
|
| 121 |
-
temperature=
|
| 122 |
grammar=grammar,
|
| 123 |
stream=True
|
| 124 |
)
|
|
|
|
| 19 |
URL = "http://localhost:5834/v1/chat/completions"
|
| 20 |
in_memory_llm = None
|
| 21 |
|
| 22 |
+
N_GPU_LAYERS = env.get("N_GPU_LAYERS", 10)
|
| 23 |
+
CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 4096))
|
| 24 |
LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
|
| 25 |
USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
|
| 26 |
+
MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
|
| 27 |
+
TEMPERATURE = float(env.get("TEMPERATURE", 0.7))
|
| 28 |
|
| 29 |
if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
|
| 30 |
print(f"Using local model from {LLM_MODEL_PATH}")
|
|
|
|
| 38 |
|
| 39 |
if in_memory_llm is None and USE_HTTP_SERVER is False:
|
| 40 |
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
| 41 |
+
in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
|
| 42 |
|
| 43 |
def llm_streaming(
|
| 44 |
prompt: str, pydantic_model_class, return_pydantic_object=False
|
|
|
|
| 54 |
|
| 55 |
payload = {
|
| 56 |
"stream": True,
|
| 57 |
+
"max_tokens": MAX_TOKENS,
|
| 58 |
"grammar": grammar,
|
| 59 |
+
"temperature": TEMPERATURE,
|
| 60 |
"messages": [{"role": "user", "content": prompt}],
|
| 61 |
}
|
| 62 |
headers = {
|
|
|
|
| 120 |
|
| 121 |
stream = in_memory_llm(
|
| 122 |
prompt,
|
| 123 |
+
max_tokens=MAX_TOKENS,
|
| 124 |
+
temperature=TEMPERATURE,
|
| 125 |
grammar=grammar,
|
| 126 |
stream=True
|
| 127 |
)
|