Spaces:
Sleeping
Sleeping
Luke Stanley
commited on
Commit
·
469f650
1
Parent(s):
ce5ad5f
Avoid unneeded imports, make serverless output more sensible, removing some debugging and comments
Browse files- app.py +3 -2
- runpod_handler.py +1 -1
- utils.py +12 -10
app.py
CHANGED
|
@@ -36,8 +36,9 @@ if LLM_WORKER == "http" or LLM_WORKER == "in_memory":
|
|
| 36 |
from chill import improvement_loop
|
| 37 |
|
| 38 |
|
| 39 |
-
def
|
|
|
|
| 40 |
return str(improvement_loop(text))
|
| 41 |
|
| 42 |
-
demo = gr.Interface(fn=
|
| 43 |
demo.launch(max_threads=1, share=True)
|
|
|
|
| 36 |
from chill import improvement_loop
|
| 37 |
|
| 38 |
|
| 39 |
+
def chill_out(text):
|
| 40 |
+
print("Got this input:", text)
|
| 41 |
return str(improvement_loop(text))
|
| 42 |
|
| 43 |
+
demo = gr.Interface(fn=chill_out, inputs="text", outputs="text")
|
| 44 |
demo.launch(max_threads=1, share=True)
|
runpod_handler.py
CHANGED
|
@@ -33,7 +33,7 @@ def handler(job):
|
|
| 33 |
print("schema", schema )
|
| 34 |
output = llm_stream_sans_network_simple(prompt, schema)
|
| 35 |
#print("got this output", str(output))
|
| 36 |
-
return
|
| 37 |
|
| 38 |
runpod.serverless.start({
|
| 39 |
"handler": handler,
|
|
|
|
| 33 |
print("schema", schema )
|
| 34 |
output = llm_stream_sans_network_simple(prompt, schema)
|
| 35 |
#print("got this output", str(output))
|
| 36 |
+
return output
|
| 37 |
|
| 38 |
runpod.serverless.start({
|
| 39 |
"handler": handler,
|
utils.py
CHANGED
|
@@ -1,11 +1,9 @@
|
|
| 1 |
import json
|
| 2 |
from os import environ as env
|
| 3 |
from typing import Any, Dict, Union
|
| 4 |
-
# TODO: Make imports conditional on type of worker being used:
|
| 5 |
-
import requests
|
| 6 |
|
|
|
|
| 7 |
from huggingface_hub import hf_hub_download
|
| 8 |
-
from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
| 9 |
|
| 10 |
|
| 11 |
# There are 3 ways to use the LLM model currently used:
|
|
@@ -18,7 +16,7 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
|
| 18 |
# The real OpenAI API has other ways to set the output format.
|
| 19 |
# It's possible to switch to another LLM API by changing the llm_streaming function.
|
| 20 |
# 3. Use the RunPod API, which is a paid service with severless GPU functions.
|
| 21 |
-
#
|
| 22 |
|
| 23 |
URL = "http://localhost:5834/v1/chat/completions"
|
| 24 |
in_memory_llm = None
|
|
@@ -34,15 +32,19 @@ LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
|
|
| 34 |
MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
|
| 35 |
TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
| 38 |
print(f"Using local model from {LLM_MODEL_PATH}")
|
| 39 |
-
|
| 40 |
print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
|
| 41 |
LLM_MODEL_PATH =hf_hub_download(
|
| 42 |
repo_id=env.get("REPO_ID", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"),
|
| 43 |
filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"),
|
| 44 |
)
|
| 45 |
print(f"Model downloaded to {LLM_MODEL_PATH}")
|
|
|
|
|
|
|
| 46 |
|
| 47 |
if in_memory_llm is None and LLM_WORKER == "in_memory":
|
| 48 |
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
|
@@ -152,8 +154,8 @@ def llm_stream_sans_network(
|
|
| 152 |
|
| 153 |
# Function to call the RunPod API with a Pydantic model and movie name
|
| 154 |
def llm_stream_serverless(prompt,model):
|
| 155 |
-
RUNPOD_ENDPOINT_ID = env("RUNPOD_ENDPOINT_ID")
|
| 156 |
-
RUNPOD_API_KEY = env("RUNPOD_API_KEY")
|
| 157 |
url = f"https://api.runpod.ai/v2/{RUNPOD_ENDPOINT_ID}/runsync"
|
| 158 |
|
| 159 |
headers = {
|
|
@@ -171,8 +173,8 @@ def llm_stream_serverless(prompt,model):
|
|
| 171 |
|
| 172 |
response = requests.post(url, json=data, headers=headers)
|
| 173 |
result = response.json()
|
| 174 |
-
|
| 175 |
-
|
| 176 |
return json.loads(output)
|
| 177 |
|
| 178 |
def query_ai_prompt(prompt, replacements, model_class):
|
|
|
|
| 1 |
import json
|
| 2 |
from os import environ as env
|
| 3 |
from typing import Any, Dict, Union
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
import requests
|
| 6 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
# There are 3 ways to use the LLM model currently used:
|
|
|
|
| 16 |
# The real OpenAI API has other ways to set the output format.
|
| 17 |
# It's possible to switch to another LLM API by changing the llm_streaming function.
|
| 18 |
# 3. Use the RunPod API, which is a paid service with severless GPU functions.
|
| 19 |
+
# See serverless.md for more information.
|
| 20 |
|
| 21 |
URL = "http://localhost:5834/v1/chat/completions"
|
| 22 |
in_memory_llm = None
|
|
|
|
| 32 |
MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
|
| 33 |
TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
|
| 34 |
|
| 35 |
+
performing_local_inference = (LLM_WORKER == "in_memory" or LLM_WORKER == "http")
|
| 36 |
+
|
| 37 |
+
if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
|
| 38 |
print(f"Using local model from {LLM_MODEL_PATH}")
|
| 39 |
+
if performing_local_inference and not LLM_MODEL_PATH:
|
| 40 |
print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
|
| 41 |
LLM_MODEL_PATH =hf_hub_download(
|
| 42 |
repo_id=env.get("REPO_ID", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"),
|
| 43 |
filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"),
|
| 44 |
)
|
| 45 |
print(f"Model downloaded to {LLM_MODEL_PATH}")
|
| 46 |
+
if LLM_WORKER == "http" or LLM_WORKER == "in_memory":
|
| 47 |
+
from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
|
| 48 |
|
| 49 |
if in_memory_llm is None and LLM_WORKER == "in_memory":
|
| 50 |
print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
|
|
|
|
| 154 |
|
| 155 |
# Function to call the RunPod API with a Pydantic model and movie name
|
| 156 |
def llm_stream_serverless(prompt,model):
|
| 157 |
+
RUNPOD_ENDPOINT_ID = env.get("RUNPOD_ENDPOINT_ID")
|
| 158 |
+
RUNPOD_API_KEY = env.get("RUNPOD_API_KEY")
|
| 159 |
url = f"https://api.runpod.ai/v2/{RUNPOD_ENDPOINT_ID}/runsync"
|
| 160 |
|
| 161 |
headers = {
|
|
|
|
| 173 |
|
| 174 |
response = requests.post(url, json=data, headers=headers)
|
| 175 |
result = response.json()
|
| 176 |
+
print(result)
|
| 177 |
+
output = result['output']
|
| 178 |
return json.loads(output)
|
| 179 |
|
| 180 |
def query_ai_prompt(prompt, replacements, model_class):
|