Spaces:

lemonteaa
/

edge_llm_chat

Running

App Files Files Community

lemonteaa commited on Aug 23

Commit

19d6e32

verified ·

1 Parent(s): f7ba720

Update chat_demo.py

Browse files

Files changed (1) hide show

chat_demo.py +14 -5

chat_demo.py CHANGED Viewed

@@ -7,8 +7,15 @@ import tempfile
 import subprocess
 import threading
-BASE_URL = "http://localhost:5100/v1"
-MODEL_NAME = "placeholder-model-id"
 def read_output(process):
     """Reads the output from the subprocess and prints it to the console."""
@@ -33,12 +40,13 @@ def start_server(command):
     return process
-server_process = start_server(["./llama.cpp/build/bin/llama-server", "-m" ,"./llama.cpp/build/ERNIE-4.5-0.3B-PT-UD-Q8_K_XL.gguf", "-c", "32000", "--jinja", "--no-mmap", "--port", "5100", "--threads", "2"])
 cli = OpenAI(api_key="sk-nokey", base_url=BASE_URL)
-def openai_call(message, history, system_prompt, max_new_tokens):
     #print(history) # DEBUG
     history.insert(0, {
         "role": "system",
@@ -49,7 +57,7 @@ def openai_call(message, history, system_prompt, max_new_tokens):
         "content": message
     })
     response = cli.chat.completions.create(
-        model=MODEL_NAME,
         messages=history,
         max_tokens=max_new_tokens,
         #stop=["<|im_end|>", "</s>"],
@@ -95,6 +103,7 @@ with gr.Blocks() as demo:
     orig_path = gr.State()
     chatbot = gr.Chatbot(placeholder="Have fun with the AI!", editable='all', show_copy_button=True)
     additional_inputs=[
         gr.Textbox("You are a helpful AI assistant.", label="System Prompt"),
         gr.Slider(30, 8192, value=2048, label="Max new tokens"),
     ]

 import subprocess
 import threading
+MAIN_PORT = 5100
+BASE_URL = f"http://localhost:{MAIN_PORT}/v1"
+#MODEL_NAME = "placeholder-model-id"
+MODEL_LIST = [
+    ("Ernie-4.5-0.3B - Good generalist and small", "Ernie-4.5-0.3B"),
+    ("LFM2-VL-450M - Stronger RLHF? Weaker in STEM", "LFM2-VL-450M"),
+    ("gemma-3-270m-it - Deliberately Raw, need strong system prompt and steering if want assistant behavior", "gemma-3-270m-it"),
+    ("Qwen3-0.6B - hybrid thinking /no_think, can do very limited STEM?", "Qwen3-0.6B")
+]
 def read_output(process):
     """Reads the output from the subprocess and prints it to the console."""
     return process
+#server_process = start_server(["./llama.cpp/build/bin/llama-server", "-m" ,"./llama.cpp/build/ERNIE-4.5-0.3B-PT-UD-Q8_K_XL.gguf", "-c", "32000", "--jinja", "--no-mmap", "--port", "5100", "--threads", "2"])
+server_process = start_server(["./llamaswap/llama-swap", "--listen", f"localhost:{MAIN_PORT}", "--config", "./config.yaml"])
 cli = OpenAI(api_key="sk-nokey", base_url=BASE_URL)
+def openai_call(message, history, model_chosen, system_prompt, max_new_tokens):
     #print(history) # DEBUG
     history.insert(0, {
         "role": "system",
         "content": message
     })
     response = cli.chat.completions.create(
+        model=model_chosen,
         messages=history,
         max_tokens=max_new_tokens,
         #stop=["<|im_end|>", "</s>"],
     orig_path = gr.State()
     chatbot = gr.Chatbot(placeholder="Have fun with the AI!", editable='all', show_copy_button=True)
     additional_inputs=[
+        gr.Dropdown(choices=MODEL_LIST, label="LLM Model"),
         gr.Textbox("You are a helpful AI assistant.", label="System Prompt"),
         gr.Slider(30, 8192, value=2048, label="Max new tokens"),
     ]