llama-cpp-agent

Paused

App Files Files Community

pabloce commited on May 20, 2024

Commit

7418606

verified ·

1 Parent(s): 63c66b0

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -35

app.py CHANGED Viewed

@@ -15,7 +15,8 @@ from llama_index.llms.llama_cpp.llama_utils import (
 )
 from llama_index.core.memory import ChatMemoryBuffer
-subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', env={'CMAKE_ARGS': "-DLLAMA_CUDA=on"}, shell=True)
 hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")
@@ -28,43 +29,61 @@ def respond(
     temperature,
     top_p,
 ):
-    stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
-    chat_template = '<s>[INST] ' + system_message
-    # for human, assistant in history:
-    #     chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
-    chat_template += ' ' + message + ' [/INST]'
-    print(chat_template)
-    llm = LlamaCPP(
-        model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
-        temperature=temperature,
-        max_new_tokens=max_tokens,
-        context_window=2048,
-        generate_kwargs={
-            "top_k": 50,
-            "top_p": top_p,
-            "repeat_penalty": 1.3
-        },
-        model_kwargs={
-            "n_threads": 0,
-            "n_gpu_layers": 33
-        },
-        messages_to_prompt=messages_to_prompt,
-        completion_to_prompt=completion_to_prompt,
-        verbose=True,
     )
-    # response = ""
     # for chunk in llm.stream_complete(message):
-    #     print(chunk.delta, end="", flush=True)
-    #     response += str(chunk.delta)
-    #     yield response
-    outputs = []
-    for chunk in llm.stream_complete(message):
-        outputs.append(chunk.delta)
-        if chunk.delta in stop_tokens:
-            break
-        yield "".join(outputs)
 demo = gr.ChatInterface(
     respond,

 )
 from llama_index.core.memory import ChatMemoryBuffer
+subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
+subprocess.run('pip install llama-cpp-agent', shell=True)
 hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF", filename="mistral-7b-instruct-v0.2.Q6_K.gguf",  local_dir = "./models")
     temperature,
     top_p,
 ):
+    llama_model = Llama(r"models/mistral-7b-instruct-v0.2.Q6_K.gguf", n_batch=1024, n_threads=0, n_gpu_layers=33, n_ctx=8192, verbose=False)
+    provider = LlamaCppPythonProvider(llama_model)
+    agent = LlamaCppAgent(
+      provider,
+      system_prompt=f"{system_message}",
+      predefined_messages_formatter_type=MessagesFormatterType.MISTRAL,
+      debug_output=True
     )
+    settings = provider.get_provider_default_settings()
+    settings.stream = True
+    settings.max_tokens = max_tokens
+    settings.temperature = temperature
+    settings.top_p = top_p
+    yield agent.get_chat_response(message, llm_sampling_settings=settings, returns_streaming_generator=True)
+    # stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
+    # chat_template = '<s>[INST] ' + system_message
+    # # for human, assistant in history:
+    # #     chat_template += human + ' [/INST] ' + assistant + '</s>[INST]'
+    # chat_template += ' ' + message + ' [/INST]'
+    # print(chat_template)
+    # llm = LlamaCPP(
+    #     model_path="models/mistral-7b-instruct-v0.2.Q6_K.gguf",
+    #     temperature=temperature,
+    #     max_new_tokens=max_tokens,
+    #     context_window=2048,
+    #     generate_kwargs={
+    #         "top_k": 50,
+    #         "top_p": top_p,
+    #         "repeat_penalty": 1.3
+    #     },
+    #     model_kwargs={
+    #         "n_threads": 0,
+    #         "n_gpu_layers": 33
+    #     },
+    #     messages_to_prompt=messages_to_prompt,
+    #     completion_to_prompt=completion_to_prompt,
+    #     verbose=True,
+    # )
+    # # response = ""
+    # # for chunk in llm.stream_complete(message):
+    # #     print(chunk.delta, end="", flush=True)
+    # #     response += str(chunk.delta)
+    # #     yield response
+    # outputs = []
     # for chunk in llm.stream_complete(message):
+    #     outputs.append(chunk.delta)
+    #     if chunk.delta in stop_tokens:
+    #         break
+    #     yield "".join(outputs)
 demo = gr.ChatInterface(
     respond,