Spaces:

xsa-dev
/

llama2-7b-llama_cpp-ggmlv3-q4_1

Runtime error

App Files Files Community

xsa-dev commited on Aug 14, 2023

Commit

55c0bb6

1 Parent(s): 6bf19a4

rewrite

Browse files

Files changed (1) hide show

app.py +49 -20

app.py CHANGED Viewed

@@ -1,21 +1,50 @@
 import gradio as gr
-import os
-title = "LLAMA2"
-description = "LLAMA2"
-article = "<p style='text-align: center'>test ...</p>"
-examples = [
-    ["Test message 1"],
-    ["What you can?"],
-    ["Гладкая Бореальная низина на северном полушарии занимает 40%"],
-]
-gr.load(
-    "huggingface/meta-llama/Llama-2-7b-chat-hf",
-    inputs=gr.Textbox(lines=5, label="Входной текст"),
-    title=title,
-    description=description,
-    article=article,
-    examples=examples,
-    enable_queue=True,
-    api_key=os.getenv('api_key')
-).launch()

 import gradio as gr
+import copy
+import time
+import ctypes  # to run on C api directly
+import llama_cpp
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download  # load from huggingfaces
+llm = Llama(model_path=hf_hub_download(
+    repo_id="TheBloke/Llama-2-7B-Chat-GGML",
+    filename="llama-2-7b-chat.ggmlv3.q4_1.bin"), n_ctx=2048)  # download model from hf/ n_ctx=2048 for high ccontext length
+history = []
+pre_prompt = " The user and the AI are having a conversation : <|endoftext|> \n "
+def generate_text(input_text, history):
+    print("history ", history)
+    print("input ", input_text)
+    temp = ""
+    if history == []:
+        input_text_with_history = f"SYSTEM:{pre_prompt}" + \
+            "\n" + f"USER: {input_text} " + "\n" + " ASSISTANT:"
+    else:
+        input_text_with_history = f"{history[-1][1]}" + "\n"
+        input_text_with_history += f"USER: {input_text}" + "\n" + " ASSISTANT:"
+    print("new input", input_text_with_history)
+    output = llm(input_text_with_history, max_tokens=1024, stop=[
+                 "<|prompter|>", "<|endoftext|>", "<|endoftext|> \n", "ASSISTANT:", "USER:", "SYSTEM:"], stream=True)
+    for out in output:
+        stream = copy.deepcopy(out)
+        print(stream["choices"][0]["text"])
+        temp += stream["choices"][0]["text"]
+        yield temp
+    history = ["init", input_text_with_history]
+demo = gr.ChatInterface(generate_text,
+                        title="LLM on CPU",
+                        description="Running LLM with https://github.com/abetlen/llama-cpp-python. btw the text streaming thing was the hardest thing to impliment",
+                        examples=["Hello", "Am I cool?",
+                                  "Are tomatoes vegetables?"],
+                        cache_examples=True,
+                        retry_btn=None,
+                        undo_btn="Delete Previous",
+                        clear_btn="Clear",)
+demo.queue(concurrency_count=1, max_size=5)
+demo.launch()