Advanced-RAG-Demo

Runtime error

App Files Files Community

AFischer1985 commited on Feb 22, 2024

Commit

47e0125

verified ·

1 Parent(s): 923df80

Update run.py

Browse files

Files changed (1) hide show

run.py +133 -80

run.py CHANGED Viewed

@@ -1,8 +1,8 @@
 #########################################################################################
-# Title:  German AI-Interface to the Hugging Face Hub with advanced RAG
 # Author: Andreas Fischer
 # Date:   January 31st, 2023
-# Last update: February 21st, 2024
 ##########################################################################################
 #https://github.com/abetlen/llama-cpp-python/issues/306
@@ -26,10 +26,13 @@ if(os.path.exists(filename)==True): os.remove(filename)
 #-----------
 import os
 import chromadb
-dbPath="/home/af/Schreibtisch/gradio/Chroma/db"
-if(os.path.exists(dbPath)==False): dbPath="/home/user/app/db"
 print(dbPath)
 #client = chromadb.Client()
 path=dbPath
 client = chromadb.PersistentClient(path=path)
@@ -40,7 +43,7 @@ from chromadb.utils import embedding_functions
 default_ef = embedding_functions.DefaultEmbeddingFunction()
 #sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
 #instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-large", device="cuda")
-embeddingModel = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer") #, device="cuda")
 print(str(client.list_collections()))
@@ -143,37 +146,41 @@ x=[x["type2"] for x in rag0["metadatas"][0]]
 x.index("1c") if "1c" in x else len(x)+1
-# Get model
-#-----------
-import os
-import requests
-modelPath="/home/af/gguf/models/discolm_german_7b_v1.Q4_0.gguf"
-if(os.path.exists(modelPath)==False):
-  #url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf"
-  #url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
-  #url="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
-  url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true"
-  response = requests.get(url)
-  with open("./model.gguf", mode="wb") as file:
-    file.write(response.content)
-  print("Model downloaded")
-  modelPath="./model.gguf"
-print(modelPath)
-# Llama-cpp-Server
-#------------------
-import subprocess
-n="20"
-if("mixtral-8x7b-instruct" in modelPath): n="0" # mixtral seems to cause problems here...
-command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "8", "--n_gpu_layers", n]
-subprocess.Popen(command)
-print("Server ready!")
 # Gradio-GUI
@@ -346,58 +353,104 @@ def response(message, history):
     historylimit=historylimit # number of past messages to consider for response to current message
     )
   print(prompt)
-  # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
-  url="http://0.0.0.0:2600/v1/completions"
-  body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"}      # e.g. Mixtral-Instruct
-  if("discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]})   # fix stop-token of DiscoLM
-  response="" #+"("+myType+")\n"
-  buffer=""
-  print("URL: "+url)
-  print("User: "+message+"\nAI: ")
-  for text in requests.post(url, json=body, stream=True):  #-H 'accept: application/json' -H 'Content-Type: application/json'
-    if buffer is None: buffer=""
-    buffer=str("".join(buffer))
-    # print("*** Raw String: "+str(text)+"\n***\n")
-    text=text.decode('utf-8')
-    if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
-    # print("\n*** Buffer: "+str(buffer)+"\n***\n")
-    buffer=buffer.split('"finish_reason": null}]}')
-    if(len(buffer)==1):
-      buffer="".join(buffer)
-      pass
-    if(len(buffer)==2):
-      part=buffer[0]+'"finish_reason": null}]}'
-      if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
-      try:
-        part = str(json.loads(part)["choices"][0]["text"])
         print(part, end="", flush=True)
-        response=response+part
-        buffer="" # reset buffer
-      except Exception as e:
-        print("Exception:"+str(e))
         pass
-    yield response
-  if((myType=="1a")|(myType=="1b")): #add RAG-results to chat-output if appropriate
-    response=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
-    yield response
-  history.append((message, response)) # add current dialog to history
-  # Store current state in DB if settings=="Permanent"
-  if (settings=="Permanent"):
-    x=collection.get(include=[])["ids"] # add current dialog to db
-    collection.add(
-      documents=[message,response],
-      metadatas=[
-        { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"},
-        { "source": "DU",  "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}
-      ],
-      ids=[str(len(x)+1),str(len(x)+2)]
-    )
-    json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False)
 gr.ChatInterface(
   response,
   chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, das für jede Anfrage die am besten geeigneten KI-Tools empfiehlt.<br>Aktuell bin ich wenig mehr als eine Tech-Demo und kenne nur 7 KI-Modelle - also sei bitte nicht zu streng mit mir.<br>Was ist dein Anliegen?"]],render_markdown=True),
-  title="German AI-Interface to the Hugging Face Hub with advanced RAG",
   #additional_inputs=[gr.Dropdown(["Permanent","Temporär"],value="Temporär",label="Dialog sichern?")]
   ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
 print("Interface up and running!")

 #########################################################################################
+# Title:  German AI-Interface with advanced RAG
 # Author: Andreas Fischer
 # Date:   January 31st, 2023
+# Last update: February 22st, 2024
 ##########################################################################################
 #https://github.com/abetlen/llama-cpp-python/issues/306
 #-----------
 import os
 import chromadb
+dbPath = "/home/af/Schreibtisch/Code/gradio/Chroma/db"
+onPrem = True if(os.path.exists(dbPath)) else False
+if(onPrem==False): dbPath="/home/user/app/db"
+onPrem=False
 print(dbPath)
 #client = chromadb.Client()
 path=dbPath
 client = chromadb.PersistentClient(path=path)
 default_ef = embedding_functions.DefaultEmbeddingFunction()
 #sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer")
 #instructor_ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-large", device="cuda")
+embeddingModel = embedding_functions.InstructorEmbeddingFunction(model_name="T-Systems-onsite/cross-en-de-roberta-sentence-transformer", device="cuda" if(onPrem) else "cpu")
 print(str(client.list_collections()))
 x.index("1c") if "1c" in x else len(x)+1
+# Model
+#-------
+#onPrem=False
+if(onPrem==False):
+  modelPath="mistralai/Mixtral-8x7B-Instruct-v0.1"
+  from huggingface_hub import InferenceClient
+  import gradio as gr
+  client = InferenceClient(
+    modelPath
+    #"mistralai/Mixtral-8x7B-Instruct-v0.1"
+    #"mistralai/Mistral-7B-Instruct-v0.1"
+  )
+else:
+  import os
+  import requests
+  import subprocess
+  modelPath="/home/af/gguf/models/discolm_german_7b_v1.Q4_0.gguf"
+  if(os.path.exists(modelPath)==False):
+    #url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf"
+    #url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
+    #url="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
+    url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true"
+    response = requests.get(url)
+    with open("./model.gguf", mode="wb") as file:
+      file.write(response.content)
+    print("Model downloaded")
+    modelPath="./model.gguf"
+  print(modelPath)
+  n="20"
+  if("mixtral-8x7b-instruct" in modelPath): n="0" # mixtral seems to cause problems here...
+  command = ["python3", "-m", "llama_cpp.server", "--model", modelPath, "--host", "0.0.0.0", "--port", "2600", "--n_threads", "8", "--n_gpu_layers", n]
+  subprocess.Popen(command)
+  print("Server ready!")
 # Gradio-GUI
     historylimit=historylimit # number of past messages to consider for response to current message
     )
   print(prompt)
+  ## Request response from model
+  #------------------------------
+  print("AI running on prem!" if(onPrem) else "AI running HFHub!")
+  if(onPrem==False):
+    temperature=float(0.9)
+    max_new_tokens=500
+    top_p=0.95
+    repetition_penalty=1.0
+    if temperature < 1e-2: temperature = 1e-2
+    top_p = float(top_p)
+    generate_kwargs = dict(
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        do_sample=True,
+        seed=42,
+    )
+    stream = client.text_generation(prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
+    response = ""
+    print("User: "+message+"\nAI: ")
+    for text in stream:
+        part=text.token.text
         print(part, end="", flush=True)
+        response += part
+        yield response
+    if((myType=="1a")|(myType=="1b")): #add RAG-results to chat-output if appropriate
+      response=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
+      yield response
+    history.append((message, response)) # add current dialog to history
+    # Store current state in DB if settings=="Permanent"
+    if (settings=="Permanent"):
+      x=collection.get(include=[])["ids"] # add current dialog to db
+      collection.add(
+        documents=[message,response],
+        metadatas=[
+          { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"},
+          { "source": "DU",  "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}
+        ],
+        ids=[str(len(x)+1),str(len(x)+2)]
+      )
+      json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False)
+  if(onPrem==True):
+    # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
+    url="http://0.0.0.0:2600/v1/completions"
+    body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"}      # e.g. Mixtral-Instruct
+    if("discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]})   # fix stop-token of DiscoLM
+    response="" #+"("+myType+")\n"
+    buffer=""
+    #print("URL: "+url)
+    print("User: "+message+"\nAI: ")
+    for text in requests.post(url, json=body, stream=True):  #-H 'accept: application/json' -H 'Content-Type: application/json'
+      if buffer is None: buffer=""
+      buffer=str("".join(buffer))
+      # print("*** Raw String: "+str(text)+"\n***\n")
+      text=text.decode('utf-8')
+      if((text.startswith(": ping -")==False) & (len(text.strip("\n\r"))>0)): buffer=buffer+str(text)
+      # print("\n*** Buffer: "+str(buffer)+"\n***\n")
+      buffer=buffer.split('"finish_reason": null}]}')
+      if(len(buffer)==1):
+        buffer="".join(buffer)
         pass
+      if(len(buffer)==2):
+        part=buffer[0]+'"finish_reason": null}]}'
+        if(part.lstrip('\n\r').startswith("data: ")): part=part.lstrip('\n\r').replace("data: ", "")
+        try:
+          part = str(json.loads(part)["choices"][0]["text"])
+          print(part, end="", flush=True)
+          response=response+part
+          buffer="" # reset buffer
+        except Exception as e:
+          print("Exception:"+str(e))
+          pass
+      yield response
+    if((myType=="1a")|(myType=="1b")): #add RAG-results to chat-output if appropriate
+      response=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
+      yield response
+    history.append((message, response)) # add current dialog to history
+    # Store current state in DB if settings=="Permanent"
+    if (settings=="Permanent"):
+      x=collection.get(include=[])["ids"] # add current dialog to db
+      collection.add(
+        documents=[message,response],
+        metadatas=[
+          { "source": "ICH", "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"},
+          { "source": "DU",  "dialog": f"ICH: {message.strip()}\n DU: {response.strip()}", "type":"episode"}
+        ],
+        ids=[str(len(x)+1),str(len(x)+2)]
+      )
+      json.dump(history,open(filename,'w',encoding="utf-8"),ensure_ascii=False)
 gr.ChatInterface(
   response,
   chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, das für jede Anfrage die am besten geeigneten KI-Tools empfiehlt.<br>Aktuell bin ich wenig mehr als eine Tech-Demo und kenne nur 7 KI-Modelle - also sei bitte nicht zu streng mit mir.<br>Was ist dein Anliegen?"]],render_markdown=True),
+  title="German AI-Interface with advanced RAG",
   #additional_inputs=[gr.Dropdown(["Permanent","Temporär"],value="Temporär",label="Dialog sichern?")]
   ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
 print("Interface up and running!")