Advanced-RAG-Demo

Runtime error

App Files Files Community

AFischer1985 commited on Feb 26, 2024

Commit

beaf90e

verified ·

1 Parent(s): 7b16373

Update run.py

Browse files

Files changed (1) hide show

run.py +48 -28

run.py CHANGED Viewed

@@ -2,7 +2,7 @@
 # Title:  German AI-Interface with advanced RAG
 # Author: Andreas Fischer
 # Date:   January 31st, 2023
-# Last update: February 25st, 2024
 ##########################################################################################
 #https://github.com/abetlen/llama-cpp-python/issues/306
@@ -30,7 +30,7 @@ dbPath = "/home/af/Schreibtisch/Code/gradio/Chroma/db"
 onPrem = True if(os.path.exists(dbPath)) else False
 if(onPrem==False): dbPath="/home/user/app/db"
-onPrem=False
 print(dbPath)
 #client = chromadb.Client()
@@ -164,12 +164,11 @@ else:
   import os
   import requests
   import subprocess
-  modelPath="/home/af/gguf/models/Discolm_german_7b_v1.Q4_0.gguf"
   if(os.path.exists(modelPath)==False):
-    #url="https://huggingface.co/TheBloke/WizardLM-13B-V1.2-GGUF/resolve/main/wizardlm-13b-v1.2.Q4_0.gguf"
-    url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
-    #url="https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_0.gguf?download=true"
     #url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true"
     response = requests.get(url)
     with open("./Mixtral-8x7b-instruct.gguf", mode="wb") as file:
       file.write(response.content)
@@ -183,10 +182,15 @@ else:
   print("Server ready!")
 # Gradio-GUI
 #------------
-def extend_prompt(message="", history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4): #float("Inf")
   startOfString=""
   if zeichenlimit is None: zeichenlimit=1000000000 # :-)
   template0=" [INST]{system}\n  [/INST] </s>"
@@ -229,13 +233,18 @@ def extend_prompt(message="", history=None, system=None, RAGAddon=None, system2=
     prompt += template0.format(system=system) #"<s>"
   if history is not None:
     for user_message, bot_response in history[-historylimit:]:
-      if user_message is not None: prompt += template1.format(message=user_message[:zeichenlimit])  #"[INST] {user_prompt} [/INST] "
-      if bot_response is not None: prompt += template2.format(response=bot_response[:zeichenlimit]) #"{bot_response}</s> "
-  if message is not None: prompt += template1.format(message=message[:zeichenlimit])                #"[INST] {message} [/INST]"
   if system2 is not None:
     prompt += system2
   return startOfString+prompt
 import gradio as gr
 import requests
 import json
@@ -244,7 +253,8 @@ import os
 import re
 def response(message, history):
-  settings="Temporär"
   # Preprocessing to revent simple forms of prompt injection:
   #----------------------------------------------------------
@@ -253,12 +263,12 @@ def response(message, history):
   message=message.replace("[/INST]","")
   message=re.sub("<[|](im_start|im_end|end_of_turn)[|]>", '', message)
-  # Load Memory if settings=="Permanent"
   #-------------------------------------
-  if (settings=="Permanent"):
     if((len(history)==0)&(os.path.isfile(filename))): history=json.load(open(filename,'r',encoding="utf-8")) # retrieve history (if available)
-  system="Du bist ein deutschsprachiges wortkarges KI-basiertes Assistenzsystem. Fasse dich kurz und verzichte auf Codebeispiele."
   #RAG-layer 0: Intention-RAG
   #---------------------------
@@ -354,7 +364,13 @@ def response(message, history):
     rag,                      # RAG-component added to the system prompt
     system2,                  # fictive first words of the AI (neither displayed nor stored)
     historylimit=historylimit # number of past messages to consider for response to current message
     )
   print("\n\n*** Prompt:\n"+prompt+"\n***\n\n")
   ## Request response from model
@@ -383,13 +399,14 @@ def response(message, history):
         part=text.token.text
         #print(part, end="", flush=True)
         response += part
         yield response
     if((myType=="1a")): #add RAG-results to chat-output if appropriate
-      response2=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
-      yield response2
     history.append((message, response)) # add current dialog to history
-    # Store current state in DB if settings=="Permanent"
-    if (settings=="Permanent"):
       x=collection.get(include=[])["ids"] # add current dialog to db
       collection.add(
         documents=[message,response],
@@ -405,7 +422,8 @@ def response(message, history):
     # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
     url="http://0.0.0.0:2600/v1/completions"
     body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"}      # e.g. Mixtral-Instruct
-    if("discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]})   # fix stop-token of DiscoLM
     response="" #+"("+myType+")\n"
     buffer=""
     #print("URL: "+url)
@@ -432,13 +450,13 @@ def response(message, history):
         except Exception as e:
           print("Exception:"+str(e))
           pass
       yield response
     if((myType=="1a")): #add RAG-results to chat-output if appropriate
-      response2=response+"\n\n<br><details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
-      yield response2
-    history.append((message, response)) # add current dialog to history
-    # Store current state in DB if settings=="Permanent"
-    if (settings=="Permanent"):
       x=collection.get(include=[])["ids"] # add current dialog to db
       collection.add(
         documents=[message,response],
@@ -453,9 +471,11 @@ def response(message, history):
 gr.ChatInterface(
   response,
-  chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, das für jede Anfrage die am besten geeigneten KI-Tools empfiehlt.<br>Aktuell bin ich wenig mehr als eine Tech-Demo und kenne nur 7 KI-Modelle - also sei bitte nicht zu streng mit mir.<ul><li>wenn du ein KI-Modell suchst, antworte ich auf Basis der Liste</li><li>wenn du Fragen zur Benutzung eines KI-Modells hast, verweise ich an andere Stellen</li><li>wenn du andre Fragen hast, antworte ich frei und berücksichtige dabei Relevantes aus dem gesamten bisherigen Dialog.</li></ul><br>Was ist dein Anliegen?"]],render_markdown=True),
-  title="German AI-Interface with advanced RAG",
-  #additional_inputs=[gr.Dropdown(["Permanent","Temporär"],value="Temporär",label="Dialog sichern?")]
   ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
 print("Interface up and running!")

 # Title:  German AI-Interface with advanced RAG
 # Author: Andreas Fischer
 # Date:   January 31st, 2023
+# Last update: February 26st, 2024
 ##########################################################################################
 #https://github.com/abetlen/llama-cpp-python/issues/306
 onPrem = True if(os.path.exists(dbPath)) else False
 if(onPrem==False): dbPath="/home/user/app/db"
+#onPrem=True  # uncomment to override automatic detection
 print(dbPath)
 #client = chromadb.Client()
   import os
   import requests
   import subprocess
+  #modelPath="/home/af/gguf/models/Discolm_german_7b_v1.Q4_0.gguf"
+  modelPath="/home/af/gguf/models/Mixtral-8x7b-instruct-v0.1.Q4_0.gguf"
   if(os.path.exists(modelPath)==False):
     #url="https://huggingface.co/TheBloke/DiscoLM_German_7b_v1-GGUF/resolve/main/discolm_german_7b_v1.Q4_0.gguf?download=true"
+    url="https://huggingface.co/TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/resolve/main/mixtral-8x7b-instruct-v0.1.Q4_0.gguf?download=true"
     response = requests.get(url)
     with open("./Mixtral-8x7b-instruct.gguf", mode="wb") as file:
       file.write(response.content)
   print("Server ready!")
+#import llama_cpp
+#llama_cpp.llama_backend_init(numa=False)
+#params=llama_cpp.llama_context_default_params()
+#params.n_ctx
 # Gradio-GUI
 #------------
+import re
+def extend_prompt(message="", history=None, system=None, RAGAddon=None, system2=None, zeichenlimit=None,historylimit=4, removeHTML=True):
   startOfString=""
   if zeichenlimit is None: zeichenlimit=1000000000 # :-)
   template0=" [INST]{system}\n  [/INST] </s>"
     prompt += template0.format(system=system) #"<s>"
   if history is not None:
     for user_message, bot_response in history[-historylimit:]:
+      if user_message is None: user_message = ""
+      if bot_response is None: bot_response = ""
+      bot_response = re.sub("\n\n<details>((.|\n)*?)</details>","", bot_response) # remove RAG-compontents
+      if removeHTML==True: bot_response = re.sub("<(.*?)>","\n", bot_response) # remove HTML-components in general (may cause bugs with markdown-rendering)
+      if user_message is not None: prompt += template1.format(message=user_message[:zeichenlimit])
+      if bot_response is not None: prompt += template2.format(response=bot_response[:zeichenlimit])
+  if message is not None: prompt += template1.format(message=message[:zeichenlimit])
   if system2 is not None:
     prompt += system2
   return startOfString+prompt
 import gradio as gr
 import requests
 import json
 import re
 def response(message, history):
+  settings="Memory Off"
+  removeHTML=True
   # Preprocessing to revent simple forms of prompt injection:
   #----------------------------------------------------------
   message=message.replace("[/INST]","")
   message=re.sub("<[|](im_start|im_end|end_of_turn)[|]>", '', message)
+  # Load Memory if memory is turned on
   #-------------------------------------
+  if (settings=="Memory On"):
     if((len(history)==0)&(os.path.isfile(filename))): history=json.load(open(filename,'r',encoding="utf-8")) # retrieve history (if available)
+  system="Du bist ein deutschsprachiges wortkarges KI-basiertes Assistenzsystem. Antworte kurz, in deutsche Sprache und verzichte auf HTML und Code jeder Art."
   #RAG-layer 0: Intention-RAG
   #---------------------------
     rag,                      # RAG-component added to the system prompt
     system2,                  # fictive first words of the AI (neither displayed nor stored)
     historylimit=historylimit # number of past messages to consider for response to current message
+    removeHTML=removeHTML     # remove HTML-components from History (to prevent bugs with Markdown)
     )
+  #print("\n\nMESSAGE:"+str(message))
+  #print("\n\nHISTORY:"+str(history))
+  #print("\n\nSYSTEM:"+str(system))
+  #print("\n\nRAG:"+str(rag))
+  #print("\n\nSYSTEM2:"+str(system2))
   print("\n\n*** Prompt:\n"+prompt+"\n***\n\n")
   ## Request response from model
         part=text.token.text
         #print(part, end="", flush=True)
         response += part
+        if removeHTML==True: response = re.sub("<(.*?)>","\n", response) # remove HTML-components in general (may cause bugs with markdown-rendering)
         yield response
     if((myType=="1a")): #add RAG-results to chat-output if appropriate
+      response=response+"\n\n<details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
+      yield response
     history.append((message, response)) # add current dialog to history
+    # Store current state in DB if memory is turned on
+    if (settings=="Memory On"):
       x=collection.get(include=[])["ids"] # add current dialog to db
       collection.add(
         documents=[message,response],
     # url="https://afischer1985-wizardlm-13b-v1-2-q4-0-gguf.hf.space/v1/completions"
     url="http://0.0.0.0:2600/v1/completions"
     body={"prompt":prompt,"max_tokens":None, "echo":"False","stream":"True"}      # e.g. Mixtral-Instruct
+    if("Discolm_german_7b" in modelPath): body.update({"stop": ["<|im_end|>"]})   # fix stop-token of DiscoLM
+    if("Gemma-" in modelPath): body.update({"stop": ["<|im_end|>","</end_of_turn>"]})   # fix stop-token of Gemma
     response="" #+"("+myType+")\n"
     buffer=""
     #print("URL: "+url)
         except Exception as e:
           print("Exception:"+str(e))
           pass
+      if removeHTML==True: response = re.sub("<(.*?)>","\n", response) # remove HTML-components in general (may cause bugs with markdown-rendering)
       yield response
     if((myType=="1a")): #add RAG-results to chat-output if appropriate
+      response=response+"\n\n<details><summary><strong>Sources</strong></summary><br><ul>"+ "".join(["<li>" + s + "</li>" for s in combination])+"</ul></details>"
+      yield response
+    # Store current state in DB if memory is turned on
+    if (settings=="Memory On"):
       x=collection.get(include=[])["ids"] # add current dialog to db
       collection.add(
         documents=[message,response],
 gr.ChatInterface(
   response,
+  chatbot=gr.Chatbot(value=[[None,"Herzlich willkommen! Ich bin ein KI-basiertes Assistenzsystem, das für jede Anfrage die am besten geeigneten KI-Tools empfiehlt.\nAktuell bin ich wenig mehr als eine Tech-Demo und kenne nur 7 KI-Modelle - also sei bitte nicht zu streng mit mir.<ul><li>Wenn du ein KI-Modell suchst, antworte ich auf Basis der Liste</li><li>Wenn du Fragen zur Benutzung eines KI-Modells hast, verweise ich an andere Stellen</li><li>Wenn du andre Fragen hast, antworte ich frei und berücksichtige dabei Relevantes aus dem gesamten bisherigen Dialog.</li></ul>\nWas ist dein Anliegen?"]],render_markdown=True),
+  title="German AI-Interface with advanced RAG (on prem)" if onPrem else "German AI-Interface with advanced RAG (HFHub)",
+  #additional_inputs=[gr.Dropdown(["Memory On","Memory Off"],value="Memory Off",label="Memory")]
   ).queue().launch(share=True) #False, server_name="0.0.0.0", server_port=7864)
 print("Interface up and running!")