learn-ai

Sleeping

Donghao Huang commited on Aug 18, 2023

Commit

6b469d2

1 Parent(s): 571afe2

fixed bug on llama-2

Files changed (3) hide show

app_modules/llm_inference.py CHANGED Viewed

@@ -35,7 +35,12 @@ class LLMInference(metaclass=abc.ABCMeta):
         return self.chain
     def call_chain(
-        self, inputs, streaming_handler, q: Queue = None, tracing: bool = False
     ):
         print(inputs)
         if self.llm_loader.streamer.for_huggingface:
@@ -46,11 +51,7 @@ class LLMInference(metaclass=abc.ABCMeta):
             chain = self.get_chain(tracing)
             result = (
-                self._run_chain(
-                    chain,
-                    inputs,
-                    streaming_handler,
-                )
                 if streaming_handler is not None
                 else chain(inputs)
             )
@@ -74,7 +75,7 @@ class LLMInference(metaclass=abc.ABCMeta):
     def _execute_chain(self, chain, inputs, q, sh):
         q.put(chain(inputs, callbacks=[sh]))
-    def _run_chain(self, chain, inputs, streaming_handler):
         que = Queue()
         t = Thread(
@@ -83,7 +84,7 @@ class LLMInference(metaclass=abc.ABCMeta):
         )
         t.start()
-        if self.llm_loader.streamer.for_huggingface:
             count = (
                 2
                 if "chat_history" in inputs and len(inputs.get("chat_history")) > 0

         return self.chain
     def call_chain(
+        self,
+        inputs,
+        streaming_handler,
+        q: Queue = None,
+        tracing: bool = False,
+        testing: bool = False,
     ):
         print(inputs)
         if self.llm_loader.streamer.for_huggingface:
             chain = self.get_chain(tracing)
             result = (
+                self._run_chain(chain, inputs, streaming_handler, testing)
                 if streaming_handler is not None
                 else chain(inputs)
             )
     def _execute_chain(self, chain, inputs, q, sh):
         q.put(chain(inputs, callbacks=[sh]))
+    def _run_chain(self, chain, inputs, streaming_handler, testing):
         que = Queue()
         t = Thread(
         )
         t.start()
+        if self.llm_loader.streamer.for_huggingface and not testing:
             count = (
                 2
                 if "chat_history" in inputs and len(inputs.get("chat_history")) > 0

app_modules/llm_loader.py CHANGED Viewed

@@ -227,6 +227,7 @@ class LLMLoader:
                     if "gpt4all-j" in MODEL_NAME_OR_PATH
                     or "dolly" in MODEL_NAME_OR_PATH
                     or "Qwen" in MODEL_NAME_OR_PATH
                     else 0
                 )
                 use_fast = (
@@ -452,7 +453,6 @@ class LLMLoader:
                             top_p=0.95,
                             top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                             repetition_penalty=1.115,
-                            use_auth_token=token,
                             token=token,
                         )
                     )

                     if "gpt4all-j" in MODEL_NAME_OR_PATH
                     or "dolly" in MODEL_NAME_OR_PATH
                     or "Qwen" in MODEL_NAME_OR_PATH
+                    or "Llama-2" in MODEL_NAME_OR_PATH
                     else 0
                 )
                 use_fast = (
                             top_p=0.95,
                             top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                             repetition_penalty=1.115,
                             token=token,
                         )
                     )

test.py CHANGED Viewed

@@ -69,7 +69,11 @@ while True:
     start = timer()
     result = qa_chain.call_chain(
-        {"question": query, "chat_history": chat_history}, custom_handler
     )
     end = timer()
     print(f"Completed in {end - start:.3f}s")

     start = timer()
     result = qa_chain.call_chain(
+        {"question": query, "chat_history": chat_history},
+        custom_handler,
+        None,
+        False,
+        True,
     )
     end = timer()
     print(f"Completed in {end - start:.3f}s")