learn-ai

Sleeping

App Files Files Community

inflaton commited on Aug 3, 2023

Commit

c2cb992

1 Parent(s): 7f9d16c

tested app_modules/llm_loader.py

Browse files

Files changed (3) hide show

.env.example +3 -3
app_modules/llm_loader.py +15 -18
test.py +19 -10

.env.example CHANGED Viewed

@@ -54,13 +54,13 @@ MOSAICML_MODEL_NAME_OR_PATH="mosaicml/mpt-7b-instruct"
 FALCON_MODEL_NAME_OR_PATH="tiiuae/falcon-7b-instruct"
-GPT4ALL_J_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_0.bin"
 GPT4ALL_J_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_0.bin
-GPT4ALL_MODEL_PATH="./models/ggml-nous-gpt4-vicuna-13b.bin"
 GPT4ALL_DOWNLOAD_LINK=https://gpt4all.io/models/ggml-nous-gpt4-vicuna-13b.bin
-LLAMACPP_MODEL_PATH="./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
 LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
 # Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512

 FALCON_MODEL_NAME_OR_PATH="tiiuae/falcon-7b-instruct"
+GPT4ALL_J_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
 GPT4ALL_J_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_0.bin
+GPT4ALL_MODEL_PATH="../models/ggml-nous-gpt4-vicuna-13b.bin"
 GPT4ALL_DOWNLOAD_LINK=https://gpt4all.io/models/ggml-nous-gpt4-vicuna-13b.bin
+LLAMACPP_MODEL_PATH="../models/llama-2-7b-chat.ggmlv3.q4_K_M.bin"
 LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin
 # Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512

app_modules/llm_loader.py CHANGED Viewed

@@ -30,7 +30,7 @@ from transformers import (
 )
 from app_modules.instruct_pipeline import InstructionTextGenerationPipeline
-from app_modules.utils import ensure_model_is_downloaded, remove_extra_spaces
 class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
@@ -336,7 +336,6 @@ class LLMLoader:
                     )
                 else:
                     if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
-                        use_auth_token = None
                         model = (
                             AutoModelForSeq2SeqLM.from_pretrained(
                                 MODEL_NAME_OR_PATH,
@@ -354,25 +353,23 @@ class LLMLoader:
                         )
                         print(f"Model memory footprint: {model.get_memory_footprint()}")
                     else:
-                        use_auth_token = token
                         model = MODEL_NAME_OR_PATH
                     pipe = pipeline(
-                        task,
-                        model=model,
-                        tokenizer=tokenizer,
-                        streamer=self.streamer,
-                        return_full_text=return_full_text,  # langchain expects the full text
-                        device=hf_pipeline_device_type,
-                        torch_dtype=torch_dtype,
-                        max_new_tokens=2048,
-                        trust_remote_code=True,
-                        temperature=temperature,
-                        top_p=0.95,
-                        top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
-                        repetition_penalty=1.115,
-                        token=use_auth_token,
-                    )
                 self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
             elif self.llm_model_type == "mosaicml":

 )
 from app_modules.instruct_pipeline import InstructionTextGenerationPipeline
+from app_modules.utils import ensure_model_is_downloaded
 class TextIteratorStreamer(TextStreamer, StreamingStdOutCallbackHandler):
                     )
                 else:
                     if os.environ.get("DISABLE_MODEL_PRELOADING") != "true":
                         model = (
                             AutoModelForSeq2SeqLM.from_pretrained(
                                 MODEL_NAME_OR_PATH,
                         )
                         print(f"Model memory footprint: {model.get_memory_footprint()}")
                     else:
                         model = MODEL_NAME_OR_PATH
                     pipe = pipeline(
+                            task,
+                            model=model,
+                            tokenizer=tokenizer,
+                            streamer=self.streamer,
+                            return_full_text=return_full_text,  # langchain expects the full text
+                            device=hf_pipeline_device_type,
+                            torch_dtype=torch_dtype,
+                            max_new_tokens=2048,
+                            trust_remote_code=True,
+                            temperature=temperature,
+                            top_p=0.95,
+                            top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
+                            repetition_penalty=1.115,
+                        )
                 self.llm = HuggingFacePipeline(pipeline=pipe, callbacks=callbacks)
             elif self.llm_model_type == "mosaicml":

test.py CHANGED Viewed

@@ -1,14 +1,21 @@
 # project/test.py
 import unittest
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.schema import HumanMessage
 from app_modules.llm_loader import LLMLoader
-from timeit import default_timer as timer
-USER_QUESTION = "What's the capital city of Malaysia?"
 class MyCustomHandler(BaseCallbackHandler):
@@ -32,7 +39,9 @@ class TestLLMLoader(unittest.TestCase):
     def run_test_case(self, llm_model_type, query):
         llm_loader = LLMLoader(llm_model_type)
         start = timer()
-        llm_loader.init(n_threds=8, hf_pipeline_device_type="cpu")
         end = timer()
         print(f"Model loaded in {end - start:.3f}s")
@@ -43,17 +52,17 @@ class TestLLMLoader(unittest.TestCase):
         print(f"Inference completed in {end2 - end:.3f}s")
         print(result)
-    def xtest_openai(self):
-        self.run_test_case("openai", USER_QUESTION)
-    def xtest_llamacpp(self):
-        self.run_test_case("llamacpp", USER_QUESTION)
-    def xtest_gpt4all_j(self):
-        self.run_test_case("gpt4all-j", USER_QUESTION)
     def test_huggingface(self):
-        self.run_test_case("huggingface", USER_QUESTION)
 if __name__ == "__main__":

 # project/test.py
+import os
 import unittest
+from timeit import default_timer as timer
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.schema import HumanMessage
 from app_modules.llm_loader import LLMLoader
+from app_modules.utils import *
+user_question = "What's the capital city of Malaysia?"
+n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
+hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
+print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
+print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
 class MyCustomHandler(BaseCallbackHandler):
     def run_test_case(self, llm_model_type, query):
         llm_loader = LLMLoader(llm_model_type)
         start = timer()
+        llm_loader.init(
+            n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type
+        )
         end = timer()
         print(f"Model loaded in {end - start:.3f}s")
         print(f"Inference completed in {end2 - end:.3f}s")
         print(result)
+    def test_openai(self):
+        self.run_test_case("openai", user_question)
+    def test_llamacpp(self):
+        self.run_test_case("llamacpp", user_question)
+    def test_gpt4all_j(self):
+        self.run_test_case("gpt4all-j", user_question)
     def test_huggingface(self):
+        self.run_test_case("huggingface", user_question)
 if __name__ == "__main__":