learn-ai

Sleeping

App Files Files Community

dh-mc commited on Aug 22, 2023

Commit

ee3a625

1 Parent(s): 25309c9

added refine summary chain

Browse files

Files changed (3) hide show

app_modules/init.py +34 -30
app_modules/llm_summarize_chain.py +20 -0
summarize.py +70 -0

app_modules/init.py CHANGED Viewed

@@ -23,55 +23,59 @@ load_dotenv(found_dotenv, override=False)
 init_settings()
-def app_init():
     # https://github.com/huggingface/transformers/issues/17611
     os.environ["CURL_CA_BUNDLE"] = ""
     hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
     print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
     print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
-    hf_embeddings_model_name = (
-        os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
-    )
-    n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
-    index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get(
-        "CHROMADB_INDEX_PATH"
-    )
-    using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
-    llm_model_type = os.environ.get("LLM_MODEL_TYPE")
-    start = timer()
-    embeddings = HuggingFaceInstructEmbeddings(
-        model_name=hf_embeddings_model_name,
-        model_kwargs={"device": hf_embeddings_device_type},
-    )
-    end = timer()
-    print(f"Completed in {end - start:.3f}s")
-    start = timer()
-    print(f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}")
-    if not os.path.isdir(index_path):
-        raise ValueError(f"{index_path} does not exist!")
-    elif using_faiss:
-        vectorstore = FAISS.load_local(index_path, embeddings)
-    else:
-        vectorstore = Chroma(
-            embedding_function=embeddings, persist_directory=index_path
         )
-    end = timer()
-    print(f"Completed in {end - start:.3f}s")
     start = timer()
     llm_loader = LLMLoader(llm_model_type)
     llm_loader.init(n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type)
-    qa_chain = QAChain(vectorstore, llm_loader)
     end = timer()
     print(f"Completed in {end - start:.3f}s")

 init_settings()
+def app_init(initQAChain: bool = True):
     # https://github.com/huggingface/transformers/issues/17611
     os.environ["CURL_CA_BUNDLE"] = ""
+    llm_model_type = os.environ.get("LLM_MODEL_TYPE")
+    n_threds = int(os.environ.get("NUMBER_OF_CPU_CORES") or "4")
     hf_embeddings_device_type, hf_pipeline_device_type = get_device_types()
     print(f"hf_embeddings_device_type: {hf_embeddings_device_type}")
     print(f"hf_pipeline_device_type: {hf_pipeline_device_type}")
+    if initQAChain:
+        hf_embeddings_model_name = (
+            os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
+        )
+        index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get(
+            "CHROMADB_INDEX_PATH"
+        )
+        using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
+        start = timer()
+        embeddings = HuggingFaceInstructEmbeddings(
+            model_name=hf_embeddings_model_name,
+            model_kwargs={"device": hf_embeddings_device_type},
+        )
+        end = timer()
+        print(f"Completed in {end - start:.3f}s")
+        start = timer()
+        print(
+            f"Load index from {index_path} with {'FAISS' if using_faiss else 'Chroma'}"
         )
+        if not os.path.isdir(index_path):
+            raise ValueError(f"{index_path} does not exist!")
+        elif using_faiss:
+            vectorstore = FAISS.load_local(index_path, embeddings)
+        else:
+            vectorstore = Chroma(
+                embedding_function=embeddings, persist_directory=index_path
+            )
+        end = timer()
+        print(f"Completed in {end - start:.3f}s")
     start = timer()
     llm_loader = LLMLoader(llm_model_type)
     llm_loader.init(n_threds=n_threds, hf_pipeline_device_type=hf_pipeline_device_type)
+    qa_chain = QAChain(vectorstore, llm_loader) if initQAChain else None
     end = timer()
     print(f"Completed in {end - start:.3f}s")

app_modules/llm_summarize_chain.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+from typing import List, Optional
+from langchain.chains.base import Chain
+from langchain.chains.summarize import load_summarize_chain
+from app_modules.llm_inference import LLMInference
+class SummarizeChain(LLMInference):
+    def __init__(self, llm_loader):
+        super().__init__(llm_loader)
+    def create_chain(self) -> Chain:
+        chain = load_summarize_chain(self.llm_loader.llm, chain_type="refine")
+        return chain
+    def run_chain(self, chain, inputs, callbacks: Optional[List] = []):
+        result = chain(inputs, return_only_outputs=True)
+        return result

summarize.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# setting device on GPU if available, else CPU
+import os
+import sys
+from timeit import default_timer as timer
+from typing import List
+from langchain.document_loaders import PyPDFDirectoryLoader
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores.base import VectorStore
+from langchain.vectorstores.chroma import Chroma
+from langchain.vectorstores.faiss import FAISS
+from app_modules.init import app_init, get_device_types
+from app_modules.llm_summarize_chain import SummarizeChain
+def load_documents(source_pdfs_path, urls) -> List:
+    loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True)
+    documents = loader.load()
+    if urls is not None and len(urls) > 0:
+        for doc in documents:
+            source = doc.metadata["source"]
+            filename = source.split("/")[-1]
+            for url in urls:
+                if url.endswith(filename):
+                    doc.metadata["url"] = url
+                    break
+    return documents
+def split_chunks(documents: List, chunk_size, chunk_overlap) -> List:
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap
+    )
+    return text_splitter.split_documents(documents)
+llm_loader = app_init(False)[0]
+source_pdfs_path = (
+    sys.argv[1] if len(sys.argv) > 1 else os.environ.get("SOURCE_PDFS_PATH")
+)
+chunk_size = os.environ.get("CHUNCK_SIZE")
+chunk_overlap = os.environ.get("CHUNK_OVERLAP")
+sources = load_documents(source_pdfs_path, None)
+print(f"Splitting {len(sources)} PDF pages in to chunks ...")
+chunks = split_chunks(
+    sources, chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap)
+)
+print(f"Summarizing {len(chunks)} chunks ...")
+start = timer()
+summarize_chain = SummarizeChain(llm_loader)
+result = summarize_chain.call_chain(
+    {"input_documents": chunks},
+    None,
+    None,
+    True,
+)
+end = timer()
+print(f"Completed in {end - start:.3f}s")
+print("\n\n***Summary:")
+print(result["output_text"])