chore: linting
Browse files- app/app.py +18 -28
app/app.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
# Chroma compatibility issue resolution
|
| 2 |
# https://docs.trychroma.com/troubleshooting#sqlite
|
| 3 |
-
__import__(
|
| 4 |
import sys
|
| 5 |
-
|
|
|
|
| 6 |
|
| 7 |
from tempfile import NamedTemporaryFile
|
| 8 |
from typing import List
|
|
@@ -11,7 +12,7 @@ import chainlit as cl
|
|
| 11 |
from chainlit.types import AskFileResponse
|
| 12 |
import chromadb
|
| 13 |
from chromadb.config import Settings
|
| 14 |
-
from langchain.chains import
|
| 15 |
from langchain.chat_models import ChatOpenAI
|
| 16 |
from langchain.document_loaders import PDFPlumberLoader
|
| 17 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
|
@@ -31,7 +32,7 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
|
|
| 31 |
|
| 32 |
Args:
|
| 33 |
file (AskFileResponse): input file to be processed
|
| 34 |
-
|
| 35 |
Raises:
|
| 36 |
ValueError: when we fail to process PDF files. We consider PDF file
|
| 37 |
processing failure when there's no text returned. For example, PDFs
|
|
@@ -51,8 +52,7 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
|
|
| 51 |
documents = loader.load()
|
| 52 |
|
| 53 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 54 |
-
chunk_size=3000,
|
| 55 |
-
chunk_overlap=100
|
| 56 |
)
|
| 57 |
docs = text_splitter.split_documents(documents)
|
| 58 |
|
|
@@ -66,7 +66,9 @@ def process_file(*, file: AskFileResponse) -> List[Document]:
|
|
| 66 |
return docs
|
| 67 |
|
| 68 |
|
| 69 |
-
def create_search_engine(
|
|
|
|
|
|
|
| 70 |
"""Takes a list of Langchain Documents and an embedding model API wrapper
|
| 71 |
and build a search index using a VectorStore.
|
| 72 |
|
|
@@ -80,27 +82,21 @@ def create_search_engine(*, docs: List[Document], embeddings: Embeddings) -> Vec
|
|
| 80 |
"""
|
| 81 |
# Initialize Chromadb client to enable resetting and disable telemtry
|
| 82 |
client = chromadb.EphemeralClient()
|
| 83 |
-
client_settings=Settings(
|
| 84 |
-
allow_reset=True,
|
| 85 |
-
anonymized_telemetry=False
|
| 86 |
-
)
|
| 87 |
|
| 88 |
# Reset the search engine to ensure we don't use old copies.
|
| 89 |
# NOTE: we do not need this for production
|
| 90 |
-
search_engine = Chroma(
|
| 91 |
-
client=client,
|
| 92 |
-
client_settings=client_settings
|
| 93 |
-
)
|
| 94 |
search_engine._client.reset()
|
| 95 |
search_engine = Chroma.from_documents(
|
| 96 |
client=client,
|
| 97 |
documents=docs,
|
| 98 |
embedding=embeddings,
|
| 99 |
-
client_settings=client_settings
|
| 100 |
)
|
| 101 |
|
| 102 |
return search_engine
|
| 103 |
-
|
| 104 |
|
| 105 |
@cl.on_chat_start
|
| 106 |
async def on_chat_start():
|
|
@@ -123,20 +119,17 @@ async def on_chat_start():
|
|
| 123 |
# Process and save data in the user session
|
| 124 |
msg = cl.Message(content=f"Processing `{file.name}`...")
|
| 125 |
await msg.send()
|
| 126 |
-
|
| 127 |
docs = process_file(file=file)
|
| 128 |
cl.user_session.set("docs", docs)
|
| 129 |
msg.content = f"`{file.name}` processed. Loading ..."
|
| 130 |
await msg.update()
|
| 131 |
|
| 132 |
# Indexing documents into our search engine
|
| 133 |
-
embeddings = OpenAIEmbeddings(
|
| 134 |
-
model="text-embedding-ada-002"
|
| 135 |
-
)
|
| 136 |
try:
|
| 137 |
search_engine = await cl.make_async(create_search_engine)(
|
| 138 |
-
docs=docs,
|
| 139 |
-
embeddings=embeddings
|
| 140 |
)
|
| 141 |
except Exception as e:
|
| 142 |
await cl.Message(content=f"Error: {e}").send()
|
|
@@ -145,9 +138,7 @@ async def on_chat_start():
|
|
| 145 |
await msg.update()
|
| 146 |
|
| 147 |
model = ChatOpenAI(
|
| 148 |
-
model="gpt-3.5-turbo-16k-0613",
|
| 149 |
-
temperature=0,
|
| 150 |
-
streaming=True
|
| 151 |
)
|
| 152 |
|
| 153 |
chain = RetrievalQAWithSourcesChain.from_chain_type(
|
|
@@ -164,13 +155,12 @@ async def on_chat_start():
|
|
| 164 |
|
| 165 |
@cl.on_message
|
| 166 |
async def main(message: cl.Message):
|
| 167 |
-
|
| 168 |
# Let's load the chain from user_session
|
| 169 |
chain = cl.user_session.get("chain") # type: RetrievalQAWithSourcesChain
|
| 170 |
|
| 171 |
response = await chain.acall(
|
| 172 |
message.content,
|
| 173 |
-
callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)]
|
| 174 |
)
|
| 175 |
answer = response["answer"]
|
| 176 |
sources = response["sources"].strip()
|
|
|
|
| 1 |
# Chroma compatibility issue resolution
|
| 2 |
# https://docs.trychroma.com/troubleshooting#sqlite
|
| 3 |
+
__import__("pysqlite3")
|
| 4 |
import sys
|
| 5 |
+
|
| 6 |
+
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
|
| 7 |
|
| 8 |
from tempfile import NamedTemporaryFile
|
| 9 |
from typing import List
|
|
|
|
| 12 |
from chainlit.types import AskFileResponse
|
| 13 |
import chromadb
|
| 14 |
from chromadb.config import Settings
|
| 15 |
+
from langchain.chains import RetrievalQAWithSourcesChain
|
| 16 |
from langchain.chat_models import ChatOpenAI
|
| 17 |
from langchain.document_loaders import PDFPlumberLoader
|
| 18 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
|
|
|
| 32 |
|
| 33 |
Args:
|
| 34 |
file (AskFileResponse): input file to be processed
|
| 35 |
+
|
| 36 |
Raises:
|
| 37 |
ValueError: when we fail to process PDF files. We consider PDF file
|
| 38 |
processing failure when there's no text returned. For example, PDFs
|
|
|
|
| 52 |
documents = loader.load()
|
| 53 |
|
| 54 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 55 |
+
chunk_size=3000, chunk_overlap=100
|
|
|
|
| 56 |
)
|
| 57 |
docs = text_splitter.split_documents(documents)
|
| 58 |
|
|
|
|
| 66 |
return docs
|
| 67 |
|
| 68 |
|
| 69 |
+
def create_search_engine(
|
| 70 |
+
*, docs: List[Document], embeddings: Embeddings
|
| 71 |
+
) -> VectorStore:
|
| 72 |
"""Takes a list of Langchain Documents and an embedding model API wrapper
|
| 73 |
and build a search index using a VectorStore.
|
| 74 |
|
|
|
|
| 82 |
"""
|
| 83 |
# Initialize Chromadb client to enable resetting and disable telemtry
|
| 84 |
client = chromadb.EphemeralClient()
|
| 85 |
+
client_settings = Settings(allow_reset=True, anonymized_telemetry=False)
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
# Reset the search engine to ensure we don't use old copies.
|
| 88 |
# NOTE: we do not need this for production
|
| 89 |
+
search_engine = Chroma(client=client, client_settings=client_settings)
|
|
|
|
|
|
|
|
|
|
| 90 |
search_engine._client.reset()
|
| 91 |
search_engine = Chroma.from_documents(
|
| 92 |
client=client,
|
| 93 |
documents=docs,
|
| 94 |
embedding=embeddings,
|
| 95 |
+
client_settings=client_settings,
|
| 96 |
)
|
| 97 |
|
| 98 |
return search_engine
|
| 99 |
+
|
| 100 |
|
| 101 |
@cl.on_chat_start
|
| 102 |
async def on_chat_start():
|
|
|
|
| 119 |
# Process and save data in the user session
|
| 120 |
msg = cl.Message(content=f"Processing `{file.name}`...")
|
| 121 |
await msg.send()
|
| 122 |
+
|
| 123 |
docs = process_file(file=file)
|
| 124 |
cl.user_session.set("docs", docs)
|
| 125 |
msg.content = f"`{file.name}` processed. Loading ..."
|
| 126 |
await msg.update()
|
| 127 |
|
| 128 |
# Indexing documents into our search engine
|
| 129 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
|
|
|
|
|
|
|
| 130 |
try:
|
| 131 |
search_engine = await cl.make_async(create_search_engine)(
|
| 132 |
+
docs=docs, embeddings=embeddings
|
|
|
|
| 133 |
)
|
| 134 |
except Exception as e:
|
| 135 |
await cl.Message(content=f"Error: {e}").send()
|
|
|
|
| 138 |
await msg.update()
|
| 139 |
|
| 140 |
model = ChatOpenAI(
|
| 141 |
+
model="gpt-3.5-turbo-16k-0613", temperature=0, streaming=True
|
|
|
|
|
|
|
| 142 |
)
|
| 143 |
|
| 144 |
chain = RetrievalQAWithSourcesChain.from_chain_type(
|
|
|
|
| 155 |
|
| 156 |
@cl.on_message
|
| 157 |
async def main(message: cl.Message):
|
|
|
|
| 158 |
# Let's load the chain from user_session
|
| 159 |
chain = cl.user_session.get("chain") # type: RetrievalQAWithSourcesChain
|
| 160 |
|
| 161 |
response = await chain.acall(
|
| 162 |
message.content,
|
| 163 |
+
callbacks=[cl.AsyncLangchainCallbackHandler(stream_final_answer=True)],
|
| 164 |
)
|
| 165 |
answer = response["answer"]
|
| 166 |
sources = response["sources"].strip()
|