feat: heya
Browse files- README.md +3 -4
- app/app.py +66 -1
README.md
CHANGED
|
@@ -5,9 +5,8 @@ _See the readme file in the main branch for updated instructions and information
|
|
| 5 |
## Lab3: Enabling Load PDF to Chainlit App
|
| 6 |
Building on top of the current simplified version of ChatGPT using Chainlit, we now going to add loading PDF capabilities into the application.
|
| 7 |
|
| 8 |
-
|
| 9 |
|
| 10 |
-
In this lab, we will be adding an Chat LLM to our Chainlit app using Langchain.
|
| 11 |
|
| 12 |
## Exercises
|
| 13 |
|
|
@@ -34,6 +33,6 @@ chainlit run app/app.py -w
|
|
| 34 |
|
| 35 |
## References
|
| 36 |
|
| 37 |
-
- [Langchain
|
| 38 |
-
- [Langchain
|
| 39 |
- [Chainlit's documentation](https://docs.chainlit.io/get-started/pure-python)
|
|
|
|
| 5 |
## Lab3: Enabling Load PDF to Chainlit App
|
| 6 |
Building on top of the current simplified version of ChatGPT using Chainlit, we now going to add loading PDF capabilities into the application.
|
| 7 |
|
| 8 |
+
In this lab, we will utilize the build in PDF loading and parsing connectors inside Langchain, load the PDF, and chunk the PDFs into individual pieces with their associated metadata.
|
| 9 |
|
|
|
|
| 10 |
|
| 11 |
## Exercises
|
| 12 |
|
|
|
|
| 33 |
|
| 34 |
## References
|
| 35 |
|
| 36 |
+
- [Langchain PDF Loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf)
|
| 37 |
+
- [Langchain Text Splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/#text-splitters)
|
| 38 |
- [Chainlit's documentation](https://docs.chainlit.io/get-started/pure-python)
|
app/app.py
CHANGED
|
@@ -1,9 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import chainlit as cl
|
|
|
|
| 2 |
from langchain.chat_models import ChatOpenAI
|
| 3 |
from langchain.prompts import ChatPromptTemplate
|
| 4 |
-
from langchain.schema import StrOutputParser
|
| 5 |
from langchain.chains import LLMChain
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
@cl.on_chat_start
|
| 9 |
async def on_chat_start():
|
|
|
|
| 1 |
+
from tempfile import NamedTemporaryFile
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
import chainlit as cl
|
| 5 |
+
from chainlit.types import AskFileResponse
|
| 6 |
from langchain.chat_models import ChatOpenAI
|
| 7 |
from langchain.prompts import ChatPromptTemplate
|
| 8 |
+
from langchain.schema import Document, StrOutputParser
|
| 9 |
from langchain.chains import LLMChain
|
| 10 |
|
| 11 |
+
from langchain.document_loaders import PDFPlumberLoader
|
| 12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def process_file(*, file: AskFileResponse) -> List[Document]:
|
| 16 |
+
"""Processes one PDF file from a Chainlit AskFileResponse object by first
|
| 17 |
+
loading the PDF document and then chunk it into sub documents. Only
|
| 18 |
+
supports PDF files.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
file (AskFileResponse): input file to be processed
|
| 22 |
+
|
| 23 |
+
Raises:
|
| 24 |
+
ValueError: when we fail to process PDF files. We consider PDF file
|
| 25 |
+
processing failure when there's no text returned. For example, PDFs
|
| 26 |
+
with only image contents, corrupted PDFs, etc.
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
List[Document]: List of Document(s). Each individual document has two
|
| 30 |
+
fields: page_content(string) and metadata(dict).
|
| 31 |
+
"""
|
| 32 |
+
# We only support PDF as input.
|
| 33 |
+
if file.type != "application/pdf":
|
| 34 |
+
raise TypeError("Only PDF files are supported")
|
| 35 |
+
|
| 36 |
+
with NamedTemporaryFile() as tempfile:
|
| 37 |
+
tempfile.write(file.content)
|
| 38 |
+
|
| 39 |
+
######################################################################
|
| 40 |
+
# Exercise 1a:
|
| 41 |
+
# We have the input PDF file saved as a temporary file. The name of
|
| 42 |
+
# the file is 'tempfile.name'. Please use one of the PDF loaders in
|
| 43 |
+
# Langchain to load the file.
|
| 44 |
+
######################################################################
|
| 45 |
+
loader = PDFPlumberLoader(tempfile.name)
|
| 46 |
+
documents = loader.load()
|
| 47 |
+
######################################################################
|
| 48 |
+
|
| 49 |
+
######################################################################
|
| 50 |
+
# Exercise 1b:
|
| 51 |
+
# We can now chunk the documents now it is loaded. Langchain provides
|
| 52 |
+
# a list of helpful text splitters. Please use one of the splitters
|
| 53 |
+
# to chunk the file.
|
| 54 |
+
######################################################################
|
| 55 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 56 |
+
chunk_size=3000,
|
| 57 |
+
chunk_overlap=100
|
| 58 |
+
)
|
| 59 |
+
docs = text_splitter.split_documents(documents)
|
| 60 |
+
######################################################################
|
| 61 |
+
|
| 62 |
+
# We are adding source_id into the metadata here to denote which
|
| 63 |
+
# source document it is.
|
| 64 |
+
for i, doc in enumerate(docs):
|
| 65 |
+
doc.metadata["source"] = f"source_{i}"
|
| 66 |
+
|
| 67 |
+
if not docs:
|
| 68 |
+
raise ValueError("PDF file parsing failed.")
|
| 69 |
+
|
| 70 |
+
return docs
|
| 71 |
+
|
| 72 |
|
| 73 |
@cl.on_chat_start
|
| 74 |
async def on_chat_start():
|