| import streamlit as st | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| from langchain_community.vectorstores import InMemoryVectorStore | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| def load_embedding_model(model): | |
| model = HuggingFaceEmbeddings(model_name=model) | |
| return model | |
| def load_vector_store(): | |
| model = load_embedding_model("sentence-transformers/all-MiniLM-L12-v2") | |
| vector_store = Chroma( | |
| collection_name="main_store", | |
| embedding_function=model, | |
| persist_directory="./chroma", | |
| ) | |
| return vector_store | |
| def process_pdf(pdf, vector_store): | |
| """ | |
| Loads a pdf and splits it into chunks | |
| """ | |
| loader = PyPDFLoader(pdf) | |
| docs = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| splits = text_splitter.split_documents(docs) | |
| vector_store.add_documents(splits) | |