Spaces:
Running
Running
| import PyPDF2 | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceBgeEmbeddings | |
| from langchain.vectorstores import FAISS | |
| def read_pdf(uploaded_file): | |
| pdf_reader = PyPDF2.PdfReader(uploaded_file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def Chunks(docs): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| # Set a really small chunk size, just to show. | |
| chunk_size = 1000, | |
| chunk_overlap = 100, | |
| ) | |
| doc = text_splitter.split_text(docs) | |
| return doc | |
| def PDF_4_QA(file): | |
| content = read_pdf(file) | |
| pdf_chunks = Chunks(docs=content) | |
| embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', | |
| model_kwargs={'device': 'cpu'}) | |
| vectorstore_openai = FAISS.from_texts(pdf_chunks, embeddings) | |
| return vectorstore_openai |