Spaces:
Sleeping
Sleeping
| import re | |
| from io import BytesIO | |
| from typing import Tuple, List | |
| import pickle | |
| from langchain.docstore.document import Document | |
| from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores.faiss import FAISS | |
| from pypdf import PdfReader | |
| import faiss | |
| def parse_pdf(file: BytesIO, filename: str) -> Tuple[List[str], str]: | |
| pdf = PdfReader(file) | |
| output = [] | |
| for page in pdf.pages: | |
| text = page.extract_text() | |
| text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) | |
| text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip()) | |
| text = re.sub(r"\n\s*\n", "\n\n", text) | |
| output.append(text) | |
| return output, filename | |
| def text_to_docs(text: List[str], filename: str) -> List[Document]: | |
| if isinstance(text, str): | |
| text = [text] | |
| page_docs = [Document(page_content=page) for page in text] | |
| for i, doc in enumerate(page_docs): | |
| doc.metadata["page"] = i + 1 | |
| doc_chunks = [] | |
| for doc in page_docs: | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=4000, | |
| separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], | |
| chunk_overlap=0, | |
| ) | |
| chunks = text_splitter.split_text(doc.page_content) | |
| for i, chunk in enumerate(chunks): | |
| doc = Document( | |
| page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i} | |
| ) | |
| doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}" | |
| doc.metadata["filename"] = filename # Add filename to metadata | |
| doc_chunks.append(doc) | |
| return doc_chunks | |
| def docs_to_index(docs, huggingface_model_name): | |
| # Using Hugging Face embeddings | |
| embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| index = FAISS.from_documents(docs, embedding_model) | |
| return index | |
| def get_index_for_pdf(pdf_files, pdf_names, huggingface_model_name): | |
| documents = [] | |
| for pdf_file, pdf_name in zip(pdf_files, pdf_names): | |
| text, filename = parse_pdf(BytesIO(pdf_file), pdf_name) | |
| documents = documents + text_to_docs(text, filename) | |
| index = docs_to_index(documents, huggingface_model_name) | |
| return index | |