Spaces:
Sleeping
Sleeping
| """ | |
| To preprocess the data and create a vector database using docling and langchain, | |
| openai embeddings. | |
| """ | |
| import getpass | |
| import os | |
| from dotenv import load_dotenv | |
| import itertools | |
| from uuid import uuid4 | |
| import faiss | |
| from langchain_community.docstore.in_memory import InMemoryDocstore | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_openai import OpenAIEmbeddings | |
| from docling.document_converter import DocumentConverter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from transformers import AutoTokenizer | |
| from docling_core.transforms.chunker.hybrid_chunker import HybridChunker | |
| from docling_core.types.doc.document import TableItem,PictureItem | |
| from docling_core.types.doc.labels import DocItemLabel | |
| from langchain_core.documents import Document | |
| import logging | |
| load_dotenv() | |
| def adding_metadata_chunks(chunks: HybridChunker, file_name: str, speciality: str) -> list[Document]: | |
| """Adding metadata to the chunks | |
| This function processes a list of chunks and adds metadata to each chunk. | |
| Args: | |
| chunks (Hybridchunker): The chunks to be processed. | |
| file_name (str): The name of the file from which the chunks were created. | |
| specality (str): specalization of the book. | |
| Returns: | |
| List[Document]: A list of Document objects with added metadata. | |
| """ | |
| documents = [] | |
| for idx, chunk in enumerate(chunks): | |
| items = chunk.meta.doc_items | |
| if len(items) == 1 and isinstance(items[0], TableItem): | |
| # If the chunk is a table, we can skip it | |
| continue | |
| main_ref = " ".join([item.get_ref().cref for item in items]) | |
| parent_ref = " ".join([item.parent.get_ref().cref for item in items]) | |
| child_ref = " ".join([str(child) for sublist in [item.children for item in items] for child in sublist]) | |
| text = chunk.text # The text of the chunk | |
| metadata = { | |
| "source": file_name, | |
| "specilization": speciality, | |
| "chunk_index": idx, | |
| "self_ref": main_ref, | |
| "parent_ref": parent_ref, | |
| "child_ref": child_ref, | |
| "chunk_type": "text", | |
| } | |
| document = Document(page_content=text, metadata=metadata) | |
| documents.append(document) | |
| return documents | |
| def modifying_tables(docling_document, file_name: str, speciality: str) -> list[Document]: | |
| """Extract the tables from the converted document and add metadata. | |
| Args: | |
| document (Document): converted document. | |
| file_name (str): file name. | |
| specality (str): specalization of the book. | |
| Returns: | |
| list[TableItem]: A list of documents containing table data with | |
| reference IDs in the metadata. | |
| """ | |
| tables: list[Document] = [] | |
| for table in docling_document.tables: | |
| if table.label in [DocItemLabel.TABLE]: | |
| main_ref = table.get_ref().cref | |
| parent_ref = table.parent.get_ref().cref | |
| child_ref = table.children | |
| text = table.export_to_markdown() | |
| metadata = { | |
| "source": file_name, | |
| "chunk_index": None, | |
| "self_ref": main_ref, | |
| "parent_ref": parent_ref, | |
| "child_ref": child_ref, | |
| "chunk_type": "table", | |
| } | |
| document = Document(page_content=text, metadata=metadata) | |
| tables.append(document) | |
| return tables | |
| def dataloader(file_path:str, embeddings_model:str) -> list[Document]: | |
| logging.info("Converting the document to docling format...") | |
| docling_document = DocumentConverter().convert(source=file_path).document | |
| file_name = file_path.split("\\")[-1].split(".")[0] | |
| # Create a hybrid chunker to chunk the document | |
| embeddings_tokenizer = AutoTokenizer.from_pretrained(embeddings_model) | |
| logging.info("Chunking the document...") | |
| chunks = HybridChunker(tokenizer=embeddings_tokenizer).chunk(docling_document) | |
| # Add metadata to the chunks | |
| logging.info("Adding metadata to the chunks...") | |
| texts = adding_metadata_chunks(chunks, file_name) | |
| logging.info("Modifying tables...") | |
| tables = modifying_tables(docling_document, file_name) | |
| # Combine the text and table documents into a single list | |
| documents = list(itertools.chain(texts, tables)) | |
| logging.info(f"Loaded {len(documents)} documents from {file_name}.") | |
| return documents | |
| def create_vector_database(documents: list[Document]) -> FAISS: | |
| """Create a vector database from the documents. | |
| Args: | |
| file_path (str): The path to the document file. | |
| embeddings_model (str): The model name for embeddings. | |
| Returns: | |
| list[Document]: A list of Document objects with embeddings. | |
| """ | |
| logging.info("Creating the vector database...") | |
| embeddings = OpenAIEmbeddings(model="text-embedding-3-large") | |
| index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world"))) | |
| vector_store = FAISS( | |
| embedding_function=embeddings, | |
| index=index, | |
| docstore=InMemoryDocstore(), | |
| index_to_docstore_id={}, | |
| ) | |
| uuids = [str(uuid4()) for _ in range(len(documents))] | |
| vector_store.add_documents(documents=documents, ids=uuids) | |
| logging.info("Vector database created successfully.") | |
| def main(file_path:str, embeddings_model:str) -> FAISS: | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| documents = dataloader(file_path, embeddings_model) | |
| create_vector_database(documents) | |
| if __name__ == "__main__": | |
| file_path = r"converted\ROBBINS-&-COTRAN-PATHOLOGIC-BASIS-OF-DISEASE-10TH-ED-with-image-refs.md" | |
| embeddings_model = "ibm-granite/granite-embedding-125m-english" | |
| main(file_path, embeddings_model) |