from pathlib import Path from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import UnstructuredMarkdownLoader from src.utils import logger, convert_document_to_markdown, save_to_markdown class DocumentProcessor: def __init__(self, chunk_size=500, chunk_overlap=100): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) def process_document(self, file_path: str) -> str: """ Processes a document by converting it to markdown and saving it. Args: file_path (str): The path to the document file. Returns: str: The path to the saved markdown file.""" logger.info(f"Processing document: {file_path}") path_obj = Path(file_path) md_content = convert_document_to_markdown(path_obj) logger.info("Document converted to markdown.") md_file_path = save_to_markdown(md_content, path_obj) logger.info(f"Markdown file saved at: {md_file_path}") return md_file_path def load_and_split_pdf(self, file_path: str): """ Loads a document, splits it into chunks, and returns the chunks. Args: file_path (str): The path to the PDF document. Returns: list: A list of document chunks. """ logger.info(f"Loading and splitting Document: {file_path}") path_doc = self.process_document(file_path) loader = UnstructuredMarkdownLoader(path_doc) docs = loader.load() chunks = self.text_splitter.split_documents(docs) logger.info(f"Loaded and split Document into {len(chunks)} chunks") return chunks