# Import required libraries import pandas as pd from pathlib import Path from typing import List from langchain.schema import Document from .config import logger from langchain_pymupdf4llm import PyMuPDF4LLMLoader from langchain_community.document_loaders.parsers import TesseractBlobParser def load_pdf_documents(pdf_path: Path) -> List[Document]: """ Load and process PDF documents from medical guidelines using PyMuPDF4LLMLoader. Uses Tesseract for image extraction and optimized table extraction for medical documents. Extracts disease and provider from directory structure. Directory structure expected: data/new_data/PROVIDER/file.pdf Example: data/new_data/SASLT/SASLT_2021.pdf Args: pdf_path: Path to the PDF file Returns: List of Document objects with metadata (source, disease, provider, page_number) """ try: # Validate file exists if not pdf_path.exists(): raise FileNotFoundError(f"PDF file not found at {pdf_path}") # Extract provider from directory structure # Structure: data/new_data/PROVIDER/file.pdf path_parts = pdf_path.parts disease = "HBV" # Default disease for this system provider = "unknown" # Find provider: it's the parent directory of the PDF file if len(path_parts) >= 2: provider = path_parts[-2] # Parent directory (e.g., SASLT) # If provider is 'new_data', it means file is directly in new_data folder if provider.lower() == "new_data": provider = "unknown" # Initialize PyMuPDF4LLMLoader loader = PyMuPDF4LLMLoader( str(pdf_path), mode="page", extract_images=True, images_parser=TesseractBlobParser(), table_strategy="lines" ) raw_documents = loader.load() documents = [] for idx, doc in enumerate(raw_documents): if doc.page_content.strip(): # Extract actual page number from metadata, default to sequential numbering # PyMuPDF4LLMLoader uses 0-indexed pages, so we add 1 for human-readable page numbers actual_page = doc.metadata.get("page") if actual_page is not None: # If page is 0-indexed, add 1 to make it 1-indexed page_num = actual_page + 1 if actual_page == idx else actual_page else: # Fallback to 1-indexed sequential numbering page_num = idx + 1 processed_doc = Document( page_content=doc.page_content, metadata={ "source": pdf_path.name, "disease": disease, "provider": provider, "page_number": page_num } ) documents.append(processed_doc) logger.info(f"Loaded {len(documents)} pages from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})") return documents except Exception as e: logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}") raise def load_markdown_documents(md_path: Path) -> List[Document]: """ Load and process Markdown medical guidelines. Extracts disease and provider from directory structure. Directory structure expected: data/new_data/PROVIDER/file.md Example: data/new_data/SASLT/guidelines.md Args: md_path: Path to the Markdown file Returns: List of Document objects with metadata (source, disease, provider, page_number) """ try: # Validate file exists if not md_path.exists(): raise FileNotFoundError(f"Markdown file not found at {md_path}") # Extract provider from directory structure # Structure: data/new_data/PROVIDER/file.md path_parts = md_path.parts disease = "HBV" # Default disease for this system provider = "unknown" # Find provider: it's the parent directory of the markdown file if len(path_parts) >= 2: provider = path_parts[-2] # Parent directory (e.g., SASLT) # If provider is 'new_data', it means file is directly in new_data folder if provider.lower() == "new_data": provider = "unknown" # Read markdown content with open(md_path, 'r', encoding='utf-8') as f: content = f.read() # Create document with minimal metadata for RAG doc = Document( page_content=content, metadata={ "source": md_path.name, "disease": disease, "provider": provider, "page_number": 1 } ) logger.info(f"Loaded Markdown document: {md_path.name} (Disease: {disease}, Provider: {provider})") return [doc] except Exception as e: logger.error(f"Error loading Markdown document from {md_path}: {str(e)}") raise