# Import required libraries import pandas as pd from pathlib import Path from typing import List from langchain.schema import Document from core.config import logger from unstructured.partition.pdf import partition_pdf from unstructured.chunking.title import chunk_by_title def load_pdf_documents(pdf_path: Path) -> List[Document]: """ Load and process PDF documents from medical guidelines using Unstructured.io. Uses high-resolution strategy with ML-based table detection for borderless tables. Extracts disease and provider from directory structure. Directory structure expected: data/new_data/PROVIDER/file.pdf Example: data/new_data/SASLT/SASLT_2021.pdf Args: pdf_path: Path to the PDF file Returns: List of Document objects with metadata (source, disease, provider, page_number) """ try: # Validate file exists if not pdf_path.exists(): raise FileNotFoundError(f"PDF file not found at {pdf_path}") # Extract provider from directory structure # Structure: data/new_data/PROVIDER/file.pdf path_parts = pdf_path.parts disease = "HBV" # Default disease for this system provider = "unknown" # Find provider: it's the parent directory of the PDF file if len(path_parts) >= 2: provider = path_parts[-2] # Parent directory (e.g., SASLT) # If provider is 'new_data', it means file is directly in new_data folder if provider.lower() == "new_data": provider = "unknown" # Use Unstructured.io to partition the PDF # hi_res strategy uses ML models for better table detection elements = partition_pdf( filename=str(pdf_path), strategy="hi_res", # Use ML-based detection for borderless tables infer_table_structure=True, # Detect table structure without borders extract_images_in_pdf=True, # Extract images with OCR languages=["eng"], # OCR language include_page_breaks=True, # Maintain page boundaries ) # Group elements by page number pages_content = {} for element in elements: # Get page number from metadata (1-indexed) page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1 if page_num not in pages_content: pages_content[page_num] = [] # Convert element to text pages_content[page_num].append(element.text) # Create Document objects for each page documents = [] for page_num in sorted(pages_content.keys()): # Combine all elements on the page page_content = "\n\n".join(pages_content[page_num]) if page_content.strip(): processed_doc = Document( page_content=page_content, metadata={ "source": pdf_path.name, "disease": disease, "provider": provider, "page_number": page_num } ) documents.append(processed_doc) logger.info(f"Loaded {len(documents)} pages from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})") return documents except Exception as e: logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}") raise # Alternative version: Preserve element types (useful for RAG) def load_pdf_documents_with_elements(pdf_path: Path) -> List[Document]: """ Load PDF documents while preserving element types (text, table, title, etc.). Useful for better RAG retrieval by maintaining document structure. """ try: if not pdf_path.exists(): raise FileNotFoundError(f"PDF file not found at {pdf_path}") path_parts = pdf_path.parts disease = "HBV" provider = path_parts[-2] if len(path_parts) >= 2 else "unknown" if provider.lower() == "new_data": provider = "unknown" elements = partition_pdf( filename=str(pdf_path), strategy="hi_res", infer_table_structure=True, extract_images_in_pdf=True, languages=["eng"], ) documents = [] for idx, element in enumerate(elements): if element.text.strip(): page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1 element_type = element.category # e.g., "Table", "Title", "NarrativeText" processed_doc = Document( page_content=element.text, metadata={ "source": pdf_path.name, "disease": disease, "provider": provider, "page_number": page_num, "element_type": element_type, "element_id": idx } ) documents.append(processed_doc) logger.info(f"Loaded {len(documents)} elements from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})") return documents except Exception as e: logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}") raise # Usage doc = load_pdf_documents(Path(r"data\processed_data\SASLT\SASLT 2021_20251026_171017.pdf"))