Spaces:
Running
Running
| # Import required libraries | |
| import pandas as pd | |
| from pathlib import Path | |
| from typing import List | |
| from langchain.schema import Document | |
| from core.config import logger | |
| from unstructured.partition.pdf import partition_pdf | |
| from unstructured.chunking.title import chunk_by_title | |
| def load_pdf_documents(pdf_path: Path) -> List[Document]: | |
| """ | |
| Load and process PDF documents from medical guidelines using Unstructured.io. | |
| Uses high-resolution strategy with ML-based table detection for borderless tables. | |
| Extracts disease and provider from directory structure. | |
| Directory structure expected: data/new_data/PROVIDER/file.pdf | |
| Example: data/new_data/SASLT/SASLT_2021.pdf | |
| Args: | |
| pdf_path: Path to the PDF file | |
| Returns: | |
| List of Document objects with metadata (source, disease, provider, page_number) | |
| """ | |
| try: | |
| # Validate file exists | |
| if not pdf_path.exists(): | |
| raise FileNotFoundError(f"PDF file not found at {pdf_path}") | |
| # Extract provider from directory structure | |
| # Structure: data/new_data/PROVIDER/file.pdf | |
| path_parts = pdf_path.parts | |
| disease = "HBV" # Default disease for this system | |
| provider = "unknown" | |
| # Find provider: it's the parent directory of the PDF file | |
| if len(path_parts) >= 2: | |
| provider = path_parts[-2] # Parent directory (e.g., SASLT) | |
| # If provider is 'new_data', it means file is directly in new_data folder | |
| if provider.lower() == "new_data": | |
| provider = "unknown" | |
| # Use Unstructured.io to partition the PDF | |
| # hi_res strategy uses ML models for better table detection | |
| elements = partition_pdf( | |
| filename=str(pdf_path), | |
| strategy="hi_res", # Use ML-based detection for borderless tables | |
| infer_table_structure=True, # Detect table structure without borders | |
| extract_images_in_pdf=True, # Extract images with OCR | |
| languages=["eng"], # OCR language | |
| include_page_breaks=True, # Maintain page boundaries | |
| ) | |
| # Group elements by page number | |
| pages_content = {} | |
| for element in elements: | |
| # Get page number from metadata (1-indexed) | |
| page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1 | |
| if page_num not in pages_content: | |
| pages_content[page_num] = [] | |
| # Convert element to text | |
| pages_content[page_num].append(element.text) | |
| # Create Document objects for each page | |
| documents = [] | |
| for page_num in sorted(pages_content.keys()): | |
| # Combine all elements on the page | |
| page_content = "\n\n".join(pages_content[page_num]) | |
| if page_content.strip(): | |
| processed_doc = Document( | |
| page_content=page_content, | |
| metadata={ | |
| "source": pdf_path.name, | |
| "disease": disease, | |
| "provider": provider, | |
| "page_number": page_num | |
| } | |
| ) | |
| documents.append(processed_doc) | |
| logger.info(f"Loaded {len(documents)} pages from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})") | |
| return documents | |
| except Exception as e: | |
| logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}") | |
| raise | |
| # Alternative version: Preserve element types (useful for RAG) | |
| def load_pdf_documents_with_elements(pdf_path: Path) -> List[Document]: | |
| """ | |
| Load PDF documents while preserving element types (text, table, title, etc.). | |
| Useful for better RAG retrieval by maintaining document structure. | |
| """ | |
| try: | |
| if not pdf_path.exists(): | |
| raise FileNotFoundError(f"PDF file not found at {pdf_path}") | |
| path_parts = pdf_path.parts | |
| disease = "HBV" | |
| provider = path_parts[-2] if len(path_parts) >= 2 else "unknown" | |
| if provider.lower() == "new_data": | |
| provider = "unknown" | |
| elements = partition_pdf( | |
| filename=str(pdf_path), | |
| strategy="hi_res", | |
| infer_table_structure=True, | |
| extract_images_in_pdf=True, | |
| languages=["eng"], | |
| ) | |
| documents = [] | |
| for idx, element in enumerate(elements): | |
| if element.text.strip(): | |
| page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1 | |
| element_type = element.category # e.g., "Table", "Title", "NarrativeText" | |
| processed_doc = Document( | |
| page_content=element.text, | |
| metadata={ | |
| "source": pdf_path.name, | |
| "disease": disease, | |
| "provider": provider, | |
| "page_number": page_num, | |
| "element_type": element_type, | |
| "element_id": idx | |
| } | |
| ) | |
| documents.append(processed_doc) | |
| logger.info(f"Loaded {len(documents)} elements from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})") | |
| return documents | |
| except Exception as e: | |
| logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}") | |
| raise | |
| # Usage | |
| doc = load_pdf_documents(Path(r"data\processed_data\SASLT\SASLT 2021_20251026_171017.pdf")) |