Spaces:
Running
Running
File size: 5,670 Bytes
73c6377 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
# Import required libraries
import pandas as pd
from pathlib import Path
from typing import List
from langchain.schema import Document
from core.config import logger
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
def load_pdf_documents(pdf_path: Path) -> List[Document]:
"""
Load and process PDF documents from medical guidelines using Unstructured.io.
Uses high-resolution strategy with ML-based table detection for borderless tables.
Extracts disease and provider from directory structure.
Directory structure expected: data/new_data/PROVIDER/file.pdf
Example: data/new_data/SASLT/SASLT_2021.pdf
Args:
pdf_path: Path to the PDF file
Returns:
List of Document objects with metadata (source, disease, provider, page_number)
"""
try:
# Validate file exists
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
# Extract provider from directory structure
# Structure: data/new_data/PROVIDER/file.pdf
path_parts = pdf_path.parts
disease = "HBV" # Default disease for this system
provider = "unknown"
# Find provider: it's the parent directory of the PDF file
if len(path_parts) >= 2:
provider = path_parts[-2] # Parent directory (e.g., SASLT)
# If provider is 'new_data', it means file is directly in new_data folder
if provider.lower() == "new_data":
provider = "unknown"
# Use Unstructured.io to partition the PDF
# hi_res strategy uses ML models for better table detection
elements = partition_pdf(
filename=str(pdf_path),
strategy="hi_res", # Use ML-based detection for borderless tables
infer_table_structure=True, # Detect table structure without borders
extract_images_in_pdf=True, # Extract images with OCR
languages=["eng"], # OCR language
include_page_breaks=True, # Maintain page boundaries
)
# Group elements by page number
pages_content = {}
for element in elements:
# Get page number from metadata (1-indexed)
page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1
if page_num not in pages_content:
pages_content[page_num] = []
# Convert element to text
pages_content[page_num].append(element.text)
# Create Document objects for each page
documents = []
for page_num in sorted(pages_content.keys()):
# Combine all elements on the page
page_content = "\n\n".join(pages_content[page_num])
if page_content.strip():
processed_doc = Document(
page_content=page_content,
metadata={
"source": pdf_path.name,
"disease": disease,
"provider": provider,
"page_number": page_num
}
)
documents.append(processed_doc)
logger.info(f"Loaded {len(documents)} pages from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
return documents
except Exception as e:
logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
raise
# Alternative version: Preserve element types (useful for RAG)
def load_pdf_documents_with_elements(pdf_path: Path) -> List[Document]:
"""
Load PDF documents while preserving element types (text, table, title, etc.).
Useful for better RAG retrieval by maintaining document structure.
"""
try:
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
path_parts = pdf_path.parts
disease = "HBV"
provider = path_parts[-2] if len(path_parts) >= 2 else "unknown"
if provider.lower() == "new_data":
provider = "unknown"
elements = partition_pdf(
filename=str(pdf_path),
strategy="hi_res",
infer_table_structure=True,
extract_images_in_pdf=True,
languages=["eng"],
)
documents = []
for idx, element in enumerate(elements):
if element.text.strip():
page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1
element_type = element.category # e.g., "Table", "Title", "NarrativeText"
processed_doc = Document(
page_content=element.text,
metadata={
"source": pdf_path.name,
"disease": disease,
"provider": provider,
"page_number": page_num,
"element_type": element_type,
"element_id": idx
}
)
documents.append(processed_doc)
logger.info(f"Loaded {len(documents)} elements from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
return documents
except Exception as e:
logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
raise
# Usage
doc = load_pdf_documents(Path(r"data\processed_data\SASLT\SASLT 2021_20251026_171017.pdf")) |