HBV_AI_Assistant / tempCodeRunnerFile.python
moazx's picture
Initial commit with all files including LFS
73c6377
raw
history blame
5.67 kB
# Import required libraries
import pandas as pd
from pathlib import Path
from typing import List
from langchain.schema import Document
from core.config import logger
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
def load_pdf_documents(pdf_path: Path) -> List[Document]:
"""
Load and process PDF documents from medical guidelines using Unstructured.io.
Uses high-resolution strategy with ML-based table detection for borderless tables.
Extracts disease and provider from directory structure.
Directory structure expected: data/new_data/PROVIDER/file.pdf
Example: data/new_data/SASLT/SASLT_2021.pdf
Args:
pdf_path: Path to the PDF file
Returns:
List of Document objects with metadata (source, disease, provider, page_number)
"""
try:
# Validate file exists
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
# Extract provider from directory structure
# Structure: data/new_data/PROVIDER/file.pdf
path_parts = pdf_path.parts
disease = "HBV" # Default disease for this system
provider = "unknown"
# Find provider: it's the parent directory of the PDF file
if len(path_parts) >= 2:
provider = path_parts[-2] # Parent directory (e.g., SASLT)
# If provider is 'new_data', it means file is directly in new_data folder
if provider.lower() == "new_data":
provider = "unknown"
# Use Unstructured.io to partition the PDF
# hi_res strategy uses ML models for better table detection
elements = partition_pdf(
filename=str(pdf_path),
strategy="hi_res", # Use ML-based detection for borderless tables
infer_table_structure=True, # Detect table structure without borders
extract_images_in_pdf=True, # Extract images with OCR
languages=["eng"], # OCR language
include_page_breaks=True, # Maintain page boundaries
)
# Group elements by page number
pages_content = {}
for element in elements:
# Get page number from metadata (1-indexed)
page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1
if page_num not in pages_content:
pages_content[page_num] = []
# Convert element to text
pages_content[page_num].append(element.text)
# Create Document objects for each page
documents = []
for page_num in sorted(pages_content.keys()):
# Combine all elements on the page
page_content = "\n\n".join(pages_content[page_num])
if page_content.strip():
processed_doc = Document(
page_content=page_content,
metadata={
"source": pdf_path.name,
"disease": disease,
"provider": provider,
"page_number": page_num
}
)
documents.append(processed_doc)
logger.info(f"Loaded {len(documents)} pages from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
return documents
except Exception as e:
logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
raise
# Alternative version: Preserve element types (useful for RAG)
def load_pdf_documents_with_elements(pdf_path: Path) -> List[Document]:
"""
Load PDF documents while preserving element types (text, table, title, etc.).
Useful for better RAG retrieval by maintaining document structure.
"""
try:
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found at {pdf_path}")
path_parts = pdf_path.parts
disease = "HBV"
provider = path_parts[-2] if len(path_parts) >= 2 else "unknown"
if provider.lower() == "new_data":
provider = "unknown"
elements = partition_pdf(
filename=str(pdf_path),
strategy="hi_res",
infer_table_structure=True,
extract_images_in_pdf=True,
languages=["eng"],
)
documents = []
for idx, element in enumerate(elements):
if element.text.strip():
page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1
element_type = element.category # e.g., "Table", "Title", "NarrativeText"
processed_doc = Document(
page_content=element.text,
metadata={
"source": pdf_path.name,
"disease": disease,
"provider": provider,
"page_number": page_num,
"element_type": element_type,
"element_id": idx
}
)
documents.append(processed_doc)
logger.info(f"Loaded {len(documents)} elements from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
return documents
except Exception as e:
logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
raise
# Usage
doc = load_pdf_documents(Path(r"data\processed_data\SASLT\SASLT 2021_20251026_171017.pdf"))