File size: 5,670 Bytes
73c6377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Import required libraries
import pandas as pd
from pathlib import Path
from typing import List
from langchain.schema import Document
from core.config import logger
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title


def load_pdf_documents(pdf_path: Path) -> List[Document]:
    """
    Load and process PDF documents from medical guidelines using Unstructured.io.
    Uses high-resolution strategy with ML-based table detection for borderless tables.
    Extracts disease and provider from directory structure.
   
    Directory structure expected: data/new_data/PROVIDER/file.pdf
    Example: data/new_data/SASLT/SASLT_2021.pdf
   
    Args:
        pdf_path: Path to the PDF file
       
    Returns:
        List of Document objects with metadata (source, disease, provider, page_number)
    """
    try:
        # Validate file exists
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found at {pdf_path}")
       
        # Extract provider from directory structure
        # Structure: data/new_data/PROVIDER/file.pdf
        path_parts = pdf_path.parts
        disease = "HBV"  # Default disease for this system
        provider = "unknown"
       
        # Find provider: it's the parent directory of the PDF file
        if len(path_parts) >= 2:
            provider = path_parts[-2]  # Parent directory (e.g., SASLT)
           
        # If provider is 'new_data', it means file is directly in new_data folder
        if provider.lower() == "new_data":
            provider = "unknown"
       
        # Use Unstructured.io to partition the PDF
        # hi_res strategy uses ML models for better table detection
        elements = partition_pdf(
            filename=str(pdf_path),
            strategy="hi_res",  # Use ML-based detection for borderless tables
            infer_table_structure=True,  # Detect table structure without borders
            extract_images_in_pdf=True,  # Extract images with OCR
            languages=["eng"],  # OCR language
            include_page_breaks=True,  # Maintain page boundaries
        )
       
        # Group elements by page number
        pages_content = {}
        for element in elements:
            # Get page number from metadata (1-indexed)
            page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1
           
            if page_num not in pages_content:
                pages_content[page_num] = []
           
            # Convert element to text
            pages_content[page_num].append(element.text)
       
        # Create Document objects for each page
        documents = []
        for page_num in sorted(pages_content.keys()):
            # Combine all elements on the page
            page_content = "\n\n".join(pages_content[page_num])
           
            if page_content.strip():
                processed_doc = Document(
                    page_content=page_content,
                    metadata={
                        "source": pdf_path.name,
                        "disease": disease,
                        "provider": provider,
                        "page_number": page_num
                    }
                )
                documents.append(processed_doc)
       
        logger.info(f"Loaded {len(documents)} pages from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
        return documents
       
    except Exception as e:
        logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
        raise


# Alternative version: Preserve element types (useful for RAG)
def load_pdf_documents_with_elements(pdf_path: Path) -> List[Document]:
    """
    Load PDF documents while preserving element types (text, table, title, etc.).
    Useful for better RAG retrieval by maintaining document structure.
    """
    try:
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found at {pdf_path}")
       
        path_parts = pdf_path.parts
        disease = "HBV"
        provider = path_parts[-2] if len(path_parts) >= 2 else "unknown"
        if provider.lower() == "new_data":
            provider = "unknown"
       
        elements = partition_pdf(
            filename=str(pdf_path),
            strategy="hi_res",
            infer_table_structure=True,
            extract_images_in_pdf=True,
            languages=["eng"],
        )
       
        documents = []
        for idx, element in enumerate(elements):
            if element.text.strip():
                page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1
                element_type = element.category  # e.g., "Table", "Title", "NarrativeText"
               
                processed_doc = Document(
                    page_content=element.text,
                    metadata={
                        "source": pdf_path.name,
                        "disease": disease,
                        "provider": provider,
                        "page_number": page_num,
                        "element_type": element_type,
                        "element_id": idx
                    }
                )
                documents.append(processed_doc)
       
        logger.info(f"Loaded {len(documents)} elements from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
        return documents
       
    except Exception as e:
        logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
        raise


# Usage
doc = load_pdf_documents(Path(r"data\processed_data\SASLT\SASLT 2021_20251026_171017.pdf"))