File size: 5,203 Bytes
73c6377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Import required libraries
import pandas as pd
from pathlib import Path
from typing import List
from langchain.schema import Document
from .config import logger
from langchain_pymupdf4llm import PyMuPDF4LLMLoader
from langchain_community.document_loaders.parsers import TesseractBlobParser


def load_pdf_documents(pdf_path: Path) -> List[Document]:
    """
    Load and process PDF documents from medical guidelines using PyMuPDF4LLMLoader.
    Uses Tesseract for image extraction and optimized table extraction for medical documents.
    Extracts disease and provider from directory structure.
   
    Directory structure expected: data/new_data/PROVIDER/file.pdf
    Example: data/new_data/SASLT/SASLT_2021.pdf
   
    Args:
        pdf_path: Path to the PDF file
       
    Returns:
        List of Document objects with metadata (source, disease, provider, page_number)
    """
    try:
       
        # Validate file exists
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found at {pdf_path}")
       
        # Extract provider from directory structure
        # Structure: data/new_data/PROVIDER/file.pdf
        path_parts = pdf_path.parts
        disease = "HBV"  # Default disease for this system
        provider = "unknown"
       
        # Find provider: it's the parent directory of the PDF file
        if len(path_parts) >= 2:
            provider = path_parts[-2]  # Parent directory (e.g., SASLT)
            
        # If provider is 'new_data', it means file is directly in new_data folder
        if provider.lower() == "new_data":
            provider = "unknown"
       
        # Initialize PyMuPDF4LLMLoader
        loader = PyMuPDF4LLMLoader(
            str(pdf_path),
            mode="page",
            extract_images=True,
            images_parser=TesseractBlobParser(),
            table_strategy="lines"
        )
       
        raw_documents = loader.load()
       
        documents = []
        for idx, doc in enumerate(raw_documents):
            if doc.page_content.strip():
                # Extract actual page number from metadata, default to sequential numbering
                # PyMuPDF4LLMLoader uses 0-indexed pages, so we add 1 for human-readable page numbers
                actual_page = doc.metadata.get("page")
                if actual_page is not None:
                    # If page is 0-indexed, add 1 to make it 1-indexed
                    page_num = actual_page + 1 if actual_page == idx else actual_page
                else:
                    # Fallback to 1-indexed sequential numbering
                    page_num = idx + 1
                
                processed_doc = Document(
                    page_content=doc.page_content,
                    metadata={
                        "source": pdf_path.name,
                        "disease": disease,
                        "provider": provider,
                        "page_number": page_num
                    }
                )
                documents.append(processed_doc)

        logger.info(f"Loaded {len(documents)} pages from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
        return documents
       
    except Exception as e:
        logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
        raise
    
    
def load_markdown_documents(md_path: Path) -> List[Document]:
    """
    Load and process Markdown medical guidelines.
    Extracts disease and provider from directory structure.

    Directory structure expected: data/new_data/PROVIDER/file.md
    Example: data/new_data/SASLT/guidelines.md

    Args:
        md_path: Path to the Markdown file

    Returns:
        List of Document objects with metadata (source, disease, provider, page_number)
    """
    try:
        # Validate file exists
        if not md_path.exists():
            raise FileNotFoundError(f"Markdown file not found at {md_path}")

        # Extract provider from directory structure
        # Structure: data/new_data/PROVIDER/file.md
        path_parts = md_path.parts
        disease = "HBV"  # Default disease for this system
        provider = "unknown"

        # Find provider: it's the parent directory of the markdown file
        if len(path_parts) >= 2:
            provider = path_parts[-2]  # Parent directory (e.g., SASLT)
            
        # If provider is 'new_data', it means file is directly in new_data folder
        if provider.lower() == "new_data":
            provider = "unknown"

        # Read markdown content
        with open(md_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Create document with minimal metadata for RAG
        doc = Document(
            page_content=content,
            metadata={
                "source": md_path.name,
                "disease": disease,
                "provider": provider,
                "page_number": 1
            }
        )

        logger.info(f"Loaded Markdown document: {md_path.name} (Disease: {disease}, Provider: {provider})")
        return [doc]

    except Exception as e:
        logger.error(f"Error loading Markdown document from {md_path}: {str(e)}")
        raise