Spaces:

moazx
/

HBV_AI_Assistant

Sleeping

App Files Files Community

HBV_AI_Assistant / core /data_loaders.py

moazx

Initial commit with all files including LFS

73c6377 about 1 month ago

raw

history blame

5.2 kB

	# Import required libraries
	import pandas as pd
	from pathlib import Path
	from typing import List
	from langchain.schema import Document
	from .config import logger
	from langchain_pymupdf4llm import PyMuPDF4LLMLoader
	from langchain_community.document_loaders.parsers import TesseractBlobParser


	def load_pdf_documents(pdf_path: Path) -> List[Document]:
	"""
	Load and process PDF documents from medical guidelines using PyMuPDF4LLMLoader.
	Uses Tesseract for image extraction and optimized table extraction for medical documents.
	Extracts disease and provider from directory structure.

	Directory structure expected: data/new_data/PROVIDER/file.pdf
	Example: data/new_data/SASLT/SASLT_2021.pdf

	Args:
	pdf_path: Path to the PDF file

	Returns:
	List of Document objects with metadata (source, disease, provider, page_number)
	"""
	try:

	# Validate file exists
	if not pdf_path.exists():
	raise FileNotFoundError(f"PDF file not found at {pdf_path}")

	# Extract provider from directory structure
	# Structure: data/new_data/PROVIDER/file.pdf
	path_parts = pdf_path.parts
	disease = "HBV" # Default disease for this system
	provider = "unknown"

	# Find provider: it's the parent directory of the PDF file
	if len(path_parts) >= 2:
	provider = path_parts[-2] # Parent directory (e.g., SASLT)

	# If provider is 'new_data', it means file is directly in new_data folder
	if provider.lower() == "new_data":
	provider = "unknown"

	# Initialize PyMuPDF4LLMLoader
	loader = PyMuPDF4LLMLoader(
	str(pdf_path),
	mode="page",
	extract_images=True,
	images_parser=TesseractBlobParser(),
	table_strategy="lines"
	)

	raw_documents = loader.load()

	documents = []
	for idx, doc in enumerate(raw_documents):
	if doc.page_content.strip():
	# Extract actual page number from metadata, default to sequential numbering
	# PyMuPDF4LLMLoader uses 0-indexed pages, so we add 1 for human-readable page numbers
	actual_page = doc.metadata.get("page")
	if actual_page is not None:
	# If page is 0-indexed, add 1 to make it 1-indexed
	page_num = actual_page + 1 if actual_page == idx else actual_page
	else:
	# Fallback to 1-indexed sequential numbering
	page_num = idx + 1

	processed_doc = Document(
	page_content=doc.page_content,
	metadata={
	"source": pdf_path.name,
	"disease": disease,
	"provider": provider,
	"page_number": page_num
	}
	)
	documents.append(processed_doc)

	logger.info(f"Loaded {len(documents)} pages from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
	return documents

	except Exception as e:
	logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
	raise


	def load_markdown_documents(md_path: Path) -> List[Document]:
	"""
	Load and process Markdown medical guidelines.
	Extracts disease and provider from directory structure.

	Directory structure expected: data/new_data/PROVIDER/file.md
	Example: data/new_data/SASLT/guidelines.md

	Args:
	md_path: Path to the Markdown file

	Returns:
	List of Document objects with metadata (source, disease, provider, page_number)
	"""
	try:
	# Validate file exists
	if not md_path.exists():
	raise FileNotFoundError(f"Markdown file not found at {md_path}")

	# Extract provider from directory structure
	# Structure: data/new_data/PROVIDER/file.md
	path_parts = md_path.parts
	disease = "HBV" # Default disease for this system
	provider = "unknown"

	# Find provider: it's the parent directory of the markdown file
	if len(path_parts) >= 2:
	provider = path_parts[-2] # Parent directory (e.g., SASLT)

	# If provider is 'new_data', it means file is directly in new_data folder
	if provider.lower() == "new_data":
	provider = "unknown"

	# Read markdown content
	with open(md_path, 'r', encoding='utf-8') as f:
	content = f.read()

	# Create document with minimal metadata for RAG
	doc = Document(
	page_content=content,
	metadata={
	"source": md_path.name,
	"disease": disease,
	"provider": provider,
	"page_number": 1
	}
	)

	logger.info(f"Loaded Markdown document: {md_path.name} (Disease: {disease}, Provider: {provider})")
	return [doc]

	except Exception as e:
	logger.error(f"Error loading Markdown document from {md_path}: {str(e)}")
	raise