Spaces:

moazx
/

HBV_AI_Assistant

Running

App Files Files Community

HBV_AI_Assistant / tempCodeRunnerFile.python

moazx

Initial commit with all files including LFS

73c6377 25 days ago

raw

history blame

5.67 kB

	# Import required libraries
	import pandas as pd
	from pathlib import Path
	from typing import List
	from langchain.schema import Document
	from core.config import logger
	from unstructured.partition.pdf import partition_pdf
	from unstructured.chunking.title import chunk_by_title


	def load_pdf_documents(pdf_path: Path) -> List[Document]:
	"""
	Load and process PDF documents from medical guidelines using Unstructured.io.
	Uses high-resolution strategy with ML-based table detection for borderless tables.
	Extracts disease and provider from directory structure.

	Directory structure expected: data/new_data/PROVIDER/file.pdf
	Example: data/new_data/SASLT/SASLT_2021.pdf

	Args:
	pdf_path: Path to the PDF file

	Returns:
	List of Document objects with metadata (source, disease, provider, page_number)
	"""
	try:
	# Validate file exists
	if not pdf_path.exists():
	raise FileNotFoundError(f"PDF file not found at {pdf_path}")

	# Extract provider from directory structure
	# Structure: data/new_data/PROVIDER/file.pdf
	path_parts = pdf_path.parts
	disease = "HBV" # Default disease for this system
	provider = "unknown"

	# Find provider: it's the parent directory of the PDF file
	if len(path_parts) >= 2:
	provider = path_parts[-2] # Parent directory (e.g., SASLT)

	# If provider is 'new_data', it means file is directly in new_data folder
	if provider.lower() == "new_data":
	provider = "unknown"

	# Use Unstructured.io to partition the PDF
	# hi_res strategy uses ML models for better table detection
	elements = partition_pdf(
	filename=str(pdf_path),
	strategy="hi_res", # Use ML-based detection for borderless tables
	infer_table_structure=True, # Detect table structure without borders
	extract_images_in_pdf=True, # Extract images with OCR
	languages=["eng"], # OCR language
	include_page_breaks=True, # Maintain page boundaries
	)

	# Group elements by page number
	pages_content = {}
	for element in elements:
	# Get page number from metadata (1-indexed)
	page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1

	if page_num not in pages_content:
	pages_content[page_num] = []

	# Convert element to text
	pages_content[page_num].append(element.text)

	# Create Document objects for each page
	documents = []
	for page_num in sorted(pages_content.keys()):
	# Combine all elements on the page
	page_content = "\n\n".join(pages_content[page_num])

	if page_content.strip():
	processed_doc = Document(
	page_content=page_content,
	metadata={
	"source": pdf_path.name,
	"disease": disease,
	"provider": provider,
	"page_number": page_num
	}
	)
	documents.append(processed_doc)

	logger.info(f"Loaded {len(documents)} pages from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
	return documents

	except Exception as e:
	logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
	raise


	# Alternative version: Preserve element types (useful for RAG)
	def load_pdf_documents_with_elements(pdf_path: Path) -> List[Document]:
	"""
	Load PDF documents while preserving element types (text, table, title, etc.).
	Useful for better RAG retrieval by maintaining document structure.
	"""
	try:
	if not pdf_path.exists():
	raise FileNotFoundError(f"PDF file not found at {pdf_path}")

	path_parts = pdf_path.parts
	disease = "HBV"
	provider = path_parts[-2] if len(path_parts) >= 2 else "unknown"
	if provider.lower() == "new_data":
	provider = "unknown"

	elements = partition_pdf(
	filename=str(pdf_path),
	strategy="hi_res",
	infer_table_structure=True,
	extract_images_in_pdf=True,
	languages=["eng"],
	)

	documents = []
	for idx, element in enumerate(elements):
	if element.text.strip():
	page_num = element.metadata.page_number if hasattr(element.metadata, 'page_number') else 1
	element_type = element.category # e.g., "Table", "Title", "NarrativeText"

	processed_doc = Document(
	page_content=element.text,
	metadata={
	"source": pdf_path.name,
	"disease": disease,
	"provider": provider,
	"page_number": page_num,
	"element_type": element_type,
	"element_id": idx
	}
	)
	documents.append(processed_doc)

	logger.info(f"Loaded {len(documents)} elements from PDF: {pdf_path.name} (Disease: {disease}, Provider: {provider})")
	return documents

	except Exception as e:
	logger.error(f"Error loading PDF documents from {pdf_path}: {str(e)}")
	raise


	# Usage
	doc = load_pdf_documents(Path(r"data\processed_data\SASLT\SASLT 2021_20251026_171017.pdf"))