Spaces:

pasupuletkarthiksai
/

medico-bot

Sleeping

medico-bot / src /data_preprocessing /dataloader.py

kap2403

"added files"

5e433de 7 months ago

5.34 kB

	# this mo

	import re
	import os
	import json
	import docling
	from langchain_core.documents import Document
	from typing import List, Dict, Any, Optional, Tuple
	import logging

	logging.basicConfig(level=logging.INFO)

	#============================
	# data loader from json and md files
	#============================

	def load_json_file(file_path: str)-> dict:
	"""
	Load a JSON file and return its content as a dictionary.

	Args:
	file_path (str): Path to the JSON file.

	Returns:
	dict: Dictionary containing the JSON data.
	"""
	with open(file_path, 'r') as file:
	data = json.load(file)
	return data

	def load_md_file(file_path: str) -> str:
	"""
	Load a Markdown file and return its content as a string.
	The function reads the file in UTF-8 encoding.

	Args:
	file_path (str): Path to the Markdown file.

	Returns:
	str: Content of the Markdown file as a string.
	"""
	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()
	return content


	def data_preprocess(folder_path: str) -> dict:
	"""
	Load data from a folder containing JSON files and a Markdown file.
	The function reads the following files:
	- tables.json
	- images.json
	- text.json
	- chunks.json
	- {base_folder_name}-with-images.md

	Args:
	folder_path (str): Path to the folder containing the JSON and Markdown files.

	Returns:
	dict: A dictionary containing the loaded data from the JSON files and the
	Markdown file.
	"""
	tables_path = os.path.join(folder_path, "tables.json")
	images_path = os.path.join(folder_path, "images.json")
	text_path = os.path.join(folder_path, "text.json")
	chunks_path = os.path.join(folder_path, "chunks.json")

	# Extract base folder name for md and images folder
	base_folder_name = os.path.basename(folder_path)
	images_folder_path = os.path.join(folder_path, f"{base_folder_name}-with-images_artifacts")
	md_file_path = os.path.join(folder_path, f"{base_folder_name}-with-images.md")

	# Load JSON contents
	tables = load_json_file(tables_path)
	images = load_json_file(images_path)
	text = load_json_file(text_path)
	chunks = load_json_file(chunks_path)

	# Load Markdown content
	markdown = load_md_file(md_file_path)

	return {
	"tables": tables,
	"images": images,
	"text": text,
	"chunks": chunks,
	"images_folder": images_folder_path,
	"markdown": markdown
	}


	def load_json_data_documents(converted_document: dict, data_type: str)-> Document:
	"""
	Load JSON data documents from the converted document.
	This function takes a converted document and a data type (e.g., "tables", "images", "text", "chunks")
	and returns a list of Document objects.

	Args:
	converted_document (dict): The converted document containing data.
	data_type (str): The type of data to load (e.g., "tables", "images", "text", "chunks").
	Returns:
	Document: A list of Document objects containing the loaded data.
	"""
	documents = []
	for chunk in converted_document[data_type]:
	content = chunk["content"]
	metadata = chunk["metadata"]
	# Create Document object
	document = Document(
	page_content=content,
	metadata=metadata
	)
	documents.append(document)

	return documents



	#============================
	# dataloader for all the data
	# from the folder
	# containing json and md files
	# and images
	#============================


	def dataloader(folder_path: str)-> Tuple[list, list, list, list]:
	"""
	Load data from a folder containing JSON files and a Markdown file.
	The function reads the following files:

	Args:
	folder_path (str): Folder path containing all folders with JSON files and
	Markdown files.
	Returns:
	Tuple[list, list, list, list]: list of chunks, list of pictures, list of tables,
	and list of text of overall data.
	"""

	chunks_list = []
	pictures_list = []
	tables_list = []
	text_list = []

	logging.info(f"Loading data from folder: {folder_path}")
	for file_name in os.listdir(folder_path):
	logging.info(f"Processing file: {file_name}")
	file_path = os.path.join(folder_path, file_name)

	# load the data
	dict_data = data_preprocess(file_path)
	chunks_data = load_json_data_documents(dict_data, "chunks")
	pictures_data = load_json_data_documents(dict_data, "images")
	tables_data = load_json_data_documents(dict_data, "tables")
	text_data = load_json_data_documents(dict_data, "text")

	# adding the data to the list
	chunks_list.extend(chunks_data)
	pictures_list.extend(pictures_data)
	tables_list.extend(tables_data)
	text_list.extend(text_data)
	logging.info(f"Loaded {len(chunks_data)} chunks, {len(pictures_data)} pictures, "
	f"{len(tables_data)} tables, and {len(text_data)} text documents from {file_name}")

	return chunks_list, pictures_list, tables_list, text_list



	if __name__ == "__main__":
	# Example usage
	folder_path = "dataset/converted_json_docs"
	chunks, pictures, tables, text = dataloader(folder_path)