Spaces:
Sleeping
Sleeping
| # this mo | |
| import re | |
| import os | |
| import json | |
| import docling | |
| from langchain_core.documents import Document | |
| from typing import List, Dict, Any, Optional, Tuple | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| #============================ | |
| # data loader from json and md files | |
| #============================ | |
| def load_json_file(file_path: str)-> dict: | |
| """ | |
| Load a JSON file and return its content as a dictionary. | |
| Args: | |
| file_path (str): Path to the JSON file. | |
| Returns: | |
| dict: Dictionary containing the JSON data. | |
| """ | |
| with open(file_path, 'r') as file: | |
| data = json.load(file) | |
| return data | |
| def load_md_file(file_path: str) -> str: | |
| """ | |
| Load a Markdown file and return its content as a string. | |
| The function reads the file in UTF-8 encoding. | |
| Args: | |
| file_path (str): Path to the Markdown file. | |
| Returns: | |
| str: Content of the Markdown file as a string. | |
| """ | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| return content | |
| def data_preprocess(folder_path: str) -> dict: | |
| """ | |
| Load data from a folder containing JSON files and a Markdown file. | |
| The function reads the following files: | |
| - tables.json | |
| - images.json | |
| - text.json | |
| - chunks.json | |
| - {base_folder_name}-with-images.md | |
| Args: | |
| folder_path (str): Path to the folder containing the JSON and Markdown files. | |
| Returns: | |
| dict: A dictionary containing the loaded data from the JSON files and the | |
| Markdown file. | |
| """ | |
| tables_path = os.path.join(folder_path, "tables.json") | |
| images_path = os.path.join(folder_path, "images.json") | |
| text_path = os.path.join(folder_path, "text.json") | |
| chunks_path = os.path.join(folder_path, "chunks.json") | |
| # Extract base folder name for md and images folder | |
| base_folder_name = os.path.basename(folder_path) | |
| images_folder_path = os.path.join(folder_path, f"{base_folder_name}-with-images_artifacts") | |
| md_file_path = os.path.join(folder_path, f"{base_folder_name}-with-images.md") | |
| # Load JSON contents | |
| tables = load_json_file(tables_path) | |
| images = load_json_file(images_path) | |
| text = load_json_file(text_path) | |
| chunks = load_json_file(chunks_path) | |
| # Load Markdown content | |
| markdown = load_md_file(md_file_path) | |
| return { | |
| "tables": tables, | |
| "images": images, | |
| "text": text, | |
| "chunks": chunks, | |
| "images_folder": images_folder_path, | |
| "markdown": markdown | |
| } | |
| def load_json_data_documents(converted_document: dict, data_type: str)-> Document: | |
| """ | |
| Load JSON data documents from the converted document. | |
| This function takes a converted document and a data type (e.g., "tables", "images", "text", "chunks") | |
| and returns a list of Document objects. | |
| Args: | |
| converted_document (dict): The converted document containing data. | |
| data_type (str): The type of data to load (e.g., "tables", "images", "text", "chunks"). | |
| Returns: | |
| Document: A list of Document objects containing the loaded data. | |
| """ | |
| documents = [] | |
| for chunk in converted_document[data_type]: | |
| content = chunk["content"] | |
| metadata = chunk["metadata"] | |
| # Create Document object | |
| document = Document( | |
| page_content=content, | |
| metadata=metadata | |
| ) | |
| documents.append(document) | |
| return documents | |
| #============================ | |
| # dataloader for all the data | |
| # from the folder | |
| # containing json and md files | |
| # and images | |
| #============================ | |
| def dataloader(folder_path: str)-> Tuple[list, list, list, list]: | |
| """ | |
| Load data from a folder containing JSON files and a Markdown file. | |
| The function reads the following files: | |
| Args: | |
| folder_path (str): Folder path containing all folders with JSON files and | |
| Markdown files. | |
| Returns: | |
| Tuple[list, list, list, list]: list of chunks, list of pictures, list of tables, | |
| and list of text of overall data. | |
| """ | |
| chunks_list = [] | |
| pictures_list = [] | |
| tables_list = [] | |
| text_list = [] | |
| logging.info(f"Loading data from folder: {folder_path}") | |
| for file_name in os.listdir(folder_path): | |
| logging.info(f"Processing file: {file_name}") | |
| file_path = os.path.join(folder_path, file_name) | |
| # load the data | |
| dict_data = data_preprocess(file_path) | |
| chunks_data = load_json_data_documents(dict_data, "chunks") | |
| pictures_data = load_json_data_documents(dict_data, "images") | |
| tables_data = load_json_data_documents(dict_data, "tables") | |
| text_data = load_json_data_documents(dict_data, "text") | |
| # adding the data to the list | |
| chunks_list.extend(chunks_data) | |
| pictures_list.extend(pictures_data) | |
| tables_list.extend(tables_data) | |
| text_list.extend(text_data) | |
| logging.info(f"Loaded {len(chunks_data)} chunks, {len(pictures_data)} pictures, " | |
| f"{len(tables_data)} tables, and {len(text_data)} text documents from {file_name}") | |
| return chunks_list, pictures_list, tables_list, text_list | |
| if __name__ == "__main__": | |
| # Example usage | |
| folder_path = "dataset/converted_json_docs" | |
| chunks, pictures, tables, text = dataloader(folder_path) | |