Spaces:
Build error
Build error
| import os | |
| import json | |
| from pathlib import Path | |
| from pypdf import PdfReader | |
| from langchain.docstore.document import Document | |
| from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader | |
| from langchain_community.document_loaders.excel import UnstructuredExcelLoader | |
| def load_file(filepath): | |
| # try: | |
| print(f"Loading {filepath}") | |
| if filepath.suffix == '.txt': | |
| loader = TextLoader(str(filepath)) | |
| return loader.load() | |
| elif filepath.suffix == '.csv': | |
| loader = CSVLoader(file_path=str(filepath)) | |
| return loader.load() | |
| elif filepath.suffix == '.pdf': | |
| loader = PyPDFLoader(str(filepath)) | |
| return loader.load() | |
| elif filepath.suffix == '.md': | |
| # Load Markdown file as a Document using TextLoader | |
| loader = TextLoader(str(filepath)) | |
| return loader.load() | |
| elif filepath.suffix == '.xls' or filepath.suffix == '.xlsx': | |
| loader = UnstructuredExcelLoader(str(filepath)) | |
| return loader.load() | |
| elif filepath.suffix == '.json': | |
| with open(filepath) as f: | |
| json_data = json.load(f) | |
| if isinstance(json_data, list): # Handle list of dictionaries | |
| for item in json_data: | |
| content = "\n".join([f"{k}: {v}" for k, v in item.items()]) | |
| return [Document(page_content=content, metadata={'source': str(filepath)})] | |
| elif isinstance(json_data, dict): # Handle nested dictionaries | |
| content = "" | |
| for key, value in json_data.items(): | |
| content += f"**{key}**\n\n" | |
| if isinstance(value, list): | |
| for item in value: | |
| if isinstance(item, dict): | |
| content += "\n".join([f"{k}: {v}" for k, v in item.items()]) + "\n\n" | |
| else: | |
| content += str(item) + "\n\n" | |
| else: | |
| content += str(value) + "\n\n" | |
| return [Document(page_content=content, metadata={'source': str(filepath)})] | |
| else: | |
| print(f"Unsupported JSON structure in {filepath}") | |
| else: | |
| print(f"Unsupported file type: {filepath}") | |
| # except Exception as e: | |
| # print(f"Error loading {filepath}: {e}") | |
| def load_data_files(data_dir): | |
| """ | |
| Loads all data files from the specified directory, handling various file types. | |
| Args: | |
| data_dir: The directory containing the data files. | |
| Returns: | |
| A list of Document objects, each representing a loaded document. | |
| """ | |
| docs = [] | |
| for filepath in Path(data_dir).glob('**/*.*'): | |
| docs.extend(load_file(filepath)) | |
| return docs | |
| if __name__ == "__main__": | |
| # Test with files in the 'examples' directory | |
| docs = load_data_files("examples") | |
| for doc in docs: | |
| print(doc) |