Spaces:
Runtime error
Runtime error
| import os | |
| import time | |
| import traceback | |
| from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_chroma import Chroma | |
| from config.settings import CHROMA_PATH, EMBEDDING_MODEL | |
| # ============================================================ | |
| # Load and split documents | |
| # ============================================================ | |
| def load_documents(path: str): | |
| """ | |
| Load and split a single uploaded document (PDF, CSV, or MD/TXT). | |
| Args: | |
| path (str): Full path to the uploaded file. | |
| Returns: | |
| List[Document]: List of document chunks ready for embedding. | |
| """ | |
| print(f"\nReading uploaded file: {path}") | |
| if not os.path.exists(path): | |
| print(f"[ERROR] File not found: {path}") | |
| return [] | |
| ext = os.path.splitext(path)[1].lower() | |
| all_docs = [] | |
| try: | |
| # Select appropriate loader based on file type | |
| if ext == ".pdf": | |
| loader = PyPDFLoader(path) | |
| elif ext == ".csv": | |
| loader = CSVLoader(path, encoding="utf-8") | |
| elif ext in [".md"]: | |
| loader = TextLoader(path, encoding="utf-8") | |
| else: | |
| print( | |
| f"[WARNING] Unsupported file type: {ext}. Only PDF, CSV, or MD allowed." | |
| ) | |
| return [] | |
| # Load the document | |
| docs = loader.load() | |
| all_docs.extend(docs) | |
| print(f"Loaded {len(docs)} documents from {os.path.basename(path)}") | |
| # Split the text into smaller chunks for embeddings | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| split_docs = splitter.split_documents(all_docs) | |
| print(f"Split into {len(split_docs)} text chunks.") | |
| return split_docs | |
| except Exception as e: | |
| print(f"[ERROR] Failed to load or split document: {path}") | |
| print(f"Reason: {e}") | |
| traceback.print_exc() | |
| return [] | |
| # ============================================================ | |
| # Select embedding model (OpenAI β fallback to HuggingFace) | |
| # ============================================================ | |
| def get_embedding_model(): | |
| """ | |
| Try using OpenAIEmbeddings if a valid API key is available. | |
| If it fails (401, missing key, etc.), fall back to HuggingFaceEmbeddings. | |
| """ | |
| # Custom endpoint only supports GPT-4o-mini, not embeddings | |
| # So we skip OpenAI embeddings and use HuggingFace directly | |
| print("[INFO] Using HuggingFace embeddings (custom endpoint doesn't support embeddings)") | |
| print(f"Using HuggingFaceEmbeddings ({EMBEDDING_MODEL})...") | |
| return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) | |
| # ============================================================ | |
| # Main ingestion process | |
| # ============================================================ | |
| def ingest_data(path: str): | |
| """ | |
| Generate embeddings for an uploaded file and store them in a local ChromaDB. | |
| """ | |
| start_time = time.time() | |
| print("\nStarting ingestion for uploaded file...") | |
| documents = load_documents(path) | |
| if not documents: | |
| print("No valid document to process. Skipping embedding step.") | |
| return | |
| embeddings = get_embedding_model() | |
| try: | |
| vectordb = Chroma.from_documents( | |
| documents, embeddings, persist_directory=CHROMA_PATH | |
| ) | |
| elapsed = time.time() - start_time | |
| count = vectordb._collection.count() | |
| print(f"\nIngestion complete in {elapsed:.2f} seconds.") | |
| print(f"Data stored in {CHROMA_PATH} ({count} vectors).") | |
| except Exception as e: | |
| print(f"[ERROR] Failed to store vectors in ChromaDB: {e}") | |
| traceback.print_exc() | |