import os import time import traceback from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma from config.settings import CHROMA_PATH, EMBEDDING_MODEL # ============================================================ # Load and split documents # ============================================================ def load_documents(path: str): """ Load and split a single uploaded document (PDF, CSV, or MD/TXT). Args: path (str): Full path to the uploaded file. Returns: List[Document]: List of document chunks ready for embedding. """ print(f"\nReading uploaded file: {path}") if not os.path.exists(path): print(f"[ERROR] File not found: {path}") return [] ext = os.path.splitext(path)[1].lower() all_docs = [] try: # Select appropriate loader based on file type if ext == ".pdf": loader = PyPDFLoader(path) elif ext == ".csv": loader = CSVLoader(path, encoding="utf-8") elif ext in [".md"]: loader = TextLoader(path, encoding="utf-8") else: print( f"[WARNING] Unsupported file type: {ext}. Only PDF, CSV, or MD allowed." ) return [] # Load the document docs = loader.load() all_docs.extend(docs) print(f"Loaded {len(docs)} documents from {os.path.basename(path)}") # Split the text into smaller chunks for embeddings splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) split_docs = splitter.split_documents(all_docs) print(f"Split into {len(split_docs)} text chunks.") return split_docs except Exception as e: print(f"[ERROR] Failed to load or split document: {path}") print(f"Reason: {e}") traceback.print_exc() return [] # ============================================================ # Select embedding model (OpenAI → fallback to HuggingFace) # ============================================================ def get_embedding_model(): """ Try using OpenAIEmbeddings if a valid API key is available. If it fails (401, missing key, etc.), fall back to HuggingFaceEmbeddings. """ # Custom endpoint only supports GPT-4o-mini, not embeddings # So we skip OpenAI embeddings and use HuggingFace directly print("[INFO] Using HuggingFace embeddings (custom endpoint doesn't support embeddings)") print(f"Using HuggingFaceEmbeddings ({EMBEDDING_MODEL})...") return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) # ============================================================ # Main ingestion process # ============================================================ def ingest_data(path: str): """ Generate embeddings for an uploaded file and store them in a local ChromaDB. """ start_time = time.time() print("\nStarting ingestion for uploaded file...") documents = load_documents(path) if not documents: print("No valid document to process. Skipping embedding step.") return embeddings = get_embedding_model() try: vectordb = Chroma.from_documents( documents, embeddings, persist_directory=CHROMA_PATH ) elapsed = time.time() - start_time count = vectordb._collection.count() print(f"\nIngestion complete in {elapsed:.2f} seconds.") print(f"Data stored in {CHROMA_PATH} ({count} vectors).") except Exception as e: print(f"[ERROR] Failed to store vectors in ChromaDB: {e}") traceback.print_exc()