Spaces:

lapnt3
/

my-gradio-app

Runtime error

Nguyen Trong Lap

Recreate history without binary blobs

eeb0f9c 24 days ago

3.79 kB

	import os
	import time
	import traceback
	from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_chroma import Chroma
	from config.settings import CHROMA_PATH, EMBEDDING_MODEL


	# ============================================================
	# Load and split documents
	# ============================================================
	def load_documents(path: str):
	"""
	Load and split a single uploaded document (PDF, CSV, or MD/TXT).
	Args:
	path (str): Full path to the uploaded file.
	Returns:
	List[Document]: List of document chunks ready for embedding.
	"""
	print(f"\nReading uploaded file: {path}")

	if not os.path.exists(path):
	print(f"[ERROR] File not found: {path}")
	return []

	ext = os.path.splitext(path)[1].lower()
	all_docs = []

	try:
	# Select appropriate loader based on file type
	if ext == ".pdf":
	loader = PyPDFLoader(path)
	elif ext == ".csv":
	loader = CSVLoader(path, encoding="utf-8")
	elif ext in [".md"]:
	loader = TextLoader(path, encoding="utf-8")
	else:
	print(
	f"[WARNING] Unsupported file type: {ext}. Only PDF, CSV, or MD allowed."
	)
	return []

	# Load the document
	docs = loader.load()
	all_docs.extend(docs)
	print(f"Loaded {len(docs)} documents from {os.path.basename(path)}")

	# Split the text into smaller chunks for embeddings
	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	split_docs = splitter.split_documents(all_docs)
	print(f"Split into {len(split_docs)} text chunks.")
	return split_docs

	except Exception as e:
	print(f"[ERROR] Failed to load or split document: {path}")
	print(f"Reason: {e}")
	traceback.print_exc()
	return []


	# ============================================================
	# Select embedding model (OpenAI → fallback to HuggingFace)
	# ============================================================
	def get_embedding_model():
	"""
	Try using OpenAIEmbeddings if a valid API key is available.
	If it fails (401, missing key, etc.), fall back to HuggingFaceEmbeddings.
	"""

	# Custom endpoint only supports GPT-4o-mini, not embeddings
	# So we skip OpenAI embeddings and use HuggingFace directly
	print("[INFO] Using HuggingFace embeddings (custom endpoint doesn't support embeddings)")

	print(f"Using HuggingFaceEmbeddings ({EMBEDDING_MODEL})...")
	return HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)


	# ============================================================
	# Main ingestion process
	# ============================================================
	def ingest_data(path: str):
	"""
	Generate embeddings for an uploaded file and store them in a local ChromaDB.
	"""
	start_time = time.time()
	print("\nStarting ingestion for uploaded file...")

	documents = load_documents(path)
	if not documents:
	print("No valid document to process. Skipping embedding step.")
	return

	embeddings = get_embedding_model()

	try:
	vectordb = Chroma.from_documents(
	documents, embeddings, persist_directory=CHROMA_PATH
	)
	elapsed = time.time() - start_time
	count = vectordb._collection.count()
	print(f"\nIngestion complete in {elapsed:.2f} seconds.")
	print(f"Data stored in {CHROMA_PATH} ({count} vectors).")
	except Exception as e:
	print(f"[ERROR] Failed to store vectors in ChromaDB: {e}")
	traceback.print_exc()