Spaces:
Runtime error
Runtime error
| """ | |
| Cloud Vector Database Setup | |
| Creates vector database collections on cloud infrastructure. | |
| Supports both vector search and graph-based retrieval systems. | |
| """ | |
| from my_config import MY_CONFIG | |
| import os | |
| import sys | |
| from llama_index.core import SimpleDirectoryReader | |
| from llama_index.core.node_parser import SentenceSplitter | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.core import Settings | |
| from pymilvus import MilvusClient | |
| from llama_index.core import StorageContext | |
| from llama_index.vector_stores.milvus import MilvusVectorStore | |
| from llama_index.core import VectorStoreIndex | |
| import logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| # Validate cloud database configuration | |
| if not MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT: | |
| raise ValueError("Cloud endpoint configuration missing") | |
| if not MY_CONFIG.ZILLIZ_TOKEN: | |
| raise ValueError("Cloud authentication token missing") | |
| def main(): | |
| logger.info("Initializing cloud database connection") | |
| # Load source documents | |
| logger.info("Loading documents") | |
| reader = SimpleDirectoryReader(input_dir=MY_CONFIG.PROCESSED_DATA_DIR, recursive=False, required_exts=[".md"]) | |
| documents = reader.load_data() | |
| logger.info(f"Loaded {len(documents)} documents") | |
| # Process document chunks | |
| logger.info("Processing document chunks") | |
| parser = SentenceSplitter(chunk_size=MY_CONFIG.CHUNK_SIZE, chunk_overlap=MY_CONFIG.CHUNK_OVERLAP) | |
| nodes = parser.get_nodes_from_documents(documents) | |
| logger.info(f"Created {len(nodes)} chunks") | |
| # Initialize embedding model | |
| logger.info("Configuring embedding model") | |
| os.environ['HF_ENDPOINT'] = MY_CONFIG.HF_ENDPOINT | |
| Settings.embed_model = HuggingFaceEmbedding( | |
| model_name=MY_CONFIG.EMBEDDING_MODEL | |
| ) | |
| # Create cloud database collection | |
| logger.info("Creating database collection") | |
| collection_name = MY_CONFIG.COLLECTION_NAME | |
| milvus_client = None | |
| try: | |
| # Connect to cloud database | |
| milvus_client = MilvusClient( | |
| uri=MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT, | |
| token=MY_CONFIG.ZILLIZ_TOKEN | |
| ) | |
| # Remove existing collection if present | |
| if milvus_client.has_collection(collection_name=collection_name): | |
| milvus_client.drop_collection(collection_name=collection_name) | |
| # Initialize vector store | |
| vector_store = MilvusVectorStore( | |
| uri=MY_CONFIG.ZILLIZ_CLUSTER_ENDPOINT, | |
| token=MY_CONFIG.ZILLIZ_TOKEN, | |
| collection_name=collection_name, | |
| dim=MY_CONFIG.EMBEDDING_LENGTH, | |
| overwrite=True | |
| ) | |
| storage_context = StorageContext.from_defaults(vector_store=vector_store) | |
| # Store document vectors | |
| logger.info(f"Processing {len(nodes)} document chunks") | |
| VectorStoreIndex( | |
| nodes=nodes, | |
| storage_context=storage_context, | |
| ) | |
| logger.info(f"Database collection '{collection_name}' created successfully") | |
| except Exception as e: | |
| logger.error(f"Failed to create collection: {str(e)}") | |
| raise | |
| finally: | |
| if milvus_client: | |
| milvus_client.close() | |
| logger.info("Cloud database setup completed successfully") | |
| if __name__ == "__main__": | |
| try: | |
| main() | |
| sys.exit(0) | |
| except KeyboardInterrupt: | |
| logger.info("Operation cancelled by user") | |
| sys.exit(1) | |
| except Exception as e: | |
| logger.error(f"Fatal error: {str(e)}") | |
| sys.exit(1) |