Spaces:
Sleeping
Sleeping
| """ | |
| Script to set up the Pinecone database with medical data | |
| """ | |
| from enhanced_data_loader import load_comprehensive_medical_datasets, chunk_text | |
| from embedding_service import EmbeddingService | |
| import time | |
| def setup_database(): | |
| """Set up Pinecone database with medical documents""" | |
| print("="*50) | |
| print("Setting up Medical Chatbot Database") | |
| print("="*50) | |
| # Load comprehensive medical data from multiple sources | |
| documents = load_comprehensive_medical_datasets() | |
| # Chunk large documents | |
| chunked_documents = [] | |
| for doc in documents: | |
| chunks = chunk_text(doc['text']) | |
| for chunk in chunks: | |
| chunked_documents.append({ | |
| 'text': chunk, | |
| 'source': doc['source'], | |
| 'metadata': doc['metadata'] | |
| }) | |
| print(f"Total chunks: {len(chunked_documents)}") | |
| # Initialize embedding service | |
| embedding_service = EmbeddingService() | |
| # Upload to Pinecone | |
| embedding_service.upsert_documents(chunked_documents) | |
| print("\n" + "="*50) | |
| print("Database setup complete!") | |
| print("="*50) | |
| if __name__ == "__main__": | |
| setup_database() | |