""" ViMedical Disease Dataset - Download & Process Downloads and processes Vietnamese medical disease dataset into ChromaDB Dataset: PB3002/ViMedical_Disease (603 diseases, 12K+ examples) """ import requests import pandas as pd import chromadb from sentence_transformers import SentenceTransformer import os import re def download_vimedical(): """Download ViMedical dataset from HuggingFace""" print("📥 Downloading ViMedical Disease dataset...") # HuggingFace dataset URL url = "https://huggingface.co/datasets/PB3002/ViMedical_Disease/resolve/main/ViMedical_Disease.csv" # Create datasets directory os.makedirs("data_mining/datasets", exist_ok=True) output_path = "data_mining/datasets/vimedical_disease.csv" try: # Download response = requests.get(url, timeout=60) response.raise_for_status() # Save with open(output_path, 'wb') as f: f.write(response.content) # Check file size file_size = os.path.getsize(output_path) / (1024 * 1024) # MB print(f"✅ Downloaded: {output_path}") print(f"📊 File size: {file_size:.2f} MB") return True except Exception as e: print(f"❌ Download failed: {e}") return False def extract_symptoms(question): """Extract symptom description from question""" # Remove common prefixes prefixes = [ 'Tôi đang có triệu chứng như ', 'Tôi thường xuyên ', 'Tôi cảm thấy ', 'Tôi bị ', 'Tôi hay ', 'Tôi có ' ] symptom = question for prefix in prefixes: if symptom.startswith(prefix): symptom = symptom[len(prefix):] break # Remove question suffix suffixes = [ '. Tôi bị bệnh gì?', '. Tôi có thể bị gì?', '. Đó là bệnh gì?' ] for suffix in suffixes: if symptom.endswith(suffix): symptom = symptom[:-len(suffix)] break return symptom.strip() def process_vimedical(): """Process ViMedical dataset and build ChromaDB""" print("\n🔨 Processing ViMedical dataset...") # Load dataset csv_path = "data_mining/datasets/vimedical_disease.csv" if not os.path.exists(csv_path): print(f"❌ Dataset not found: {csv_path}") return False df = pd.read_csv(csv_path) print(f"📊 Loaded {len(df)} records") print(f"📊 Unique diseases: {df['Disease'].nunique()}") # Initialize embedder print("🤖 Loading embedding model...") embedder = SentenceTransformer('keepitreal/vietnamese-sbert') # Initialize ChromaDB print("💾 Initializing ChromaDB...") os.makedirs("data_mining/output", exist_ok=True) client = chromadb.PersistentClient(path="data_mining/output/medical_chroma") # Create collection collection = client.get_or_create_collection( name="medical_diseases", metadata={"hnsw:space": "cosine"} ) # Group by disease print("📝 Processing diseases...") disease_groups = df.groupby('Disease') processed = 0 for disease_name, group in disease_groups: # Extract symptoms from all questions symptoms = [] for question in group['Question']: symptom = extract_symptoms(question) if symptom: symptoms.append(symptom) # Create document text doc_text = f"Bệnh: {disease_name}\n\nTriệu chứng:\n" doc_text += "\n".join(f"- {s}" for s in symptoms[:10]) # Limit to 10 examples # Generate embedding embedding = embedder.encode(doc_text) # Add to ChromaDB collection.add( ids=[f"disease_{processed:04d}"], embeddings=[embedding.tolist()], documents=[doc_text], metadatas=[{ 'disease_name': disease_name, 'num_examples': len(symptoms), 'source': 'ViMedical_Disease' }] ) processed += 1 if processed % 50 == 0: print(f" Processed {processed}/{len(disease_groups)} diseases...") print(f"✅ Processed {processed} diseases") print(f"💾 Database saved to: data_mining/output/medical_chroma/") # Get database size db_path = "data_mining/output/medical_chroma" total_size = 0 for dirpath, dirnames, filenames in os.walk(db_path): for filename in filenames: filepath = os.path.join(dirpath, filename) total_size += os.path.getsize(filepath) print(f"📊 Database size: {total_size / (1024 * 1024):.2f} MB") return True def main(): """Main function - download and process""" print("=" * 60) print("ViMedical Disease Dataset - Download & Process") print("=" * 60) # Step 1: Download if not download_vimedical(): return False # Step 2: Process if not process_vimedical(): return False print("\n" + "=" * 60) print("✅ ViMedical dataset ready!") print("=" * 60) return True if __name__ == "__main__": success = main() exit(0 if success else 1)