#!/usr/bin/env python3 """ Mining Script: Vietnamese Medical Q&A Dataset Downloads and processes hungnm/vietnamese-medical-qa from HuggingFace Splits into 2 collections: symptom_qa and general_health_qa """ import sys import pandas as pd from pathlib import Path def download_medical_qa(): """Download Vietnamese Medical Q&A dataset from HuggingFace""" try: from datasets import load_dataset print("📥 Downloading Vietnamese Medical Q&A from HuggingFace...") print(" Source: hungnm/vietnamese-medical-qa") print(" Size: ~9,335 Q&A pairs") # Download dataset dataset = load_dataset("hungnm/vietnamese-medical-qa") df = dataset['train'].to_pandas() print(f"✅ Downloaded: {len(df)} Q&A pairs") # Save to CSV output_dir = Path("data_mining/datasets") output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / "vietnamese_medical_qa.csv" df.to_csv(output_path, index=False, encoding='utf-8') print(f"💾 Saved to: {output_path}") return df except ImportError: print("❌ Error: 'datasets' library not installed") print(" Install with: pip install datasets") return None except Exception as e: print(f"❌ Error downloading dataset: {e}") return None def is_symptom_question(question): """ Classify if question is about SPECIFIC SYMPTOMS Returns: bool: True if symptom question, False if general health question """ if not question or not isinstance(question, str): return False question_lower = question.lower() # Symptom keywords (high priority - user describing active symptoms) symptom_keywords = [ # Pain 'bị đau', 'đau', 'nhức', 'tức', 'đau nhức', # Infection/Fever 'bị sốt', 'sốt', 'viêm', 'nhiễm trùng', 'mủ', 'sưng', # Digestive 'buồn nôn', 'nôn', 'tiêu chảy', 'táo bón', 'đầy hơi', 'ợ hơi', 'ợ chua', 'khó tiêu', # Respiratory 'ho', 'khó thở', 'nghẹt mũi', 'chảy nước mũi', 'đau họng', 'khàn giọng', # Neurological 'chóng mặt', 'hoa mắt', 'mất thăng bằng', 'đau đầu', # Skin 'ngứa', 'phát ban', 'nổi mẩn', 'đỏ', # General symptoms 'mệt mỏi', 'yếu', 'không khỏe', 'bị ốm', 'khó chịu' ] # General health keywords (prevention, knowledge, advice) general_keywords = [ # Prevention 'làm sao để không', 'phòng ngừa', 'tránh', 'cách phòng', 'làm thế nào để', 'cách nào để', # Knowledge questions 'là gì', 'có phải', 'có nên', 'nên không', 'tại sao', 'nguyên nhân', 'có thể', # Advice/Recommendations 'nên làm gì', 'nên ăn gì', 'có tốt không', 'có được không', 'có nên', 'khuyên' ] # Count keyword matches symptom_score = sum(1 for kw in symptom_keywords if kw in question_lower) general_score = sum(1 for kw in general_keywords if kw in question_lower) # Decision logic if symptom_score > general_score: return True # Symptom question elif general_score > symptom_score: return False # General health question else: # Tie-breaker: Check for "bị" (indicates having a condition) return 'bị' in question_lower def process_medical_qa(): """Process and split into 2 ChromaDB collections""" try: from sentence_transformers import SentenceTransformer import chromadb print("\n🔄 Processing Vietnamese Medical Q&A...") # Load CSV csv_path = Path("data_mining/datasets/vietnamese_medical_qa.csv") if not csv_path.exists(): print(f"❌ Error: {csv_path} not found") return False df = pd.read_csv(csv_path, encoding='utf-8') print(f"📊 Loaded: {len(df)} Q&A pairs") # Initialize embedding model print("🤖 Loading embedding model: keepitreal/vietnamese-sbert...") embedder = SentenceTransformer('keepitreal/vietnamese-sbert') # Initialize ChromaDB output_dir = Path("data_mining/output") output_dir.mkdir(parents=True, exist_ok=True) # Split data symptom_data = [] general_data = [] print("🔍 Classifying questions...") for idx, row in df.iterrows(): question = str(row['question']) answer = str(row['answer']) # Combine Q&A text = f"Câu hỏi: {question}\n\nTrả lời: {answer}" # Classify if is_symptom_question(question): symptom_data.append({ 'id': f'symptom_qa_{idx}', 'text': text, 'question': question, 'answer': answer, 'type': 'symptom' }) else: general_data.append({ 'id': f'general_qa_{idx}', 'text': text, 'question': question, 'answer': answer, 'type': 'general' }) print(f"✅ Classification complete:") print(f" - Symptom Q&A: {len(symptom_data)} ({len(symptom_data)/len(df)*100:.1f}%)") print(f" - General Health Q&A: {len(general_data)} ({len(general_data)/len(df)*100:.1f}%)") # Create ChromaDB collections # 1. Symptom Q&A Collection print("\n📦 Creating Symptom Q&A ChromaDB...") symptom_client = chromadb.PersistentClient(path=str(output_dir / "symptom_qa_chroma")) symptom_collection = symptom_client.get_or_create_collection( name="symptom_qa", metadata={"description": "Vietnamese Medical Q&A - Symptom Questions"} ) # Batch insert symptom data batch_size = 100 for i in range(0, len(symptom_data), batch_size): batch = symptom_data[i:i+batch_size] ids = [item['id'] for item in batch] texts = [item['text'] for item in batch] metadatas = [{ 'type': item['type'], 'domain': 'symptom', 'agent': 'SymptomAgent', 'source': 'vietnamese-medical-qa' } for item in batch] # Generate embeddings embeddings = embedder.encode(texts, show_progress_bar=False) symptom_collection.add( ids=ids, embeddings=embeddings.tolist(), documents=texts, metadatas=metadatas ) if (i + batch_size) % 500 == 0: print(f" Processed {min(i+batch_size, len(symptom_data))}/{len(symptom_data)} symptom Q&A...") print(f"✅ Symptom Q&A ChromaDB created: {len(symptom_data)} records") # 2. General Health Q&A Collection print("\n📦 Creating General Health Q&A ChromaDB...") general_client = chromadb.PersistentClient(path=str(output_dir / "general_health_qa_chroma")) general_collection = general_client.get_or_create_collection( name="general_health_qa", metadata={"description": "Vietnamese Medical Q&A - General Health Questions"} ) # Batch insert general data for i in range(0, len(general_data), batch_size): batch = general_data[i:i+batch_size] ids = [item['id'] for item in batch] texts = [item['text'] for item in batch] metadatas = [{ 'type': item['type'], 'domain': 'general_health', 'agent': 'GeneralHealthAgent', 'source': 'vietnamese-medical-qa' } for item in batch] # Generate embeddings embeddings = embedder.encode(texts, show_progress_bar=False) general_collection.add( ids=ids, embeddings=embeddings.tolist(), documents=texts, metadatas=metadatas ) if (i + batch_size) % 500 == 0: print(f" Processed {min(i+batch_size, len(general_data))}/{len(general_data)} general Q&A...") print(f"✅ General Health Q&A ChromaDB created: {len(general_data)} records") print("\n✅ Processing complete!") print(f" Output: {output_dir}") print(f" - symptom_qa_chroma/ ({len(symptom_data)} records)") print(f" - general_health_qa_chroma/ ({len(general_data)} records)") return True except ImportError as e: print(f"❌ Error: Missing library - {e}") print(" Install with: pip install sentence-transformers chromadb") return False except Exception as e: print(f"❌ Error processing dataset: {e}") import traceback traceback.print_exc() return False def main(): """Main execution""" print("=" * 60) print("Vietnamese Medical Q&A Dataset Mining") print("Source: hungnm/vietnamese-medical-qa (HuggingFace)") print("=" * 60) # Step 1: Download df = download_medical_qa() if df is None: print("\n❌ Download failed!") return False # Step 2: Process success = process_medical_qa() if not success: print("\n❌ Processing failed!") return False print("\n" + "=" * 60) print("✅ SUCCESS! Vietnamese Medical Q&A ready for RAG system") print("=" * 60) return True if __name__ == "__main__": success = main() sys.exit(0 if success else 1)