Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Mining Script: Vietnamese Medical Q&A Dataset | |
| Downloads and processes hungnm/vietnamese-medical-qa from HuggingFace | |
| Splits into 2 collections: symptom_qa and general_health_qa | |
| """ | |
| import sys | |
| import pandas as pd | |
| from pathlib import Path | |
| def download_medical_qa(): | |
| """Download Vietnamese Medical Q&A dataset from HuggingFace""" | |
| try: | |
| from datasets import load_dataset | |
| print("📥 Downloading Vietnamese Medical Q&A from HuggingFace...") | |
| print(" Source: hungnm/vietnamese-medical-qa") | |
| print(" Size: ~9,335 Q&A pairs") | |
| # Download dataset | |
| dataset = load_dataset("hungnm/vietnamese-medical-qa") | |
| df = dataset['train'].to_pandas() | |
| print(f"✅ Downloaded: {len(df)} Q&A pairs") | |
| # Save to CSV | |
| output_dir = Path("data_mining/datasets") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| output_path = output_dir / "vietnamese_medical_qa.csv" | |
| df.to_csv(output_path, index=False, encoding='utf-8') | |
| print(f"💾 Saved to: {output_path}") | |
| return df | |
| except ImportError: | |
| print("❌ Error: 'datasets' library not installed") | |
| print(" Install with: pip install datasets") | |
| return None | |
| except Exception as e: | |
| print(f"❌ Error downloading dataset: {e}") | |
| return None | |
| def is_symptom_question(question): | |
| """ | |
| Classify if question is about SPECIFIC SYMPTOMS | |
| Returns: | |
| bool: True if symptom question, False if general health question | |
| """ | |
| if not question or not isinstance(question, str): | |
| return False | |
| question_lower = question.lower() | |
| # Symptom keywords (high priority - user describing active symptoms) | |
| symptom_keywords = [ | |
| # Pain | |
| 'bị đau', 'đau', 'nhức', 'tức', 'đau nhức', | |
| # Infection/Fever | |
| 'bị sốt', 'sốt', 'viêm', 'nhiễm trùng', 'mủ', 'sưng', | |
| # Digestive | |
| 'buồn nôn', 'nôn', 'tiêu chảy', 'táo bón', 'đầy hơi', | |
| 'ợ hơi', 'ợ chua', 'khó tiêu', | |
| # Respiratory | |
| 'ho', 'khó thở', 'nghẹt mũi', 'chảy nước mũi', | |
| 'đau họng', 'khàn giọng', | |
| # Neurological | |
| 'chóng mặt', 'hoa mắt', 'mất thăng bằng', 'đau đầu', | |
| # Skin | |
| 'ngứa', 'phát ban', 'nổi mẩn', 'đỏ', | |
| # General symptoms | |
| 'mệt mỏi', 'yếu', 'không khỏe', 'bị ốm', 'khó chịu' | |
| ] | |
| # General health keywords (prevention, knowledge, advice) | |
| general_keywords = [ | |
| # Prevention | |
| 'làm sao để không', 'phòng ngừa', 'tránh', 'cách phòng', | |
| 'làm thế nào để', 'cách nào để', | |
| # Knowledge questions | |
| 'là gì', 'có phải', 'có nên', 'nên không', | |
| 'tại sao', 'nguyên nhân', 'có thể', | |
| # Advice/Recommendations | |
| 'nên làm gì', 'nên ăn gì', 'có tốt không', | |
| 'có được không', 'có nên', 'khuyên' | |
| ] | |
| # Count keyword matches | |
| symptom_score = sum(1 for kw in symptom_keywords if kw in question_lower) | |
| general_score = sum(1 for kw in general_keywords if kw in question_lower) | |
| # Decision logic | |
| if symptom_score > general_score: | |
| return True # Symptom question | |
| elif general_score > symptom_score: | |
| return False # General health question | |
| else: | |
| # Tie-breaker: Check for "bị" (indicates having a condition) | |
| return 'bị' in question_lower | |
| def process_medical_qa(): | |
| """Process and split into 2 ChromaDB collections""" | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| import chromadb | |
| print("\n🔄 Processing Vietnamese Medical Q&A...") | |
| # Load CSV | |
| csv_path = Path("data_mining/datasets/vietnamese_medical_qa.csv") | |
| if not csv_path.exists(): | |
| print(f"❌ Error: {csv_path} not found") | |
| return False | |
| df = pd.read_csv(csv_path, encoding='utf-8') | |
| print(f"📊 Loaded: {len(df)} Q&A pairs") | |
| # Initialize embedding model | |
| print("🤖 Loading embedding model: keepitreal/vietnamese-sbert...") | |
| embedder = SentenceTransformer('keepitreal/vietnamese-sbert') | |
| # Initialize ChromaDB | |
| output_dir = Path("data_mining/output") | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Split data | |
| symptom_data = [] | |
| general_data = [] | |
| print("🔍 Classifying questions...") | |
| for idx, row in df.iterrows(): | |
| question = str(row['question']) | |
| answer = str(row['answer']) | |
| # Combine Q&A | |
| text = f"Câu hỏi: {question}\n\nTrả lời: {answer}" | |
| # Classify | |
| if is_symptom_question(question): | |
| symptom_data.append({ | |
| 'id': f'symptom_qa_{idx}', | |
| 'text': text, | |
| 'question': question, | |
| 'answer': answer, | |
| 'type': 'symptom' | |
| }) | |
| else: | |
| general_data.append({ | |
| 'id': f'general_qa_{idx}', | |
| 'text': text, | |
| 'question': question, | |
| 'answer': answer, | |
| 'type': 'general' | |
| }) | |
| print(f"✅ Classification complete:") | |
| print(f" - Symptom Q&A: {len(symptom_data)} ({len(symptom_data)/len(df)*100:.1f}%)") | |
| print(f" - General Health Q&A: {len(general_data)} ({len(general_data)/len(df)*100:.1f}%)") | |
| # Create ChromaDB collections | |
| # 1. Symptom Q&A Collection | |
| print("\n📦 Creating Symptom Q&A ChromaDB...") | |
| symptom_client = chromadb.PersistentClient(path=str(output_dir / "symptom_qa_chroma")) | |
| symptom_collection = symptom_client.get_or_create_collection( | |
| name="symptom_qa", | |
| metadata={"description": "Vietnamese Medical Q&A - Symptom Questions"} | |
| ) | |
| # Batch insert symptom data | |
| batch_size = 100 | |
| for i in range(0, len(symptom_data), batch_size): | |
| batch = symptom_data[i:i+batch_size] | |
| ids = [item['id'] for item in batch] | |
| texts = [item['text'] for item in batch] | |
| metadatas = [{ | |
| 'type': item['type'], | |
| 'domain': 'symptom', | |
| 'agent': 'SymptomAgent', | |
| 'source': 'vietnamese-medical-qa' | |
| } for item in batch] | |
| # Generate embeddings | |
| embeddings = embedder.encode(texts, show_progress_bar=False) | |
| symptom_collection.add( | |
| ids=ids, | |
| embeddings=embeddings.tolist(), | |
| documents=texts, | |
| metadatas=metadatas | |
| ) | |
| if (i + batch_size) % 500 == 0: | |
| print(f" Processed {min(i+batch_size, len(symptom_data))}/{len(symptom_data)} symptom Q&A...") | |
| print(f"✅ Symptom Q&A ChromaDB created: {len(symptom_data)} records") | |
| # 2. General Health Q&A Collection | |
| print("\n📦 Creating General Health Q&A ChromaDB...") | |
| general_client = chromadb.PersistentClient(path=str(output_dir / "general_health_qa_chroma")) | |
| general_collection = general_client.get_or_create_collection( | |
| name="general_health_qa", | |
| metadata={"description": "Vietnamese Medical Q&A - General Health Questions"} | |
| ) | |
| # Batch insert general data | |
| for i in range(0, len(general_data), batch_size): | |
| batch = general_data[i:i+batch_size] | |
| ids = [item['id'] for item in batch] | |
| texts = [item['text'] for item in batch] | |
| metadatas = [{ | |
| 'type': item['type'], | |
| 'domain': 'general_health', | |
| 'agent': 'GeneralHealthAgent', | |
| 'source': 'vietnamese-medical-qa' | |
| } for item in batch] | |
| # Generate embeddings | |
| embeddings = embedder.encode(texts, show_progress_bar=False) | |
| general_collection.add( | |
| ids=ids, | |
| embeddings=embeddings.tolist(), | |
| documents=texts, | |
| metadatas=metadatas | |
| ) | |
| if (i + batch_size) % 500 == 0: | |
| print(f" Processed {min(i+batch_size, len(general_data))}/{len(general_data)} general Q&A...") | |
| print(f"✅ General Health Q&A ChromaDB created: {len(general_data)} records") | |
| print("\n✅ Processing complete!") | |
| print(f" Output: {output_dir}") | |
| print(f" - symptom_qa_chroma/ ({len(symptom_data)} records)") | |
| print(f" - general_health_qa_chroma/ ({len(general_data)} records)") | |
| return True | |
| except ImportError as e: | |
| print(f"❌ Error: Missing library - {e}") | |
| print(" Install with: pip install sentence-transformers chromadb") | |
| return False | |
| except Exception as e: | |
| print(f"❌ Error processing dataset: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def main(): | |
| """Main execution""" | |
| print("=" * 60) | |
| print("Vietnamese Medical Q&A Dataset Mining") | |
| print("Source: hungnm/vietnamese-medical-qa (HuggingFace)") | |
| print("=" * 60) | |
| # Step 1: Download | |
| df = download_medical_qa() | |
| if df is None: | |
| print("\n❌ Download failed!") | |
| return False | |
| # Step 2: Process | |
| success = process_medical_qa() | |
| if not success: | |
| print("\n❌ Processing failed!") | |
| return False | |
| print("\n" + "=" * 60) | |
| print("✅ SUCCESS! Vietnamese Medical Q&A ready for RAG system") | |
| print("=" * 60) | |
| return True | |
| if __name__ == "__main__": | |
| success = main() | |
| sys.exit(0 if success else 1) | |