Spaces:

lapnt3
/

my-gradio-app

Runtime error

File size: 10,183 Bytes

eeb0f9c

#!/usr/bin/env python3
"""
Mining Script: Vietnamese Medical Q&A Dataset
Downloads and processes hungnm/vietnamese-medical-qa from HuggingFace
Splits into 2 collections: symptom_qa and general_health_qa
"""

import sys
import pandas as pd
from pathlib import Path

def download_medical_qa():
    """Download Vietnamese Medical Q&A dataset from HuggingFace"""
    try:
        from datasets import load_dataset
        
        print("📥 Downloading Vietnamese Medical Q&A from HuggingFace...")
        print("   Source: hungnm/vietnamese-medical-qa")
        print("   Size: ~9,335 Q&A pairs")
        
        # Download dataset
        dataset = load_dataset("hungnm/vietnamese-medical-qa")
        df = dataset['train'].to_pandas()
        
        print(f"✅ Downloaded: {len(df)} Q&A pairs")
        
        # Save to CSV
        output_dir = Path("data_mining/datasets")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        output_path = output_dir / "vietnamese_medical_qa.csv"
        df.to_csv(output_path, index=False, encoding='utf-8')
        
        print(f"💾 Saved to: {output_path}")
        return df
        
    except ImportError:
        print("❌ Error: 'datasets' library not installed")
        print("   Install with: pip install datasets")
        return None
    except Exception as e:
        print(f"❌ Error downloading dataset: {e}")
        return None


def is_symptom_question(question):
    """
    Classify if question is about SPECIFIC SYMPTOMS
    
    Returns:
        bool: True if symptom question, False if general health question
    """
    if not question or not isinstance(question, str):
        return False
    
    question_lower = question.lower()
    
    # Symptom keywords (high priority - user describing active symptoms)
    symptom_keywords = [
        # Pain
        'bị đau', 'đau', 'nhức', 'tức', 'đau nhức',
        
        # Infection/Fever
        'bị sốt', 'sốt', 'viêm', 'nhiễm trùng', 'mủ', 'sưng',
        
        # Digestive
        'buồn nôn', 'nôn', 'tiêu chảy', 'táo bón', 'đầy hơi',
        'ợ hơi', 'ợ chua', 'khó tiêu',
        
        # Respiratory
        'ho', 'khó thở', 'nghẹt mũi', 'chảy nước mũi',
        'đau họng', 'khàn giọng',
        
        # Neurological
        'chóng mặt', 'hoa mắt', 'mất thăng bằng', 'đau đầu',
        
        # Skin
        'ngứa', 'phát ban', 'nổi mẩn', 'đỏ',
        
        # General symptoms
        'mệt mỏi', 'yếu', 'không khỏe', 'bị ốm', 'khó chịu'
    ]
    
    # General health keywords (prevention, knowledge, advice)
    general_keywords = [
        # Prevention
        'làm sao để không', 'phòng ngừa', 'tránh', 'cách phòng',
        'làm thế nào để', 'cách nào để',
        
        # Knowledge questions
        'là gì', 'có phải', 'có nên', 'nên không',
        'tại sao', 'nguyên nhân', 'có thể',
        
        # Advice/Recommendations
        'nên làm gì', 'nên ăn gì', 'có tốt không',
        'có được không', 'có nên', 'khuyên'
    ]
    
    # Count keyword matches
    symptom_score = sum(1 for kw in symptom_keywords if kw in question_lower)
    general_score = sum(1 for kw in general_keywords if kw in question_lower)
    
    # Decision logic
    if symptom_score > general_score:
        return True  # Symptom question
    elif general_score > symptom_score:
        return False  # General health question
    else:
        # Tie-breaker: Check for "bị" (indicates having a condition)
        return 'bị' in question_lower


def process_medical_qa():
    """Process and split into 2 ChromaDB collections"""
    try:
        from sentence_transformers import SentenceTransformer
        import chromadb
        
        print("\n🔄 Processing Vietnamese Medical Q&A...")
        
        # Load CSV
        csv_path = Path("data_mining/datasets/vietnamese_medical_qa.csv")
        if not csv_path.exists():
            print(f"❌ Error: {csv_path} not found")
            return False
        
        df = pd.read_csv(csv_path, encoding='utf-8')
        print(f"📊 Loaded: {len(df)} Q&A pairs")
        
        # Initialize embedding model
        print("🤖 Loading embedding model: keepitreal/vietnamese-sbert...")
        embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
        
        # Initialize ChromaDB
        output_dir = Path("data_mining/output")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Split data
        symptom_data = []
        general_data = []
        
        print("🔍 Classifying questions...")
        for idx, row in df.iterrows():
            question = str(row['question'])
            answer = str(row['answer'])
            
            # Combine Q&A
            text = f"Câu hỏi: {question}\n\nTrả lời: {answer}"
            
            # Classify
            if is_symptom_question(question):
                symptom_data.append({
                    'id': f'symptom_qa_{idx}',
                    'text': text,
                    'question': question,
                    'answer': answer,
                    'type': 'symptom'
                })
            else:
                general_data.append({
                    'id': f'general_qa_{idx}',
                    'text': text,
                    'question': question,
                    'answer': answer,
                    'type': 'general'
                })
        
        print(f"✅ Classification complete:")
        print(f"   - Symptom Q&A: {len(symptom_data)} ({len(symptom_data)/len(df)*100:.1f}%)")
        print(f"   - General Health Q&A: {len(general_data)} ({len(general_data)/len(df)*100:.1f}%)")
        
        # Create ChromaDB collections
        # 1. Symptom Q&A Collection
        print("\n📦 Creating Symptom Q&A ChromaDB...")
        symptom_client = chromadb.PersistentClient(path=str(output_dir / "symptom_qa_chroma"))
        symptom_collection = symptom_client.get_or_create_collection(
            name="symptom_qa",
            metadata={"description": "Vietnamese Medical Q&A - Symptom Questions"}
        )
        
        # Batch insert symptom data
        batch_size = 100
        for i in range(0, len(symptom_data), batch_size):
            batch = symptom_data[i:i+batch_size]
            
            ids = [item['id'] for item in batch]
            texts = [item['text'] for item in batch]
            metadatas = [{
                'type': item['type'],
                'domain': 'symptom',
                'agent': 'SymptomAgent',
                'source': 'vietnamese-medical-qa'
            } for item in batch]
            
            # Generate embeddings
            embeddings = embedder.encode(texts, show_progress_bar=False)
            
            symptom_collection.add(
                ids=ids,
                embeddings=embeddings.tolist(),
                documents=texts,
                metadatas=metadatas
            )
            
            if (i + batch_size) % 500 == 0:
                print(f"   Processed {min(i+batch_size, len(symptom_data))}/{len(symptom_data)} symptom Q&A...")
        
        print(f"✅ Symptom Q&A ChromaDB created: {len(symptom_data)} records")
        
        # 2. General Health Q&A Collection
        print("\n📦 Creating General Health Q&A ChromaDB...")
        general_client = chromadb.PersistentClient(path=str(output_dir / "general_health_qa_chroma"))
        general_collection = general_client.get_or_create_collection(
            name="general_health_qa",
            metadata={"description": "Vietnamese Medical Q&A - General Health Questions"}
        )
        
        # Batch insert general data
        for i in range(0, len(general_data), batch_size):
            batch = general_data[i:i+batch_size]
            
            ids = [item['id'] for item in batch]
            texts = [item['text'] for item in batch]
            metadatas = [{
                'type': item['type'],
                'domain': 'general_health',
                'agent': 'GeneralHealthAgent',
                'source': 'vietnamese-medical-qa'
            } for item in batch]
            
            # Generate embeddings
            embeddings = embedder.encode(texts, show_progress_bar=False)
            
            general_collection.add(
                ids=ids,
                embeddings=embeddings.tolist(),
                documents=texts,
                metadatas=metadatas
            )
            
            if (i + batch_size) % 500 == 0:
                print(f"   Processed {min(i+batch_size, len(general_data))}/{len(general_data)} general Q&A...")
        
        print(f"✅ General Health Q&A ChromaDB created: {len(general_data)} records")
        
        print("\n✅ Processing complete!")
        print(f"   Output: {output_dir}")
        print(f"   - symptom_qa_chroma/ ({len(symptom_data)} records)")
        print(f"   - general_health_qa_chroma/ ({len(general_data)} records)")
        
        return True
        
    except ImportError as e:
        print(f"❌ Error: Missing library - {e}")
        print("   Install with: pip install sentence-transformers chromadb")
        return False
    except Exception as e:
        print(f"❌ Error processing dataset: {e}")
        import traceback
        traceback.print_exc()
        return False


def main():
    """Main execution"""
    print("=" * 60)
    print("Vietnamese Medical Q&A Dataset Mining")
    print("Source: hungnm/vietnamese-medical-qa (HuggingFace)")
    print("=" * 60)
    
    # Step 1: Download
    df = download_medical_qa()
    if df is None:
        print("\n❌ Download failed!")
        return False
    
    # Step 2: Process
    success = process_medical_qa()
    if not success:
        print("\n❌ Processing failed!")
        return False
    
    print("\n" + "=" * 60)
    print("✅ SUCCESS! Vietnamese Medical Q&A ready for RAG system")
    print("=" * 60)
    return True


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)