my-gradio-app / data_mining /mining_medical_qa.py
Nguyen Trong Lap
Recreate history without binary blobs
eeb0f9c
#!/usr/bin/env python3
"""
Mining Script: Vietnamese Medical Q&A Dataset
Downloads and processes hungnm/vietnamese-medical-qa from HuggingFace
Splits into 2 collections: symptom_qa and general_health_qa
"""
import sys
import pandas as pd
from pathlib import Path
def download_medical_qa():
"""Download Vietnamese Medical Q&A dataset from HuggingFace"""
try:
from datasets import load_dataset
print("📥 Downloading Vietnamese Medical Q&A from HuggingFace...")
print(" Source: hungnm/vietnamese-medical-qa")
print(" Size: ~9,335 Q&A pairs")
# Download dataset
dataset = load_dataset("hungnm/vietnamese-medical-qa")
df = dataset['train'].to_pandas()
print(f"✅ Downloaded: {len(df)} Q&A pairs")
# Save to CSV
output_dir = Path("data_mining/datasets")
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / "vietnamese_medical_qa.csv"
df.to_csv(output_path, index=False, encoding='utf-8')
print(f"💾 Saved to: {output_path}")
return df
except ImportError:
print("❌ Error: 'datasets' library not installed")
print(" Install with: pip install datasets")
return None
except Exception as e:
print(f"❌ Error downloading dataset: {e}")
return None
def is_symptom_question(question):
"""
Classify if question is about SPECIFIC SYMPTOMS
Returns:
bool: True if symptom question, False if general health question
"""
if not question or not isinstance(question, str):
return False
question_lower = question.lower()
# Symptom keywords (high priority - user describing active symptoms)
symptom_keywords = [
# Pain
'bị đau', 'đau', 'nhức', 'tức', 'đau nhức',
# Infection/Fever
'bị sốt', 'sốt', 'viêm', 'nhiễm trùng', 'mủ', 'sưng',
# Digestive
'buồn nôn', 'nôn', 'tiêu chảy', 'táo bón', 'đầy hơi',
'ợ hơi', 'ợ chua', 'khó tiêu',
# Respiratory
'ho', 'khó thở', 'nghẹt mũi', 'chảy nước mũi',
'đau họng', 'khàn giọng',
# Neurological
'chóng mặt', 'hoa mắt', 'mất thăng bằng', 'đau đầu',
# Skin
'ngứa', 'phát ban', 'nổi mẩn', 'đỏ',
# General symptoms
'mệt mỏi', 'yếu', 'không khỏe', 'bị ốm', 'khó chịu'
]
# General health keywords (prevention, knowledge, advice)
general_keywords = [
# Prevention
'làm sao để không', 'phòng ngừa', 'tránh', 'cách phòng',
'làm thế nào để', 'cách nào để',
# Knowledge questions
'là gì', 'có phải', 'có nên', 'nên không',
'tại sao', 'nguyên nhân', 'có thể',
# Advice/Recommendations
'nên làm gì', 'nên ăn gì', 'có tốt không',
'có được không', 'có nên', 'khuyên'
]
# Count keyword matches
symptom_score = sum(1 for kw in symptom_keywords if kw in question_lower)
general_score = sum(1 for kw in general_keywords if kw in question_lower)
# Decision logic
if symptom_score > general_score:
return True # Symptom question
elif general_score > symptom_score:
return False # General health question
else:
# Tie-breaker: Check for "bị" (indicates having a condition)
return 'bị' in question_lower
def process_medical_qa():
"""Process and split into 2 ChromaDB collections"""
try:
from sentence_transformers import SentenceTransformer
import chromadb
print("\n🔄 Processing Vietnamese Medical Q&A...")
# Load CSV
csv_path = Path("data_mining/datasets/vietnamese_medical_qa.csv")
if not csv_path.exists():
print(f"❌ Error: {csv_path} not found")
return False
df = pd.read_csv(csv_path, encoding='utf-8')
print(f"📊 Loaded: {len(df)} Q&A pairs")
# Initialize embedding model
print("🤖 Loading embedding model: keepitreal/vietnamese-sbert...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
# Initialize ChromaDB
output_dir = Path("data_mining/output")
output_dir.mkdir(parents=True, exist_ok=True)
# Split data
symptom_data = []
general_data = []
print("🔍 Classifying questions...")
for idx, row in df.iterrows():
question = str(row['question'])
answer = str(row['answer'])
# Combine Q&A
text = f"Câu hỏi: {question}\n\nTrả lời: {answer}"
# Classify
if is_symptom_question(question):
symptom_data.append({
'id': f'symptom_qa_{idx}',
'text': text,
'question': question,
'answer': answer,
'type': 'symptom'
})
else:
general_data.append({
'id': f'general_qa_{idx}',
'text': text,
'question': question,
'answer': answer,
'type': 'general'
})
print(f"✅ Classification complete:")
print(f" - Symptom Q&A: {len(symptom_data)} ({len(symptom_data)/len(df)*100:.1f}%)")
print(f" - General Health Q&A: {len(general_data)} ({len(general_data)/len(df)*100:.1f}%)")
# Create ChromaDB collections
# 1. Symptom Q&A Collection
print("\n📦 Creating Symptom Q&A ChromaDB...")
symptom_client = chromadb.PersistentClient(path=str(output_dir / "symptom_qa_chroma"))
symptom_collection = symptom_client.get_or_create_collection(
name="symptom_qa",
metadata={"description": "Vietnamese Medical Q&A - Symptom Questions"}
)
# Batch insert symptom data
batch_size = 100
for i in range(0, len(symptom_data), batch_size):
batch = symptom_data[i:i+batch_size]
ids = [item['id'] for item in batch]
texts = [item['text'] for item in batch]
metadatas = [{
'type': item['type'],
'domain': 'symptom',
'agent': 'SymptomAgent',
'source': 'vietnamese-medical-qa'
} for item in batch]
# Generate embeddings
embeddings = embedder.encode(texts, show_progress_bar=False)
symptom_collection.add(
ids=ids,
embeddings=embeddings.tolist(),
documents=texts,
metadatas=metadatas
)
if (i + batch_size) % 500 == 0:
print(f" Processed {min(i+batch_size, len(symptom_data))}/{len(symptom_data)} symptom Q&A...")
print(f"✅ Symptom Q&A ChromaDB created: {len(symptom_data)} records")
# 2. General Health Q&A Collection
print("\n📦 Creating General Health Q&A ChromaDB...")
general_client = chromadb.PersistentClient(path=str(output_dir / "general_health_qa_chroma"))
general_collection = general_client.get_or_create_collection(
name="general_health_qa",
metadata={"description": "Vietnamese Medical Q&A - General Health Questions"}
)
# Batch insert general data
for i in range(0, len(general_data), batch_size):
batch = general_data[i:i+batch_size]
ids = [item['id'] for item in batch]
texts = [item['text'] for item in batch]
metadatas = [{
'type': item['type'],
'domain': 'general_health',
'agent': 'GeneralHealthAgent',
'source': 'vietnamese-medical-qa'
} for item in batch]
# Generate embeddings
embeddings = embedder.encode(texts, show_progress_bar=False)
general_collection.add(
ids=ids,
embeddings=embeddings.tolist(),
documents=texts,
metadatas=metadatas
)
if (i + batch_size) % 500 == 0:
print(f" Processed {min(i+batch_size, len(general_data))}/{len(general_data)} general Q&A...")
print(f"✅ General Health Q&A ChromaDB created: {len(general_data)} records")
print("\n✅ Processing complete!")
print(f" Output: {output_dir}")
print(f" - symptom_qa_chroma/ ({len(symptom_data)} records)")
print(f" - general_health_qa_chroma/ ({len(general_data)} records)")
return True
except ImportError as e:
print(f"❌ Error: Missing library - {e}")
print(" Install with: pip install sentence-transformers chromadb")
return False
except Exception as e:
print(f"❌ Error processing dataset: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Main execution"""
print("=" * 60)
print("Vietnamese Medical Q&A Dataset Mining")
print("Source: hungnm/vietnamese-medical-qa (HuggingFace)")
print("=" * 60)
# Step 1: Download
df = download_medical_qa()
if df is None:
print("\n❌ Download failed!")
return False
# Step 2: Process
success = process_medical_qa()
if not success:
print("\n❌ Processing failed!")
return False
print("\n" + "=" * 60)
print("✅ SUCCESS! Vietnamese Medical Q&A ready for RAG system")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)