File size: 10,183 Bytes
eeb0f9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env python3
"""
Mining Script: Vietnamese Medical Q&A Dataset
Downloads and processes hungnm/vietnamese-medical-qa from HuggingFace
Splits into 2 collections: symptom_qa and general_health_qa
"""

import sys
import pandas as pd
from pathlib import Path

def download_medical_qa():
    """Download Vietnamese Medical Q&A dataset from HuggingFace"""
    try:
        from datasets import load_dataset
        
        print("📥 Downloading Vietnamese Medical Q&A from HuggingFace...")
        print("   Source: hungnm/vietnamese-medical-qa")
        print("   Size: ~9,335 Q&A pairs")
        
        # Download dataset
        dataset = load_dataset("hungnm/vietnamese-medical-qa")
        df = dataset['train'].to_pandas()
        
        print(f"✅ Downloaded: {len(df)} Q&A pairs")
        
        # Save to CSV
        output_dir = Path("data_mining/datasets")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        output_path = output_dir / "vietnamese_medical_qa.csv"
        df.to_csv(output_path, index=False, encoding='utf-8')
        
        print(f"💾 Saved to: {output_path}")
        return df
        
    except ImportError:
        print("❌ Error: 'datasets' library not installed")
        print("   Install with: pip install datasets")
        return None
    except Exception as e:
        print(f"❌ Error downloading dataset: {e}")
        return None


def is_symptom_question(question):
    """
    Classify if question is about SPECIFIC SYMPTOMS
    
    Returns:
        bool: True if symptom question, False if general health question
    """
    if not question or not isinstance(question, str):
        return False
    
    question_lower = question.lower()
    
    # Symptom keywords (high priority - user describing active symptoms)
    symptom_keywords = [
        # Pain
        'bị đau', 'đau', 'nhức', 'tức', 'đau nhức',
        
        # Infection/Fever
        'bị sốt', 'sốt', 'viêm', 'nhiễm trùng', 'mủ', 'sưng',
        
        # Digestive
        'buồn nôn', 'nôn', 'tiêu chảy', 'táo bón', 'đầy hơi',
        'ợ hơi', 'ợ chua', 'khó tiêu',
        
        # Respiratory
        'ho', 'khó thở', 'nghẹt mũi', 'chảy nước mũi',
        'đau họng', 'khàn giọng',
        
        # Neurological
        'chóng mặt', 'hoa mắt', 'mất thăng bằng', 'đau đầu',
        
        # Skin
        'ngứa', 'phát ban', 'nổi mẩn', 'đỏ',
        
        # General symptoms
        'mệt mỏi', 'yếu', 'không khỏe', 'bị ốm', 'khó chịu'
    ]
    
    # General health keywords (prevention, knowledge, advice)
    general_keywords = [
        # Prevention
        'làm sao để không', 'phòng ngừa', 'tránh', 'cách phòng',
        'làm thế nào để', 'cách nào để',
        
        # Knowledge questions
        'là gì', 'có phải', 'có nên', 'nên không',
        'tại sao', 'nguyên nhân', 'có thể',
        
        # Advice/Recommendations
        'nên làm gì', 'nên ăn gì', 'có tốt không',
        'có được không', 'có nên', 'khuyên'
    ]
    
    # Count keyword matches
    symptom_score = sum(1 for kw in symptom_keywords if kw in question_lower)
    general_score = sum(1 for kw in general_keywords if kw in question_lower)
    
    # Decision logic
    if symptom_score > general_score:
        return True  # Symptom question
    elif general_score > symptom_score:
        return False  # General health question
    else:
        # Tie-breaker: Check for "bị" (indicates having a condition)
        return 'bị' in question_lower


def process_medical_qa():
    """Process and split into 2 ChromaDB collections"""
    try:
        from sentence_transformers import SentenceTransformer
        import chromadb
        
        print("\n🔄 Processing Vietnamese Medical Q&A...")
        
        # Load CSV
        csv_path = Path("data_mining/datasets/vietnamese_medical_qa.csv")
        if not csv_path.exists():
            print(f"❌ Error: {csv_path} not found")
            return False
        
        df = pd.read_csv(csv_path, encoding='utf-8')
        print(f"📊 Loaded: {len(df)} Q&A pairs")
        
        # Initialize embedding model
        print("🤖 Loading embedding model: keepitreal/vietnamese-sbert...")
        embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
        
        # Initialize ChromaDB
        output_dir = Path("data_mining/output")
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # Split data
        symptom_data = []
        general_data = []
        
        print("🔍 Classifying questions...")
        for idx, row in df.iterrows():
            question = str(row['question'])
            answer = str(row['answer'])
            
            # Combine Q&A
            text = f"Câu hỏi: {question}\n\nTrả lời: {answer}"
            
            # Classify
            if is_symptom_question(question):
                symptom_data.append({
                    'id': f'symptom_qa_{idx}',
                    'text': text,
                    'question': question,
                    'answer': answer,
                    'type': 'symptom'
                })
            else:
                general_data.append({
                    'id': f'general_qa_{idx}',
                    'text': text,
                    'question': question,
                    'answer': answer,
                    'type': 'general'
                })
        
        print(f"✅ Classification complete:")
        print(f"   - Symptom Q&A: {len(symptom_data)} ({len(symptom_data)/len(df)*100:.1f}%)")
        print(f"   - General Health Q&A: {len(general_data)} ({len(general_data)/len(df)*100:.1f}%)")
        
        # Create ChromaDB collections
        # 1. Symptom Q&A Collection
        print("\n📦 Creating Symptom Q&A ChromaDB...")
        symptom_client = chromadb.PersistentClient(path=str(output_dir / "symptom_qa_chroma"))
        symptom_collection = symptom_client.get_or_create_collection(
            name="symptom_qa",
            metadata={"description": "Vietnamese Medical Q&A - Symptom Questions"}
        )
        
        # Batch insert symptom data
        batch_size = 100
        for i in range(0, len(symptom_data), batch_size):
            batch = symptom_data[i:i+batch_size]
            
            ids = [item['id'] for item in batch]
            texts = [item['text'] for item in batch]
            metadatas = [{
                'type': item['type'],
                'domain': 'symptom',
                'agent': 'SymptomAgent',
                'source': 'vietnamese-medical-qa'
            } for item in batch]
            
            # Generate embeddings
            embeddings = embedder.encode(texts, show_progress_bar=False)
            
            symptom_collection.add(
                ids=ids,
                embeddings=embeddings.tolist(),
                documents=texts,
                metadatas=metadatas
            )
            
            if (i + batch_size) % 500 == 0:
                print(f"   Processed {min(i+batch_size, len(symptom_data))}/{len(symptom_data)} symptom Q&A...")
        
        print(f"✅ Symptom Q&A ChromaDB created: {len(symptom_data)} records")
        
        # 2. General Health Q&A Collection
        print("\n📦 Creating General Health Q&A ChromaDB...")
        general_client = chromadb.PersistentClient(path=str(output_dir / "general_health_qa_chroma"))
        general_collection = general_client.get_or_create_collection(
            name="general_health_qa",
            metadata={"description": "Vietnamese Medical Q&A - General Health Questions"}
        )
        
        # Batch insert general data
        for i in range(0, len(general_data), batch_size):
            batch = general_data[i:i+batch_size]
            
            ids = [item['id'] for item in batch]
            texts = [item['text'] for item in batch]
            metadatas = [{
                'type': item['type'],
                'domain': 'general_health',
                'agent': 'GeneralHealthAgent',
                'source': 'vietnamese-medical-qa'
            } for item in batch]
            
            # Generate embeddings
            embeddings = embedder.encode(texts, show_progress_bar=False)
            
            general_collection.add(
                ids=ids,
                embeddings=embeddings.tolist(),
                documents=texts,
                metadatas=metadatas
            )
            
            if (i + batch_size) % 500 == 0:
                print(f"   Processed {min(i+batch_size, len(general_data))}/{len(general_data)} general Q&A...")
        
        print(f"✅ General Health Q&A ChromaDB created: {len(general_data)} records")
        
        print("\n✅ Processing complete!")
        print(f"   Output: {output_dir}")
        print(f"   - symptom_qa_chroma/ ({len(symptom_data)} records)")
        print(f"   - general_health_qa_chroma/ ({len(general_data)} records)")
        
        return True
        
    except ImportError as e:
        print(f"❌ Error: Missing library - {e}")
        print("   Install with: pip install sentence-transformers chromadb")
        return False
    except Exception as e:
        print(f"❌ Error processing dataset: {e}")
        import traceback
        traceback.print_exc()
        return False


def main():
    """Main execution"""
    print("=" * 60)
    print("Vietnamese Medical Q&A Dataset Mining")
    print("Source: hungnm/vietnamese-medical-qa (HuggingFace)")
    print("=" * 60)
    
    # Step 1: Download
    df = download_medical_qa()
    if df is None:
        print("\n❌ Download failed!")
        return False
    
    # Step 2: Process
    success = process_medical_qa()
    if not success:
        print("\n❌ Processing failed!")
        return False
    
    print("\n" + "=" * 60)
    print("✅ SUCCESS! Vietnamese Medical Q&A ready for RAG system")
    print("=" * 60)
    return True


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)