""" MentalChat16K Dataset - Download & Process Downloads and processes mental health counseling conversations into ChromaDB Dataset: ShenLab/MentalChat16K (16K conversations, 33 topics) """ from datasets import load_dataset import pandas as pd import chromadb from sentence_transformers import SentenceTransformer import os def download_mentalchat(): """Download MentalChat16K dataset from HuggingFace""" print("šŸ“„ Downloading MentalChat16K dataset...") print(" Source: ShenLab/MentalChat16K") print(" Coverage: 33 mental health topics") try: # Load dataset from HuggingFace dataset = load_dataset("ShenLab/MentalChat16K") # Create output directory os.makedirs("data_mining/datasets", exist_ok=True) # Convert to pandas DataFrame df = dataset['train'].to_pandas() # Save to CSV output_path = "data_mining/datasets/mentalchat16k.csv" df.to_csv(output_path, index=False) # Check file size file_size = os.path.getsize(output_path) / (1024 * 1024) # MB print(f"āœ… Downloaded: {output_path}") print(f"šŸ“Š Records: {len(df)}") print(f"šŸ“Š File size: {file_size:.2f} MB") return True except Exception as e: print(f"āŒ Download failed: {e}") return False def process_mentalchat(): """Process MentalChat16K dataset and build ChromaDB""" print("\nšŸ”Ø Processing MentalChat16K dataset...") # Load dataset csv_path = "data_mining/datasets/mentalchat16k.csv" if not os.path.exists(csv_path): print(f"āŒ Dataset not found: {csv_path}") return False df = pd.read_csv(csv_path) print(f"šŸ“Š Loaded {len(df)} records") # Initialize embedder print("šŸ¤– Loading embedding model...") embedder = SentenceTransformer('keepitreal/vietnamese-sbert') # Initialize ChromaDB print("šŸ’¾ Initializing ChromaDB...") os.makedirs("data_mining/output", exist_ok=True) client = chromadb.PersistentClient(path="data_mining/output/mental_health_chroma") # Create collection collection = client.get_or_create_collection( name="mental_health", metadata={"hnsw:space": "cosine"} ) # Process conversations print("šŸ“ Processing conversations...") # Determine column names and combine if needed if 'instruction' in df.columns and 'output' in df.columns: # New format: instruction + input + output print(" Detected instruction-based format") df['text'] = df.apply(lambda row: f"User: {row['instruction']}\n{row.get('input', '')}\n\nAssistant: {row['output']}", axis=1 ) text_column = 'text' else: # Try to find existing text column text_column = None for col in ['conversation', 'text', 'Context', 'Question', 'Response']: if col in df.columns: text_column = col break if not text_column: print(f"āŒ Could not find text column. Available: {df.columns.tolist()}") return False print(f" Using column: '{text_column}'") processed = 0 batch_size = 100 for i in range(0, len(df), batch_size): batch = df.iloc[i:i+batch_size] ids = [] embeddings = [] documents = [] metadatas = [] for idx, row in batch.iterrows(): text = str(row[text_column]) if len(text) < 10: continue embedding = embedder.encode(text) ids.append(f"mental_{processed:05d}") embeddings.append(embedding.tolist()) documents.append(text) metadatas.append({ 'domain': 'mental_health', 'agent': 'MentalHealthAgent', 'source': 'MentalChat16K', 'index': processed }) processed += 1 if ids: collection.add( ids=ids, embeddings=embeddings, documents=documents, metadatas=metadatas ) if (i + batch_size) % 1000 == 0: print(f" Processed {min(i + batch_size, len(df))}/{len(df)} records...") print(f"āœ… Processed {processed} conversations") print(f"šŸ’¾ Database saved to: data_mining/output/mental_health_chroma/") # Get database size db_path = "data_mining/output/mental_health_chroma" total_size = 0 for dirpath, dirnames, filenames in os.walk(db_path): for filename in filenames: filepath = os.path.join(dirpath, filename) total_size += os.path.getsize(filepath) print(f"šŸ“Š Database size: {total_size / (1024 * 1024):.2f} MB") return True def main(): """Main function - download and process""" print("=" * 60) print("MentalChat16K Dataset - Download & Process") print("=" * 60) if not download_mentalchat(): return False if not process_mentalchat(): return False print("\n" + "=" * 60) print("āœ… MentalChat16K dataset ready!") print("=" * 60) return True if __name__ == "__main__": success = main() exit(0 if success else 1)