#!/usr/bin/env python3 """ Mining Script: Vietnamese Food Nutrition Database Processes Vietnamese food CSV into ChromaDB for NutritionAgent """ import sys import pandas as pd from pathlib import Path def process_vietnamese_nutrition(): """Process Vietnamese food nutrition CSV into ChromaDB""" try: from sentence_transformers import SentenceTransformer import chromadb print("๐Ÿœ Processing Vietnamese Food Nutrition Database...") # Load CSV csv_path = Path("data_mining/datasets/vietnamese_food_nutrition.csv") if not csv_path.exists(): print("โŒ CSV not found. Creating it first...") import vn_food_db vn_food_db.vn_food_db() df = pd.read_csv(csv_path) print(f"๐Ÿ“Š Loaded: {len(df)} Vietnamese foods") # Initialize print("๐Ÿค– Loading embedding model...") embedder = SentenceTransformer('keepitreal/vietnamese-sbert') output_dir = Path("data_mining/output") output_dir.mkdir(parents=True, exist_ok=True) client = chromadb.PersistentClient(path=str(output_dir / "vietnamese_nutrition_chroma")) collection = client.get_or_create_collection( name="vietnamese_nutrition", metadata={"description": "Vietnamese Food Nutrition Database"} ) # Process foods print("๐Ÿ“ฆ Creating ChromaDB...") batch_size = 20 for i in range(0, len(df), batch_size): batch = df.iloc[i:i+batch_size] ids = [] texts = [] metadatas = [] for idx, row in batch.iterrows(): # Create document text = f"""Mรณn ฤƒn: {row['name_vi']} ({row['name_en']}) Calories: {row['calories']} kcal Protein: {row['protein_g']}g Carbohydrates: {row['carbs_g']}g Fat: {row['fat_g']}g Fiber: {row['fiber_g']}g Category: {row['category']}""" ids.append(f"food_{idx}") texts.append(text) metadatas.append({ 'name_vi': row['name_vi'], 'name_en': row['name_en'], 'calories': int(row['calories']), 'category': row['category'], 'source': 'vietnamese_food_db' }) # Generate embeddings embeddings = embedder.encode(texts, show_progress_bar=False) # Add to collection collection.add( ids=ids, embeddings=embeddings.tolist(), documents=texts, metadatas=metadatas ) print(f" Processed {min(i+batch_size, len(df))}/{len(df)} foods...") print(f"\nโœ… Vietnamese Nutrition ChromaDB created!") print(f" Output: {output_dir / 'vietnamese_nutrition_chroma'}") print(f" Records: {len(df)} foods") return True except ImportError as e: print(f"โŒ Missing library: {e}") print(" Install: pip install sentence-transformers chromadb pandas") return False except Exception as e: print(f"โŒ Error: {e}") import traceback traceback.print_exc() return False def main(): """Main execution""" print("=" * 60) print("Vietnamese Food Nutrition Database Mining") print("=" * 60) success = process_vietnamese_nutrition() if success: print("\n" + "=" * 60) print("โœ… SUCCESS! Vietnamese nutrition data ready for RAG") print("=" * 60) else: print("\nโŒ FAILED!") return success if __name__ == "__main__": success = main() sys.exit(0 if success else 1)