my-gradio-app / data_mining /mining_vietnamese_nutrition.py
Nguyen Trong Lap
Recreate history without binary blobs
eeb0f9c
#!/usr/bin/env python3
"""
Mining Script: Vietnamese Food Nutrition Database
Processes Vietnamese food CSV into ChromaDB for NutritionAgent
"""
import sys
import pandas as pd
from pathlib import Path
def process_vietnamese_nutrition():
"""Process Vietnamese food nutrition CSV into ChromaDB"""
try:
from sentence_transformers import SentenceTransformer
import chromadb
print("🍜 Processing Vietnamese Food Nutrition Database...")
# Load CSV
csv_path = Path("data_mining/datasets/vietnamese_food_nutrition.csv")
if not csv_path.exists():
print("❌ CSV not found. Creating it first...")
import vn_food_db
vn_food_db.vn_food_db()
df = pd.read_csv(csv_path)
print(f"πŸ“Š Loaded: {len(df)} Vietnamese foods")
# Initialize
print("πŸ€– Loading embedding model...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
output_dir = Path("data_mining/output")
output_dir.mkdir(parents=True, exist_ok=True)
client = chromadb.PersistentClient(path=str(output_dir / "vietnamese_nutrition_chroma"))
collection = client.get_or_create_collection(
name="vietnamese_nutrition",
metadata={"description": "Vietnamese Food Nutrition Database"}
)
# Process foods
print("πŸ“¦ Creating ChromaDB...")
batch_size = 20
for i in range(0, len(df), batch_size):
batch = df.iloc[i:i+batch_size]
ids = []
texts = []
metadatas = []
for idx, row in batch.iterrows():
# Create document
text = f"""MΓ³n Δƒn: {row['name_vi']} ({row['name_en']})
Calories: {row['calories']} kcal
Protein: {row['protein_g']}g
Carbohydrates: {row['carbs_g']}g
Fat: {row['fat_g']}g
Fiber: {row['fiber_g']}g
Category: {row['category']}"""
ids.append(f"food_{idx}")
texts.append(text)
metadatas.append({
'name_vi': row['name_vi'],
'name_en': row['name_en'],
'calories': int(row['calories']),
'category': row['category'],
'source': 'vietnamese_food_db'
})
# Generate embeddings
embeddings = embedder.encode(texts, show_progress_bar=False)
# Add to collection
collection.add(
ids=ids,
embeddings=embeddings.tolist(),
documents=texts,
metadatas=metadatas
)
print(f" Processed {min(i+batch_size, len(df))}/{len(df)} foods...")
print(f"\nβœ… Vietnamese Nutrition ChromaDB created!")
print(f" Output: {output_dir / 'vietnamese_nutrition_chroma'}")
print(f" Records: {len(df)} foods")
return True
except ImportError as e:
print(f"❌ Missing library: {e}")
print(" Install: pip install sentence-transformers chromadb pandas")
return False
except Exception as e:
print(f"❌ Error: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Main execution"""
print("=" * 60)
print("Vietnamese Food Nutrition Database Mining")
print("=" * 60)
success = process_vietnamese_nutrition()
if success:
print("\n" + "=" * 60)
print("βœ… SUCCESS! Vietnamese nutrition data ready for RAG")
print("=" * 60)
else:
print("\n❌ FAILED!")
return success
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)