""" Fitness Dataset - Download & Process Downloads and processes gym exercise data into ChromaDB Dataset: onurSakar/GYM-Exercise (1.66K exercises) """ from datasets import load_dataset import pandas as pd import chromadb from sentence_transformers import SentenceTransformer import os def download_fitness(): """Download GYM Exercise dataset from HuggingFace""" print("šŸ“„ Downloading GYM Exercise dataset...") print(" Source: onurSakar/GYM-Exercise") try: dataset = load_dataset("onurSakar/GYM-Exercise") os.makedirs("data_mining/datasets", exist_ok=True) df = dataset['train'].to_pandas() output_path = "data_mining/datasets/gym_exercise.csv" df.to_csv(output_path, index=False) file_size = os.path.getsize(output_path) / (1024 * 1024) print(f"āœ… Downloaded: {output_path}") print(f"šŸ“Š Records: {len(df)}") print(f"šŸ“Š File size: {file_size:.2f} MB") return True except Exception as e: print(f"āŒ Download failed: {e}") return False def process_fitness(): """Process Fitness dataset and build ChromaDB""" print("\nšŸ”Ø Processing Fitness dataset...") csv_path = "data_mining/datasets/gym_exercise.csv" if not os.path.exists(csv_path): print(f"āŒ Dataset not found: {csv_path}") return False df = pd.read_csv(csv_path) print(f"šŸ“Š Loaded {len(df)} records") print("šŸ¤– Loading embedding model...") embedder = SentenceTransformer('keepitreal/vietnamese-sbert') print("šŸ’¾ Initializing ChromaDB...") os.makedirs("data_mining/output", exist_ok=True) client = chromadb.PersistentClient(path="data_mining/output/fitness_chroma") collection = client.get_or_create_collection( name="fitness", metadata={"hnsw:space": "cosine"} ) print("šŸ“ Processing fitness data...") processed = 0 for idx, row in df.iterrows(): text_parts = [] for col in df.columns: value = str(row[col]) if value and value != 'nan' and len(value) > 2: text_parts.append(f"{col}: {value}") text = "\n".join(text_parts) if len(text) < 10: continue embedding = embedder.encode(text) collection.add( ids=[f"fitness_{processed:05d}"], embeddings=[embedding.tolist()], documents=[text], metadatas=[{ 'domain': 'fitness', 'agent': 'FitnessAgent', 'source': 'GYM_Exercise', 'index': processed }] ) processed += 1 if (processed % 100) == 0: print(f" Processed {processed}/{len(df)} records...") print(f"āœ… Processed {processed} fitness records") print(f"šŸ’¾ Database saved to: data_mining/output/fitness_chroma/") db_path = "data_mining/output/fitness_chroma" total_size = 0 for dirpath, dirnames, filenames in os.walk(db_path): for filename in filenames: filepath = os.path.join(dirpath, filename) total_size += os.path.getsize(filepath) print(f"šŸ“Š Database size: {total_size / (1024 * 1024):.2f} MB") return True def main(): """Main function - download and process""" print("=" * 60) print("Fitness Dataset - Download & Process") print("=" * 60) if not download_fitness(): return False if not process_fitness(): return False print("\n" + "=" * 60) print("āœ… Fitness dataset ready!") print("=" * 60) return True if __name__ == "__main__": success = main() exit(0 if success else 1)