Spaces:
Runtime error
Runtime error
File size: 3,866 Bytes
eeb0f9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
"""
Fitness Dataset - Download & Process
Downloads and processes gym exercise data into ChromaDB
Dataset: onurSakar/GYM-Exercise (1.66K exercises)
"""
from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os
def download_fitness():
"""Download GYM Exercise dataset from HuggingFace"""
print("π₯ Downloading GYM Exercise dataset...")
print(" Source: onurSakar/GYM-Exercise")
try:
dataset = load_dataset("onurSakar/GYM-Exercise")
os.makedirs("data_mining/datasets", exist_ok=True)
df = dataset['train'].to_pandas()
output_path = "data_mining/datasets/gym_exercise.csv"
df.to_csv(output_path, index=False)
file_size = os.path.getsize(output_path) / (1024 * 1024)
print(f"β
Downloaded: {output_path}")
print(f"π Records: {len(df)}")
print(f"π File size: {file_size:.2f} MB")
return True
except Exception as e:
print(f"β Download failed: {e}")
return False
def process_fitness():
"""Process Fitness dataset and build ChromaDB"""
print("\nπ¨ Processing Fitness dataset...")
csv_path = "data_mining/datasets/gym_exercise.csv"
if not os.path.exists(csv_path):
print(f"β Dataset not found: {csv_path}")
return False
df = pd.read_csv(csv_path)
print(f"π Loaded {len(df)} records")
print("π€ Loading embedding model...")
embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
print("πΎ Initializing ChromaDB...")
os.makedirs("data_mining/output", exist_ok=True)
client = chromadb.PersistentClient(path="data_mining/output/fitness_chroma")
collection = client.get_or_create_collection(
name="fitness",
metadata={"hnsw:space": "cosine"}
)
print("π Processing fitness data...")
processed = 0
for idx, row in df.iterrows():
text_parts = []
for col in df.columns:
value = str(row[col])
if value and value != 'nan' and len(value) > 2:
text_parts.append(f"{col}: {value}")
text = "\n".join(text_parts)
if len(text) < 10:
continue
embedding = embedder.encode(text)
collection.add(
ids=[f"fitness_{processed:05d}"],
embeddings=[embedding.tolist()],
documents=[text],
metadatas=[{
'domain': 'fitness',
'agent': 'FitnessAgent',
'source': 'GYM_Exercise',
'index': processed
}]
)
processed += 1
if (processed % 100) == 0:
print(f" Processed {processed}/{len(df)} records...")
print(f"β
Processed {processed} fitness records")
print(f"πΎ Database saved to: data_mining/output/fitness_chroma/")
db_path = "data_mining/output/fitness_chroma"
total_size = 0
for dirpath, dirnames, filenames in os.walk(db_path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
total_size += os.path.getsize(filepath)
print(f"π Database size: {total_size / (1024 * 1024):.2f} MB")
return True
def main():
"""Main function - download and process"""
print("=" * 60)
print("Fitness Dataset - Download & Process")
print("=" * 60)
if not download_fitness():
return False
if not process_fitness():
return False
print("\n" + "=" * 60)
print("β
Fitness dataset ready!")
print("=" * 60)
return True
if __name__ == "__main__":
success = main()
exit(0 if success else 1)
|