File size: 3,866 Bytes
eeb0f9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
Fitness Dataset - Download & Process
Downloads and processes gym exercise data into ChromaDB
Dataset: onurSakar/GYM-Exercise (1.66K exercises)
"""

from datasets import load_dataset
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os

def download_fitness():
    """Download GYM Exercise dataset from HuggingFace"""
    
    print("πŸ“₯ Downloading GYM Exercise dataset...")
    print("   Source: onurSakar/GYM-Exercise")
    
    try:
        dataset = load_dataset("onurSakar/GYM-Exercise")
        
        os.makedirs("data_mining/datasets", exist_ok=True)
        
        df = dataset['train'].to_pandas()
        
        output_path = "data_mining/datasets/gym_exercise.csv"
        df.to_csv(output_path, index=False)
        
        file_size = os.path.getsize(output_path) / (1024 * 1024)
        
        print(f"βœ… Downloaded: {output_path}")
        print(f"πŸ“Š Records: {len(df)}")
        print(f"πŸ“Š File size: {file_size:.2f} MB")
        
        return True
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        return False

def process_fitness():
    """Process Fitness dataset and build ChromaDB"""
    
    print("\nπŸ”¨ Processing Fitness dataset...")
    
    csv_path = "data_mining/datasets/gym_exercise.csv"
    if not os.path.exists(csv_path):
        print(f"❌ Dataset not found: {csv_path}")
        return False
    
    df = pd.read_csv(csv_path)
    print(f"πŸ“Š Loaded {len(df)} records")
    
    print("πŸ€– Loading embedding model...")
    embedder = SentenceTransformer('keepitreal/vietnamese-sbert')
    
    print("πŸ’Ύ Initializing ChromaDB...")
    os.makedirs("data_mining/output", exist_ok=True)
    client = chromadb.PersistentClient(path="data_mining/output/fitness_chroma")
    
    collection = client.get_or_create_collection(
        name="fitness",
        metadata={"hnsw:space": "cosine"}
    )
    
    print("πŸ“ Processing fitness data...")
    
    processed = 0
    
    for idx, row in df.iterrows():
        text_parts = []
        for col in df.columns:
            value = str(row[col])
            if value and value != 'nan' and len(value) > 2:
                text_parts.append(f"{col}: {value}")
        
        text = "\n".join(text_parts)
        
        if len(text) < 10:
            continue
        
        embedding = embedder.encode(text)
        
        collection.add(
            ids=[f"fitness_{processed:05d}"],
            embeddings=[embedding.tolist()],
            documents=[text],
            metadatas=[{
                'domain': 'fitness',
                'agent': 'FitnessAgent',
                'source': 'GYM_Exercise',
                'index': processed
            }]
        )
        
        processed += 1
        
        if (processed % 100) == 0:
            print(f"  Processed {processed}/{len(df)} records...")
    
    print(f"βœ… Processed {processed} fitness records")
    print(f"πŸ’Ύ Database saved to: data_mining/output/fitness_chroma/")
    
    db_path = "data_mining/output/fitness_chroma"
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(db_path):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            total_size += os.path.getsize(filepath)
    
    print(f"πŸ“Š Database size: {total_size / (1024 * 1024):.2f} MB")
    
    return True

def main():
    """Main function - download and process"""
    print("=" * 60)
    print("Fitness Dataset - Download & Process")
    print("=" * 60)
    
    if not download_fitness():
        return False
    
    if not process_fitness():
        return False
    
    print("\n" + "=" * 60)
    print("βœ… Fitness dataset ready!")
    print("=" * 60)
    return True

if __name__ == "__main__":
    success = main()
    exit(0 if success else 1)