File size: 4,762 Bytes
f9053c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
cleanup_space.py
----------------
Maintenance script for Hugging Face Space cleanup.
- Removes caches, temp files, and large unneeded assets.
- Keeps only FAISS index + metadata + glossary.
- Reuploads them to CT-Chat-Index dataset.
"""
import subprocess
subprocess.run(["python", "cleanup_space.py"], check=False)
import os
import shutil
import time
from datetime import datetime
from huggingface_hub import HfApi, upload_file, HfFolder

# πŸ”§ Configuration
REPO_ID = "essprasad/CT-Chat-Index"   # Dataset repo
REPO_TYPE = "dataset"
PERSISTENT_DIR = "persistent"
DATA_DIR = "data"
KEEP_FILES = [
    "persistent/faiss.index",
    "persistent/faiss.index.meta.json",
    "data/glossary.json"
]

api = HfApi()
token = HfFolder.get_token() or os.getenv("HF_TOKEN", None)

def readable_size(path):
    """Return human-readable folder size."""
    total = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if os.path.exists(fp):
                total += os.path.getsize(fp)
    for unit in ["B", "KB", "MB", "GB"]:
        if total < 1024.0:
            return f"{total:.2f} {unit}"
        total /= 1024.0
    return f"{total:.2f} TB"

# --------------------------------------------------------------------
# 1. Clean caches, logs, temp files
# --------------------------------------------------------------------
def clean_temp_and_cache():
    print("🧹 Cleaning temporary and cache directories...")
    for path in ["/root/.cache", "/home/user/.cache", "/tmp"]:
        shutil.rmtree(path, ignore_errors=True)
        os.makedirs(path, exist_ok=True)

    # Remove logs larger than 5 MB
    log_dir = "logs"
    if os.path.exists(log_dir):
        for f in os.listdir(log_dir):
            fp = os.path.join(log_dir, f)
            if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024:
                os.remove(fp)
                print(f"πŸ—‘οΈ Removed oversized log: {fp}")

# --------------------------------------------------------------------
# 2. Remove large documents & orphan files
# --------------------------------------------------------------------
def trim_data():
    print("πŸ“¦ Trimming large files from data/public_docs...")
    doc_dir = os.path.join(DATA_DIR, "public_docs")
    if not os.path.exists(doc_dir):
        return

    for root, _, files in os.walk(doc_dir):
        for f in files:
            fp = os.path.join(root, f)
            if os.path.getsize(fp) > 10 * 1024 * 1024:  # >10MB
                print(f"πŸ—‘οΈ Removing large doc: {fp}")
                os.remove(fp)

# --------------------------------------------------------------------
# 3. Verify and keep only essential files
# --------------------------------------------------------------------
def preserve_key_files():
    print("πŸ”’ Preserving essential files (index + glossary)...")
    all_keep = []
    for f in KEEP_FILES:
        if os.path.exists(f):
            print(f"βœ… Keeping: {f}")
            all_keep.append(f)
        else:
            print(f"⚠️ Missing expected file: {f}")
    return all_keep

# --------------------------------------------------------------------
# 4. Upload cleaned files to dataset
# --------------------------------------------------------------------
def upload_to_hub(files):
    if not token:
        print("❌ No HF token found. Please add HF_TOKEN with write access.")
        return
    print(f"πŸš€ Uploading cleaned files to {REPO_ID} ...")
    for f in files:
        try:
            upload_file(
                path_or_fileobj=f,
                path_in_repo=f,
                repo_id=REPO_ID,
                repo_type=REPO_TYPE,
                token=token,
                commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}"
            )
            print(f"βœ… Uploaded: {f}")
        except Exception as e:
            print(f"⚠️ Failed to upload {f}: {e}")

# --------------------------------------------------------------------
# 5. Disk usage report
# --------------------------------------------------------------------
def report_usage():
    print("\nπŸ“Š Disk Usage Summary:")
    for path in ["persistent", "data", "/home/user"]:
        if os.path.exists(path):
            print(f"{path}: {readable_size(path)}")

# --------------------------------------------------------------------
# Run everything
# --------------------------------------------------------------------
if __name__ == "__main__":
    start = time.time()
    print("===== 🧹 Starting Space Cleanup =====")
    clean_temp_and_cache()
    trim_data()
    files = preserve_key_files()
    upload_to_hub(files)
    report_usage()
    print(f"\nβœ… Cleanup finished in {time.time() - start:.2f}s")