Spaces:
Running
Running
File size: 4,762 Bytes
f9053c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
"""
cleanup_space.py
----------------
Maintenance script for Hugging Face Space cleanup.
- Removes caches, temp files, and large unneeded assets.
- Keeps only FAISS index + metadata + glossary.
- Reuploads them to CT-Chat-Index dataset.
"""
import subprocess
subprocess.run(["python", "cleanup_space.py"], check=False)
import os
import shutil
import time
from datetime import datetime
from huggingface_hub import HfApi, upload_file, HfFolder
# π§ Configuration
REPO_ID = "essprasad/CT-Chat-Index" # Dataset repo
REPO_TYPE = "dataset"
PERSISTENT_DIR = "persistent"
DATA_DIR = "data"
KEEP_FILES = [
"persistent/faiss.index",
"persistent/faiss.index.meta.json",
"data/glossary.json"
]
api = HfApi()
token = HfFolder.get_token() or os.getenv("HF_TOKEN", None)
def readable_size(path):
"""Return human-readable folder size."""
total = 0
for dirpath, _, filenames in os.walk(path):
for f in filenames:
fp = os.path.join(dirpath, f)
if os.path.exists(fp):
total += os.path.getsize(fp)
for unit in ["B", "KB", "MB", "GB"]:
if total < 1024.0:
return f"{total:.2f} {unit}"
total /= 1024.0
return f"{total:.2f} TB"
# --------------------------------------------------------------------
# 1. Clean caches, logs, temp files
# --------------------------------------------------------------------
def clean_temp_and_cache():
print("π§Ή Cleaning temporary and cache directories...")
for path in ["/root/.cache", "/home/user/.cache", "/tmp"]:
shutil.rmtree(path, ignore_errors=True)
os.makedirs(path, exist_ok=True)
# Remove logs larger than 5 MB
log_dir = "logs"
if os.path.exists(log_dir):
for f in os.listdir(log_dir):
fp = os.path.join(log_dir, f)
if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024:
os.remove(fp)
print(f"ποΈ Removed oversized log: {fp}")
# --------------------------------------------------------------------
# 2. Remove large documents & orphan files
# --------------------------------------------------------------------
def trim_data():
print("π¦ Trimming large files from data/public_docs...")
doc_dir = os.path.join(DATA_DIR, "public_docs")
if not os.path.exists(doc_dir):
return
for root, _, files in os.walk(doc_dir):
for f in files:
fp = os.path.join(root, f)
if os.path.getsize(fp) > 10 * 1024 * 1024: # >10MB
print(f"ποΈ Removing large doc: {fp}")
os.remove(fp)
# --------------------------------------------------------------------
# 3. Verify and keep only essential files
# --------------------------------------------------------------------
def preserve_key_files():
print("π Preserving essential files (index + glossary)...")
all_keep = []
for f in KEEP_FILES:
if os.path.exists(f):
print(f"β
Keeping: {f}")
all_keep.append(f)
else:
print(f"β οΈ Missing expected file: {f}")
return all_keep
# --------------------------------------------------------------------
# 4. Upload cleaned files to dataset
# --------------------------------------------------------------------
def upload_to_hub(files):
if not token:
print("β No HF token found. Please add HF_TOKEN with write access.")
return
print(f"π Uploading cleaned files to {REPO_ID} ...")
for f in files:
try:
upload_file(
path_or_fileobj=f,
path_in_repo=f,
repo_id=REPO_ID,
repo_type=REPO_TYPE,
token=token,
commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}"
)
print(f"β
Uploaded: {f}")
except Exception as e:
print(f"β οΈ Failed to upload {f}: {e}")
# --------------------------------------------------------------------
# 5. Disk usage report
# --------------------------------------------------------------------
def report_usage():
print("\nπ Disk Usage Summary:")
for path in ["persistent", "data", "/home/user"]:
if os.path.exists(path):
print(f"{path}: {readable_size(path)}")
# --------------------------------------------------------------------
# Run everything
# --------------------------------------------------------------------
if __name__ == "__main__":
start = time.time()
print("===== π§Ή Starting Space Cleanup =====")
clean_temp_and_cache()
trim_data()
files = preserve_key_files()
upload_to_hub(files)
report_usage()
print(f"\nβ
Cleanup finished in {time.time() - start:.2f}s")
|