ClinicalTrialBasics / cleanup_space.py
essprasad's picture
Upload 10 files
7829d29 verified
"""
cleanup_space.py
----------------
Maintenance script for Hugging Face Space cleanup.
- Removes caches, temp files, and large unneeded assets.
- Keeps only FAISS index + metadata + glossary.
- Reuploads them to CT-Chat-Index dataset.
"""
import subprocess
subprocess.run(["python", "cleanup_space.py"], check=False)
import os
import shutil
import time
from datetime import datetime
from huggingface_hub import HfApi, upload_file, HfFolder
# πŸ”§ Configuration
REPO_ID = "essprasad/CT-Chat-Index" # Dataset repo
REPO_TYPE = "dataset"
PERSISTENT_DIR = "persistent"
DATA_DIR = "data"
KEEP_FILES = [
"persistent/faiss.index",
"persistent/faiss.index.meta.json",
"data/glossary.json"
]
api = HfApi()
token = HfFolder.get_token() or os.getenv("HF_TOKEN", None)
def readable_size(path):
"""Return human-readable folder size."""
total = 0
for dirpath, _, filenames in os.walk(path):
for f in filenames:
fp = os.path.join(dirpath, f)
if os.path.exists(fp):
total += os.path.getsize(fp)
for unit in ["B", "KB", "MB", "GB"]:
if total < 1024.0:
return f"{total:.2f} {unit}"
total /= 1024.0
return f"{total:.2f} TB"
# --------------------------------------------------------------------
# 1. Clean caches, logs, temp files
# --------------------------------------------------------------------
def clean_temp_and_cache():
print("🧹 Cleaning temporary and cache directories...")
for path in ["/root/.cache", "/home/user/.cache", "/tmp"]:
shutil.rmtree(path, ignore_errors=True)
os.makedirs(path, exist_ok=True)
# Remove logs larger than 5 MB
log_dir = "logs"
if os.path.exists(log_dir):
for f in os.listdir(log_dir):
fp = os.path.join(log_dir, f)
if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024:
os.remove(fp)
print(f"πŸ—‘οΈ Removed oversized log: {fp}")
# --------------------------------------------------------------------
# 2. Remove large documents & orphan files
# --------------------------------------------------------------------
def trim_data():
print("πŸ“¦ Trimming large files from data/public_docs...")
doc_dir = os.path.join(DATA_DIR, "public_docs")
if not os.path.exists(doc_dir):
return
for root, _, files in os.walk(doc_dir):
for f in files:
fp = os.path.join(root, f)
if os.path.getsize(fp) > 10 * 1024 * 1024: # >10MB
print(f"πŸ—‘οΈ Removing large doc: {fp}")
os.remove(fp)
# --------------------------------------------------------------------
# 3. Verify and keep only essential files
# --------------------------------------------------------------------
def preserve_key_files():
print("πŸ”’ Preserving essential files (index + glossary)...")
all_keep = []
for f in KEEP_FILES:
if os.path.exists(f):
print(f"βœ… Keeping: {f}")
all_keep.append(f)
else:
print(f"⚠️ Missing expected file: {f}")
return all_keep
# --------------------------------------------------------------------
# 4. Upload cleaned files to dataset
# --------------------------------------------------------------------
def upload_to_hub(files):
if not token:
print("❌ No HF token found. Please add HF_TOKEN with write access.")
return
print(f"πŸš€ Uploading cleaned files to {REPO_ID} ...")
for f in files:
try:
upload_file(
path_or_fileobj=f,
path_in_repo=f,
repo_id=REPO_ID,
repo_type=REPO_TYPE,
token=token,
commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}"
)
print(f"βœ… Uploaded: {f}")
except Exception as e:
print(f"⚠️ Failed to upload {f}: {e}")
# --------------------------------------------------------------------
# 5. Disk usage report
# --------------------------------------------------------------------
def report_usage():
print("\nπŸ“Š Disk Usage Summary:")
for path in ["persistent", "data", "/home/user"]:
if os.path.exists(path):
print(f"{path}: {readable_size(path)}")
# --------------------------------------------------------------------
# Run everything
# --------------------------------------------------------------------
if __name__ == "__main__":
start = time.time()
print("===== 🧹 Starting Space Cleanup =====")
clean_temp_and_cache()
trim_data()
files = preserve_key_files()
upload_to_hub(files)
report_usage()
print(f"\nβœ… Cleanup finished in {time.time() - start:.2f}s")