Spaces:

essprasad
/

ClinicalTrialBasics

Running

File size: 4,762 Bytes

f9053c5

"""
cleanup_space.py
----------------
Maintenance script for Hugging Face Space cleanup.
- Removes caches, temp files, and large unneeded assets.
- Keeps only FAISS index + metadata + glossary.
- Reuploads them to CT-Chat-Index dataset.
"""
import subprocess
subprocess.run(["python", "cleanup_space.py"], check=False)
import os
import shutil
import time
from datetime import datetime
from huggingface_hub import HfApi, upload_file, HfFolder

# 🔧 Configuration
REPO_ID = "essprasad/CT-Chat-Index"   # Dataset repo
REPO_TYPE = "dataset"
PERSISTENT_DIR = "persistent"
DATA_DIR = "data"
KEEP_FILES = [
    "persistent/faiss.index",
    "persistent/faiss.index.meta.json",
    "data/glossary.json"
]

api = HfApi()
token = HfFolder.get_token() or os.getenv("HF_TOKEN", None)

def readable_size(path):
    """Return human-readable folder size."""
    total = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if os.path.exists(fp):
                total += os.path.getsize(fp)
    for unit in ["B", "KB", "MB", "GB"]:
        if total < 1024.0:
            return f"{total:.2f} {unit}"
        total /= 1024.0
    return f"{total:.2f} TB"

# --------------------------------------------------------------------
# 1. Clean caches, logs, temp files
# --------------------------------------------------------------------
def clean_temp_and_cache():
    print("🧹 Cleaning temporary and cache directories...")
    for path in ["/root/.cache", "/home/user/.cache", "/tmp"]:
        shutil.rmtree(path, ignore_errors=True)
        os.makedirs(path, exist_ok=True)

    # Remove logs larger than 5 MB
    log_dir = "logs"
    if os.path.exists(log_dir):
        for f in os.listdir(log_dir):
            fp = os.path.join(log_dir, f)
            if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024:
                os.remove(fp)
                print(f"🗑️ Removed oversized log: {fp}")

# --------------------------------------------------------------------
# 2. Remove large documents & orphan files
# --------------------------------------------------------------------
def trim_data():
    print("📦 Trimming large files from data/public_docs...")
    doc_dir = os.path.join(DATA_DIR, "public_docs")
    if not os.path.exists(doc_dir):
        return

    for root, _, files in os.walk(doc_dir):
        for f in files:
            fp = os.path.join(root, f)
            if os.path.getsize(fp) > 10 * 1024 * 1024:  # >10MB
                print(f"🗑️ Removing large doc: {fp}")
                os.remove(fp)

# --------------------------------------------------------------------
# 3. Verify and keep only essential files
# --------------------------------------------------------------------
def preserve_key_files():
    print("🔒 Preserving essential files (index + glossary)...")
    all_keep = []
    for f in KEEP_FILES:
        if os.path.exists(f):
            print(f"✅ Keeping: {f}")
            all_keep.append(f)
        else:
            print(f"⚠️ Missing expected file: {f}")
    return all_keep

# --------------------------------------------------------------------
# 4. Upload cleaned files to dataset
# --------------------------------------------------------------------
def upload_to_hub(files):
    if not token:
        print("❌ No HF token found. Please add HF_TOKEN with write access.")
        return
    print(f"🚀 Uploading cleaned files to {REPO_ID} ...")
    for f in files:
        try:
            upload_file(
                path_or_fileobj=f,
                path_in_repo=f,
                repo_id=REPO_ID,
                repo_type=REPO_TYPE,
                token=token,
                commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}"
            )
            print(f"✅ Uploaded: {f}")
        except Exception as e:
            print(f"⚠️ Failed to upload {f}: {e}")

# --------------------------------------------------------------------
# 5. Disk usage report
# --------------------------------------------------------------------
def report_usage():
    print("\n📊 Disk Usage Summary:")
    for path in ["persistent", "data", "/home/user"]:
        if os.path.exists(path):
            print(f"{path}: {readable_size(path)}")

# --------------------------------------------------------------------
# Run everything
# --------------------------------------------------------------------
if __name__ == "__main__":
    start = time.time()
    print("===== 🧹 Starting Space Cleanup =====")
    clean_temp_and_cache()
    trim_data()
    files = preserve_key_files()
    upload_to_hub(files)
    report_usage()
    print(f"\n✅ Cleanup finished in {time.time() - start:.2f}s")