""" cleanup_space.py ---------------- Maintenance script for Hugging Face Space cleanup. - Removes caches, temp files, and large unneeded assets. - Keeps only FAISS index + metadata + glossary. - Reuploads them to CT-Chat-Index dataset. """ import subprocess subprocess.run(["python", "cleanup_space.py"], check=False) import os import shutil import time from datetime import datetime from huggingface_hub import HfApi, upload_file, HfFolder # ๐Ÿ”ง Configuration REPO_ID = "essprasad/CT-Chat-Index" # Dataset repo REPO_TYPE = "dataset" PERSISTENT_DIR = "persistent" DATA_DIR = "data" KEEP_FILES = [ "persistent/faiss.index", "persistent/faiss.index.meta.json", "data/glossary.json" ] api = HfApi() token = HfFolder.get_token() or os.getenv("HF_TOKEN", None) def readable_size(path): """Return human-readable folder size.""" total = 0 for dirpath, _, filenames in os.walk(path): for f in filenames: fp = os.path.join(dirpath, f) if os.path.exists(fp): total += os.path.getsize(fp) for unit in ["B", "KB", "MB", "GB"]: if total < 1024.0: return f"{total:.2f} {unit}" total /= 1024.0 return f"{total:.2f} TB" # -------------------------------------------------------------------- # 1. Clean caches, logs, temp files # -------------------------------------------------------------------- def clean_temp_and_cache(): print("๐Ÿงน Cleaning temporary and cache directories...") for path in ["/root/.cache", "/home/user/.cache", "/tmp"]: shutil.rmtree(path, ignore_errors=True) os.makedirs(path, exist_ok=True) # Remove logs larger than 5 MB log_dir = "logs" if os.path.exists(log_dir): for f in os.listdir(log_dir): fp = os.path.join(log_dir, f) if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024: os.remove(fp) print(f"๐Ÿ—‘๏ธ Removed oversized log: {fp}") # -------------------------------------------------------------------- # 2. Remove large documents & orphan files # -------------------------------------------------------------------- def trim_data(): print("๐Ÿ“ฆ Trimming large files from data/public_docs...") doc_dir = os.path.join(DATA_DIR, "public_docs") if not os.path.exists(doc_dir): return for root, _, files in os.walk(doc_dir): for f in files: fp = os.path.join(root, f) if os.path.getsize(fp) > 10 * 1024 * 1024: # >10MB print(f"๐Ÿ—‘๏ธ Removing large doc: {fp}") os.remove(fp) # -------------------------------------------------------------------- # 3. Verify and keep only essential files # -------------------------------------------------------------------- def preserve_key_files(): print("๐Ÿ”’ Preserving essential files (index + glossary)...") all_keep = [] for f in KEEP_FILES: if os.path.exists(f): print(f"โœ… Keeping: {f}") all_keep.append(f) else: print(f"โš ๏ธ Missing expected file: {f}") return all_keep # -------------------------------------------------------------------- # 4. Upload cleaned files to dataset # -------------------------------------------------------------------- def upload_to_hub(files): if not token: print("โŒ No HF token found. Please add HF_TOKEN with write access.") return print(f"๐Ÿš€ Uploading cleaned files to {REPO_ID} ...") for f in files: try: upload_file( path_or_fileobj=f, path_in_repo=f, repo_id=REPO_ID, repo_type=REPO_TYPE, token=token, commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}" ) print(f"โœ… Uploaded: {f}") except Exception as e: print(f"โš ๏ธ Failed to upload {f}: {e}") # -------------------------------------------------------------------- # 5. Disk usage report # -------------------------------------------------------------------- def report_usage(): print("\n๐Ÿ“Š Disk Usage Summary:") for path in ["persistent", "data", "/home/user"]: if os.path.exists(path): print(f"{path}: {readable_size(path)}") # -------------------------------------------------------------------- # Run everything # -------------------------------------------------------------------- if __name__ == "__main__": start = time.time() print("===== ๐Ÿงน Starting Space Cleanup =====") clean_temp_and_cache() trim_data() files = preserve_key_files() upload_to_hub(files) report_usage() print(f"\nโœ… Cleanup finished in {time.time() - start:.2f}s")