Spaces:
Running
Running
| """ | |
| cleanup_space.py | |
| ---------------- | |
| Maintenance script for Hugging Face Space cleanup. | |
| - Removes caches, temp files, and large unneeded assets. | |
| - Keeps only FAISS index + metadata + glossary. | |
| - Reuploads them to CT-Chat-Index dataset. | |
| """ | |
| import subprocess | |
| subprocess.run(["python", "cleanup_space.py"], check=False) | |
| import os | |
| import shutil | |
| import time | |
| from datetime import datetime | |
| from huggingface_hub import HfApi, upload_file, HfFolder | |
| # π§ Configuration | |
| REPO_ID = "essprasad/CT-Chat-Index" # Dataset repo | |
| REPO_TYPE = "dataset" | |
| PERSISTENT_DIR = "persistent" | |
| DATA_DIR = "data" | |
| KEEP_FILES = [ | |
| "persistent/faiss.index", | |
| "persistent/faiss.index.meta.json", | |
| "data/glossary.json" | |
| ] | |
| api = HfApi() | |
| token = HfFolder.get_token() or os.getenv("HF_TOKEN", None) | |
| def readable_size(path): | |
| """Return human-readable folder size.""" | |
| total = 0 | |
| for dirpath, _, filenames in os.walk(path): | |
| for f in filenames: | |
| fp = os.path.join(dirpath, f) | |
| if os.path.exists(fp): | |
| total += os.path.getsize(fp) | |
| for unit in ["B", "KB", "MB", "GB"]: | |
| if total < 1024.0: | |
| return f"{total:.2f} {unit}" | |
| total /= 1024.0 | |
| return f"{total:.2f} TB" | |
| # -------------------------------------------------------------------- | |
| # 1. Clean caches, logs, temp files | |
| # -------------------------------------------------------------------- | |
| def clean_temp_and_cache(): | |
| print("π§Ή Cleaning temporary and cache directories...") | |
| for path in ["/root/.cache", "/home/user/.cache", "/tmp"]: | |
| shutil.rmtree(path, ignore_errors=True) | |
| os.makedirs(path, exist_ok=True) | |
| # Remove logs larger than 5 MB | |
| log_dir = "logs" | |
| if os.path.exists(log_dir): | |
| for f in os.listdir(log_dir): | |
| fp = os.path.join(log_dir, f) | |
| if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024: | |
| os.remove(fp) | |
| print(f"ποΈ Removed oversized log: {fp}") | |
| # -------------------------------------------------------------------- | |
| # 2. Remove large documents & orphan files | |
| # -------------------------------------------------------------------- | |
| def trim_data(): | |
| print("π¦ Trimming large files from data/public_docs...") | |
| doc_dir = os.path.join(DATA_DIR, "public_docs") | |
| if not os.path.exists(doc_dir): | |
| return | |
| for root, _, files in os.walk(doc_dir): | |
| for f in files: | |
| fp = os.path.join(root, f) | |
| if os.path.getsize(fp) > 10 * 1024 * 1024: # >10MB | |
| print(f"ποΈ Removing large doc: {fp}") | |
| os.remove(fp) | |
| # -------------------------------------------------------------------- | |
| # 3. Verify and keep only essential files | |
| # -------------------------------------------------------------------- | |
| def preserve_key_files(): | |
| print("π Preserving essential files (index + glossary)...") | |
| all_keep = [] | |
| for f in KEEP_FILES: | |
| if os.path.exists(f): | |
| print(f"β Keeping: {f}") | |
| all_keep.append(f) | |
| else: | |
| print(f"β οΈ Missing expected file: {f}") | |
| return all_keep | |
| # -------------------------------------------------------------------- | |
| # 4. Upload cleaned files to dataset | |
| # -------------------------------------------------------------------- | |
| def upload_to_hub(files): | |
| if not token: | |
| print("β No HF token found. Please add HF_TOKEN with write access.") | |
| return | |
| print(f"π Uploading cleaned files to {REPO_ID} ...") | |
| for f in files: | |
| try: | |
| upload_file( | |
| path_or_fileobj=f, | |
| path_in_repo=f, | |
| repo_id=REPO_ID, | |
| repo_type=REPO_TYPE, | |
| token=token, | |
| commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}" | |
| ) | |
| print(f"β Uploaded: {f}") | |
| except Exception as e: | |
| print(f"β οΈ Failed to upload {f}: {e}") | |
| # -------------------------------------------------------------------- | |
| # 5. Disk usage report | |
| # -------------------------------------------------------------------- | |
| def report_usage(): | |
| print("\nπ Disk Usage Summary:") | |
| for path in ["persistent", "data", "/home/user"]: | |
| if os.path.exists(path): | |
| print(f"{path}: {readable_size(path)}") | |
| # -------------------------------------------------------------------- | |
| # Run everything | |
| # -------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| start = time.time() | |
| print("===== π§Ή Starting Space Cleanup =====") | |
| clean_temp_and_cache() | |
| trim_data() | |
| files = preserve_key_files() | |
| upload_to_hub(files) | |
| report_usage() | |
| print(f"\nβ Cleanup finished in {time.time() - start:.2f}s") | |