Spaces:
Running
Running
| # ========================================================== | |
| # SAFE-MODE PRELAUNCH CLEANUP | |
| # ========================================================== | |
| import os | |
| import shutil | |
| import time | |
| import glob | |
| # Prevent Svelte/Gradio SSR locale warning early | |
| os.environ["GRADIO_LOCALE"] = "en" | |
| def _prelaunch_cleanup(threshold_gb: float = 45.0): | |
| """Pre-clean to avoid HF Spaces eviction while being conservative about persistent data.""" | |
| def _used_gb(path="/home/user/app"): | |
| try: | |
| total, used, free = shutil.disk_usage(path) | |
| return round(min(used / (1024**3), 49.9), 2) | |
| except Exception: | |
| return 0.0 | |
| used = _used_gb() | |
| print(f"\n💾 Startup disk usage: {used:.2f} GB") | |
| # Only perform aggressive cleanup when over threshold. | |
| if used > threshold_gb: | |
| print(f"⚠️ Usage {used:.2f} GB > {threshold_gb} GB — performing aggressive cleanup.") | |
| # preserve persistent / important artifacts by default | |
| preserve = {"faiss.index", "faiss.index.meta.json", "glossary.json"} | |
| for folder in ["/home/user/app/data/docs_cache", "/home/user/app/tmp_docs"]: | |
| if os.path.exists(folder): | |
| for f in glob.glob(os.path.join(folder, "*")): | |
| name = os.path.basename(f) | |
| if name in preserve: | |
| continue | |
| try: | |
| if os.path.isdir(f): | |
| shutil.rmtree(f, ignore_errors=True) | |
| else: | |
| os.remove(f) | |
| except Exception: | |
| pass | |
| print("🧹 Aggressive cleanup complete.") | |
| print(f"✨ Disk after cleanup: {_used_gb():.2f} GB\n") | |
| _prelaunch_cleanup() | |
| # ========================================================== | |
| # MAIN APP — Clinical Trial Chatbot | |
| # ========================================================== | |
| import gradio as gr | |
| from core.hybrid_retriever import summarize_combined | |
| APP_TITLE = "🧠 Clinical Research Chatbot" | |
| APP_DESC = ( | |
| "Ask any clinical research or GCP-related question. " | |
| "Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets." | |
| ) | |
| # ---------------------------------------------------------- | |
| # MODE & CREDENTIALS | |
| # ---------------------------------------------------------- | |
| PUBLIC_MODE = os.environ.get("PUBLIC_MODE", "true").lower() == "true" | |
| ADMIN_USER = os.environ.get("ADMIN_USER", "admin") | |
| ADMIN_PASS = os.environ.get("ADMIN_PASS", "changeme") | |
| print(f"🔐 Running in {'PUBLIC' if PUBLIC_MODE else 'ADMIN'} mode.") | |
| print(f"🌍 Locale set to: {os.environ.get('GRADIO_LOCALE','en')}") | |
| print(f"🧩 Env vars loaded: PUBLIC_MODE={PUBLIC_MODE}, ADMIN_USER={ADMIN_USER}") | |
| # ---------------------------------------------------------- | |
| # AUTH HELPER | |
| # ---------------------------------------------------------- | |
| def check_admin_login(username, password): | |
| return username == ADMIN_USER and password == ADMIN_PASS | |
| # ---------------------------------------------------------- | |
| # MAINTENANCE FUNCTIONS | |
| # ---------------------------------------------------------- | |
| import json | |
| import faiss | |
| import pandas as pd | |
| import numpy as np | |
| import shutil as _shutil # alias to avoid shadowed name | |
| from sentence_transformers import SentenceTransformer | |
| from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset | |
| from huggingface_hub import hf_hub_download, list_repo_files | |
| DATA_PATHS = [ | |
| "/home/user/app/persistent/faiss.index", | |
| "/home/user/app/persistent/faiss.index.meta.json", | |
| "/home/user/app/data/docs_cache", | |
| ] | |
| def clear_index(): | |
| removed = [] | |
| for p in DATA_PATHS: | |
| if os.path.isdir(p): | |
| _shutil.rmtree(p, ignore_errors=True) | |
| removed.append(f"🗑️ Deleted folder: {p}") | |
| elif os.path.exists(p): | |
| os.remove(p) | |
| removed.append(f"🗑️ Deleted file: {p}") | |
| msg = "\n".join(removed) if removed else "ℹ️ No cache files found." | |
| print(msg) | |
| return msg | |
| def rebuild_index(): | |
| """Rebuild FAISS index from glossary + Excel + web.""" | |
| try: | |
| import os | |
| import json | |
| import pandas as pd | |
| import faiss | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from core.web_loader import web_crawler_loader # may raise; handled below | |
| repo_id_index = "essprasad/CT-Chat-Index" | |
| repo_id_docs = "essprasad/CT-Chat-Docs" | |
| local_dir = "/home/user/app/persistent" | |
| os.makedirs(local_dir, exist_ok=True) | |
| print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)…") | |
| # --- Ensure glossary.json exists (download if missing) | |
| glossary_path = os.path.join(local_dir, "glossary.json") | |
| if not os.path.exists(glossary_path): | |
| try: | |
| print("📥 glossary.json missing locally — downloading from HF index dataset...") | |
| downloaded = hf_hub_download(repo_id=repo_id_index, filename="persistent/glossary.json", repo_type="dataset") | |
| # copy to local persistent path | |
| _shutil.copy2(downloaded, glossary_path) | |
| print("✅ Downloaded glossary.json.") | |
| except Exception as e: | |
| print(f"⚠️ Could not download glossary.json: {e}. Proceeding if available in other sources.") | |
| # Rebuild FAISS from glossary (this returns an index object and metadata list) | |
| index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path) | |
| print(f"📘 Loaded {len(metas)} glossary entries.") | |
| # --- 3️⃣ Index Excel (MRCT Glossary) | |
| print("📑 Scanning Excel files in dataset…") | |
| repo_files = list_repo_files(repo_id_docs, repo_type="dataset") | |
| excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))] | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| excel_entries = [] | |
| for file_name in excel_files: | |
| print(f"📄 Reading {file_name}…") | |
| try: | |
| path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset") | |
| xls = pd.read_excel(path, sheet_name=None) | |
| for sheet, df in xls.items(): | |
| if "Glossary Term" not in df.columns: | |
| continue | |
| df = df.fillna("").dropna(how="all") | |
| for _, row in df.iterrows(): | |
| term = str(row.get("Glossary Term", "")).strip() | |
| if not term: | |
| continue | |
| # Combine all the relevant MRCT fields | |
| combined_text = ( | |
| f"Glossary Term: {term}\n" | |
| f"Glossary Definition: {row.get('Glossary Definition','')}\n" | |
| f"Use in Context: {row.get('Use in Context','')}\n" | |
| f"More Info: {row.get('More Info','')}\n" | |
| f"Other Info to Think About When Joining a Study: {row.get('Other Info to Think About When Joining a Study','')}\n" | |
| f"Related Terms: {row.get('Related Terms','')}\n" | |
| f"Other Resources: {row.get('Other Resources','')}\n" | |
| f"Term URL: {row.get('Term URL','')}\n" | |
| f"CDISC/NCI URL: {row.get('CDISC/NCI URL','')}\n" | |
| f"Version: {row.get('Version','')}" | |
| ).strip() | |
| excel_entries.append({ | |
| "source": file_name, | |
| "sheet": sheet, | |
| "term": term, | |
| "type": "Excel", | |
| "file": file_name, | |
| "text": combined_text | |
| }) | |
| except Exception as e: | |
| print(f"⚠️ Error reading {file_name}: {e}") | |
| if excel_entries: | |
| texts = [e["text"] for e in excel_entries] | |
| embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32") | |
| faiss.normalize_L2(embeddings) | |
| index.add(embeddings) | |
| metas.extend(excel_entries) | |
| print(f"✅ Added {len(excel_entries)} Excel entries to FAISS.") | |
| # ---- Optional: Load web content (may be slow) | |
| try: | |
| print("🌐 Loading and embedding web sources…") | |
| web_entries = web_crawler_loader( | |
| urls_file="/home/user/app/data/urls.txt", | |
| cache_path="/home/user/app/persistent/web_cache.json", | |
| max_pages=3, | |
| timeout=20, | |
| force_refresh=False, | |
| ) | |
| if web_entries: | |
| web_entries = [e for e in web_entries if len(e.get("text", "")) > 200] | |
| print(f"✅ Retrieved {len(web_entries)} web entries.") | |
| web_texts = [e["text"] for e in web_entries] | |
| web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32") | |
| faiss.normalize_L2(web_emb) | |
| index.add(web_emb) | |
| metas.extend(web_entries) | |
| print("✅ Web content added to FAISS.") | |
| except Exception as e: | |
| print(f"⚠️ Web content embedding failed: {e}") | |
| # --- Save index + meta locally | |
| faiss_path = os.path.join(local_dir, "faiss.index") | |
| meta_path = os.path.join(local_dir, "faiss.index.meta.json") | |
| faiss.write_index(index, faiss_path) | |
| with open(meta_path, "w", encoding="utf-8") as f: | |
| json.dump(metas, f, indent=2) | |
| print(f"💾 Local FAISS saved ({len(metas)} entries).") | |
| # --- Upload artifacts back to HF dataset (best-effort) | |
| try: | |
| _upload_to_dataset(faiss_path, meta_path, repo_id_index) | |
| print(f"☁️ Uploaded FAISS ({len(metas)} entries) to {repo_id_index}.") | |
| except Exception as e: | |
| print(f"⚠️ Upload failed: {e}") | |
| return f"✅ Rebuild complete: {len(metas)} entries (Glossary + Excel + Web)." | |
| except Exception as e: | |
| return f"⚠️ Rebuild failed: {e}" | |
| def rebuild_glossary(): | |
| try: | |
| from core.glossary_builder import rebuild_and_upload | |
| rebuild_and_upload() | |
| return "✅ Glossary rebuilt and uploaded successfully." | |
| except Exception as e: | |
| return f"⚠️ Glossary rebuild failed: {e}" | |
| def reset_faiss_cache(): | |
| """ | |
| Completely clears local FAISS and glossary caches, reloads the vector_store module | |
| (to wipe in-memory runtime caches), then rebuilds glossary + index. | |
| """ | |
| try: | |
| # Use the clear helper from core.vector_store if available | |
| from importlib import reload | |
| from core import vector_store | |
| # If vector_store exposes clear_local_faiss, use it (safe and logged) | |
| if hasattr(vector_store, "clear_local_faiss"): | |
| vector_store.clear_local_faiss() | |
| else: | |
| # fallback: manually delete persistent/runtime files | |
| paths = [ | |
| "/home/user/app/persistent/faiss.index", | |
| "/home/user/app/persistent/faiss.index.meta.json", | |
| "/home/user/app/persistent/glossary.json", | |
| "/home/user/app/runtime_faiss", | |
| ] | |
| for p in paths: | |
| if os.path.exists(p): | |
| try: | |
| if os.path.isdir(p): | |
| _shutil.rmtree(p, ignore_errors=True) | |
| else: | |
| os.remove(p) | |
| print(f"🗑️ Deleted: {p}") | |
| except Exception: | |
| pass | |
| # reload the module to clear any in-memory caches | |
| reload(vector_store) | |
| print("♻️ FAISS runtime module reloaded to ensure fresh index rebuild.") | |
| msg = "🧹 Local FAISS + glossary cache cleared. Starting full rebuild...\n\n" | |
| msg += rebuild_glossary() + "\n" | |
| msg += rebuild_index() | |
| return msg | |
| except Exception as e: | |
| return f"⚠️ Reset failed: {e}" | |
| # ---------------------------------------------------------- | |
| # CHATBOT CORE | |
| # ---------------------------------------------------------- | |
| def chat_answer(query, mode="short"): | |
| try: | |
| if not query or not str(query).strip(): | |
| return "<i>⚠️ Please enter a valid query.</i>" | |
| return summarize_combined(str(query).strip(), mode=mode) | |
| except Exception as e: | |
| print("❌ Chatbot error:", e) | |
| return f"<i>⚠️ Error: {e}</i>" | |
| # ---------------------------------------------------------- | |
| # GRADIO UI | |
| # ---------------------------------------------------------- | |
| with gr.Blocks(theme="gradio/soft") as demo: | |
| gr.Markdown(f"# {APP_TITLE}") | |
| gr.Markdown(APP_DESC) | |
| query_box = gr.Textbox( | |
| label="Ask your clinical trial question", | |
| placeholder="e.g. What is an eCRF?", | |
| lines=2, | |
| ) | |
| output_box = gr.HTML(label="Answer") | |
| with gr.Row(): | |
| submit_btn = gr.Button("🚀 Submit", variant="primary") | |
| if not PUBLIC_MODE: | |
| rebuild_btn = gr.Button("🔁 Rebuild Index") | |
| rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary") | |
| reset_btn = gr.Button("🧹 Reset FAISS Cache (Full Rebuild)") | |
| clear_btn = gr.Button("🗑️ Clear Index Only") | |
| submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box) | |
| query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box) | |
| if not PUBLIC_MODE: | |
| rebuild_btn.click(fn=rebuild_index, outputs=output_box) | |
| rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box) | |
| reset_btn.click(fn=reset_faiss_cache, outputs=output_box) | |
| clear_btn.click(fn=clear_index, outputs=output_box) | |
| # ---------------------------------------------------------- | |
| # LAUNCH APP | |
| # ---------------------------------------------------------- | |
| if __name__ == "__main__": | |
| print("🚀 Starting Clinical Trial Chatbot…") | |
| print("🧠 Initializing retriever warm-up…") | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| auth=check_admin_login if not PUBLIC_MODE else None, | |
| ssr_mode=False, | |
| ) | |