# ========================================================== # SAFE-MODE PRELAUNCH CLEANUP # ========================================================== import os import shutil import time import glob # Prevent Svelte/Gradio SSR locale warning early os.environ["GRADIO_LOCALE"] = "en" def _prelaunch_cleanup(threshold_gb: float = 45.0): """Pre-clean to avoid HF Spaces eviction while being conservative about persistent data.""" def _used_gb(path="/home/user/app"): try: total, used, free = shutil.disk_usage(path) return round(min(used / (1024**3), 49.9), 2) except Exception: return 0.0 used = _used_gb() print(f"\n💾 Startup disk usage: {used:.2f} GB") # Only perform aggressive cleanup when over threshold. if used > threshold_gb: print(f"⚠️ Usage {used:.2f} GB > {threshold_gb} GB — performing aggressive cleanup.") # preserve persistent / important artifacts by default preserve = {"faiss.index", "faiss.index.meta.json", "glossary.json"} for folder in ["/home/user/app/data/docs_cache", "/home/user/app/tmp_docs"]: if os.path.exists(folder): for f in glob.glob(os.path.join(folder, "*")): name = os.path.basename(f) if name in preserve: continue try: if os.path.isdir(f): shutil.rmtree(f, ignore_errors=True) else: os.remove(f) except Exception: pass print("🧹 Aggressive cleanup complete.") print(f"✨ Disk after cleanup: {_used_gb():.2f} GB\n") _prelaunch_cleanup() # ========================================================== # MAIN APP — Clinical Trial Chatbot # ========================================================== import gradio as gr from core.hybrid_retriever import summarize_combined APP_TITLE = "🧠 Clinical Research Chatbot" APP_DESC = ( "Ask any clinical research or GCP-related question. " "Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets." ) # ---------------------------------------------------------- # MODE & CREDENTIALS # ---------------------------------------------------------- PUBLIC_MODE = os.environ.get("PUBLIC_MODE", "true").lower() == "true" ADMIN_USER = os.environ.get("ADMIN_USER", "admin") ADMIN_PASS = os.environ.get("ADMIN_PASS", "changeme") print(f"🔐 Running in {'PUBLIC' if PUBLIC_MODE else 'ADMIN'} mode.") print(f"🌍 Locale set to: {os.environ.get('GRADIO_LOCALE','en')}") print(f"🧩 Env vars loaded: PUBLIC_MODE={PUBLIC_MODE}, ADMIN_USER={ADMIN_USER}") # ---------------------------------------------------------- # AUTH HELPER # ---------------------------------------------------------- def check_admin_login(username, password): return username == ADMIN_USER and password == ADMIN_PASS # ---------------------------------------------------------- # MAINTENANCE FUNCTIONS # ---------------------------------------------------------- import json import faiss import pandas as pd import numpy as np import shutil as _shutil # alias to avoid shadowed name from sentence_transformers import SentenceTransformer from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset from huggingface_hub import hf_hub_download, list_repo_files DATA_PATHS = [ "/home/user/app/persistent/faiss.index", "/home/user/app/persistent/faiss.index.meta.json", "/home/user/app/data/docs_cache", ] def clear_index(): removed = [] for p in DATA_PATHS: if os.path.isdir(p): _shutil.rmtree(p, ignore_errors=True) removed.append(f"🗑️ Deleted folder: {p}") elif os.path.exists(p): os.remove(p) removed.append(f"🗑️ Deleted file: {p}") msg = "\n".join(removed) if removed else "ℹ️ No cache files found." print(msg) return msg def rebuild_index(): """Rebuild FAISS index from glossary + Excel + web.""" try: import os import json import pandas as pd import faiss import numpy as np from sentence_transformers import SentenceTransformer from core.web_loader import web_crawler_loader # may raise; handled below repo_id_index = "essprasad/CT-Chat-Index" repo_id_docs = "essprasad/CT-Chat-Docs" local_dir = "/home/user/app/persistent" os.makedirs(local_dir, exist_ok=True) print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)…") # --- Ensure glossary.json exists (download if missing) glossary_path = os.path.join(local_dir, "glossary.json") if not os.path.exists(glossary_path): try: print("📥 glossary.json missing locally — downloading from HF index dataset...") downloaded = hf_hub_download(repo_id=repo_id_index, filename="persistent/glossary.json", repo_type="dataset") # copy to local persistent path _shutil.copy2(downloaded, glossary_path) print("✅ Downloaded glossary.json.") except Exception as e: print(f"⚠️ Could not download glossary.json: {e}. Proceeding if available in other sources.") # Rebuild FAISS from glossary (this returns an index object and metadata list) index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path) print(f"📘 Loaded {len(metas)} glossary entries.") # --- 3️⃣ Index Excel (MRCT Glossary) print("📑 Scanning Excel files in dataset…") repo_files = list_repo_files(repo_id_docs, repo_type="dataset") excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))] model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") excel_entries = [] for file_name in excel_files: print(f"📄 Reading {file_name}…") try: path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset") xls = pd.read_excel(path, sheet_name=None) for sheet, df in xls.items(): if "Glossary Term" not in df.columns: continue df = df.fillna("").dropna(how="all") for _, row in df.iterrows(): term = str(row.get("Glossary Term", "")).strip() if not term: continue # Combine all the relevant MRCT fields combined_text = ( f"Glossary Term: {term}\n" f"Glossary Definition: {row.get('Glossary Definition','')}\n" f"Use in Context: {row.get('Use in Context','')}\n" f"More Info: {row.get('More Info','')}\n" f"Other Info to Think About When Joining a Study: {row.get('Other Info to Think About When Joining a Study','')}\n" f"Related Terms: {row.get('Related Terms','')}\n" f"Other Resources: {row.get('Other Resources','')}\n" f"Term URL: {row.get('Term URL','')}\n" f"CDISC/NCI URL: {row.get('CDISC/NCI URL','')}\n" f"Version: {row.get('Version','')}" ).strip() excel_entries.append({ "source": file_name, "sheet": sheet, "term": term, "type": "Excel", "file": file_name, "text": combined_text }) except Exception as e: print(f"⚠️ Error reading {file_name}: {e}") if excel_entries: texts = [e["text"] for e in excel_entries] embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32") faiss.normalize_L2(embeddings) index.add(embeddings) metas.extend(excel_entries) print(f"✅ Added {len(excel_entries)} Excel entries to FAISS.") # ---- Optional: Load web content (may be slow) try: print("🌐 Loading and embedding web sources…") web_entries = web_crawler_loader( urls_file="/home/user/app/data/urls.txt", cache_path="/home/user/app/persistent/web_cache.json", max_pages=3, timeout=20, force_refresh=False, ) if web_entries: web_entries = [e for e in web_entries if len(e.get("text", "")) > 200] print(f"✅ Retrieved {len(web_entries)} web entries.") web_texts = [e["text"] for e in web_entries] web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32") faiss.normalize_L2(web_emb) index.add(web_emb) metas.extend(web_entries) print("✅ Web content added to FAISS.") except Exception as e: print(f"⚠️ Web content embedding failed: {e}") # --- Save index + meta locally faiss_path = os.path.join(local_dir, "faiss.index") meta_path = os.path.join(local_dir, "faiss.index.meta.json") faiss.write_index(index, faiss_path) with open(meta_path, "w", encoding="utf-8") as f: json.dump(metas, f, indent=2) print(f"💾 Local FAISS saved ({len(metas)} entries).") # --- Upload artifacts back to HF dataset (best-effort) try: _upload_to_dataset(faiss_path, meta_path, repo_id_index) print(f"☁️ Uploaded FAISS ({len(metas)} entries) to {repo_id_index}.") except Exception as e: print(f"⚠️ Upload failed: {e}") return f"✅ Rebuild complete: {len(metas)} entries (Glossary + Excel + Web)." except Exception as e: return f"⚠️ Rebuild failed: {e}" def rebuild_glossary(): try: from core.glossary_builder import rebuild_and_upload rebuild_and_upload() return "✅ Glossary rebuilt and uploaded successfully." except Exception as e: return f"⚠️ Glossary rebuild failed: {e}" def reset_faiss_cache(): """ Completely clears local FAISS and glossary caches, reloads the vector_store module (to wipe in-memory runtime caches), then rebuilds glossary + index. """ try: # Use the clear helper from core.vector_store if available from importlib import reload from core import vector_store # If vector_store exposes clear_local_faiss, use it (safe and logged) if hasattr(vector_store, "clear_local_faiss"): vector_store.clear_local_faiss() else: # fallback: manually delete persistent/runtime files paths = [ "/home/user/app/persistent/faiss.index", "/home/user/app/persistent/faiss.index.meta.json", "/home/user/app/persistent/glossary.json", "/home/user/app/runtime_faiss", ] for p in paths: if os.path.exists(p): try: if os.path.isdir(p): _shutil.rmtree(p, ignore_errors=True) else: os.remove(p) print(f"🗑️ Deleted: {p}") except Exception: pass # reload the module to clear any in-memory caches reload(vector_store) print("♻️ FAISS runtime module reloaded to ensure fresh index rebuild.") msg = "🧹 Local FAISS + glossary cache cleared. Starting full rebuild...\n\n" msg += rebuild_glossary() + "\n" msg += rebuild_index() return msg except Exception as e: return f"⚠️ Reset failed: {e}" # ---------------------------------------------------------- # CHATBOT CORE # ---------------------------------------------------------- def chat_answer(query, mode="short"): try: if not query or not str(query).strip(): return "⚠️ Please enter a valid query." return summarize_combined(str(query).strip(), mode=mode) except Exception as e: print("❌ Chatbot error:", e) return f"⚠️ Error: {e}" # ---------------------------------------------------------- # GRADIO UI # ---------------------------------------------------------- with gr.Blocks(theme="gradio/soft") as demo: gr.Markdown(f"# {APP_TITLE}") gr.Markdown(APP_DESC) query_box = gr.Textbox( label="Ask your clinical trial question", placeholder="e.g. What is an eCRF?", lines=2, ) output_box = gr.HTML(label="Answer") with gr.Row(): submit_btn = gr.Button("🚀 Submit", variant="primary") if not PUBLIC_MODE: rebuild_btn = gr.Button("🔁 Rebuild Index") rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary") reset_btn = gr.Button("🧹 Reset FAISS Cache (Full Rebuild)") clear_btn = gr.Button("🗑️ Clear Index Only") submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box) query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box) if not PUBLIC_MODE: rebuild_btn.click(fn=rebuild_index, outputs=output_box) rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box) reset_btn.click(fn=reset_faiss_cache, outputs=output_box) clear_btn.click(fn=clear_index, outputs=output_box) # ---------------------------------------------------------- # LAUNCH APP # ---------------------------------------------------------- if __name__ == "__main__": print("🚀 Starting Clinical Trial Chatbot…") print("🧠 Initializing retriever warm-up…") demo.launch( server_name="0.0.0.0", server_port=7860, share=False, auth=check_admin_login if not PUBLIC_MODE else None, ssr_mode=False, )