# ==========================================================
# SAFE-MODE PRELAUNCH CLEANUP
# ==========================================================
import os
import shutil
import time
import glob
# Prevent Svelte/Gradio SSR locale warning early
os.environ["GRADIO_LOCALE"] = "en"
def _prelaunch_cleanup(threshold_gb: float = 45.0):
"""Pre-clean to avoid HF Spaces eviction while being conservative about persistent data."""
def _used_gb(path="/home/user/app"):
try:
total, used, free = shutil.disk_usage(path)
return round(min(used / (1024**3), 49.9), 2)
except Exception:
return 0.0
used = _used_gb()
print(f"\n💾 Startup disk usage: {used:.2f} GB")
# Only perform aggressive cleanup when over threshold.
if used > threshold_gb:
print(f"⚠️ Usage {used:.2f} GB > {threshold_gb} GB — performing aggressive cleanup.")
# preserve persistent / important artifacts by default
preserve = {"faiss.index", "faiss.index.meta.json", "glossary.json"}
for folder in ["/home/user/app/data/docs_cache", "/home/user/app/tmp_docs"]:
if os.path.exists(folder):
for f in glob.glob(os.path.join(folder, "*")):
name = os.path.basename(f)
if name in preserve:
continue
try:
if os.path.isdir(f):
shutil.rmtree(f, ignore_errors=True)
else:
os.remove(f)
except Exception:
pass
print("🧹 Aggressive cleanup complete.")
print(f"✨ Disk after cleanup: {_used_gb():.2f} GB\n")
_prelaunch_cleanup()
# ==========================================================
# MAIN APP — Clinical Trial Chatbot
# ==========================================================
import gradio as gr
from core.hybrid_retriever import summarize_combined
APP_TITLE = "🧠 Clinical Research Chatbot"
APP_DESC = (
"Ask any clinical research or GCP-related question. "
"Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets."
)
# ----------------------------------------------------------
# MODE & CREDENTIALS
# ----------------------------------------------------------
PUBLIC_MODE = os.environ.get("PUBLIC_MODE", "true").lower() == "true"
ADMIN_USER = os.environ.get("ADMIN_USER", "admin")
ADMIN_PASS = os.environ.get("ADMIN_PASS", "changeme")
print(f"🔐 Running in {'PUBLIC' if PUBLIC_MODE else 'ADMIN'} mode.")
print(f"🌍 Locale set to: {os.environ.get('GRADIO_LOCALE','en')}")
print(f"🧩 Env vars loaded: PUBLIC_MODE={PUBLIC_MODE}, ADMIN_USER={ADMIN_USER}")
# ----------------------------------------------------------
# AUTH HELPER
# ----------------------------------------------------------
def check_admin_login(username, password):
return username == ADMIN_USER and password == ADMIN_PASS
# ----------------------------------------------------------
# MAINTENANCE FUNCTIONS
# ----------------------------------------------------------
import json
import faiss
import pandas as pd
import numpy as np
import shutil as _shutil # alias to avoid shadowed name
from sentence_transformers import SentenceTransformer
from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
from huggingface_hub import hf_hub_download, list_repo_files
DATA_PATHS = [
"/home/user/app/persistent/faiss.index",
"/home/user/app/persistent/faiss.index.meta.json",
"/home/user/app/data/docs_cache",
]
def clear_index():
removed = []
for p in DATA_PATHS:
if os.path.isdir(p):
_shutil.rmtree(p, ignore_errors=True)
removed.append(f"🗑️ Deleted folder: {p}")
elif os.path.exists(p):
os.remove(p)
removed.append(f"🗑️ Deleted file: {p}")
msg = "\n".join(removed) if removed else "ℹ️ No cache files found."
print(msg)
return msg
def rebuild_index():
"""Rebuild FAISS index from glossary + Excel + web."""
try:
import os
import json
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from core.web_loader import web_crawler_loader # may raise; handled below
repo_id_index = "essprasad/CT-Chat-Index"
repo_id_docs = "essprasad/CT-Chat-Docs"
local_dir = "/home/user/app/persistent"
os.makedirs(local_dir, exist_ok=True)
print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)…")
# --- Ensure glossary.json exists (download if missing)
glossary_path = os.path.join(local_dir, "glossary.json")
if not os.path.exists(glossary_path):
try:
print("📥 glossary.json missing locally — downloading from HF index dataset...")
downloaded = hf_hub_download(repo_id=repo_id_index, filename="persistent/glossary.json", repo_type="dataset")
# copy to local persistent path
_shutil.copy2(downloaded, glossary_path)
print("✅ Downloaded glossary.json.")
except Exception as e:
print(f"⚠️ Could not download glossary.json: {e}. Proceeding if available in other sources.")
# Rebuild FAISS from glossary (this returns an index object and metadata list)
index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
print(f"📘 Loaded {len(metas)} glossary entries.")
# --- 3️⃣ Index Excel (MRCT Glossary)
print("📑 Scanning Excel files in dataset…")
repo_files = list_repo_files(repo_id_docs, repo_type="dataset")
excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))]
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
excel_entries = []
for file_name in excel_files:
print(f"📄 Reading {file_name}…")
try:
path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset")
xls = pd.read_excel(path, sheet_name=None)
for sheet, df in xls.items():
if "Glossary Term" not in df.columns:
continue
df = df.fillna("").dropna(how="all")
for _, row in df.iterrows():
term = str(row.get("Glossary Term", "")).strip()
if not term:
continue
# Combine all the relevant MRCT fields
combined_text = (
f"Glossary Term: {term}\n"
f"Glossary Definition: {row.get('Glossary Definition','')}\n"
f"Use in Context: {row.get('Use in Context','')}\n"
f"More Info: {row.get('More Info','')}\n"
f"Other Info to Think About When Joining a Study: {row.get('Other Info to Think About When Joining a Study','')}\n"
f"Related Terms: {row.get('Related Terms','')}\n"
f"Other Resources: {row.get('Other Resources','')}\n"
f"Term URL: {row.get('Term URL','')}\n"
f"CDISC/NCI URL: {row.get('CDISC/NCI URL','')}\n"
f"Version: {row.get('Version','')}"
).strip()
excel_entries.append({
"source": file_name,
"sheet": sheet,
"term": term,
"type": "Excel",
"file": file_name,
"text": combined_text
})
except Exception as e:
print(f"⚠️ Error reading {file_name}: {e}")
if excel_entries:
texts = [e["text"] for e in excel_entries]
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
faiss.normalize_L2(embeddings)
index.add(embeddings)
metas.extend(excel_entries)
print(f"✅ Added {len(excel_entries)} Excel entries to FAISS.")
# ---- Optional: Load web content (may be slow)
try:
print("🌐 Loading and embedding web sources…")
web_entries = web_crawler_loader(
urls_file="/home/user/app/data/urls.txt",
cache_path="/home/user/app/persistent/web_cache.json",
max_pages=3,
timeout=20,
force_refresh=False,
)
if web_entries:
web_entries = [e for e in web_entries if len(e.get("text", "")) > 200]
print(f"✅ Retrieved {len(web_entries)} web entries.")
web_texts = [e["text"] for e in web_entries]
web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
faiss.normalize_L2(web_emb)
index.add(web_emb)
metas.extend(web_entries)
print("✅ Web content added to FAISS.")
except Exception as e:
print(f"⚠️ Web content embedding failed: {e}")
# --- Save index + meta locally
faiss_path = os.path.join(local_dir, "faiss.index")
meta_path = os.path.join(local_dir, "faiss.index.meta.json")
faiss.write_index(index, faiss_path)
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(metas, f, indent=2)
print(f"💾 Local FAISS saved ({len(metas)} entries).")
# --- Upload artifacts back to HF dataset (best-effort)
try:
_upload_to_dataset(faiss_path, meta_path, repo_id_index)
print(f"☁️ Uploaded FAISS ({len(metas)} entries) to {repo_id_index}.")
except Exception as e:
print(f"⚠️ Upload failed: {e}")
return f"✅ Rebuild complete: {len(metas)} entries (Glossary + Excel + Web)."
except Exception as e:
return f"⚠️ Rebuild failed: {e}"
def rebuild_glossary():
try:
from core.glossary_builder import rebuild_and_upload
rebuild_and_upload()
return "✅ Glossary rebuilt and uploaded successfully."
except Exception as e:
return f"⚠️ Glossary rebuild failed: {e}"
def reset_faiss_cache():
"""
Completely clears local FAISS and glossary caches, reloads the vector_store module
(to wipe in-memory runtime caches), then rebuilds glossary + index.
"""
try:
# Use the clear helper from core.vector_store if available
from importlib import reload
from core import vector_store
# If vector_store exposes clear_local_faiss, use it (safe and logged)
if hasattr(vector_store, "clear_local_faiss"):
vector_store.clear_local_faiss()
else:
# fallback: manually delete persistent/runtime files
paths = [
"/home/user/app/persistent/faiss.index",
"/home/user/app/persistent/faiss.index.meta.json",
"/home/user/app/persistent/glossary.json",
"/home/user/app/runtime_faiss",
]
for p in paths:
if os.path.exists(p):
try:
if os.path.isdir(p):
_shutil.rmtree(p, ignore_errors=True)
else:
os.remove(p)
print(f"🗑️ Deleted: {p}")
except Exception:
pass
# reload the module to clear any in-memory caches
reload(vector_store)
print("♻️ FAISS runtime module reloaded to ensure fresh index rebuild.")
msg = "🧹 Local FAISS + glossary cache cleared. Starting full rebuild...\n\n"
msg += rebuild_glossary() + "\n"
msg += rebuild_index()
return msg
except Exception as e:
return f"⚠️ Reset failed: {e}"
# ----------------------------------------------------------
# CHATBOT CORE
# ----------------------------------------------------------
def chat_answer(query, mode="short"):
try:
if not query or not str(query).strip():
return "⚠️ Please enter a valid query."
return summarize_combined(str(query).strip(), mode=mode)
except Exception as e:
print("❌ Chatbot error:", e)
return f"⚠️ Error: {e}"
# ----------------------------------------------------------
# GRADIO UI
# ----------------------------------------------------------
with gr.Blocks(theme="gradio/soft") as demo:
gr.Markdown(f"# {APP_TITLE}")
gr.Markdown(APP_DESC)
query_box = gr.Textbox(
label="Ask your clinical trial question",
placeholder="e.g. What is an eCRF?",
lines=2,
)
output_box = gr.HTML(label="Answer")
with gr.Row():
submit_btn = gr.Button("🚀 Submit", variant="primary")
if not PUBLIC_MODE:
rebuild_btn = gr.Button("🔁 Rebuild Index")
rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary")
reset_btn = gr.Button("🧹 Reset FAISS Cache (Full Rebuild)")
clear_btn = gr.Button("🗑️ Clear Index Only")
submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box)
query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box)
if not PUBLIC_MODE:
rebuild_btn.click(fn=rebuild_index, outputs=output_box)
rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box)
reset_btn.click(fn=reset_faiss_cache, outputs=output_box)
clear_btn.click(fn=clear_index, outputs=output_box)
# ----------------------------------------------------------
# LAUNCH APP
# ----------------------------------------------------------
if __name__ == "__main__":
print("🚀 Starting Clinical Trial Chatbot…")
print("🧠 Initializing retriever warm-up…")
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
auth=check_admin_login if not PUBLIC_MODE else None,
ssr_mode=False,
)