Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

ClinicalTrialBasics / app.py

essprasad

Update app.py

468a519 verified 14 days ago

raw

history blame

14.5 kB

	# ==========================================================
	# SAFE-MODE PRELAUNCH CLEANUP
	# ==========================================================
	import os
	import shutil
	import time
	import glob

	# Prevent Svelte/Gradio SSR locale warning early
	os.environ["GRADIO_LOCALE"] = "en"


	def _prelaunch_cleanup(threshold_gb: float = 45.0):
	"""Pre-clean to avoid HF Spaces eviction while being conservative about persistent data."""
	def _used_gb(path="/home/user/app"):
	try:
	total, used, free = shutil.disk_usage(path)
	return round(min(used / (1024**3), 49.9), 2)
	except Exception:
	return 0.0

	used = _used_gb()
	print(f"\n💾 Startup disk usage: {used:.2f} GB")

	# Only perform aggressive cleanup when over threshold.
	if used > threshold_gb:
	print(f"⚠️ Usage {used:.2f} GB > {threshold_gb} GB — performing aggressive cleanup.")
	# preserve persistent / important artifacts by default
	preserve = {"faiss.index", "faiss.index.meta.json", "glossary.json"}
	for folder in ["/home/user/app/data/docs_cache", "/home/user/app/tmp_docs"]:
	if os.path.exists(folder):
	for f in glob.glob(os.path.join(folder, "*")):
	name = os.path.basename(f)
	if name in preserve:
	continue
	try:
	if os.path.isdir(f):
	shutil.rmtree(f, ignore_errors=True)
	else:
	os.remove(f)
	except Exception:
	pass
	print("🧹 Aggressive cleanup complete.")

	print(f"✨ Disk after cleanup: {_used_gb():.2f} GB\n")


	_prelaunch_cleanup()


	# ==========================================================
	# MAIN APP — Clinical Trial Chatbot
	# ==========================================================
	import gradio as gr
	from core.hybrid_retriever import summarize_combined

	APP_TITLE = "🧠 Clinical Research Chatbot"
	APP_DESC = (
	"Ask any clinical research or GCP-related question. "
	"Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets."
	)


	# ----------------------------------------------------------
	# MODE & CREDENTIALS
	# ----------------------------------------------------------
	PUBLIC_MODE = os.environ.get("PUBLIC_MODE", "true").lower() == "true"
	ADMIN_USER = os.environ.get("ADMIN_USER", "admin")
	ADMIN_PASS = os.environ.get("ADMIN_PASS", "changeme")

	print(f"🔐 Running in {'PUBLIC' if PUBLIC_MODE else 'ADMIN'} mode.")
	print(f"🌍 Locale set to: {os.environ.get('GRADIO_LOCALE','en')}")
	print(f"🧩 Env vars loaded: PUBLIC_MODE={PUBLIC_MODE}, ADMIN_USER={ADMIN_USER}")


	# ----------------------------------------------------------
	# AUTH HELPER
	# ----------------------------------------------------------
	def check_admin_login(username, password):
	return username == ADMIN_USER and password == ADMIN_PASS


	# ----------------------------------------------------------
	# MAINTENANCE FUNCTIONS
	# ----------------------------------------------------------
	import json
	import faiss
	import pandas as pd
	import numpy as np
	import shutil as _shutil # alias to avoid shadowed name
	from sentence_transformers import SentenceTransformer
	from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
	from huggingface_hub import hf_hub_download, list_repo_files

	DATA_PATHS = [
	"/home/user/app/persistent/faiss.index",
	"/home/user/app/persistent/faiss.index.meta.json",
	"/home/user/app/data/docs_cache",
	]


	def clear_index():
	removed = []
	for p in DATA_PATHS:
	if os.path.isdir(p):
	_shutil.rmtree(p, ignore_errors=True)
	removed.append(f"🗑️ Deleted folder: {p}")
	elif os.path.exists(p):
	os.remove(p)
	removed.append(f"🗑️ Deleted file: {p}")
	msg = "\n".join(removed) if removed else "ℹ️ No cache files found."
	print(msg)
	return msg


	def rebuild_index():
	"""Rebuild FAISS index from glossary + Excel + web."""
	try:
	import os
	import json
	import pandas as pd
	import faiss
	import numpy as np
	from sentence_transformers import SentenceTransformer

	from core.web_loader import web_crawler_loader # may raise; handled below

	repo_id_index = "essprasad/CT-Chat-Index"
	repo_id_docs = "essprasad/CT-Chat-Docs"
	local_dir = "/home/user/app/persistent"
	os.makedirs(local_dir, exist_ok=True)

	print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)…")

	# --- Ensure glossary.json exists (download if missing)
	glossary_path = os.path.join(local_dir, "glossary.json")
	if not os.path.exists(glossary_path):
	try:
	print("📥 glossary.json missing locally — downloading from HF index dataset...")
	downloaded = hf_hub_download(repo_id=repo_id_index, filename="persistent/glossary.json", repo_type="dataset")
	# copy to local persistent path
	_shutil.copy2(downloaded, glossary_path)
	print("✅ Downloaded glossary.json.")
	except Exception as e:
	print(f"⚠️ Could not download glossary.json: {e}. Proceeding if available in other sources.")

	# Rebuild FAISS from glossary (this returns an index object and metadata list)
	index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
	print(f"📘 Loaded {len(metas)} glossary entries.")

	# --- 3️⃣ Index Excel (MRCT Glossary)
	print("📑 Scanning Excel files in dataset…")
	repo_files = list_repo_files(repo_id_docs, repo_type="dataset")
	excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))]

	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	excel_entries = []

	for file_name in excel_files:
	print(f"📄 Reading {file_name}…")
	try:
	path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset")
	xls = pd.read_excel(path, sheet_name=None)
	for sheet, df in xls.items():
	if "Glossary Term" not in df.columns:
	continue
	df = df.fillna("").dropna(how="all")
	for _, row in df.iterrows():
	term = str(row.get("Glossary Term", "")).strip()
	if not term:
	continue

	# Combine all the relevant MRCT fields
	combined_text = (
	f"Glossary Term: {term}\n"
	f"Glossary Definition: {row.get('Glossary Definition','')}\n"
	f"Use in Context: {row.get('Use in Context','')}\n"
	f"More Info: {row.get('More Info','')}\n"
	f"Other Info to Think About When Joining a Study: {row.get('Other Info to Think About When Joining a Study','')}\n"
	f"Related Terms: {row.get('Related Terms','')}\n"
	f"Other Resources: {row.get('Other Resources','')}\n"
	f"Term URL: {row.get('Term URL','')}\n"
	f"CDISC/NCI URL: {row.get('CDISC/NCI URL','')}\n"
	f"Version: {row.get('Version','')}"
	).strip()

	excel_entries.append({
	"source": file_name,
	"sheet": sheet,
	"term": term,
	"type": "Excel",
	"file": file_name,
	"text": combined_text
	})
	except Exception as e:
	print(f"⚠️ Error reading {file_name}: {e}")

	if excel_entries:
	texts = [e["text"] for e in excel_entries]
	embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
	faiss.normalize_L2(embeddings)
	index.add(embeddings)
	metas.extend(excel_entries)
	print(f"✅ Added {len(excel_entries)} Excel entries to FAISS.")


	# ---- Optional: Load web content (may be slow)
	try:
	print("🌐 Loading and embedding web sources…")
	web_entries = web_crawler_loader(
	urls_file="/home/user/app/data/urls.txt",
	cache_path="/home/user/app/persistent/web_cache.json",
	max_pages=3,
	timeout=20,
	force_refresh=False,
	)
	if web_entries:
	web_entries = [e for e in web_entries if len(e.get("text", "")) > 200]
	print(f"✅ Retrieved {len(web_entries)} web entries.")
	web_texts = [e["text"] for e in web_entries]
	web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
	faiss.normalize_L2(web_emb)
	index.add(web_emb)
	metas.extend(web_entries)
	print("✅ Web content added to FAISS.")
	except Exception as e:
	print(f"⚠️ Web content embedding failed: {e}")

	# --- Save index + meta locally
	faiss_path = os.path.join(local_dir, "faiss.index")
	meta_path = os.path.join(local_dir, "faiss.index.meta.json")
	faiss.write_index(index, faiss_path)
	with open(meta_path, "w", encoding="utf-8") as f:
	json.dump(metas, f, indent=2)
	print(f"💾 Local FAISS saved ({len(metas)} entries).")

	# --- Upload artifacts back to HF dataset (best-effort)
	try:
	_upload_to_dataset(faiss_path, meta_path, repo_id_index)
	print(f"☁️ Uploaded FAISS ({len(metas)} entries) to {repo_id_index}.")
	except Exception as e:
	print(f"⚠️ Upload failed: {e}")

	return f"✅ Rebuild complete: {len(metas)} entries (Glossary + Excel + Web)."
	except Exception as e:
	return f"⚠️ Rebuild failed: {e}"


	def rebuild_glossary():
	try:
	from core.glossary_builder import rebuild_and_upload
	rebuild_and_upload()
	return "✅ Glossary rebuilt and uploaded successfully."
	except Exception as e:
	return f"⚠️ Glossary rebuild failed: {e}"


	def reset_faiss_cache():
	"""
	Completely clears local FAISS and glossary caches, reloads the vector_store module
	(to wipe in-memory runtime caches), then rebuilds glossary + index.
	"""
	try:
	# Use the clear helper from core.vector_store if available
	from importlib import reload
	from core import vector_store

	# If vector_store exposes clear_local_faiss, use it (safe and logged)
	if hasattr(vector_store, "clear_local_faiss"):
	vector_store.clear_local_faiss()
	else:
	# fallback: manually delete persistent/runtime files
	paths = [
	"/home/user/app/persistent/faiss.index",
	"/home/user/app/persistent/faiss.index.meta.json",
	"/home/user/app/persistent/glossary.json",
	"/home/user/app/runtime_faiss",
	]
	for p in paths:
	if os.path.exists(p):
	try:
	if os.path.isdir(p):
	_shutil.rmtree(p, ignore_errors=True)
	else:
	os.remove(p)
	print(f"🗑️ Deleted: {p}")
	except Exception:
	pass

	# reload the module to clear any in-memory caches
	reload(vector_store)
	print("♻️ FAISS runtime module reloaded to ensure fresh index rebuild.")

	msg = "🧹 Local FAISS + glossary cache cleared. Starting full rebuild...\n\n"
	msg += rebuild_glossary() + "\n"
	msg += rebuild_index()
	return msg
	except Exception as e:
	return f"⚠️ Reset failed: {e}"


	# ----------------------------------------------------------
	# CHATBOT CORE
	# ----------------------------------------------------------
	def chat_answer(query, mode="short"):
	try:
	if not query or not str(query).strip():
	return "<i>⚠️ Please enter a valid query.</i>"
	return summarize_combined(str(query).strip(), mode=mode)
	except Exception as e:
	print("❌ Chatbot error:", e)
	return f"<i>⚠️ Error: {e}</i>"


	# ----------------------------------------------------------
	# GRADIO UI
	# ----------------------------------------------------------
	with gr.Blocks(theme="gradio/soft") as demo:
	gr.Markdown(f"# {APP_TITLE}")
	gr.Markdown(APP_DESC)

	query_box = gr.Textbox(
	label="Ask your clinical trial question",
	placeholder="e.g. What is an eCRF?",
	lines=2,
	)
	output_box = gr.HTML(label="Answer")

	with gr.Row():
	submit_btn = gr.Button("🚀 Submit", variant="primary")
	if not PUBLIC_MODE:
	rebuild_btn = gr.Button("🔁 Rebuild Index")
	rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary")
	reset_btn = gr.Button("🧹 Reset FAISS Cache (Full Rebuild)")
	clear_btn = gr.Button("🗑️ Clear Index Only")

	submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box)
	query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box)

	if not PUBLIC_MODE:
	rebuild_btn.click(fn=rebuild_index, outputs=output_box)
	rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box)
	reset_btn.click(fn=reset_faiss_cache, outputs=output_box)
	clear_btn.click(fn=clear_index, outputs=output_box)


	# ----------------------------------------------------------
	# LAUNCH APP
	# ----------------------------------------------------------
	if __name__ == "__main__":
	print("🚀 Starting Clinical Trial Chatbot…")
	print("🧠 Initializing retriever warm-up…")
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	auth=check_admin_login if not PUBLIC_MODE else None,
	ssr_mode=False,
	)