Spaces:

essprasad
/

ClinicalTrialBasics

Running

File size: 14,489 Bytes

# ==========================================================
# SAFE-MODE PRELAUNCH CLEANUP
# ==========================================================
import os
import shutil
import time
import glob

# Prevent Svelte/Gradio SSR locale warning early
os.environ["GRADIO_LOCALE"] = "en"


def _prelaunch_cleanup(threshold_gb: float = 45.0):
    """Pre-clean to avoid HF Spaces eviction while being conservative about persistent data."""
    def _used_gb(path="/home/user/app"):
        try:
            total, used, free = shutil.disk_usage(path)
            return round(min(used / (1024**3), 49.9), 2)
        except Exception:
            return 0.0

    used = _used_gb()
    print(f"\n💾 Startup disk usage: {used:.2f} GB")

    # Only perform aggressive cleanup when over threshold.
    if used > threshold_gb:
        print(f"⚠️ Usage {used:.2f} GB > {threshold_gb} GB — performing aggressive cleanup.")
        # preserve persistent / important artifacts by default
        preserve = {"faiss.index", "faiss.index.meta.json", "glossary.json"}
        for folder in ["/home/user/app/data/docs_cache", "/home/user/app/tmp_docs"]:
            if os.path.exists(folder):
                for f in glob.glob(os.path.join(folder, "*")):
                    name = os.path.basename(f)
                    if name in preserve:
                        continue
                    try:
                        if os.path.isdir(f):
                            shutil.rmtree(f, ignore_errors=True)
                        else:
                            os.remove(f)
                    except Exception:
                        pass
        print("🧹 Aggressive cleanup complete.")

    print(f"✨ Disk after cleanup: {_used_gb():.2f} GB\n")


_prelaunch_cleanup()


# ==========================================================
# MAIN APP — Clinical Trial Chatbot
# ==========================================================
import gradio as gr
from core.hybrid_retriever import summarize_combined

APP_TITLE = "🧠 Clinical Research Chatbot"
APP_DESC = (
    "Ask any clinical research or GCP-related question. "
    "Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets."
)


# ----------------------------------------------------------
# MODE & CREDENTIALS
# ----------------------------------------------------------
PUBLIC_MODE = os.environ.get("PUBLIC_MODE", "true").lower() == "true"
ADMIN_USER = os.environ.get("ADMIN_USER", "admin")
ADMIN_PASS = os.environ.get("ADMIN_PASS", "changeme")

print(f"🔐 Running in {'PUBLIC' if PUBLIC_MODE else 'ADMIN'} mode.")
print(f"🌍 Locale set to: {os.environ.get('GRADIO_LOCALE','en')}")
print(f"🧩 Env vars loaded: PUBLIC_MODE={PUBLIC_MODE}, ADMIN_USER={ADMIN_USER}")


# ----------------------------------------------------------
# AUTH HELPER
# ----------------------------------------------------------
def check_admin_login(username, password):
    return username == ADMIN_USER and password == ADMIN_PASS


# ----------------------------------------------------------
# MAINTENANCE FUNCTIONS
# ----------------------------------------------------------
import json
import faiss
import pandas as pd
import numpy as np
import shutil as _shutil  # alias to avoid shadowed name
from sentence_transformers import SentenceTransformer
from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
from huggingface_hub import hf_hub_download, list_repo_files

DATA_PATHS = [
    "/home/user/app/persistent/faiss.index",
    "/home/user/app/persistent/faiss.index.meta.json",
    "/home/user/app/data/docs_cache",
]


def clear_index():
    removed = []
    for p in DATA_PATHS:
        if os.path.isdir(p):
            _shutil.rmtree(p, ignore_errors=True)
            removed.append(f"🗑️ Deleted folder: {p}")
        elif os.path.exists(p):
            os.remove(p)
            removed.append(f"🗑️ Deleted file: {p}")
    msg = "\n".join(removed) if removed else "ℹ️ No cache files found."
    print(msg)
    return msg


def rebuild_index():
    """Rebuild FAISS index from glossary + Excel + web."""
    try:
        import os
        import json
        import pandas as pd
        import faiss
        import numpy as np
        from sentence_transformers import SentenceTransformer

        from core.web_loader import web_crawler_loader  # may raise; handled below

        repo_id_index = "essprasad/CT-Chat-Index"
        repo_id_docs = "essprasad/CT-Chat-Docs"
        local_dir = "/home/user/app/persistent"
        os.makedirs(local_dir, exist_ok=True)

        print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)…")

        # --- Ensure glossary.json exists (download if missing)
        glossary_path = os.path.join(local_dir, "glossary.json")
        if not os.path.exists(glossary_path):
            try:
                print("📥 glossary.json missing locally — downloading from HF index dataset...")
                downloaded = hf_hub_download(repo_id=repo_id_index, filename="persistent/glossary.json", repo_type="dataset")
                # copy to local persistent path
                _shutil.copy2(downloaded, glossary_path)
                print("✅ Downloaded glossary.json.")
            except Exception as e:
                print(f"⚠️ Could not download glossary.json: {e}. Proceeding if available in other sources.")

        # Rebuild FAISS from glossary (this returns an index object and metadata list)
        index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
        print(f"📘 Loaded {len(metas)} glossary entries.")

        # --- 3️⃣ Index Excel (MRCT Glossary)
        print("📑 Scanning Excel files in dataset…")
        repo_files = list_repo_files(repo_id_docs, repo_type="dataset")
        excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))]
    
        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
        excel_entries = []
    
        for file_name in excel_files:
            print(f"📄 Reading {file_name}…")
            try:
                path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset")
                xls = pd.read_excel(path, sheet_name=None)
                for sheet, df in xls.items():
                    if "Glossary Term" not in df.columns:
                        continue
                    df = df.fillna("").dropna(how="all")
                    for _, row in df.iterrows():
                        term = str(row.get("Glossary Term", "")).strip()
                        if not term:
                            continue
    
                        # Combine all the relevant MRCT fields
                        combined_text = (
                            f"Glossary Term: {term}\n"
                            f"Glossary Definition: {row.get('Glossary Definition','')}\n"
                            f"Use in Context: {row.get('Use in Context','')}\n"
                            f"More Info: {row.get('More Info','')}\n"
                            f"Other Info to Think About When Joining a Study: {row.get('Other Info to Think About When Joining a Study','')}\n"
                            f"Related Terms: {row.get('Related Terms','')}\n"
                            f"Other Resources: {row.get('Other Resources','')}\n"
                            f"Term URL: {row.get('Term URL','')}\n"
                            f"CDISC/NCI URL: {row.get('CDISC/NCI URL','')}\n"
                            f"Version: {row.get('Version','')}"
                        ).strip()
    
                        excel_entries.append({
                            "source": file_name,
                            "sheet": sheet,
                            "term": term,
                            "type": "Excel",
                            "file": file_name,
                            "text": combined_text
                        })
            except Exception as e:
                print(f"⚠️ Error reading {file_name}: {e}")
    
        if excel_entries:
            texts = [e["text"] for e in excel_entries]
            embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
            faiss.normalize_L2(embeddings)
            index.add(embeddings)
            metas.extend(excel_entries)
            print(f"✅ Added {len(excel_entries)} Excel entries to FAISS.")


        # ---- Optional: Load web content (may be slow)
        try:
            print("🌐 Loading and embedding web sources…")
            web_entries = web_crawler_loader(
                urls_file="/home/user/app/data/urls.txt",
                cache_path="/home/user/app/persistent/web_cache.json",
                max_pages=3,
                timeout=20,
                force_refresh=False,
            )
            if web_entries:
                web_entries = [e for e in web_entries if len(e.get("text", "")) > 200]
                print(f"✅ Retrieved {len(web_entries)} web entries.")
                web_texts = [e["text"] for e in web_entries]
                web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
                faiss.normalize_L2(web_emb)
                index.add(web_emb)
                metas.extend(web_entries)
                print("✅ Web content added to FAISS.")
        except Exception as e:
            print(f"⚠️ Web content embedding failed: {e}")

        # --- Save index + meta locally
        faiss_path = os.path.join(local_dir, "faiss.index")
        meta_path = os.path.join(local_dir, "faiss.index.meta.json")
        faiss.write_index(index, faiss_path)
        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump(metas, f, indent=2)
        print(f"💾 Local FAISS saved ({len(metas)} entries).")

        # --- Upload artifacts back to HF dataset (best-effort)
        try:
            _upload_to_dataset(faiss_path, meta_path, repo_id_index)
            print(f"☁️ Uploaded FAISS ({len(metas)} entries) to {repo_id_index}.")
        except Exception as e:
            print(f"⚠️ Upload failed: {e}")

        return f"✅ Rebuild complete: {len(metas)} entries (Glossary + Excel + Web)."
    except Exception as e:
        return f"⚠️ Rebuild failed: {e}"


def rebuild_glossary():
    try:
        from core.glossary_builder import rebuild_and_upload
        rebuild_and_upload()
        return "✅ Glossary rebuilt and uploaded successfully."
    except Exception as e:
        return f"⚠️ Glossary rebuild failed: {e}"


def reset_faiss_cache():
    """
    Completely clears local FAISS and glossary caches, reloads the vector_store module
    (to wipe in-memory runtime caches), then rebuilds glossary + index.
    """
    try:
        # Use the clear helper from core.vector_store if available
        from importlib import reload
        from core import vector_store

        # If vector_store exposes clear_local_faiss, use it (safe and logged)
        if hasattr(vector_store, "clear_local_faiss"):
            vector_store.clear_local_faiss()
        else:
            # fallback: manually delete persistent/runtime files
            paths = [
                "/home/user/app/persistent/faiss.index",
                "/home/user/app/persistent/faiss.index.meta.json",
                "/home/user/app/persistent/glossary.json",
                "/home/user/app/runtime_faiss",
            ]
            for p in paths:
                if os.path.exists(p):
                    try:
                        if os.path.isdir(p):
                            _shutil.rmtree(p, ignore_errors=True)
                        else:
                            os.remove(p)
                        print(f"🗑️ Deleted: {p}")
                    except Exception:
                        pass

        # reload the module to clear any in-memory caches
        reload(vector_store)
        print("♻️ FAISS runtime module reloaded to ensure fresh index rebuild.")

        msg = "🧹 Local FAISS + glossary cache cleared. Starting full rebuild...\n\n"
        msg += rebuild_glossary() + "\n"
        msg += rebuild_index()
        return msg
    except Exception as e:
        return f"⚠️ Reset failed: {e}"


# ----------------------------------------------------------
# CHATBOT CORE
# ----------------------------------------------------------
def chat_answer(query, mode="short"):
    try:
        if not query or not str(query).strip():
            return "<i>⚠️ Please enter a valid query.</i>"
        return summarize_combined(str(query).strip(), mode=mode)
    except Exception as e:
        print("❌ Chatbot error:", e)
        return f"<i>⚠️ Error: {e}</i>"


# ----------------------------------------------------------
# GRADIO UI
# ----------------------------------------------------------
with gr.Blocks(theme="gradio/soft") as demo:
    gr.Markdown(f"# {APP_TITLE}")
    gr.Markdown(APP_DESC)

    query_box = gr.Textbox(
        label="Ask your clinical trial question",
        placeholder="e.g. What is an eCRF?",
        lines=2,
    )
    output_box = gr.HTML(label="Answer")

    with gr.Row():
        submit_btn = gr.Button("🚀 Submit", variant="primary")
        if not PUBLIC_MODE:
            rebuild_btn = gr.Button("🔁 Rebuild Index")
            rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary")
            reset_btn = gr.Button("🧹 Reset FAISS Cache (Full Rebuild)")
            clear_btn = gr.Button("🗑️ Clear Index Only")

    submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box)
    query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box)

    if not PUBLIC_MODE:
        rebuild_btn.click(fn=rebuild_index, outputs=output_box)
        rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box)
        reset_btn.click(fn=reset_faiss_cache, outputs=output_box)
        clear_btn.click(fn=clear_index, outputs=output_box)


# ----------------------------------------------------------
# LAUNCH APP
# ----------------------------------------------------------
if __name__ == "__main__":
    print("🚀 Starting Clinical Trial Chatbot…")
    print("🧠 Initializing retriever warm-up…")
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        auth=check_admin_login if not PUBLIC_MODE else None,
        ssr_mode=False,
    )