Spaces:

essprasad
/

ClinicalTrialBasics

Sleeping

App Files Files Community

essprasad commited on 21 days ago

Commit

b05b805

verified ·

1 Parent(s): 132e2c4

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -261

app.py CHANGED Viewed

@@ -55,11 +55,8 @@ _prelaunch_cleanup()
 # MAIN APP — Clinical Trial Chatbot
 # ==========================================================
 import gradio as gr
-import pandas as pd
-import json, faiss, numpy as np, shutil
 from sentence_transformers import SentenceTransformer
 from core.hybrid_retriever import summarize_combined
-from core import vector_store, vector_sync
 APP_TITLE = "🧠 Clinical Research Chatbot"
 APP_DESC = (
@@ -67,15 +64,31 @@ APP_DESC = (
     "Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets."
 )
 DATA_PATHS = [
     "/home/user/app/persistent/faiss.index",
     "/home/user/app/persistent/faiss.index.meta.json",
     "/home/user/app/data/docs_cache",
 ]
-# ----------------------------------------------------------
-# CLEAR INDEX / CACHE
-# ----------------------------------------------------------
 def clear_index():
     removed = []
     for p in DATA_PATHS:
@@ -89,247 +102,17 @@ def clear_index():
     print(msg)
     return msg
-# ----------------------------------------------------------
-# EMBEDDER HELPER
-# ----------------------------------------------------------
-def _load_embedder():
-    print("📦 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2 ...")
-    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-    print("✅ Model loaded.")
-    return model
-# ----------------------------------------------------------
-# WEB CRAWLER with LOCAL CACHE (Optimized & Safe)
-# ----------------------------------------------------------
-def web_crawler_loader(
-    urls_file="/home/user/app/data/urls.txt",
-    cache_path="/home/user/app/persistent/web_cache.json",
-    max_pages=3,
-    timeout=20,
-    force_refresh=False,
-):
-    """
-    Loads readable text content from URLs listed in urls.txt.
-    Uses a local cache (web_cache.json) to skip re-downloading.
-    Returns list of dicts: [{ 'source': URL, 'type': 'Website', 'text': text }]
-    """
-    import requests, re, time, json
-    from bs4 import BeautifulSoup
-    # --- Load existing cache (if any) ---
-    cache = {}
-    if os.path.exists(cache_path) and not force_refresh:
-        try:
-            with open(cache_path, "r", encoding="utf-8") as f:
-                cache = json.load(f)
-            print(f"🗂️ Loaded cached web content ({len(cache)} entries).")
-        except Exception as e:
-            print(f"⚠️ Cache read error ({e}) — starting fresh.")
-            cache = {}
-    # --- Validate URL list ---
-    if not os.path.exists(urls_file):
-        print(f"⚠️ URLs file not found: {urls_file}")
-        return list(cache.values())
-    with open(urls_file, "r", encoding="utf-8") as f:
-        urls = [u.strip() for u in f if u.strip() and not u.startswith("#")]
-    print(f"🌐 Found {len(urls)} URLs in {urls_file}")
-    new_entries = {}
-    for i, url in enumerate(urls[: max_pages * 10]):
-        if url in cache and not force_refresh:
-            print(f"♻️ Using cached content for {url}")
-            new_entries[url] = cache[url]
-            continue
-        try:
-            print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}")
-            resp = requests.get(
-                url,
-                timeout=timeout,
-                headers={"User-Agent": "ClinicalTrialChatBot/1.0 (+https://huggingface.co/essprasad)"}
-            )
-            if resp.status_code != 200:
-                print(f"⚠️ Skipped {url}: HTTP {resp.status_code}")
-                continue
-            soup = BeautifulSoup(resp.text, "html.parser")
-            # Remove unwanted elements
-            for tag in soup(["script", "style", "nav", "header", "footer", "noscript", "iframe"]):
-                tag.decompose()
-            # Extract visible text
-            text = " ".join(t.strip() for t in soup.get_text().split())
-            text = re.sub(r"\s+", " ", text).strip()
-            if len(text) < 500:
-                print(f"⚠️ Skipped {url}: too little readable text ({len(text)} chars).")
-                continue
-            # Keep first 3000 chars to reduce vector size
-            entry_text = f"Source URL: {url}. {text[:3000]}"
-            new_entries[url] = {"source": url, "type": "Website", "text": entry_text}
-            print(f"✅ Cached: {url}")
-            time.sleep(1)  # polite delay
-        except Exception as e:
-            print(f"⚠️ Failed to fetch {url}: {e}")
-    # --- Merge & Save updated cache ---
-    if new_entries:
-        cache.update(new_entries)
-        try:
-            os.makedirs(os.path.dirname(cache_path), exist_ok=True)
-            with open(cache_path, "w", encoding="utf-8") as f:
-                json.dump(cache, f, indent=2)
-            print(f"💾 Web cache updated ({len(cache)} total URLs).")
-        except Exception as e:
-            print(f"⚠️ Failed to write cache: {e}")
-    return list(cache.values())
 def rebuild_index():
-    """Fully rebuild FAISS index using glossary + Excel + web sources (fresh start)."""
-    print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)...")
-    import os, json, re, shutil, pandas as pd, faiss, numpy as np
-    from huggingface_hub import hf_hub_download, list_repo_files
-    from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
-    from sentence_transformers import SentenceTransformer
-    repo_id_index = "essprasad/CT-Chat-Index"
-    repo_id_docs = "essprasad/CT-Chat-Docs"
-    local_dir = "/home/user/app/persistent"
-    os.makedirs(local_dir, exist_ok=True)
-    # --- STEP 0: CLEAN OLD INDEX ---
-    for old_file in ["faiss.index", "faiss.index.meta.json"]:
-        old_path = os.path.join(local_dir, old_file)
-        if os.path.exists(old_path):
-            os.remove(old_path)
-            print(f"🗑️ Removed old FAISS artifact: {old_path}")
-    # --- STEP 1: LOAD GLOSSARY BASE ---
-    glossary_path = os.path.join(local_dir, "glossary.json")
-    if not os.path.exists(glossary_path):
-        print(f"📥 Downloading glossary.json from {repo_id_index}...")
-        downloaded_path = hf_hub_download(
-            repo_id=repo_id_index,
-            filename="persistent/glossary.json",
-            repo_type="dataset",
-            force_download=True,
-        )
-        shutil.copy2(downloaded_path, glossary_path)
-        print(f"✅ glossary.json copied to {glossary_path}")
-    index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
-    print(f"📘 Loaded {len(metas)} glossary entries.")
-    # --- STEP 2: INDEX EXCEL FILES ---
-    print("📑 Scanning Excel files...")
-    repo_files = list_repo_files(repo_id_docs, repo_type="dataset")
-    excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))]
-    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-    excel_entries = []
-    for file_name in excel_files:
-        print(f"📄 Processing Excel: {file_name}")
-        path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset")
-        xls = pd.read_excel(path, sheet_name=None)
-        for sheet_name, df in xls.items():
-            df = df.fillna("").dropna(how="all")
-            df.columns = [str(c).strip().lower() for c in df.columns]
-            term_col = next((c for c in df.columns if "term" in c or "word" in c), None)
-            if not term_col:
-                print(f"⚠️ No 'term' column in {file_name}:{sheet_name}")
-                continue
-            for _, row in df.iterrows():
-                term = str(row.get(term_col, "")).strip()
-                if not term:
-                    continue
-                # Combine all columns with values
-                parts = [
-                    f"{c.capitalize()}: {str(row[c]).strip()}"
-                    for c in df.columns if str(row[c]).strip()
-                ]
-                joined = " ".join(parts)
-                if len(joined) < 80:  # Skip tiny entries
-                    continue
-                entry_text = f"Definition of {term}: {joined}"
-                excel_entries.append({
-                    "source": file_name,
-                    "sheet": sheet_name,
-                    "term": term,
-                    "type": "Excel",
-                    "file": file_name,
-                    "text": entry_text,
-                })
-    if excel_entries:
-        print(f"✅ Loaded {len(excel_entries)} Excel rows.")
-        texts = [e["text"] for e in excel_entries]
-        embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
-        faiss.normalize_L2(embeddings)
-        index.add(embeddings)
-        metas.extend(excel_entries)
-        print("✅ Excel content added to FAISS.")
-    # --- STEP 3: WEB CONTENT ---
-    try:
-        print("🌐 Loading and embedding web content...")
-        web_entries = web_crawler_loader(
-            urls_file="/home/user/app/data/urls.txt",
-            cache_path="/home/user/app/persistent/web_cache.json",
-            max_pages=3,
-            timeout=20,
-            force_refresh=False,
-        )
-        if web_entries:
-            web_entries = [e for e in web_entries if len(e.get("text", "")) > 200]
-            print(f"✅ Retrieved {len(web_entries)} web entries.")
-            web_texts = [e["text"] for e in web_entries]
-            web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
-            faiss.normalize_L2(web_emb)
-            index.add(web_emb)
-            metas.extend(web_entries)
-            print("✅ Web content added to FAISS.")
-        else:
-            print("⚠️ No web entries found.")
-    except Exception as e:
-        print(f"⚠️ Web content embedding failed: {e}")
-    # --- STEP 4: SAVE & UPLOAD ---
-    faiss_path = os.path.join(local_dir, "faiss.index")
-    meta_path = os.path.join(local_dir, "faiss.index.meta.json")
-    faiss.write_index(index, faiss_path)
-    with open(meta_path, "w", encoding="utf-8") as f:
-        json.dump(metas, f, indent=2)
-    print(f"💾 Local FAISS index saved ({len(metas)} entries).")
     try:
-        _upload_to_dataset(faiss_path, meta_path, repo_id_index)
-        print(f"☁️ Uploaded latest FAISS index ({len(metas)} entries) to {repo_id_index}.")
     except Exception as e:
-        print(f"⚠️ Upload to Hugging Face failed: {e}")
-    print("✅ Glossary + Excel + Web FAISS rebuilt successfully.")
-    return f"✅ Rebuild complete: {len(metas)} entries (including Excel + Web)."
-# ----------------------------------------------------------
-# 4. REBUILD GLOSSARY
-# ----------------------------------------------------------
 def rebuild_glossary():
     try:
         from core.glossary_builder import rebuild_and_upload
@@ -339,28 +122,25 @@ def rebuild_glossary():
         return f"⚠️ Glossary rebuild failed: {e}"
 # ----------------------------------------------------------
-# 5. CHATBOT LOGIC
 # ----------------------------------------------------------
-def chat_answer(query, mode):
     try:
         query_clean = query.strip()
         if not query_clean:
             return "<i>⚠️ Please enter a valid query.</i>"
-        from core.hybrid_retriever import summarize_combined
         return summarize_combined(query_clean, mode=mode)
     except Exception as e:
         print("❌ Chatbot error:", e)
         return f"<i>⚠️ Error: {e}</i>"
 # ----------------------------------------------------------
-# 6. GRADIO UI (Simplified + Keyboard Support)
 # ----------------------------------------------------------
 with gr.Blocks(theme="gradio/soft") as demo:
     gr.Markdown(f"# {APP_TITLE}")
     gr.Markdown(APP_DESC)
-    # 🔹 Main input + output areas
     query_box = gr.Textbox(
         label="Ask your clinical trial question",
         placeholder="e.g. What is an eCRF?",
@@ -369,26 +149,32 @@ with gr.Blocks(theme="gradio/soft") as demo:
     )
     output_box = gr.HTML(label="Answer")
-    # 🔹 Control buttons row
     with gr.Row():
         submit_btn = gr.Button("🚀 Submit", variant="primary")
-        rebuild_btn = gr.Button("🔁 Rebuild Index")
-        rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary")
-        clear_btn = gr.Button("🧹 Clear Cache / Index")
-    # 🔹 Event bindings
     submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box)
-    query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box)  # ↵ Press Enter = Submit
-    rebuild_btn.click(fn=rebuild_index, outputs=output_box)
-    rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box)
-    clear_btn.click(fn=clear_index, outputs=output_box)
 # ----------------------------------------------------------
-# 7. LAUNCH APP
 # ----------------------------------------------------------
 if __name__ == "__main__":
     print("🚀 Starting Clinical Trial Chatbot...")
     print("🧠 Initializing retriever warm-up...")
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

 # MAIN APP — Clinical Trial Chatbot
 # ==========================================================
 import gradio as gr
 from sentence_transformers import SentenceTransformer
 from core.hybrid_retriever import summarize_combined
 APP_TITLE = "🧠 Clinical Research Chatbot"
 APP_DESC = (
     "Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets."
 )
+# Detect deployment mode
+PUBLIC_MODE = os.environ.get("PUBLIC_MODE", "true").lower() == "true"
+ADMIN_USER = os.environ.get("ADMIN_USER", "admin")
+ADMIN_PASS = os.environ.get("ADMIN_PASS", "changeme")
+print(f"🔐 Running in {'PUBLIC' if PUBLIC_MODE else 'ADMIN'} mode.")
+# ----------------------------------------------------------
+# ADMIN AUTHENTICATION HELPER
+# ----------------------------------------------------------
+def check_admin_login(username, password):
+    """Authenticate admin before showing rebuild/clear tools."""
+    return username == ADMIN_USER and password == ADMIN_PASS
+# ----------------------------------------------------------
+# MAINTENANCE FUNCTIONS
+# ----------------------------------------------------------
+import shutil, json, faiss, pandas as pd, numpy as np
 DATA_PATHS = [
     "/home/user/app/persistent/faiss.index",
     "/home/user/app/persistent/faiss.index.meta.json",
     "/home/user/app/data/docs_cache",
 ]
 def clear_index():
     removed = []
     for p in DATA_PATHS:
     print(msg)
     return msg
 def rebuild_index():
     try:
+        from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
+        import pandas as pd, faiss, numpy as np
+        from sentence_transformers import SentenceTransformer
+        print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)...")
+        # ... (you can keep your current detailed rebuild logic here)
+        return "✅ Rebuild complete (placeholder logic)."
     except Exception as e:
+        return f"⚠️ Rebuild failed: {e}"
 def rebuild_glossary():
     try:
         from core.glossary_builder import rebuild_and_upload
         return f"⚠️ Glossary rebuild failed: {e}"
 # ----------------------------------------------------------
+# CHATBOT CORE
 # ----------------------------------------------------------
+def chat_answer(query, mode="short"):
     try:
         query_clean = query.strip()
         if not query_clean:
             return "<i>⚠️ Please enter a valid query.</i>"
         return summarize_combined(query_clean, mode=mode)
     except Exception as e:
         print("❌ Chatbot error:", e)
         return f"<i>⚠️ Error: {e}</i>"
 # ----------------------------------------------------------
+# GRADIO UI
 # ----------------------------------------------------------
 with gr.Blocks(theme="gradio/soft") as demo:
     gr.Markdown(f"# {APP_TITLE}")
     gr.Markdown(APP_DESC)
     query_box = gr.Textbox(
         label="Ask your clinical trial question",
         placeholder="e.g. What is an eCRF?",
     )
     output_box = gr.HTML(label="Answer")
     with gr.Row():
         submit_btn = gr.Button("🚀 Submit", variant="primary")
+        # Only show admin tools if not in PUBLIC mode
+        if not PUBLIC_MODE:
+            rebuild_btn = gr.Button("🔁 Rebuild Index")
+            rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary")
+            clear_btn = gr.Button("🧹 Clear Cache / Index")
     submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box)
+    query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box)
+    if not PUBLIC_MODE:
+        rebuild_btn.click(fn=rebuild_index, outputs=output_box)
+        rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box)
+        clear_btn.click(fn=clear_index, outputs=output_box)
 # ----------------------------------------------------------
+# LAUNCH APP WITH AUTH
 # ----------------------------------------------------------
 if __name__ == "__main__":
     print("🚀 Starting Clinical Trial Chatbot...")
     print("🧠 Initializing retriever warm-up...")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        auth=check_admin_login if not PUBLIC_MODE else None
+    )