Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

essprasad commited on 19 days ago

Commit

f9053c5

verified ·

1 Parent(s): b816136

Upload 11 files

Browse files

Files changed (11) hide show

README.md +47 -7
app.py +394 -0
cleanup_space.py +135 -0
gitattributes +49 -0
gitignore +71 -0
gitignore (1) +71 -0
lfsconfig +4 -0
postBuild +60 -0
requirements.txt +43 -0
runtime.txt +1 -0
runtime.yaml +26 -0

README.md CHANGED Viewed

@@ -1,13 +1,53 @@
 ---
-title: ClinicalTrialBasics
-emoji: 📉
-colorFrom: gray
-colorTo: yellow
 sdk: gradio
-sdk_version: 5.49.1
 app_file: app.py
 pinned: false
-short_description: 'Gives answers from trusted, credible and authentic sources '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Clinical Research Chatbot
+emoji: 🧪
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.49.0
 app_file: app.py
 pinned: false
 ---
+# 🧪 Clinical Research Chatbot
+A lightweight, fully open-source chatbot for clinical research professionals.
+Runs entirely on Hugging Face — no OpenAI dependency.
+---
+## ✅ Current Features
+### 💬 Chatbot Interface
+- Gradio UI with chatbot + Admin Tools tab.
+- Query pipeline: **FAQ → Glossary → Knowledge Base → APIs (PubMed → FDA → ClinicalTrials.gov)**.
+- Answers are clearly labeled by source.
+### 🔍 Knowledge Base (Docs + URLs)
+- Supports ingestion of: PDF, DOCX, TXT, XLSX, JSON, HTML.
+- Auto-ingests from:
+  - `/data/public_docs/`
+  - `/data/urls.txt`
+- Smart chunking optimized for glossary terms + long text.
+### 📦 Vector Search
+- FAISS + `all-MiniLM-L6-v2` embeddings.
+- Persistent storage:
+  - `/persistent/faiss.index`
+  - `/persistent/faiss.index.meta.json`
+- Index survives restarts and can be exported/imported as `.zip`.
+### 🌐 API Integrations
+- PubMed
+- FDA Drug Labels
+- ClinicalTrials.gov
+### 🧠 Query Handling
+- Glossary-aware normalization
+  *(e.g., eCRF, e-CRF, electronic case report form → same match)*
+- Glossary priority: if glossary hit exists → always returned first.
+- Answer flow: **FAQ → Glossary → KB → APIs**.
+- Clear section labels, citations, and confidence notes.
+### 📜 Logging
+All queries, answers, and sources saved in:

app.py ADDED Viewed

	@@ -0,0 +1,394 @@

+# ==========================================================
+# SAFE-MODE PRELAUNCH CLEANUP (runs before any heavy imports)
+# ==========================================================
+import os, shutil, time, glob
+def _prelaunch_cleanup(threshold_gb=45.0):
+    """Early cleanup to prevent Hugging Face Space eviction (50 GB limit)."""
+    def _used_gb(path="/home/user/app"):
+        try:
+            total, used, free = shutil.disk_usage(path)
+            used_gb = max(0.0, min(used / (1024**3), 49.9))
+            return used_gb
+        except Exception:
+            return 0.0
+    used = _used_gb()
+    print(f"\n💾 Startup disk usage: {used:.2f} GB")
+    cache_paths = [
+        os.path.expanduser("~/.cache/huggingface"),
+        os.path.expanduser("~/.cache/hfhub"),
+        "/home/user/.cache/huggingface",
+        "/home/user/.cache",
+        "/home/user/app/__pycache__",
+        "/home/user/app/data/__pycache__",
+    ]
+    for p in cache_paths:
+        if os.path.exists(p):
+            shutil.rmtree(p, ignore_errors=True)
+    if used > threshold_gb:
+        print(f"⚠️ Usage {used:.2f} GB > {threshold_gb} GB — performing aggressive cleanup.")
+        preserve = {"faiss.index", "faiss.index.meta.json", "glossary.json"}
+        folders = ["/home/user/app/data/docs_cache", "/home/user/app/tmp_docs", "/home/user/app/persistent"]
+        for folder in folders:
+            if os.path.exists(folder):
+                for f in glob.glob(os.path.join(folder, "*")):
+                    if os.path.basename(f) in preserve:
+                        continue
+                    try:
+                        if os.path.isfile(f):
+                            os.remove(f)
+                        else:
+                            shutil.rmtree(f, ignore_errors=True)
+                    except Exception:
+                        pass
+        print("🧹 Aggressive cleanup complete.")
+    print(f"✨ Disk after cleanup: {_used_gb():.2f} GB\n")
+    shutil.rmtree("/home/user/app/runtime_faiss", ignore_errors=True)
+_prelaunch_cleanup()
+# ==========================================================
+# MAIN APP — Clinical Trial Chatbot
+# ==========================================================
+import gradio as gr
+import pandas as pd
+import json, faiss, numpy as np, shutil
+from sentence_transformers import SentenceTransformer
+from core.hybrid_retriever import summarize_combined
+from core import vector_store, vector_sync
+APP_TITLE = "🧠 Clinical Research Chatbot"
+APP_DESC = (
+    "Ask any clinical research or GCP-related question. "
+    "Retrieves and summarizes from ICH, GCDMP, EMA, FDA, Excel, and Web datasets."
+)
+DATA_PATHS = [
+    "/home/user/app/persistent/faiss.index",
+    "/home/user/app/persistent/faiss.index.meta.json",
+    "/home/user/app/data/docs_cache",
+]
+# ----------------------------------------------------------
+# CLEAR INDEX / CACHE
+# ----------------------------------------------------------
+def clear_index():
+    removed = []
+    for p in DATA_PATHS:
+        if os.path.isdir(p):
+            shutil.rmtree(p, ignore_errors=True)
+            removed.append(f"🗑️ Deleted folder: {p}")
+        elif os.path.exists(p):
+            os.remove(p)
+            removed.append(f"🗑️ Deleted file: {p}")
+    msg = "\n".join(removed) if removed else "ℹ️ No cache files found."
+    print(msg)
+    return msg
+# ----------------------------------------------------------
+# EMBEDDER HELPER
+# ----------------------------------------------------------
+def _load_embedder():
+    print("📦 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2 ...")
+    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    print("✅ Model loaded.")
+    return model
+# ----------------------------------------------------------
+# WEB CRAWLER with LOCAL CACHE (Optimized & Safe)
+# ----------------------------------------------------------
+def web_crawler_loader(
+    urls_file="/home/user/app/data/urls.txt",
+    cache_path="/home/user/app/persistent/web_cache.json",
+    max_pages=3,
+    timeout=20,
+    force_refresh=False,
+):
+    """
+    Loads readable text content from URLs listed in urls.txt.
+    Uses a local cache (web_cache.json) to skip re-downloading.
+    Returns list of dicts: [{ 'source': URL, 'type': 'Website', 'text': text }]
+    """
+    import requests, re, time, json
+    from bs4 import BeautifulSoup
+    # --- Load existing cache (if any) ---
+    cache = {}
+    if os.path.exists(cache_path) and not force_refresh:
+        try:
+            with open(cache_path, "r", encoding="utf-8") as f:
+                cache = json.load(f)
+            print(f"🗂️ Loaded cached web content ({len(cache)} entries).")
+        except Exception as e:
+            print(f"⚠️ Cache read error ({e}) — starting fresh.")
+            cache = {}
+    # --- Validate URL list ---
+    if not os.path.exists(urls_file):
+        print(f"⚠️ URLs file not found: {urls_file}")
+        return list(cache.values())
+    with open(urls_file, "r", encoding="utf-8") as f:
+        urls = [u.strip() for u in f if u.strip() and not u.startswith("#")]
+    print(f"🌐 Found {len(urls)} URLs in {urls_file}")
+    new_entries = {}
+    for i, url in enumerate(urls[: max_pages * 10]):
+        if url in cache and not force_refresh:
+            print(f"♻️ Using cached content for {url}")
+            new_entries[url] = cache[url]
+            continue
+        try:
+            print(f"🌐 Fetching ({i+1}/{len(urls)}): {url}")
+            resp = requests.get(
+                url,
+                timeout=timeout,
+                headers={"User-Agent": "ClinicalTrialChatBot/1.0 (+https://huggingface.co/essprasad)"}
+            )
+            if resp.status_code != 200:
+                print(f"⚠️ Skipped {url}: HTTP {resp.status_code}")
+                continue
+            soup = BeautifulSoup(resp.text, "html.parser")
+            # Remove unwanted elements
+            for tag in soup(["script", "style", "nav", "header", "footer", "noscript", "iframe"]):
+                tag.decompose()
+            # Extract visible text
+            text = " ".join(t.strip() for t in soup.get_text().split())
+            text = re.sub(r"\s+", " ", text).strip()
+            if len(text) < 500:
+                print(f"⚠️ Skipped {url}: too little readable text ({len(text)} chars).")
+                continue
+            # Keep first 3000 chars to reduce vector size
+            entry_text = f"Source URL: {url}. {text[:3000]}"
+            new_entries[url] = {"source": url, "type": "Website", "text": entry_text}
+            print(f"✅ Cached: {url}")
+            time.sleep(1)  # polite delay
+        except Exception as e:
+            print(f"⚠️ Failed to fetch {url}: {e}")
+    # --- Merge & Save updated cache ---
+    if new_entries:
+        cache.update(new_entries)
+        try:
+            os.makedirs(os.path.dirname(cache_path), exist_ok=True)
+            with open(cache_path, "w", encoding="utf-8") as f:
+                json.dump(cache, f, indent=2)
+            print(f"💾 Web cache updated ({len(cache)} total URLs).")
+        except Exception as e:
+            print(f"⚠️ Failed to write cache: {e}")
+    return list(cache.values())
+def rebuild_index():
+    """Fully rebuild FAISS index using glossary + Excel + web sources (fresh start)."""
+    print("🧠 Rebuilding FAISS index (Glossary + Excel + Web)...")
+    import os, json, re, shutil, pandas as pd, faiss, numpy as np
+    from huggingface_hub import hf_hub_download, list_repo_files
+    from core.vector_sync import rebuild_faiss_from_glossary, _upload_to_dataset
+    from sentence_transformers import SentenceTransformer
+    repo_id_index = "essprasad/CT-Chat-Index"
+    repo_id_docs = "essprasad/CT-Chat-Docs"
+    local_dir = "/home/user/app/persistent"
+    os.makedirs(local_dir, exist_ok=True)
+    # --- STEP 0: CLEAN OLD INDEX ---
+    for old_file in ["faiss.index", "faiss.index.meta.json"]:
+        old_path = os.path.join(local_dir, old_file)
+        if os.path.exists(old_path):
+            os.remove(old_path)
+            print(f"🗑️ Removed old FAISS artifact: {old_path}")
+    # --- STEP 1: LOAD GLOSSARY BASE ---
+    glossary_path = os.path.join(local_dir, "glossary.json")
+    if not os.path.exists(glossary_path):
+        print(f"📥 Downloading glossary.json from {repo_id_index}...")
+        downloaded_path = hf_hub_download(
+            repo_id=repo_id_index,
+            filename="persistent/glossary.json",
+            repo_type="dataset",
+            force_download=True,
+        )
+        shutil.copy2(downloaded_path, glossary_path)
+        print(f"✅ glossary.json copied to {glossary_path}")
+    index, metas = rebuild_faiss_from_glossary(glossary_path=glossary_path)
+    print(f"📘 Loaded {len(metas)} glossary entries.")
+    # --- STEP 2: INDEX EXCEL FILES ---
+    print("📑 Scanning Excel files...")
+    repo_files = list_repo_files(repo_id_docs, repo_type="dataset")
+    excel_files = [f for f in repo_files if f.lower().endswith((".xlsx", ".xls"))]
+    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    excel_entries = []
+    for file_name in excel_files:
+        print(f"📄 Processing Excel: {file_name}")
+        path = hf_hub_download(repo_id_docs, filename=file_name, repo_type="dataset")
+        xls = pd.read_excel(path, sheet_name=None)
+        for sheet_name, df in xls.items():
+            df = df.fillna("").dropna(how="all")
+            df.columns = [str(c).strip().lower() for c in df.columns]
+            term_col = next((c for c in df.columns if "term" in c or "word" in c), None)
+            if not term_col:
+                print(f"⚠️ No 'term' column in {file_name}:{sheet_name}")
+                continue
+            for _, row in df.iterrows():
+                term = str(row.get(term_col, "")).strip()
+                if not term:
+                    continue
+                # Combine all columns with values
+                parts = [
+                    f"{c.capitalize()}: {str(row[c]).strip()}"
+                    for c in df.columns if str(row[c]).strip()
+                ]
+                joined = " ".join(parts)
+                if len(joined) < 80:  # Skip tiny entries
+                    continue
+                entry_text = f"Definition of {term}: {joined}"
+                excel_entries.append({
+                    "source": file_name,
+                    "sheet": sheet_name,
+                    "term": term,
+                    "type": "Excel",
+                    "file": file_name,
+                    "text": entry_text,
+                })
+    if excel_entries:
+        print(f"✅ Loaded {len(excel_entries)} Excel rows.")
+        texts = [e["text"] for e in excel_entries]
+        embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
+        faiss.normalize_L2(embeddings)
+        index.add(embeddings)
+        metas.extend(excel_entries)
+        print("✅ Excel content added to FAISS.")
+    # --- STEP 3: WEB CONTENT ---
+    try:
+        print("🌐 Loading and embedding web content...")
+        web_entries = web_crawler_loader(
+            urls_file="/home/user/app/data/urls.txt",
+            cache_path="/home/user/app/persistent/web_cache.json",
+            max_pages=3,
+            timeout=20,
+            force_refresh=False,
+        )
+        if web_entries:
+            web_entries = [e for e in web_entries if len(e.get("text", "")) > 200]
+            print(f"✅ Retrieved {len(web_entries)} web entries.")
+            web_texts = [e["text"] for e in web_entries]
+            web_emb = model.encode(web_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
+            faiss.normalize_L2(web_emb)
+            index.add(web_emb)
+            metas.extend(web_entries)
+            print("✅ Web content added to FAISS.")
+        else:
+            print("⚠️ No web entries found.")
+    except Exception as e:
+        print(f"⚠️ Web content embedding failed: {e}")
+    # --- STEP 4: SAVE & UPLOAD ---
+    faiss_path = os.path.join(local_dir, "faiss.index")
+    meta_path = os.path.join(local_dir, "faiss.index.meta.json")
+    faiss.write_index(index, faiss_path)
+    with open(meta_path, "w", encoding="utf-8") as f:
+        json.dump(metas, f, indent=2)
+    print(f"💾 Local FAISS index saved ({len(metas)} entries).")
+    try:
+        _upload_to_dataset(faiss_path, meta_path, repo_id_index)
+        print(f"☁️ Uploaded latest FAISS index ({len(metas)} entries) to {repo_id_index}.")
+    except Exception as e:
+        print(f"⚠️ Upload to Hugging Face failed: {e}")
+    print("✅ Glossary + Excel + Web FAISS rebuilt successfully.")
+    return f"✅ Rebuild complete: {len(metas)} entries (including Excel + Web)."
+# ----------------------------------------------------------
+# 4. REBUILD GLOSSARY
+# ----------------------------------------------------------
+def rebuild_glossary():
+    try:
+        from core.glossary_builder import rebuild_and_upload
+        rebuild_and_upload()
+        return "✅ Glossary rebuilt and uploaded successfully."
+    except Exception as e:
+        return f"⚠️ Glossary rebuild failed: {e}"
+# ----------------------------------------------------------
+# 5. CHATBOT LOGIC
+# ----------------------------------------------------------
+def chat_answer(query, mode):
+    try:
+        query_clean = query.strip()
+        if not query_clean:
+            return "<i>⚠️ Please enter a valid query.</i>"
+        from core.hybrid_retriever import summarize_combined
+        return summarize_combined(query_clean, mode=mode)
+    except Exception as e:
+        print("❌ Chatbot error:", e)
+        return f"<i>⚠️ Error: {e}</i>"
+# ----------------------------------------------------------
+# 6. GRADIO UI (Simplified + Keyboard Support)
+# ----------------------------------------------------------
+with gr.Blocks(theme="gradio/soft") as demo:
+    gr.Markdown(f"# {APP_TITLE}")
+    gr.Markdown(APP_DESC)
+    # 🔹 Main input + output areas
+    query_box = gr.Textbox(
+        label="Ask your clinical trial question",
+        placeholder="e.g. What is an eCRF?",
+        lines=2,
+        show_label=True
+    )
+    output_box = gr.HTML(label="Answer")
+    # 🔹 Control buttons row
+    with gr.Row():
+        submit_btn = gr.Button("🚀 Submit", variant="primary")
+        rebuild_btn = gr.Button("🔁 Rebuild Index")
+        rebuild_glossary_btn = gr.Button("📘 Rebuild Glossary")
+        clear_btn = gr.Button("🧹 Clear Cache / Index")
+    # 🔹 Event bindings
+    submit_btn.click(fn=chat_answer, inputs=[query_box], outputs=output_box)
+    query_box.submit(fn=chat_answer, inputs=[query_box], outputs=output_box)  # ↵ Press Enter = Submit
+    rebuild_btn.click(fn=rebuild_index, outputs=output_box)
+    rebuild_glossary_btn.click(fn=rebuild_glossary, outputs=output_box)
+    clear_btn.click(fn=clear_index, outputs=output_box)
+# ----------------------------------------------------------
+# 7. LAUNCH APP
+# ----------------------------------------------------------
+if __name__ == "__main__":
+    print("🚀 Starting Clinical Trial Chatbot...")
+    print("🧠 Initializing retriever warm-up...")
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

cleanup_space.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+cleanup_space.py
+----------------
+Maintenance script for Hugging Face Space cleanup.
+- Removes caches, temp files, and large unneeded assets.
+- Keeps only FAISS index + metadata + glossary.
+- Reuploads them to CT-Chat-Index dataset.
+"""
+import subprocess
+subprocess.run(["python", "cleanup_space.py"], check=False)
+import os
+import shutil
+import time
+from datetime import datetime
+from huggingface_hub import HfApi, upload_file, HfFolder
+# 🔧 Configuration
+REPO_ID = "essprasad/CT-Chat-Index"   # Dataset repo
+REPO_TYPE = "dataset"
+PERSISTENT_DIR = "persistent"
+DATA_DIR = "data"
+KEEP_FILES = [
+    "persistent/faiss.index",
+    "persistent/faiss.index.meta.json",
+    "data/glossary.json"
+]
+api = HfApi()
+token = HfFolder.get_token() or os.getenv("HF_TOKEN", None)
+def readable_size(path):
+    """Return human-readable folder size."""
+    total = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if os.path.exists(fp):
+                total += os.path.getsize(fp)
+    for unit in ["B", "KB", "MB", "GB"]:
+        if total < 1024.0:
+            return f"{total:.2f} {unit}"
+        total /= 1024.0
+    return f"{total:.2f} TB"
+# --------------------------------------------------------------------
+# 1. Clean caches, logs, temp files
+# --------------------------------------------------------------------
+def clean_temp_and_cache():
+    print("🧹 Cleaning temporary and cache directories...")
+    for path in ["/root/.cache", "/home/user/.cache", "/tmp"]:
+        shutil.rmtree(path, ignore_errors=True)
+        os.makedirs(path, exist_ok=True)
+    # Remove logs larger than 5 MB
+    log_dir = "logs"
+    if os.path.exists(log_dir):
+        for f in os.listdir(log_dir):
+            fp = os.path.join(log_dir, f)
+            if os.path.isfile(fp) and os.path.getsize(fp) > 5 * 1024 * 1024:
+                os.remove(fp)
+                print(f"🗑️ Removed oversized log: {fp}")
+# --------------------------------------------------------------------
+# 2. Remove large documents & orphan files
+# --------------------------------------------------------------------
+def trim_data():
+    print("📦 Trimming large files from data/public_docs...")
+    doc_dir = os.path.join(DATA_DIR, "public_docs")
+    if not os.path.exists(doc_dir):
+        return
+    for root, _, files in os.walk(doc_dir):
+        for f in files:
+            fp = os.path.join(root, f)
+            if os.path.getsize(fp) > 10 * 1024 * 1024:  # >10MB
+                print(f"🗑️ Removing large doc: {fp}")
+                os.remove(fp)
+# --------------------------------------------------------------------
+# 3. Verify and keep only essential files
+# --------------------------------------------------------------------
+def preserve_key_files():
+    print("🔒 Preserving essential files (index + glossary)...")
+    all_keep = []
+    for f in KEEP_FILES:
+        if os.path.exists(f):
+            print(f"✅ Keeping: {f}")
+            all_keep.append(f)
+        else:
+            print(f"⚠️ Missing expected file: {f}")
+    return all_keep
+# --------------------------------------------------------------------
+# 4. Upload cleaned files to dataset
+# --------------------------------------------------------------------
+def upload_to_hub(files):
+    if not token:
+        print("❌ No HF token found. Please add HF_TOKEN with write access.")
+        return
+    print(f"🚀 Uploading cleaned files to {REPO_ID} ...")
+    for f in files:
+        try:
+            upload_file(
+                path_or_fileobj=f,
+                path_in_repo=f,
+                repo_id=REPO_ID,
+                repo_type=REPO_TYPE,
+                token=token,
+                commit_message=f"Auto-cleanup sync {datetime.utcnow().isoformat()}"
+            )
+            print(f"✅ Uploaded: {f}")
+        except Exception as e:
+            print(f"⚠️ Failed to upload {f}: {e}")
+# --------------------------------------------------------------------
+# 5. Disk usage report
+# --------------------------------------------------------------------
+def report_usage():
+    print("\n📊 Disk Usage Summary:")
+    for path in ["persistent", "data", "/home/user"]:
+        if os.path.exists(path):
+            print(f"{path}: {readable_size(path)}")
+# --------------------------------------------------------------------
+# Run everything
+# --------------------------------------------------------------------
+if __name__ == "__main__":
+    start = time.time()
+    print("===== 🧹 Starting Space Cleanup =====")
+    clean_temp_and_cache()
+    trim_data()
+    files = preserve_key_files()
+    upload_to_hub(files)
+    report_usage()
+    print(f"\n✅ Cleanup finished in {time.time() - start:.2f}s")

gitattributes ADDED Viewed

	@@ -0,0 +1,49 @@

+# ================================================
+# ⚙️ Clinical Research Chatbot – Simplified .gitattributes
+# ================================================
+# Version: Safe for Hugging Face UI-only management
+# (no Git LFS required)
+# --------------------------------
+# --------------------------------
+# Code & Config Files (text mode)
+# --------------------------------
+*.py     text eol=lf
+*.txt    text eol=lf
+*.md     text eol=lf
+*.json   text eol=lf
+*.csv    text eol=lf
+*.yaml   text eol=lf
+*.yml    text eol=lf
+*.html   text eol=lf
+*.css    text eol=lf
+*.js     text eol=lf
+*.ini    text eol=lf
+*.cfg    text eol=lf
+*.toml   text eol=lf
+requirements.txt text eol=lf
+runtime.txt text eol=lf
+runtime.yaml text eol=lf
+*.gitignore text eol=lf
+*.gitattributes text eol=lf
+# --------------------------------
+# Binary & Data Files (no LFS)
+# --------------------------------
+*.pdf    binary
+*.docx   binary
+*.xlsx   binary
+*.zip    binary
+*.ppt    binary
+*.odt    binary
+*.png    binary
+*.jpg    binary
+*.jpeg   binary
+*.tif    binary
+*.tiff   binary
+*.gif    binary
+# --------------------------------
+# Default handling
+# --------------------------------
+* text=auto eol=lf

gitignore ADDED Viewed

	@@ -0,0 +1,71 @@

+# =========================================
+# 🧪 Clinical Research Chatbot – .gitignore
+# =========================================
+# -------------------------
+# Python
+# -------------------------
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.pkl
+*.pickle
+# -------------------------
+# Environment / virtualenv
+# -------------------------
+.venv/
+env/
+venv/
+ENV/
+*.env
+# -------------------------
+# Data & Logs
+# -------------------------
+logs/*
+!logs/.gitkeep
+!logs/query_log.csv     # keep recent chatbot logs
+# -------------------------
+# Data Folders
+# -------------------------
+# Keep reference docs & FAQs, ignore temporary files
+data/public_docs/*
+!data/public_docs/.gitkeep
+data/faq/*
+!data/faq/.gitkeep
+# Glossary and metadata files should stay (important for chatbot)
+!data/glossary.json
+!data/faq_data.json
+!data/clinical_faq.json
+# Ignore temporary FAISS or index rebuilds
+persistent/*
+!persistent/.gitkeep
+!persistent/faiss.index
+!persistent/faiss.index.meta.json
+# -------------------------
+# Hugging Face + Transformers cache
+# -------------------------
+.cache/
+datasets/
+transformers_cache/
+.huggingface/
+# -------------------------
+# IDE / Editor
+# -------------------------
+.vscode/
+.idea/
+.DS_Store
+# -------------------------
+# Miscellaneous
+# -------------------------
+*.tmp
+*.bak

gitignore (1) ADDED Viewed

	@@ -0,0 +1,71 @@

+# =========================================
+# 🧪 Clinical Research Chatbot – .gitignore
+# =========================================
+# -------------------------
+# Python
+# -------------------------
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.pkl
+*.pickle
+# -------------------------
+# Environment / virtualenv
+# -------------------------
+.venv/
+env/
+venv/
+ENV/
+*.env
+# -------------------------
+# Data & Logs
+# -------------------------
+logs/*
+!logs/.gitkeep
+!logs/query_log.csv     # keep recent chatbot logs
+# -------------------------
+# Data Folders
+# -------------------------
+# Keep reference docs & FAQs, ignore temporary files
+data/public_docs/*
+!data/public_docs/.gitkeep
+data/faq/*
+!data/faq/.gitkeep
+# Glossary and metadata files should stay (important for chatbot)
+!data/glossary.json
+!data/faq_data.json
+!data/clinical_faq.json
+# Ignore temporary FAISS or index rebuilds
+persistent/*
+!persistent/.gitkeep
+!persistent/faiss.index
+!persistent/faiss.index.meta.json
+# -------------------------
+# Hugging Face + Transformers cache
+# -------------------------
+.cache/
+datasets/
+transformers_cache/
+.huggingface/
+# -------------------------
+# IDE / Editor
+# -------------------------
+.vscode/
+.idea/
+.DS_Store
+# -------------------------
+# Miscellaneous
+# -------------------------
+*.tmp
+*.bak

lfsconfig ADDED Viewed

	@@ -0,0 +1,4 @@

+[lfs]
+url = https://huggingface.co/
+locksverify = true
+batch = true

postBuild ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/bin/bash
+set -e
+echo "🔧 PostBuild starting — optimizing CT-Chat Space..."
+# -------------------------------------------------------
+# 1️⃣ Fix dependency mismatches (Gradio & Websockets)
+# -------------------------------------------------------
+pip install --force-reinstall --no-cache-dir "websockets>=12" "gradio-client>=1.3.0"
+# -------------------------------------------------------
+# 2️⃣ Create and register shared NLTK data directory
+# -------------------------------------------------------
+echo "📁 Preparing shared NLTK data directory..."
+export NLTK_DATA="/usr/local/share/nltk_data"
+mkdir -p $NLTK_DATA
+chmod -R 777 $NLTK_DATA
+# -------------------------------------------------------
+# 3️⃣ Preload all required NLTK resources (including punkt_tab)
+# -------------------------------------------------------
+echo "📦 Downloading NLTK resources..."
+python -m nltk.downloader -d $NLTK_DATA \
+    punkt punkt_tab averaged_perceptron_tagger averaged_perceptron_tagger_eng stopwords wordnet omw-1.4
+# -------------------------------------------------------
+# 4️⃣ Verify NLTK installs and paths
+# -------------------------------------------------------
+python - <<'PYCODE'
+import nltk, os
+print(f"NLTK data path → {nltk.data.path}")
+for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger_eng", "stopwords", "wordnet"]:
+    try:
+        nltk.data.find(pkg)
+        print(f"✅ Verified NLTK resource: {pkg}")
+    except LookupError:
+        print(f"⚠️ Missing NLTK resource: {pkg}")
+PYCODE
+# -------------------------------------------------------
+# 5️⃣ Clean caches (stay <50GB)
+# -------------------------------------------------------
+echo "🧹 Cleaning Hugging Face + Torch caches..."
+rm -rf /root/.cache/* || true
+rm -rf /home/user/.cache/* || true
+rm -rf /usr/local/share/nltk_data/taggers/__pycache__ || true
+rm -rf /home/user/app/hf_cache/* || true
+rm -rf /home/user/app/logs/* || true
+# -------------------------------------------------------
+# 6️⃣ Ensure writable temporary cache for runtime
+# -------------------------------------------------------
+echo "📦 Preparing /tmp/hf_cache..."
+mkdir -p /tmp/hf_cache
+chmod -R 777 /tmp/hf_cache
+# -------------------------------------------------------
+# ✅ Done
+# -------------------------------------------------------
+echo "✅ PostBuild completed successfully — NLTK preloaded (punkt_tab OK), cache ready at /tmp/hf_cache."

requirements.txt ADDED Viewed

	@@ -0,0 +1,43 @@

+# =======================================
+# 🧪 Clinical Research Chatbot Requirements
+# =======================================
+# --- Core Libraries ---
+faiss-cpu
+torch
+transformers
+sentence-transformers
+sentencepiece
+fastapi
+whoosh
+# --- Data Handling ---
+numpy
+pandas
+datasets
+# --- Document Parsing ---
+pymupdf
+python-docx
+openpyxl
+beautifulsoup4
+requests
+aiofiles
+rank-bm25
+# --- NLP + Text Processing ---
+nltk
+scikit-learn
+regex
+tqdm
+# --- Web + Interface ---
+huggingface-hub>=0.23.0
+gradio
+gradio-client
+uvicorn
+spaces
+python-multipart
+# --- Networking / Compatibility Fix ---
+websockets>=12

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.10

runtime.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# =======================================
+# ⚙️ Hugging Face Space Runtime Configuration
+# =======================================
+python: "3.10"               # Stable for FAISS + Gradio + Transformers
+# App entrypoint (FastAPI with Gradio mount)
+entrypoint: "app:app"
+hardware: "cpu-basic"        # For small to medium FAISS indexes
+# hardware: "cpu-upgrade"    # Uncomment for larger index (>100 MB) or slower summaries
+timeout: 600                 # 10-minute build timeout
+autoreload: true             # Auto-reload app on file updates (optional)
+# Cache persistent resources (prevents redownload)
+cache:
+  - data/
+  - persistent/
+  - logs/
+# Explicit build hook (optional, for clarity)
+build:
+  commands:
+    - bash postBuild