Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

essprasad commited on 9 days ago

Commit

9788b7f

verified ·

1 Parent(s): e9a707b

Upload 4 files

Browse files

Files changed (4) hide show

utils/api_clients.py +194 -0
utils/faq.py +112 -0
utils/feedback.py +105 -0
utils/nlp_helpers.py +212 -0

utils/api_clients.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+utils/api_clients.py
+------------------------------------------------
+Enhanced API clients for:
+- PubMed (NCBI)
+- ClinicalTrials.gov
+- FDA Open Data
+- WHO ICTRP
+------------------------------------------------
+Optimized for hybrid VAN-based query processing:
+- Automatically truncates long queries (top keywords only)
+- Resilient to API downtime or malformed responses
+- HTML formatted results for Gradio rendering
+"""
+import requests
+import html
+import re
+import traceback
+# ============================================================
+# 🔹 Query Normalization
+# ============================================================
+def _normalize_query(query: str, max_words: int = 5) -> str:
+    """
+    Cleans and shortens user query for API compatibility.
+    Removes filler phrases and limits to key words.
+    """
+    q = query.lower()
+    q = re.sub(
+        r"(what is|define|explain|describe|in clinical trials|the meaning of|tell me about|explanation of|concept of)\b",
+        "",
+        q,
+    )
+    q = re.sub(r"[^a-z0-9\s]", "", q)
+    q = re.sub(r"\s+", " ", q).strip()
+    # limit to first few words (avoid 404s from overlong queries)
+    words = q.split()
+    q = " ".join(words[:max_words])
+    return q or "clinical trial"
+# ============================================================
+# 🔹 PubMed API (NCBI E-Utilities)
+# ============================================================
+def fetch_pubmed(query: str, limit: int = 3) -> str:
+    try:
+        q = _normalize_query(query)
+        base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
+        esearch = f"{base}esearch.fcgi?db=pubmed&term={q}&retmax={limit}&retmode=json"
+        res = requests.get(esearch, timeout=10)
+        res.raise_for_status()
+        ids = res.json().get("esearchresult", {}).get("idlist", [])
+        if not ids:
+            return f"<i>No PubMed results found for <b>{html.escape(q)}</b>.</i>"
+        summaries = []
+        for pmid in ids:
+            summary_url = f"{base}esummary.fcgi?db=pubmed&id={pmid}&retmode=json"
+            sres = requests.get(summary_url, timeout=10)
+            sres.raise_for_status()
+            doc = sres.json()["result"].get(pmid, {})
+            title = html.escape(doc.get("title", "Untitled"))
+            source = html.escape(doc.get("source", ""))
+            pubdate = html.escape(doc.get("pubdate", ""))
+            link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
+            summaries.append(
+                f"<b>{title}</b><br>{source} ({pubdate})<br>"
+                f"<a href='{link}' target='_blank'>[PubMed]</a>"
+            )
+        return "<br><br>".join(summaries)
+    except Exception as e:
+        traceback.print_exc()
+        return f"<i>PubMed fetch failed for <b>{html.escape(query)}</b>: {e}</i>"
+# ============================================================
+# 🔹 ClinicalTrials.gov API
+# ============================================================
+def fetch_clinicaltrials(query: str, limit: int = 3) -> str:
+    """
+    Retrieves brief summaries of matching trials from ClinicalTrials.gov.
+    Automatically truncates query to avoid 404s on long input.
+    """
+    try:
+        q = _normalize_query(query)
+        url = (
+            f"https://clinicaltrials.gov/api/query/study_fields?"
+            f"expr={q}&fields=NCTId,BriefTitle,Condition,OverallStatus"
+            f"&max_rnk={limit}&fmt=json"
+        )
+        res = requests.get(url, timeout=10)
+        res.raise_for_status()
+        studies = res.json().get("StudyFieldsResponse", {}).get("StudyFields", [])
+        if not studies:
+            return f"<i>No trials found for <b>{html.escape(q)}</b>.</i>"
+        formatted = []
+        for s in studies:
+            nct = s.get("NCTId", [""])[0]
+            title = html.escape(s.get("BriefTitle", [""])[0])
+            condition = html.escape(", ".join(s.get("Condition", [])))
+            status = html.escape(s.get("OverallStatus", ["Unknown"])[0])
+            link = f"https://clinicaltrials.gov/study/{nct}" if nct else "#"
+            formatted.append(
+                f"<b>{title}</b><br>"
+                f"Condition: {condition or 'N/A'}<br>"
+                f"Status: {status}<br>"
+                f"<a href='{link}' target='_blank'>[ClinicalTrials.gov]</a>"
+            )
+        return "<br><br>".join(formatted)
+    except Exception as e:
+        traceback.print_exc()
+        return f"<i>ClinicalTrials.gov fetch failed for <b>{html.escape(query)}</b>: {e}</i>"
+# ============================================================
+# 🔹 FDA Open Data API
+# ============================================================
+def fetch_fda(query: str, limit: int = 3) -> str:
+    """
+    Retrieves FDA label and safety data for a given compound/drug name.
+    """
+    try:
+        q = _normalize_query(query)
+        url = f"https://api.fda.gov/drug/label.json?search=openfda.brand_name:{q}&limit={limit}"
+        res = requests.get(url, timeout=10)
+        if res.status_code == 404:
+            return f"<i>No FDA data found for <b>{html.escape(q)}</b>.</i>"
+        res.raise_for_status()
+        data = res.json().get("results", [])
+        if not data:
+            return f"<i>No FDA label results found for <b>{html.escape(q)}</b>.</i>"
+        formatted = []
+        for entry in data:
+            brand = ", ".join(entry.get("openfda", {}).get("brand_name", []))
+            generic = ", ".join(entry.get("openfda", {}).get("generic_name", []))
+            purpose = html.escape(" ".join(entry.get("purpose", [])[:1]))
+            warnings = html.escape(" ".join(entry.get("warnings", [])[:1]))
+            link = "https://open.fda.gov/drug/label/"
+            formatted.append(
+                f"<b>{brand or q}</b> ({generic or 'N/A'})<br>"
+                f"<u>Purpose:</u> {purpose or 'N/A'}<br>"
+                f"<u>Warning:</u> {warnings or 'N/A'}<br>"
+                f"<a href='{link}' target='_blank'>[FDA Label]</a>"
+            )
+        return "<br><br>".join(formatted)
+    except Exception as e:
+        traceback.print_exc()
+        return f"<i>FDA fetch failed for <b>{html.escape(query)}</b>: {e}</i>"
+# ============================================================
+# 🔹 WHO ICTRP (Backup Trial Source)
+# ============================================================
+def fetch_who_trials(query: str, limit: int = 2) -> str:
+    """
+    Optional backup trial search from WHO ICTRP API.
+    Returns simplified summaries for readability.
+    """
+    try:
+        q = _normalize_query(query)
+        url = f"https://trialsearch.who.int/api/TrialSearch?query={q}"
+        res = requests.get(url, timeout=10)
+        if res.status_code != 200:
+            return "<i>WHO ICTRP API unavailable or throttled.</i>"
+        trials = res.json().get("TrialSearchResult", [])
+        if not trials:
+            return f"<i>No WHO trials found for <b>{html.escape(q)}</b>.</i>"
+        formatted = []
+        for t in trials[:limit]:
+            title = html.escape(t.get("Scientific_title", "Untitled"))
+            registry = html.escape(t.get("Register", ""))
+            country = html.escape(t.get("Recruitment_Country", ""))
+            formatted.append(
+                f"<b>{title}</b><br>{registry or 'Registry Unknown'} — {country or 'N/A'}"
+            )
+        return "<br><br>".join(formatted)
+    except Exception as e:
+        traceback.print_exc()
+        return f"<i>WHO ICTRP fetch failed for <b>{html.escape(query)}</b>: {e}</i>"

utils/faq.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import json
+import os
+from sentence_transformers import SentenceTransformer, util
+import torch
+FAQ_PATHS = ["data/faq_data.json", "data/clinical_faq.json"]
+_FAQ_CACHE = None
+_FAQ_EMBEDDINGS = None
+_MODEL = None
+def _get_model():
+    """Load and cache the embedding model (shared with main app if possible)."""
+    global _MODEL
+    if _MODEL is None:
+        print("📦 [faq] Loading embedding model: all-MiniLM-L6-v2 ...")
+        _MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    return _MODEL
+def load_faqs():
+    """Load FAQ data from JSON files and cache them."""
+    global _FAQ_CACHE
+    if _FAQ_CACHE is not None:
+        return _FAQ_CACHE
+    all_faqs = []
+    for path in FAQ_PATHS:
+        if os.path.exists(path):
+            try:
+                with open(path, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                if isinstance(data, list):
+                    all_faqs.extend(data)
+                elif isinstance(data, dict):
+                    for k, v in data.items():
+                        all_faqs.append({"question": k, "answer": v})
+            except Exception as e:
+                print(f"⚠️ Failed to load FAQ file {path}: {e}")
+    _FAQ_CACHE = all_faqs
+    print(f"✅ [faq] Loaded {len(_FAQ_CACHE)} FAQ entries.")
+    return _FAQ_CACHE
+def _build_embeddings():
+    """Precompute embeddings for all FAQ questions."""
+    global _FAQ_EMBEDDINGS
+    faqs = load_faqs()
+    if not faqs:
+        _FAQ_EMBEDDINGS = torch.empty(0)
+        return _FAQ_EMBEDDINGS
+    model = _get_model()
+    questions = [f["question"] for f in faqs if f.get("question")]
+    _FAQ_EMBEDDINGS = model.encode(questions, convert_to_tensor=True, show_progress_bar=False)
+    print(f"✅ [faq] Encoded {len(_FAQ_EMBEDDINGS)} FAQ embeddings.")
+    return _FAQ_EMBEDDINGS
+def get_faq_answer(query: str, top_k: int = 1) -> str:
+    """
+    Return the most semantically similar FAQ answer to the query.
+    Uses MiniLM embeddings and cosine similarity.
+    """
+    faqs = load_faqs()
+    if not faqs:
+        return ""
+    if _FAQ_EMBEDDINGS is None:
+        _build_embeddings()
+    model = _get_model()
+    query_emb = model.encode(query, convert_to_tensor=True)
+    sims = util.cos_sim(query_emb, _FAQ_EMBEDDINGS)[0]
+    top_idx = int(torch.argmax(sims))
+    best_score = float(sims[top_idx])
+    best_item = faqs[top_idx]
+    if best_score < 0.45:  # threshold to avoid weak matches
+        return ""
+    answer = best_item.get("answer", "")
+    print(f"💡 [faq] Best match: \"{best_item.get('question')}\" (score={best_score:.2f})")
+    return answer
+def lookup_faq(query: str, top_k: int = 3) -> str:
+    """
+    Return HTML-formatted list of top-k semantically similar FAQ matches.
+    Useful for admin or verbose display.
+    """
+    faqs = load_faqs()
+    if not faqs:
+        return "<i>No FAQ data loaded.</i>"
+    if _FAQ_EMBEDDINGS is None:
+        _build_embeddings()
+    model = _get_model()
+    query_emb = model.encode(query, convert_to_tensor=True)
+    sims = util.cos_sim(query_emb, _FAQ_EMBEDDINGS)[0]
+    top_indices = torch.topk(sims, k=min(top_k, len(faqs))).indices.tolist()
+    html = []
+    for idx in top_indices:
+        score = float(sims[idx])
+        item = faqs[idx]
+        html.append(f"<b>{item['question']}</b><br>{item['answer']}<br><i>(score={score:.2f})</i>")
+    return "<br><br>".join(html)

utils/feedback.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+utils/feedback.py
+Unified feedback handler for Clinical Research Chatbot.
+Includes:
+1️⃣ Feedback Queue (unanswered/low-confidence queries)
+2️⃣ User Voting (👍 Helpful / 👎 Not Helpful)
+"""
+import os
+import json
+from datetime import datetime
+# ----------------------------
+# File Paths
+# ----------------------------
+FEEDBACK_QUEUE_LOG = "logs/feedback_queue.jsonl"
+FEEDBACK_VOTES_LOG = "logs/feedback_votes.jsonl"
+# ----------------------------
+# Feedback Queue (for Admin Review)
+# ----------------------------
+def log_feedback(query: str, notes: str = "", sources=None):
+    """
+    Store unanswered or low-confidence queries for admin review.
+    Saves to JSONL (one entry per line).
+    """
+    entry = {
+        "timestamp": datetime.utcnow().isoformat(),
+        "query": query,
+        "notes": notes,
+        "sources": sources or [],
+    }
+    os.makedirs(os.path.dirname(FEEDBACK_QUEUE_LOG), exist_ok=True)
+    with open(FEEDBACK_QUEUE_LOG, "a", encoding="utf-8") as f:
+        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+    print(f"📝 Feedback queued for admin review: {query}")
+def load_feedback(limit: int = 20):
+    """
+    Load last N feedback entries for admin dashboard.
+    """
+    if not os.path.exists(FEEDBACK_QUEUE_LOG):
+        return []
+    with open(FEEDBACK_QUEUE_LOG, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    entries = [json.loads(line) for line in lines]
+    return entries[-limit:]
+def clear_feedback():
+    """
+    Clear feedback log (admin only).
+    """
+    if os.path.exists(FEEDBACK_QUEUE_LOG):
+        os.remove(FEEDBACK_QUEUE_LOG)
+        print("🗑️ Feedback log cleared.")
+# ----------------------------
+# User Voting (for “Helpful / Not Helpful”)
+# ----------------------------
+def save_vote_feedback(query: str, vote: str, context=None):
+    """
+    Log user votes (👍 / 👎) on chatbot responses.
+    """
+    entry = {
+        "timestamp": datetime.utcnow().isoformat(),
+        "query": query,
+        "vote": vote,
+        "context": context or {},
+    }
+    os.makedirs(os.path.dirname(FEEDBACK_VOTES_LOG), exist_ok=True)
+    try:
+        with open(FEEDBACK_VOTES_LOG, "a", encoding="utf-8") as f:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+        print(f"🗳️ User voted '{vote}' for query: {query}")
+    except Exception as e:
+        print(f"⚠️ Failed to save vote feedback: {e}")
+def load_votes(limit: int = 50):
+    """
+    Load last N user votes for analysis.
+    """
+    if not os.path.exists(FEEDBACK_VOTES_LOG):
+        return []
+    with open(FEEDBACK_VOTES_LOG, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+    entries = [json.loads(line) for line in lines]
+    return entries[-limit:]
+def clear_votes():
+    """
+    Clear user voting log (admin only).
+    """
+    if os.path.exists(FEEDBACK_VOTES_LOG):
+        os.remove(FEEDBACK_VOTES_LOG)
+        print("🗑️ User vote feedback cleared.")

utils/nlp_helpers.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+utils/nlp_helpers.py — Enhanced NLP Utilities for Clinical Research Chatbot
+----------------------------------------------------------------------------
+✅ Domain-aware abbreviation normalization (ICH-GCP, CDISC, FDA)
+✅ Glossary-synonym expansion with prioritization
+✅ Improved VAN (Verb–Adjective–Noun) normalization
+✅ Compatible with Hugging Face Spaces (persistent NLTK path)
+"""
+import os
+import re
+import json
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+# --------------------------------------------------------------------
+# 🧠 NLTK Setup (force consistent path for Hugging Face Spaces)
+# --------------------------------------------------------------------
+NLTK_PATH = "/usr/local/share/nltk_data"
+os.environ["NLTK_DATA"] = NLTK_PATH
+nltk.data.path.clear()
+nltk.data.path.append(NLTK_PATH)
+required_pkgs = [
+    "punkt",
+    "punkt_tab",
+    "averaged_perceptron_tagger",
+    "averaged_perceptron_tagger_eng",
+    "stopwords",
+    "wordnet",
+]
+for pkg in required_pkgs:
+    try:
+        nltk.data.find(pkg)
+    except LookupError:
+        nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
+STOPWORDS = set(stopwords.words("english"))
+lemmatizer = WordNetLemmatizer()
+# --------------------------------------------------------------------
+# ⚕️ Clinical Abbreviation & Synonym Normalization
+# --------------------------------------------------------------------
+NORMALIZATION_MAP = {
+    # Core trial terms
+    r"\be[-_ ]?crf(s)?\b": "electronic case report form",
+    r"\bedc(s)?\b": "electronic data capture",
+    r"\bctms\b": "clinical trial management system",
+    r"\bcsr(s)?\b": "clinical study report",
+    r"\bcrf\b": "case report form",
+    # Data standards
+    r"\bsdtm(s)?\b": "study data tabulation model",
+    r"\badam(s)?\b": "analysis data model",
+    r"\bdefine[-_ ]?xml\b": "define xml metadata",
+    # Compliance / Ethics
+    r"\bgcp\b": "good clinical practice",
+    r"\biec\b": "independent ethics committee",
+    r"\birb\b": "institutional review board",
+    r"\bpi\b": "principal investigator",
+    r"\bsub[-_ ]?inv(es)?tigators?\b": "sub investigator",
+    r"\bsae(s)?\b": "serious adverse event",
+    r"\bae(s)?\b": "adverse event",
+    r"\bsusar(s)?\b": "suspected unexpected serious adverse reaction",
+    # Misc
+    r"\bsdv\b": "source data verification",
+    r"\bsop(s)?\b": "standard operating procedure",
+    r"\bqms\b": "quality management system",
+    r"\bicf\b": "informed consent form",
+    r"\bregulatory\b": "regulatory compliance",
+}
+DOMAIN_SYNONYMS = {
+    "edc": ["data entry system", "data management platform"],
+    "ecrf": ["electronic data entry form", "study data form"],
+    "gcp": ["good clinical practice", "ich e6", "regulatory compliance"],
+    "sdtm": ["data tabulation model", "cdisc standard"],
+    "adam": ["analysis dataset model", "statistical dataset"],
+    "ae": ["adverse event", "side effect"],
+    "sae": ["serious adverse event", "life threatening event"],
+    "susar": ["unexpected serious adverse reaction", "drug safety event"],
+    "ctms": ["trial management tool", "site tracking system"],
+    "pi": ["principal investigator", "study doctor"],
+    "csr": ["clinical study report", "final study document"],
+    "qms": ["quality management framework", "audit system"],
+    "sop": ["standard operating procedure", "company process document"],
+}
+GLOSSARY_PATH = "data/glossary.json"
+# --------------------------------------------------------------------
+# 🧹 Text Normalization
+# --------------------------------------------------------------------
+def normalize_query_text(text: str) -> str:
+    """Lowercase, remove punctuation, and expand known abbreviations."""
+    text = text.strip().lower()
+    text = re.sub(r"[^\w\s\-]", " ", text)
+    text = re.sub(r"\s+", " ", text)
+    for pattern, repl in NORMALIZATION_MAP.items():
+        text = re.sub(pattern, repl, text)
+    return text.strip()
+# --------------------------------------------------------------------
+# ⚙️ VAN (Verb–Adjective–Noun) Extraction — IMPROVED
+# --------------------------------------------------------------------
+def extract_van_tokens(text: str):
+    """
+    Extract and normalize core content words using VAN logic.
+    - Lowercases and expands abbreviations
+    - Removes stopwords and determiners ('a', 'an', 'the')
+    - Keeps only Verbs, Adjectives, and Nouns
+    - Lemmatizes words to singular or base form
+    - Deduplicates tokens
+    """
+    text = normalize_query_text(text)
+    if not text:
+        return []
+    try:
+        tokens = nltk.word_tokenize(text)
+        pos_tags = nltk.pos_tag(tokens)
+    except LookupError:
+        for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger"]:
+            nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
+        pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
+    filtered = []
+    for w, t in pos_tags:
+        if not w.isalpha():
+            continue
+        # Remove determiners and common auxiliaries
+        if w in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
+            continue
+        if w in STOPWORDS:
+            continue
+        if len(w) <= 2:
+            continue
+        # Keep only N, V, J
+        if t.startswith(("N", "V", "J")):
+            pos = (
+                "v" if t.startswith("V")
+                else "a" if t.startswith("J")
+                else "n"
+            )
+            lemma = lemmatizer.lemmatize(w, pos)
+            filtered.append(lemma)
+    # Deduplicate while preserving order
+    seen, unique = set(), []
+    for w in filtered:
+        if w not in seen:
+            seen.add(w)
+            unique.append(w)
+    return unique
+# --------------------------------------------------------------------
+# 📘 Glossary-based Synonym Expansion
+# --------------------------------------------------------------------
+def expand_with_glossary(tokens: list):
+    """Expand tokens using both internal DOMAIN_SYNONYMS and glossary.json."""
+    expanded = list(tokens)
+    # Add domain synonym expansion
+    for token in tokens:
+        key = token.lower()
+        if key in DOMAIN_SYNONYMS:
+            expanded.extend(DOMAIN_SYNONYMS[key])
+    # Glossary-driven enrichment
+    if os.path.exists(GLOSSARY_PATH):
+        try:
+            with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
+                glossary = json.load(f)
+        except Exception:
+            glossary = {}
+        for token in tokens:
+            t_norm = re.sub(r"[^a-z0-9]", "", token.lower())
+            for term, definition in glossary.items():
+                term_norm = re.sub(r"[^a-z0-9]", "", term.lower())
+                if t_norm in term_norm or term_norm in t_norm:
+                    defs = [
+                        w for w in re.findall(r"[a-z]+", str(definition).lower())
+                        if w not in STOPWORDS and len(w) > 3
+                    ]
+                    expanded.extend(defs[:3])
+    # Deduplicate
+    seen, out = set(), []
+    for w in expanded:
+        if w not in seen:
+            seen.add(w)
+            out.append(w)
+    return out
+# --------------------------------------------------------------------
+# 🔍 Unified Token Extraction
+# --------------------------------------------------------------------
+def extract_content_words(query: str):
+    """Normalize, extract VAN tokens, and expand using domain synonyms & glossary."""
+    print(f"🔎 [NLP] Extracting VANs from query: {query}")
+    tokens = extract_van_tokens(query)
+    expanded = expand_with_glossary(tokens)
+    print(f"🔎 [NLP] VAN tokens → {expanded}")
+    return expanded
+# --------------------------------------------------------------------
+# 🧪 Self-test
+# --------------------------------------------------------------------
+if __name__ == "__main__":
+    sample = "Explain how EDC and eCRF relate to GCP compliance in a clinical trial?"
+    print(extract_content_words(sample))