Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

essprasad commited on 12 days ago

Commit

e9a707b

verified ·

1 Parent(s): cc3f67c

Delete utils/nlp_helpers.py

Browse files

Files changed (1) hide show

utils/nlp_helpers.py +0 -212

utils/nlp_helpers.py DELETED Viewed

@@ -1,212 +0,0 @@
-"""
-utils/nlp_helpers.py — Enhanced NLP Utilities for Clinical Research Chatbot
-----------------------------------------------------------------------------
-✅ Domain-aware abbreviation normalization (ICH-GCP, CDISC, FDA)
-✅ Glossary-synonym expansion with prioritization
-✅ Improved VAN (Verb–Adjective–Noun) normalization
-✅ Compatible with Hugging Face Spaces (persistent NLTK path)
-"""
-import os
-import re
-import json
-import nltk
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
-# --------------------------------------------------------------------
-# 🧠 NLTK Setup (force consistent path for Hugging Face Spaces)
-# --------------------------------------------------------------------
-NLTK_PATH = "/usr/local/share/nltk_data"
-os.environ["NLTK_DATA"] = NLTK_PATH
-nltk.data.path.clear()
-nltk.data.path.append(NLTK_PATH)
-required_pkgs = [
-    "punkt",
-    "punkt_tab",
-    "averaged_perceptron_tagger",
-    "averaged_perceptron_tagger_eng",
-    "stopwords",
-    "wordnet",
-]
-for pkg in required_pkgs:
-    try:
-        nltk.data.find(pkg)
-    except LookupError:
-        nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
-STOPWORDS = set(stopwords.words("english"))
-lemmatizer = WordNetLemmatizer()
-# --------------------------------------------------------------------
-# ⚕️ Clinical Abbreviation & Synonym Normalization
-# --------------------------------------------------------------------
-NORMALIZATION_MAP = {
-    # Core trial terms
-    r"\be[-_ ]?crf(s)?\b": "electronic case report form",
-    r"\bedc(s)?\b": "electronic data capture",
-    r"\bctms\b": "clinical trial management system",
-    r"\bcsr(s)?\b": "clinical study report",
-    r"\bcrf\b": "case report form",
-    # Data standards
-    r"\bsdtm(s)?\b": "study data tabulation model",
-    r"\badam(s)?\b": "analysis data model",
-    r"\bdefine[-_ ]?xml\b": "define xml metadata",
-    # Compliance / Ethics
-    r"\bgcp\b": "good clinical practice",
-    r"\biec\b": "independent ethics committee",
-    r"\birb\b": "institutional review board",
-    r"\bpi\b": "principal investigator",
-    r"\bsub[-_ ]?inv(es)?tigators?\b": "sub investigator",
-    r"\bsae(s)?\b": "serious adverse event",
-    r"\bae(s)?\b": "adverse event",
-    r"\bsusar(s)?\b": "suspected unexpected serious adverse reaction",
-    # Misc
-    r"\bsdv\b": "source data verification",
-    r"\bsop(s)?\b": "standard operating procedure",
-    r"\bqms\b": "quality management system",
-    r"\bicf\b": "informed consent form",
-    r"\bregulatory\b": "regulatory compliance",
-}
-DOMAIN_SYNONYMS = {
-    "edc": ["data entry system", "data management platform"],
-    "ecrf": ["electronic data entry form", "study data form"],
-    "gcp": ["good clinical practice", "ich e6", "regulatory compliance"],
-    "sdtm": ["data tabulation model", "cdisc standard"],
-    "adam": ["analysis dataset model", "statistical dataset"],
-    "ae": ["adverse event", "side effect"],
-    "sae": ["serious adverse event", "life threatening event"],
-    "susar": ["unexpected serious adverse reaction", "drug safety event"],
-    "ctms": ["trial management tool", "site tracking system"],
-    "pi": ["principal investigator", "study doctor"],
-    "csr": ["clinical study report", "final study document"],
-    "qms": ["quality management framework", "audit system"],
-    "sop": ["standard operating procedure", "company process document"],
-}
-GLOSSARY_PATH = "data/glossary.json"
-# --------------------------------------------------------------------
-# 🧹 Text Normalization
-# --------------------------------------------------------------------
-def normalize_query_text(text: str) -> str:
-    """Lowercase, remove punctuation, and expand known abbreviations."""
-    text = text.strip().lower()
-    text = re.sub(r"[^\w\s\-]", " ", text)
-    text = re.sub(r"\s+", " ", text)
-    for pattern, repl in NORMALIZATION_MAP.items():
-        text = re.sub(pattern, repl, text)
-    return text.strip()
-# --------------------------------------------------------------------
-# ⚙️ VAN (Verb–Adjective–Noun) Extraction — IMPROVED
-# --------------------------------------------------------------------
-def extract_van_tokens(text: str):
-    """
-    Extract and normalize core content words using VAN logic.
-    - Lowercases and expands abbreviations
-    - Removes stopwords and determiners ('a', 'an', 'the')
-    - Keeps only Verbs, Adjectives, and Nouns
-    - Lemmatizes words to singular or base form
-    - Deduplicates tokens
-    """
-    text = normalize_query_text(text)
-    if not text:
-        return []
-    try:
-        tokens = nltk.word_tokenize(text)
-        pos_tags = nltk.pos_tag(tokens)
-    except LookupError:
-        for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger"]:
-            nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
-        pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
-    filtered = []
-    for w, t in pos_tags:
-        if not w.isalpha():
-            continue
-        # Remove determiners and common auxiliaries
-        if w in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
-            continue
-        if w in STOPWORDS:
-            continue
-        if len(w) <= 2:
-            continue
-        # Keep only N, V, J
-        if t.startswith(("N", "V", "J")):
-            pos = (
-                "v" if t.startswith("V")
-                else "a" if t.startswith("J")
-                else "n"
-            )
-            lemma = lemmatizer.lemmatize(w, pos)
-            filtered.append(lemma)
-    # Deduplicate while preserving order
-    seen, unique = set(), []
-    for w in filtered:
-        if w not in seen:
-            seen.add(w)
-            unique.append(w)
-    return unique
-# --------------------------------------------------------------------
-# 📘 Glossary-based Synonym Expansion
-# --------------------------------------------------------------------
-def expand_with_glossary(tokens: list):
-    """Expand tokens using both internal DOMAIN_SYNONYMS and glossary.json."""
-    expanded = list(tokens)
-    # Add domain synonym expansion
-    for token in tokens:
-        key = token.lower()
-        if key in DOMAIN_SYNONYMS:
-            expanded.extend(DOMAIN_SYNONYMS[key])
-    # Glossary-driven enrichment
-    if os.path.exists(GLOSSARY_PATH):
-        try:
-            with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
-                glossary = json.load(f)
-        except Exception:
-            glossary = {}
-        for token in tokens:
-            t_norm = re.sub(r"[^a-z0-9]", "", token.lower())
-            for term, definition in glossary.items():
-                term_norm = re.sub(r"[^a-z0-9]", "", term.lower())
-                if t_norm in term_norm or term_norm in t_norm:
-                    defs = [
-                        w for w in re.findall(r"[a-z]+", str(definition).lower())
-                        if w not in STOPWORDS and len(w) > 3
-                    ]
-                    expanded.extend(defs[:3])
-    # Deduplicate
-    seen, out = set(), []
-    for w in expanded:
-        if w not in seen:
-            seen.add(w)
-            out.append(w)
-    return out
-# --------------------------------------------------------------------
-# 🔍 Unified Token Extraction
-# --------------------------------------------------------------------
-def extract_content_words(query: str):
-    """Normalize, extract VAN tokens, and expand using domain synonyms & glossary."""
-    print(f"🔎 [NLP] Extracting VANs from query: {query}")
-    tokens = extract_van_tokens(query)
-    expanded = expand_with_glossary(tokens)
-    print(f"🔎 [NLP] VAN tokens → {expanded}")
-    return expanded
-# --------------------------------------------------------------------
-# 🧪 Self-test
-# --------------------------------------------------------------------
-if __name__ == "__main__":
-    sample = "Explain how EDC and eCRF relate to GCP compliance in a clinical trial?"
-    print(extract_content_words(sample))