Spaces:
Running
Running
| """ | |
| utils/nlp_helpers.py — Enhanced NLP Utilities for Clinical Research Chatbot | |
| ---------------------------------------------------------------------------- | |
| ✅ Domain-aware abbreviation normalization (ICH-GCP, CDISC, FDA) | |
| ✅ Glossary-synonym expansion with prioritization | |
| ✅ Improved VAN (Verb–Adjective–Noun) normalization | |
| ✅ Compatible with Hugging Face Spaces (persistent NLTK path) | |
| """ | |
| import os | |
| import re | |
| import json | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import WordNetLemmatizer | |
| # -------------------------------------------------------------------- | |
| # 🧠 NLTK Setup (force consistent path for Hugging Face Spaces) | |
| # -------------------------------------------------------------------- | |
| NLTK_PATH = "/usr/local/share/nltk_data" | |
| os.environ["NLTK_DATA"] = NLTK_PATH | |
| nltk.data.path.clear() | |
| nltk.data.path.append(NLTK_PATH) | |
| required_pkgs = [ | |
| "punkt", | |
| "punkt_tab", | |
| "averaged_perceptron_tagger", | |
| "averaged_perceptron_tagger_eng", | |
| "stopwords", | |
| "wordnet", | |
| ] | |
| for pkg in required_pkgs: | |
| try: | |
| nltk.data.find(pkg) | |
| except LookupError: | |
| nltk.download(pkg, download_dir=NLTK_PATH, quiet=True) | |
| STOPWORDS = set(stopwords.words("english")) | |
| lemmatizer = WordNetLemmatizer() | |
| # -------------------------------------------------------------------- | |
| # ⚕️ Clinical Abbreviation & Synonym Normalization | |
| # -------------------------------------------------------------------- | |
| NORMALIZATION_MAP = { | |
| # Core trial terms | |
| r"\be[-_ ]?crf(s)?\b": "electronic case report form", | |
| r"\bedc(s)?\b": "electronic data capture", | |
| r"\bctms\b": "clinical trial management system", | |
| r"\bcsr(s)?\b": "clinical study report", | |
| r"\bcrf\b": "case report form", | |
| # Data standards | |
| r"\bsdtm(s)?\b": "study data tabulation model", | |
| r"\badam(s)?\b": "analysis data model", | |
| r"\bdefine[-_ ]?xml\b": "define xml metadata", | |
| # Compliance / Ethics | |
| r"\bgcp\b": "good clinical practice", | |
| r"\biec\b": "independent ethics committee", | |
| r"\birb\b": "institutional review board", | |
| r"\bpi\b": "principal investigator", | |
| r"\bsub[-_ ]?inv(es)?tigators?\b": "sub investigator", | |
| r"\bsae(s)?\b": "serious adverse event", | |
| r"\bae(s)?\b": "adverse event", | |
| r"\bsusar(s)?\b": "suspected unexpected serious adverse reaction", | |
| # Misc | |
| r"\bsdv\b": "source data verification", | |
| r"\bsop(s)?\b": "standard operating procedure", | |
| r"\bqms\b": "quality management system", | |
| r"\bicf\b": "informed consent form", | |
| r"\bregulatory\b": "regulatory compliance", | |
| } | |
| DOMAIN_SYNONYMS = { | |
| "edc": ["data entry system", "data management platform"], | |
| "ecrf": ["electronic data entry form", "study data form"], | |
| "gcp": ["good clinical practice", "ich e6", "regulatory compliance"], | |
| "sdtm": ["data tabulation model", "cdisc standard"], | |
| "adam": ["analysis dataset model", "statistical dataset"], | |
| "ae": ["adverse event", "side effect"], | |
| "sae": ["serious adverse event", "life threatening event"], | |
| "susar": ["unexpected serious adverse reaction", "drug safety event"], | |
| "ctms": ["trial management tool", "site tracking system"], | |
| "pi": ["principal investigator", "study doctor"], | |
| "csr": ["clinical study report", "final study document"], | |
| "qms": ["quality management framework", "audit system"], | |
| "sop": ["standard operating procedure", "company process document"], | |
| } | |
| GLOSSARY_PATH = "data/glossary.json" | |
| # -------------------------------------------------------------------- | |
| # 🧹 Text Normalization | |
| # -------------------------------------------------------------------- | |
| def normalize_query_text(text: str) -> str: | |
| """Lowercase, remove punctuation, and expand known abbreviations.""" | |
| text = text.strip().lower() | |
| text = re.sub(r"[^\w\s\-]", " ", text) | |
| text = re.sub(r"\s+", " ", text) | |
| for pattern, repl in NORMALIZATION_MAP.items(): | |
| text = re.sub(pattern, repl, text) | |
| return text.strip() | |
| # -------------------------------------------------------------------- | |
| # ⚙️ VAN (Verb–Adjective–Noun) Extraction — IMPROVED | |
| # -------------------------------------------------------------------- | |
| def extract_van_tokens(text: str): | |
| """ | |
| Extract and normalize core content words using VAN logic. | |
| - Lowercases and expands abbreviations | |
| - Removes stopwords and determiners ('a', 'an', 'the') | |
| - Keeps only Verbs, Adjectives, and Nouns | |
| - Lemmatizes words to singular or base form | |
| - Deduplicates tokens | |
| """ | |
| text = normalize_query_text(text) | |
| if not text: | |
| return [] | |
| try: | |
| tokens = nltk.word_tokenize(text) | |
| pos_tags = nltk.pos_tag(tokens) | |
| except LookupError: | |
| for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger"]: | |
| nltk.download(pkg, download_dir=NLTK_PATH, quiet=True) | |
| pos_tags = nltk.pos_tag(nltk.word_tokenize(text)) | |
| filtered = [] | |
| for w, t in pos_tags: | |
| if not w.isalpha(): | |
| continue | |
| # Remove determiners and common auxiliaries | |
| if w in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}: | |
| continue | |
| if w in STOPWORDS: | |
| continue | |
| if len(w) <= 2: | |
| continue | |
| # Keep only N, V, J | |
| if t.startswith(("N", "V", "J")): | |
| pos = ( | |
| "v" if t.startswith("V") | |
| else "a" if t.startswith("J") | |
| else "n" | |
| ) | |
| lemma = lemmatizer.lemmatize(w, pos) | |
| filtered.append(lemma) | |
| # Deduplicate while preserving order | |
| seen, unique = set(), [] | |
| for w in filtered: | |
| if w not in seen: | |
| seen.add(w) | |
| unique.append(w) | |
| return unique | |
| # -------------------------------------------------------------------- | |
| # 📘 Glossary-based Synonym Expansion | |
| # -------------------------------------------------------------------- | |
| def expand_with_glossary(tokens: list): | |
| """Expand tokens using both internal DOMAIN_SYNONYMS and glossary.json.""" | |
| expanded = list(tokens) | |
| # Add domain synonym expansion | |
| for token in tokens: | |
| key = token.lower() | |
| if key in DOMAIN_SYNONYMS: | |
| expanded.extend(DOMAIN_SYNONYMS[key]) | |
| # Glossary-driven enrichment | |
| if os.path.exists(GLOSSARY_PATH): | |
| try: | |
| with open(GLOSSARY_PATH, "r", encoding="utf-8") as f: | |
| glossary = json.load(f) | |
| except Exception: | |
| glossary = {} | |
| for token in tokens: | |
| t_norm = re.sub(r"[^a-z0-9]", "", token.lower()) | |
| for term, definition in glossary.items(): | |
| term_norm = re.sub(r"[^a-z0-9]", "", term.lower()) | |
| if t_norm in term_norm or term_norm in t_norm: | |
| defs = [ | |
| w for w in re.findall(r"[a-z]+", str(definition).lower()) | |
| if w not in STOPWORDS and len(w) > 3 | |
| ] | |
| expanded.extend(defs[:3]) | |
| # Deduplicate | |
| seen, out = set(), [] | |
| for w in expanded: | |
| if w not in seen: | |
| seen.add(w) | |
| out.append(w) | |
| return out | |
| # -------------------------------------------------------------------- | |
| # 🔍 Unified Token Extraction | |
| # -------------------------------------------------------------------- | |
| def extract_content_words(query: str): | |
| """Normalize, extract VAN tokens, and expand using domain synonyms & glossary.""" | |
| print(f"🔎 [NLP] Extracting VANs from query: {query}") | |
| tokens = extract_van_tokens(query) | |
| expanded = expand_with_glossary(tokens) | |
| print(f"🔎 [NLP] VAN tokens → {expanded}") | |
| return expanded | |
| # -------------------------------------------------------------------- | |
| # 🧪 Self-test | |
| # -------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| sample = "Explain how EDC and eCRF relate to GCP compliance in a clinical trial?" | |
| print(extract_content_words(sample)) | |