Spaces:
Running
Running
File size: 7,960 Bytes
9788b7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
"""
utils/nlp_helpers.py — Enhanced NLP Utilities for Clinical Research Chatbot
----------------------------------------------------------------------------
✅ Domain-aware abbreviation normalization (ICH-GCP, CDISC, FDA)
✅ Glossary-synonym expansion with prioritization
✅ Improved VAN (Verb–Adjective–Noun) normalization
✅ Compatible with Hugging Face Spaces (persistent NLTK path)
"""
import os
import re
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# --------------------------------------------------------------------
# 🧠 NLTK Setup (force consistent path for Hugging Face Spaces)
# --------------------------------------------------------------------
NLTK_PATH = "/usr/local/share/nltk_data"
os.environ["NLTK_DATA"] = NLTK_PATH
nltk.data.path.clear()
nltk.data.path.append(NLTK_PATH)
required_pkgs = [
"punkt",
"punkt_tab",
"averaged_perceptron_tagger",
"averaged_perceptron_tagger_eng",
"stopwords",
"wordnet",
]
for pkg in required_pkgs:
try:
nltk.data.find(pkg)
except LookupError:
nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
STOPWORDS = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
# --------------------------------------------------------------------
# ⚕️ Clinical Abbreviation & Synonym Normalization
# --------------------------------------------------------------------
NORMALIZATION_MAP = {
# Core trial terms
r"\be[-_ ]?crf(s)?\b": "electronic case report form",
r"\bedc(s)?\b": "electronic data capture",
r"\bctms\b": "clinical trial management system",
r"\bcsr(s)?\b": "clinical study report",
r"\bcrf\b": "case report form",
# Data standards
r"\bsdtm(s)?\b": "study data tabulation model",
r"\badam(s)?\b": "analysis data model",
r"\bdefine[-_ ]?xml\b": "define xml metadata",
# Compliance / Ethics
r"\bgcp\b": "good clinical practice",
r"\biec\b": "independent ethics committee",
r"\birb\b": "institutional review board",
r"\bpi\b": "principal investigator",
r"\bsub[-_ ]?inv(es)?tigators?\b": "sub investigator",
r"\bsae(s)?\b": "serious adverse event",
r"\bae(s)?\b": "adverse event",
r"\bsusar(s)?\b": "suspected unexpected serious adverse reaction",
# Misc
r"\bsdv\b": "source data verification",
r"\bsop(s)?\b": "standard operating procedure",
r"\bqms\b": "quality management system",
r"\bicf\b": "informed consent form",
r"\bregulatory\b": "regulatory compliance",
}
DOMAIN_SYNONYMS = {
"edc": ["data entry system", "data management platform"],
"ecrf": ["electronic data entry form", "study data form"],
"gcp": ["good clinical practice", "ich e6", "regulatory compliance"],
"sdtm": ["data tabulation model", "cdisc standard"],
"adam": ["analysis dataset model", "statistical dataset"],
"ae": ["adverse event", "side effect"],
"sae": ["serious adverse event", "life threatening event"],
"susar": ["unexpected serious adverse reaction", "drug safety event"],
"ctms": ["trial management tool", "site tracking system"],
"pi": ["principal investigator", "study doctor"],
"csr": ["clinical study report", "final study document"],
"qms": ["quality management framework", "audit system"],
"sop": ["standard operating procedure", "company process document"],
}
GLOSSARY_PATH = "data/glossary.json"
# --------------------------------------------------------------------
# 🧹 Text Normalization
# --------------------------------------------------------------------
def normalize_query_text(text: str) -> str:
"""Lowercase, remove punctuation, and expand known abbreviations."""
text = text.strip().lower()
text = re.sub(r"[^\w\s\-]", " ", text)
text = re.sub(r"\s+", " ", text)
for pattern, repl in NORMALIZATION_MAP.items():
text = re.sub(pattern, repl, text)
return text.strip()
# --------------------------------------------------------------------
# ⚙️ VAN (Verb–Adjective–Noun) Extraction — IMPROVED
# --------------------------------------------------------------------
def extract_van_tokens(text: str):
"""
Extract and normalize core content words using VAN logic.
- Lowercases and expands abbreviations
- Removes stopwords and determiners ('a', 'an', 'the')
- Keeps only Verbs, Adjectives, and Nouns
- Lemmatizes words to singular or base form
- Deduplicates tokens
"""
text = normalize_query_text(text)
if not text:
return []
try:
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
except LookupError:
for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger"]:
nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
filtered = []
for w, t in pos_tags:
if not w.isalpha():
continue
# Remove determiners and common auxiliaries
if w in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
continue
if w in STOPWORDS:
continue
if len(w) <= 2:
continue
# Keep only N, V, J
if t.startswith(("N", "V", "J")):
pos = (
"v" if t.startswith("V")
else "a" if t.startswith("J")
else "n"
)
lemma = lemmatizer.lemmatize(w, pos)
filtered.append(lemma)
# Deduplicate while preserving order
seen, unique = set(), []
for w in filtered:
if w not in seen:
seen.add(w)
unique.append(w)
return unique
# --------------------------------------------------------------------
# 📘 Glossary-based Synonym Expansion
# --------------------------------------------------------------------
def expand_with_glossary(tokens: list):
"""Expand tokens using both internal DOMAIN_SYNONYMS and glossary.json."""
expanded = list(tokens)
# Add domain synonym expansion
for token in tokens:
key = token.lower()
if key in DOMAIN_SYNONYMS:
expanded.extend(DOMAIN_SYNONYMS[key])
# Glossary-driven enrichment
if os.path.exists(GLOSSARY_PATH):
try:
with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
glossary = json.load(f)
except Exception:
glossary = {}
for token in tokens:
t_norm = re.sub(r"[^a-z0-9]", "", token.lower())
for term, definition in glossary.items():
term_norm = re.sub(r"[^a-z0-9]", "", term.lower())
if t_norm in term_norm or term_norm in t_norm:
defs = [
w for w in re.findall(r"[a-z]+", str(definition).lower())
if w not in STOPWORDS and len(w) > 3
]
expanded.extend(defs[:3])
# Deduplicate
seen, out = set(), []
for w in expanded:
if w not in seen:
seen.add(w)
out.append(w)
return out
# --------------------------------------------------------------------
# 🔍 Unified Token Extraction
# --------------------------------------------------------------------
def extract_content_words(query: str):
"""Normalize, extract VAN tokens, and expand using domain synonyms & glossary."""
print(f"🔎 [NLP] Extracting VANs from query: {query}")
tokens = extract_van_tokens(query)
expanded = expand_with_glossary(tokens)
print(f"🔎 [NLP] VAN tokens → {expanded}")
return expanded
# --------------------------------------------------------------------
# 🧪 Self-test
# --------------------------------------------------------------------
if __name__ == "__main__":
sample = "Explain how EDC and eCRF relate to GCP compliance in a clinical trial?"
print(extract_content_words(sample))
|