essprasad commited on
Commit
e9a707b
·
verified ·
1 Parent(s): cc3f67c

Delete utils/nlp_helpers.py

Browse files
Files changed (1) hide show
  1. utils/nlp_helpers.py +0 -212
utils/nlp_helpers.py DELETED
@@ -1,212 +0,0 @@
1
- """
2
- utils/nlp_helpers.py — Enhanced NLP Utilities for Clinical Research Chatbot
3
- ----------------------------------------------------------------------------
4
- ✅ Domain-aware abbreviation normalization (ICH-GCP, CDISC, FDA)
5
- ✅ Glossary-synonym expansion with prioritization
6
- ✅ Improved VAN (Verb–Adjective–Noun) normalization
7
- ✅ Compatible with Hugging Face Spaces (persistent NLTK path)
8
- """
9
-
10
- import os
11
- import re
12
- import json
13
- import nltk
14
- from nltk.corpus import stopwords
15
- from nltk.stem import WordNetLemmatizer
16
-
17
- # --------------------------------------------------------------------
18
- # 🧠 NLTK Setup (force consistent path for Hugging Face Spaces)
19
- # --------------------------------------------------------------------
20
- NLTK_PATH = "/usr/local/share/nltk_data"
21
- os.environ["NLTK_DATA"] = NLTK_PATH
22
- nltk.data.path.clear()
23
- nltk.data.path.append(NLTK_PATH)
24
-
25
- required_pkgs = [
26
- "punkt",
27
- "punkt_tab",
28
- "averaged_perceptron_tagger",
29
- "averaged_perceptron_tagger_eng",
30
- "stopwords",
31
- "wordnet",
32
- ]
33
-
34
- for pkg in required_pkgs:
35
- try:
36
- nltk.data.find(pkg)
37
- except LookupError:
38
- nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
39
-
40
- STOPWORDS = set(stopwords.words("english"))
41
- lemmatizer = WordNetLemmatizer()
42
-
43
- # --------------------------------------------------------------------
44
- # ⚕️ Clinical Abbreviation & Synonym Normalization
45
- # --------------------------------------------------------------------
46
- NORMALIZATION_MAP = {
47
- # Core trial terms
48
- r"\be[-_ ]?crf(s)?\b": "electronic case report form",
49
- r"\bedc(s)?\b": "electronic data capture",
50
- r"\bctms\b": "clinical trial management system",
51
- r"\bcsr(s)?\b": "clinical study report",
52
- r"\bcrf\b": "case report form",
53
- # Data standards
54
- r"\bsdtm(s)?\b": "study data tabulation model",
55
- r"\badam(s)?\b": "analysis data model",
56
- r"\bdefine[-_ ]?xml\b": "define xml metadata",
57
- # Compliance / Ethics
58
- r"\bgcp\b": "good clinical practice",
59
- r"\biec\b": "independent ethics committee",
60
- r"\birb\b": "institutional review board",
61
- r"\bpi\b": "principal investigator",
62
- r"\bsub[-_ ]?inv(es)?tigators?\b": "sub investigator",
63
- r"\bsae(s)?\b": "serious adverse event",
64
- r"\bae(s)?\b": "adverse event",
65
- r"\bsusar(s)?\b": "suspected unexpected serious adverse reaction",
66
- # Misc
67
- r"\bsdv\b": "source data verification",
68
- r"\bsop(s)?\b": "standard operating procedure",
69
- r"\bqms\b": "quality management system",
70
- r"\bicf\b": "informed consent form",
71
- r"\bregulatory\b": "regulatory compliance",
72
- }
73
-
74
- DOMAIN_SYNONYMS = {
75
- "edc": ["data entry system", "data management platform"],
76
- "ecrf": ["electronic data entry form", "study data form"],
77
- "gcp": ["good clinical practice", "ich e6", "regulatory compliance"],
78
- "sdtm": ["data tabulation model", "cdisc standard"],
79
- "adam": ["analysis dataset model", "statistical dataset"],
80
- "ae": ["adverse event", "side effect"],
81
- "sae": ["serious adverse event", "life threatening event"],
82
- "susar": ["unexpected serious adverse reaction", "drug safety event"],
83
- "ctms": ["trial management tool", "site tracking system"],
84
- "pi": ["principal investigator", "study doctor"],
85
- "csr": ["clinical study report", "final study document"],
86
- "qms": ["quality management framework", "audit system"],
87
- "sop": ["standard operating procedure", "company process document"],
88
- }
89
-
90
- GLOSSARY_PATH = "data/glossary.json"
91
-
92
- # --------------------------------------------------------------------
93
- # 🧹 Text Normalization
94
- # --------------------------------------------------------------------
95
- def normalize_query_text(text: str) -> str:
96
- """Lowercase, remove punctuation, and expand known abbreviations."""
97
- text = text.strip().lower()
98
- text = re.sub(r"[^\w\s\-]", " ", text)
99
- text = re.sub(r"\s+", " ", text)
100
- for pattern, repl in NORMALIZATION_MAP.items():
101
- text = re.sub(pattern, repl, text)
102
- return text.strip()
103
-
104
- # --------------------------------------------------------------------
105
- # ⚙️ VAN (Verb–Adjective–Noun) Extraction — IMPROVED
106
- # --------------------------------------------------------------------
107
- def extract_van_tokens(text: str):
108
- """
109
- Extract and normalize core content words using VAN logic.
110
- - Lowercases and expands abbreviations
111
- - Removes stopwords and determiners ('a', 'an', 'the')
112
- - Keeps only Verbs, Adjectives, and Nouns
113
- - Lemmatizes words to singular or base form
114
- - Deduplicates tokens
115
- """
116
- text = normalize_query_text(text)
117
- if not text:
118
- return []
119
-
120
- try:
121
- tokens = nltk.word_tokenize(text)
122
- pos_tags = nltk.pos_tag(tokens)
123
- except LookupError:
124
- for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger"]:
125
- nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
126
- pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
127
-
128
- filtered = []
129
- for w, t in pos_tags:
130
- if not w.isalpha():
131
- continue
132
- # Remove determiners and common auxiliaries
133
- if w in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
134
- continue
135
- if w in STOPWORDS:
136
- continue
137
- if len(w) <= 2:
138
- continue
139
- # Keep only N, V, J
140
- if t.startswith(("N", "V", "J")):
141
- pos = (
142
- "v" if t.startswith("V")
143
- else "a" if t.startswith("J")
144
- else "n"
145
- )
146
- lemma = lemmatizer.lemmatize(w, pos)
147
- filtered.append(lemma)
148
-
149
- # Deduplicate while preserving order
150
- seen, unique = set(), []
151
- for w in filtered:
152
- if w not in seen:
153
- seen.add(w)
154
- unique.append(w)
155
- return unique
156
-
157
- # --------------------------------------------------------------------
158
- # 📘 Glossary-based Synonym Expansion
159
- # --------------------------------------------------------------------
160
- def expand_with_glossary(tokens: list):
161
- """Expand tokens using both internal DOMAIN_SYNONYMS and glossary.json."""
162
- expanded = list(tokens)
163
-
164
- # Add domain synonym expansion
165
- for token in tokens:
166
- key = token.lower()
167
- if key in DOMAIN_SYNONYMS:
168
- expanded.extend(DOMAIN_SYNONYMS[key])
169
-
170
- # Glossary-driven enrichment
171
- if os.path.exists(GLOSSARY_PATH):
172
- try:
173
- with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
174
- glossary = json.load(f)
175
- except Exception:
176
- glossary = {}
177
- for token in tokens:
178
- t_norm = re.sub(r"[^a-z0-9]", "", token.lower())
179
- for term, definition in glossary.items():
180
- term_norm = re.sub(r"[^a-z0-9]", "", term.lower())
181
- if t_norm in term_norm or term_norm in t_norm:
182
- defs = [
183
- w for w in re.findall(r"[a-z]+", str(definition).lower())
184
- if w not in STOPWORDS and len(w) > 3
185
- ]
186
- expanded.extend(defs[:3])
187
-
188
- # Deduplicate
189
- seen, out = set(), []
190
- for w in expanded:
191
- if w not in seen:
192
- seen.add(w)
193
- out.append(w)
194
- return out
195
-
196
- # --------------------------------------------------------------------
197
- # 🔍 Unified Token Extraction
198
- # --------------------------------------------------------------------
199
- def extract_content_words(query: str):
200
- """Normalize, extract VAN tokens, and expand using domain synonyms & glossary."""
201
- print(f"🔎 [NLP] Extracting VANs from query: {query}")
202
- tokens = extract_van_tokens(query)
203
- expanded = expand_with_glossary(tokens)
204
- print(f"🔎 [NLP] VAN tokens → {expanded}")
205
- return expanded
206
-
207
- # --------------------------------------------------------------------
208
- # 🧪 Self-test
209
- # --------------------------------------------------------------------
210
- if __name__ == "__main__":
211
- sample = "Explain how EDC and eCRF relate to GCP compliance in a clinical trial?"
212
- print(extract_content_words(sample))