File size: 7,960 Bytes
9788b7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
utils/nlp_helpers.py — Enhanced NLP Utilities for Clinical Research Chatbot
----------------------------------------------------------------------------
✅ Domain-aware abbreviation normalization (ICH-GCP, CDISC, FDA)
✅ Glossary-synonym expansion with prioritization
✅ Improved VAN (Verb–Adjective–Noun) normalization
✅ Compatible with Hugging Face Spaces (persistent NLTK path)
"""

import os
import re
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# --------------------------------------------------------------------
# 🧠 NLTK Setup (force consistent path for Hugging Face Spaces)
# --------------------------------------------------------------------
NLTK_PATH = "/usr/local/share/nltk_data"
os.environ["NLTK_DATA"] = NLTK_PATH
nltk.data.path.clear()
nltk.data.path.append(NLTK_PATH)

required_pkgs = [
    "punkt",
    "punkt_tab",
    "averaged_perceptron_tagger",
    "averaged_perceptron_tagger_eng",
    "stopwords",
    "wordnet",
]

for pkg in required_pkgs:
    try:
        nltk.data.find(pkg)
    except LookupError:
        nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)

STOPWORDS = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# --------------------------------------------------------------------
# ⚕️ Clinical Abbreviation & Synonym Normalization
# --------------------------------------------------------------------
NORMALIZATION_MAP = {
    # Core trial terms
    r"\be[-_ ]?crf(s)?\b": "electronic case report form",
    r"\bedc(s)?\b": "electronic data capture",
    r"\bctms\b": "clinical trial management system",
    r"\bcsr(s)?\b": "clinical study report",
    r"\bcrf\b": "case report form",
    # Data standards
    r"\bsdtm(s)?\b": "study data tabulation model",
    r"\badam(s)?\b": "analysis data model",
    r"\bdefine[-_ ]?xml\b": "define xml metadata",
    # Compliance / Ethics
    r"\bgcp\b": "good clinical practice",
    r"\biec\b": "independent ethics committee",
    r"\birb\b": "institutional review board",
    r"\bpi\b": "principal investigator",
    r"\bsub[-_ ]?inv(es)?tigators?\b": "sub investigator",
    r"\bsae(s)?\b": "serious adverse event",
    r"\bae(s)?\b": "adverse event",
    r"\bsusar(s)?\b": "suspected unexpected serious adverse reaction",
    # Misc
    r"\bsdv\b": "source data verification",
    r"\bsop(s)?\b": "standard operating procedure",
    r"\bqms\b": "quality management system",
    r"\bicf\b": "informed consent form",
    r"\bregulatory\b": "regulatory compliance",
}

DOMAIN_SYNONYMS = {
    "edc": ["data entry system", "data management platform"],
    "ecrf": ["electronic data entry form", "study data form"],
    "gcp": ["good clinical practice", "ich e6", "regulatory compliance"],
    "sdtm": ["data tabulation model", "cdisc standard"],
    "adam": ["analysis dataset model", "statistical dataset"],
    "ae": ["adverse event", "side effect"],
    "sae": ["serious adverse event", "life threatening event"],
    "susar": ["unexpected serious adverse reaction", "drug safety event"],
    "ctms": ["trial management tool", "site tracking system"],
    "pi": ["principal investigator", "study doctor"],
    "csr": ["clinical study report", "final study document"],
    "qms": ["quality management framework", "audit system"],
    "sop": ["standard operating procedure", "company process document"],
}

GLOSSARY_PATH = "data/glossary.json"

# --------------------------------------------------------------------
# 🧹 Text Normalization
# --------------------------------------------------------------------
def normalize_query_text(text: str) -> str:
    """Lowercase, remove punctuation, and expand known abbreviations."""
    text = text.strip().lower()
    text = re.sub(r"[^\w\s\-]", " ", text)
    text = re.sub(r"\s+", " ", text)
    for pattern, repl in NORMALIZATION_MAP.items():
        text = re.sub(pattern, repl, text)
    return text.strip()

# --------------------------------------------------------------------
# ⚙️ VAN (Verb–Adjective–Noun) Extraction — IMPROVED
# --------------------------------------------------------------------
def extract_van_tokens(text: str):
    """
    Extract and normalize core content words using VAN logic.
    - Lowercases and expands abbreviations
    - Removes stopwords and determiners ('a', 'an', 'the')
    - Keeps only Verbs, Adjectives, and Nouns
    - Lemmatizes words to singular or base form
    - Deduplicates tokens
    """
    text = normalize_query_text(text)
    if not text:
        return []

    try:
        tokens = nltk.word_tokenize(text)
        pos_tags = nltk.pos_tag(tokens)
    except LookupError:
        for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger"]:
            nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
        pos_tags = nltk.pos_tag(nltk.word_tokenize(text))

    filtered = []
    for w, t in pos_tags:
        if not w.isalpha():
            continue
        # Remove determiners and common auxiliaries
        if w in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
            continue
        if w in STOPWORDS:
            continue
        if len(w) <= 2:
            continue
        # Keep only N, V, J
        if t.startswith(("N", "V", "J")):
            pos = (
                "v" if t.startswith("V")
                else "a" if t.startswith("J")
                else "n"
            )
            lemma = lemmatizer.lemmatize(w, pos)
            filtered.append(lemma)

    # Deduplicate while preserving order
    seen, unique = set(), []
    for w in filtered:
        if w not in seen:
            seen.add(w)
            unique.append(w)
    return unique

# --------------------------------------------------------------------
# 📘 Glossary-based Synonym Expansion
# --------------------------------------------------------------------
def expand_with_glossary(tokens: list):
    """Expand tokens using both internal DOMAIN_SYNONYMS and glossary.json."""
    expanded = list(tokens)

    # Add domain synonym expansion
    for token in tokens:
        key = token.lower()
        if key in DOMAIN_SYNONYMS:
            expanded.extend(DOMAIN_SYNONYMS[key])

    # Glossary-driven enrichment
    if os.path.exists(GLOSSARY_PATH):
        try:
            with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
                glossary = json.load(f)
        except Exception:
            glossary = {}
        for token in tokens:
            t_norm = re.sub(r"[^a-z0-9]", "", token.lower())
            for term, definition in glossary.items():
                term_norm = re.sub(r"[^a-z0-9]", "", term.lower())
                if t_norm in term_norm or term_norm in t_norm:
                    defs = [
                        w for w in re.findall(r"[a-z]+", str(definition).lower())
                        if w not in STOPWORDS and len(w) > 3
                    ]
                    expanded.extend(defs[:3])

    # Deduplicate
    seen, out = set(), []
    for w in expanded:
        if w not in seen:
            seen.add(w)
            out.append(w)
    return out

# --------------------------------------------------------------------
# 🔍 Unified Token Extraction
# --------------------------------------------------------------------
def extract_content_words(query: str):
    """Normalize, extract VAN tokens, and expand using domain synonyms & glossary."""
    print(f"🔎 [NLP] Extracting VANs from query: {query}")
    tokens = extract_van_tokens(query)
    expanded = expand_with_glossary(tokens)
    print(f"🔎 [NLP] VAN tokens → {expanded}")
    return expanded

# --------------------------------------------------------------------
# 🧪 Self-test
# --------------------------------------------------------------------
if __name__ == "__main__":
    sample = "Explain how EDC and eCRF relate to GCP compliance in a clinical trial?"
    print(extract_content_words(sample))