essprasad commited on
Commit
9788b7f
·
verified ·
1 Parent(s): e9a707b

Upload 4 files

Browse files
Files changed (4) hide show
  1. utils/api_clients.py +194 -0
  2. utils/faq.py +112 -0
  3. utils/feedback.py +105 -0
  4. utils/nlp_helpers.py +212 -0
utils/api_clients.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ utils/api_clients.py
3
+ ------------------------------------------------
4
+ Enhanced API clients for:
5
+ - PubMed (NCBI)
6
+ - ClinicalTrials.gov
7
+ - FDA Open Data
8
+ - WHO ICTRP
9
+ ------------------------------------------------
10
+ Optimized for hybrid VAN-based query processing:
11
+ - Automatically truncates long queries (top keywords only)
12
+ - Resilient to API downtime or malformed responses
13
+ - HTML formatted results for Gradio rendering
14
+ """
15
+
16
+ import requests
17
+ import html
18
+ import re
19
+ import traceback
20
+
21
+ # ============================================================
22
+ # 🔹 Query Normalization
23
+ # ============================================================
24
+ def _normalize_query(query: str, max_words: int = 5) -> str:
25
+ """
26
+ Cleans and shortens user query for API compatibility.
27
+ Removes filler phrases and limits to key words.
28
+ """
29
+ q = query.lower()
30
+ q = re.sub(
31
+ r"(what is|define|explain|describe|in clinical trials|the meaning of|tell me about|explanation of|concept of)\b",
32
+ "",
33
+ q,
34
+ )
35
+ q = re.sub(r"[^a-z0-9\s]", "", q)
36
+ q = re.sub(r"\s+", " ", q).strip()
37
+
38
+ # limit to first few words (avoid 404s from overlong queries)
39
+ words = q.split()
40
+ q = " ".join(words[:max_words])
41
+ return q or "clinical trial"
42
+
43
+ # ============================================================
44
+ # 🔹 PubMed API (NCBI E-Utilities)
45
+ # ============================================================
46
+ def fetch_pubmed(query: str, limit: int = 3) -> str:
47
+ try:
48
+ q = _normalize_query(query)
49
+ base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
50
+ esearch = f"{base}esearch.fcgi?db=pubmed&term={q}&retmax={limit}&retmode=json"
51
+ res = requests.get(esearch, timeout=10)
52
+ res.raise_for_status()
53
+
54
+ ids = res.json().get("esearchresult", {}).get("idlist", [])
55
+ if not ids:
56
+ return f"<i>No PubMed results found for <b>{html.escape(q)}</b>.</i>"
57
+
58
+ summaries = []
59
+ for pmid in ids:
60
+ summary_url = f"{base}esummary.fcgi?db=pubmed&id={pmid}&retmode=json"
61
+ sres = requests.get(summary_url, timeout=10)
62
+ sres.raise_for_status()
63
+ doc = sres.json()["result"].get(pmid, {})
64
+ title = html.escape(doc.get("title", "Untitled"))
65
+ source = html.escape(doc.get("source", ""))
66
+ pubdate = html.escape(doc.get("pubdate", ""))
67
+ link = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
68
+ summaries.append(
69
+ f"<b>{title}</b><br>{source} ({pubdate})<br>"
70
+ f"<a href='{link}' target='_blank'>[PubMed]</a>"
71
+ )
72
+
73
+ return "<br><br>".join(summaries)
74
+
75
+ except Exception as e:
76
+ traceback.print_exc()
77
+ return f"<i>PubMed fetch failed for <b>{html.escape(query)}</b>: {e}</i>"
78
+
79
+ # ============================================================
80
+ # 🔹 ClinicalTrials.gov API
81
+ # ============================================================
82
+ def fetch_clinicaltrials(query: str, limit: int = 3) -> str:
83
+ """
84
+ Retrieves brief summaries of matching trials from ClinicalTrials.gov.
85
+ Automatically truncates query to avoid 404s on long input.
86
+ """
87
+ try:
88
+ q = _normalize_query(query)
89
+ url = (
90
+ f"https://clinicaltrials.gov/api/query/study_fields?"
91
+ f"expr={q}&fields=NCTId,BriefTitle,Condition,OverallStatus"
92
+ f"&max_rnk={limit}&fmt=json"
93
+ )
94
+ res = requests.get(url, timeout=10)
95
+ res.raise_for_status()
96
+
97
+ studies = res.json().get("StudyFieldsResponse", {}).get("StudyFields", [])
98
+ if not studies:
99
+ return f"<i>No trials found for <b>{html.escape(q)}</b>.</i>"
100
+
101
+ formatted = []
102
+ for s in studies:
103
+ nct = s.get("NCTId", [""])[0]
104
+ title = html.escape(s.get("BriefTitle", [""])[0])
105
+ condition = html.escape(", ".join(s.get("Condition", [])))
106
+ status = html.escape(s.get("OverallStatus", ["Unknown"])[0])
107
+ link = f"https://clinicaltrials.gov/study/{nct}" if nct else "#"
108
+ formatted.append(
109
+ f"<b>{title}</b><br>"
110
+ f"Condition: {condition or 'N/A'}<br>"
111
+ f"Status: {status}<br>"
112
+ f"<a href='{link}' target='_blank'>[ClinicalTrials.gov]</a>"
113
+ )
114
+
115
+ return "<br><br>".join(formatted)
116
+
117
+ except Exception as e:
118
+ traceback.print_exc()
119
+ return f"<i>ClinicalTrials.gov fetch failed for <b>{html.escape(query)}</b>: {e}</i>"
120
+
121
+ # ============================================================
122
+ # 🔹 FDA Open Data API
123
+ # ============================================================
124
+ def fetch_fda(query: str, limit: int = 3) -> str:
125
+ """
126
+ Retrieves FDA label and safety data for a given compound/drug name.
127
+ """
128
+ try:
129
+ q = _normalize_query(query)
130
+ url = f"https://api.fda.gov/drug/label.json?search=openfda.brand_name:{q}&limit={limit}"
131
+ res = requests.get(url, timeout=10)
132
+
133
+ if res.status_code == 404:
134
+ return f"<i>No FDA data found for <b>{html.escape(q)}</b>.</i>"
135
+
136
+ res.raise_for_status()
137
+ data = res.json().get("results", [])
138
+ if not data:
139
+ return f"<i>No FDA label results found for <b>{html.escape(q)}</b>.</i>"
140
+
141
+ formatted = []
142
+ for entry in data:
143
+ brand = ", ".join(entry.get("openfda", {}).get("brand_name", []))
144
+ generic = ", ".join(entry.get("openfda", {}).get("generic_name", []))
145
+ purpose = html.escape(" ".join(entry.get("purpose", [])[:1]))
146
+ warnings = html.escape(" ".join(entry.get("warnings", [])[:1]))
147
+ link = "https://open.fda.gov/drug/label/"
148
+ formatted.append(
149
+ f"<b>{brand or q}</b> ({generic or 'N/A'})<br>"
150
+ f"<u>Purpose:</u> {purpose or 'N/A'}<br>"
151
+ f"<u>Warning:</u> {warnings or 'N/A'}<br>"
152
+ f"<a href='{link}' target='_blank'>[FDA Label]</a>"
153
+ )
154
+
155
+ return "<br><br>".join(formatted)
156
+
157
+ except Exception as e:
158
+ traceback.print_exc()
159
+ return f"<i>FDA fetch failed for <b>{html.escape(query)}</b>: {e}</i>"
160
+
161
+ # ============================================================
162
+ # 🔹 WHO ICTRP (Backup Trial Source)
163
+ # ============================================================
164
+ def fetch_who_trials(query: str, limit: int = 2) -> str:
165
+ """
166
+ Optional backup trial search from WHO ICTRP API.
167
+ Returns simplified summaries for readability.
168
+ """
169
+ try:
170
+ q = _normalize_query(query)
171
+ url = f"https://trialsearch.who.int/api/TrialSearch?query={q}"
172
+ res = requests.get(url, timeout=10)
173
+
174
+ if res.status_code != 200:
175
+ return "<i>WHO ICTRP API unavailable or throttled.</i>"
176
+
177
+ trials = res.json().get("TrialSearchResult", [])
178
+ if not trials:
179
+ return f"<i>No WHO trials found for <b>{html.escape(q)}</b>.</i>"
180
+
181
+ formatted = []
182
+ for t in trials[:limit]:
183
+ title = html.escape(t.get("Scientific_title", "Untitled"))
184
+ registry = html.escape(t.get("Register", ""))
185
+ country = html.escape(t.get("Recruitment_Country", ""))
186
+ formatted.append(
187
+ f"<b>{title}</b><br>{registry or 'Registry Unknown'} — {country or 'N/A'}"
188
+ )
189
+
190
+ return "<br><br>".join(formatted)
191
+
192
+ except Exception as e:
193
+ traceback.print_exc()
194
+ return f"<i>WHO ICTRP fetch failed for <b>{html.escape(query)}</b>: {e}</i>"
utils/faq.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import torch
5
+
6
+ FAQ_PATHS = ["data/faq_data.json", "data/clinical_faq.json"]
7
+ _FAQ_CACHE = None
8
+ _FAQ_EMBEDDINGS = None
9
+ _MODEL = None
10
+
11
+
12
+ def _get_model():
13
+ """Load and cache the embedding model (shared with main app if possible)."""
14
+ global _MODEL
15
+ if _MODEL is None:
16
+ print("📦 [faq] Loading embedding model: all-MiniLM-L6-v2 ...")
17
+ _MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
+ return _MODEL
19
+
20
+
21
+ def load_faqs():
22
+ """Load FAQ data from JSON files and cache them."""
23
+ global _FAQ_CACHE
24
+ if _FAQ_CACHE is not None:
25
+ return _FAQ_CACHE
26
+
27
+ all_faqs = []
28
+ for path in FAQ_PATHS:
29
+ if os.path.exists(path):
30
+ try:
31
+ with open(path, "r", encoding="utf-8") as f:
32
+ data = json.load(f)
33
+ if isinstance(data, list):
34
+ all_faqs.extend(data)
35
+ elif isinstance(data, dict):
36
+ for k, v in data.items():
37
+ all_faqs.append({"question": k, "answer": v})
38
+ except Exception as e:
39
+ print(f"⚠️ Failed to load FAQ file {path}: {e}")
40
+
41
+ _FAQ_CACHE = all_faqs
42
+ print(f"✅ [faq] Loaded {len(_FAQ_CACHE)} FAQ entries.")
43
+ return _FAQ_CACHE
44
+
45
+
46
+ def _build_embeddings():
47
+ """Precompute embeddings for all FAQ questions."""
48
+ global _FAQ_EMBEDDINGS
49
+ faqs = load_faqs()
50
+ if not faqs:
51
+ _FAQ_EMBEDDINGS = torch.empty(0)
52
+ return _FAQ_EMBEDDINGS
53
+
54
+ model = _get_model()
55
+ questions = [f["question"] for f in faqs if f.get("question")]
56
+ _FAQ_EMBEDDINGS = model.encode(questions, convert_to_tensor=True, show_progress_bar=False)
57
+ print(f"✅ [faq] Encoded {len(_FAQ_EMBEDDINGS)} FAQ embeddings.")
58
+ return _FAQ_EMBEDDINGS
59
+
60
+
61
+ def get_faq_answer(query: str, top_k: int = 1) -> str:
62
+ """
63
+ Return the most semantically similar FAQ answer to the query.
64
+ Uses MiniLM embeddings and cosine similarity.
65
+ """
66
+ faqs = load_faqs()
67
+ if not faqs:
68
+ return ""
69
+
70
+ if _FAQ_EMBEDDINGS is None:
71
+ _build_embeddings()
72
+
73
+ model = _get_model()
74
+ query_emb = model.encode(query, convert_to_tensor=True)
75
+ sims = util.cos_sim(query_emb, _FAQ_EMBEDDINGS)[0]
76
+ top_idx = int(torch.argmax(sims))
77
+
78
+ best_score = float(sims[top_idx])
79
+ best_item = faqs[top_idx]
80
+
81
+ if best_score < 0.45: # threshold to avoid weak matches
82
+ return ""
83
+
84
+ answer = best_item.get("answer", "")
85
+ print(f"💡 [faq] Best match: \"{best_item.get('question')}\" (score={best_score:.2f})")
86
+ return answer
87
+
88
+
89
+ def lookup_faq(query: str, top_k: int = 3) -> str:
90
+ """
91
+ Return HTML-formatted list of top-k semantically similar FAQ matches.
92
+ Useful for admin or verbose display.
93
+ """
94
+ faqs = load_faqs()
95
+ if not faqs:
96
+ return "<i>No FAQ data loaded.</i>"
97
+
98
+ if _FAQ_EMBEDDINGS is None:
99
+ _build_embeddings()
100
+
101
+ model = _get_model()
102
+ query_emb = model.encode(query, convert_to_tensor=True)
103
+ sims = util.cos_sim(query_emb, _FAQ_EMBEDDINGS)[0]
104
+ top_indices = torch.topk(sims, k=min(top_k, len(faqs))).indices.tolist()
105
+
106
+ html = []
107
+ for idx in top_indices:
108
+ score = float(sims[idx])
109
+ item = faqs[idx]
110
+ html.append(f"<b>{item['question']}</b><br>{item['answer']}<br><i>(score={score:.2f})</i>")
111
+
112
+ return "<br><br>".join(html)
utils/feedback.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ utils/feedback.py
3
+ Unified feedback handler for Clinical Research Chatbot.
4
+
5
+ Includes:
6
+ 1️⃣ Feedback Queue (unanswered/low-confidence queries)
7
+ 2️⃣ User Voting (👍 Helpful / 👎 Not Helpful)
8
+ """
9
+
10
+ import os
11
+ import json
12
+ from datetime import datetime
13
+
14
+ # ----------------------------
15
+ # File Paths
16
+ # ----------------------------
17
+ FEEDBACK_QUEUE_LOG = "logs/feedback_queue.jsonl"
18
+ FEEDBACK_VOTES_LOG = "logs/feedback_votes.jsonl"
19
+
20
+
21
+ # ----------------------------
22
+ # Feedback Queue (for Admin Review)
23
+ # ----------------------------
24
+ def log_feedback(query: str, notes: str = "", sources=None):
25
+ """
26
+ Store unanswered or low-confidence queries for admin review.
27
+ Saves to JSONL (one entry per line).
28
+ """
29
+ entry = {
30
+ "timestamp": datetime.utcnow().isoformat(),
31
+ "query": query,
32
+ "notes": notes,
33
+ "sources": sources or [],
34
+ }
35
+
36
+ os.makedirs(os.path.dirname(FEEDBACK_QUEUE_LOG), exist_ok=True)
37
+ with open(FEEDBACK_QUEUE_LOG, "a", encoding="utf-8") as f:
38
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
39
+
40
+ print(f"📝 Feedback queued for admin review: {query}")
41
+
42
+
43
+ def load_feedback(limit: int = 20):
44
+ """
45
+ Load last N feedback entries for admin dashboard.
46
+ """
47
+ if not os.path.exists(FEEDBACK_QUEUE_LOG):
48
+ return []
49
+ with open(FEEDBACK_QUEUE_LOG, "r", encoding="utf-8") as f:
50
+ lines = f.readlines()
51
+ entries = [json.loads(line) for line in lines]
52
+ return entries[-limit:]
53
+
54
+
55
+ def clear_feedback():
56
+ """
57
+ Clear feedback log (admin only).
58
+ """
59
+ if os.path.exists(FEEDBACK_QUEUE_LOG):
60
+ os.remove(FEEDBACK_QUEUE_LOG)
61
+ print("🗑️ Feedback log cleared.")
62
+
63
+
64
+ # ----------------------------
65
+ # User Voting (for “Helpful / Not Helpful”)
66
+ # ----------------------------
67
+ def save_vote_feedback(query: str, vote: str, context=None):
68
+ """
69
+ Log user votes (👍 / 👎) on chatbot responses.
70
+ """
71
+ entry = {
72
+ "timestamp": datetime.utcnow().isoformat(),
73
+ "query": query,
74
+ "vote": vote,
75
+ "context": context or {},
76
+ }
77
+
78
+ os.makedirs(os.path.dirname(FEEDBACK_VOTES_LOG), exist_ok=True)
79
+ try:
80
+ with open(FEEDBACK_VOTES_LOG, "a", encoding="utf-8") as f:
81
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
82
+ print(f"🗳️ User voted '{vote}' for query: {query}")
83
+ except Exception as e:
84
+ print(f"⚠️ Failed to save vote feedback: {e}")
85
+
86
+
87
+ def load_votes(limit: int = 50):
88
+ """
89
+ Load last N user votes for analysis.
90
+ """
91
+ if not os.path.exists(FEEDBACK_VOTES_LOG):
92
+ return []
93
+ with open(FEEDBACK_VOTES_LOG, "r", encoding="utf-8") as f:
94
+ lines = f.readlines()
95
+ entries = [json.loads(line) for line in lines]
96
+ return entries[-limit:]
97
+
98
+
99
+ def clear_votes():
100
+ """
101
+ Clear user voting log (admin only).
102
+ """
103
+ if os.path.exists(FEEDBACK_VOTES_LOG):
104
+ os.remove(FEEDBACK_VOTES_LOG)
105
+ print("🗑️ User vote feedback cleared.")
utils/nlp_helpers.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ utils/nlp_helpers.py — Enhanced NLP Utilities for Clinical Research Chatbot
3
+ ----------------------------------------------------------------------------
4
+ ✅ Domain-aware abbreviation normalization (ICH-GCP, CDISC, FDA)
5
+ ✅ Glossary-synonym expansion with prioritization
6
+ ✅ Improved VAN (Verb–Adjective–Noun) normalization
7
+ ✅ Compatible with Hugging Face Spaces (persistent NLTK path)
8
+ """
9
+
10
+ import os
11
+ import re
12
+ import json
13
+ import nltk
14
+ from nltk.corpus import stopwords
15
+ from nltk.stem import WordNetLemmatizer
16
+
17
+ # --------------------------------------------------------------------
18
+ # 🧠 NLTK Setup (force consistent path for Hugging Face Spaces)
19
+ # --------------------------------------------------------------------
20
+ NLTK_PATH = "/usr/local/share/nltk_data"
21
+ os.environ["NLTK_DATA"] = NLTK_PATH
22
+ nltk.data.path.clear()
23
+ nltk.data.path.append(NLTK_PATH)
24
+
25
+ required_pkgs = [
26
+ "punkt",
27
+ "punkt_tab",
28
+ "averaged_perceptron_tagger",
29
+ "averaged_perceptron_tagger_eng",
30
+ "stopwords",
31
+ "wordnet",
32
+ ]
33
+
34
+ for pkg in required_pkgs:
35
+ try:
36
+ nltk.data.find(pkg)
37
+ except LookupError:
38
+ nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
39
+
40
+ STOPWORDS = set(stopwords.words("english"))
41
+ lemmatizer = WordNetLemmatizer()
42
+
43
+ # --------------------------------------------------------------------
44
+ # ⚕️ Clinical Abbreviation & Synonym Normalization
45
+ # --------------------------------------------------------------------
46
+ NORMALIZATION_MAP = {
47
+ # Core trial terms
48
+ r"\be[-_ ]?crf(s)?\b": "electronic case report form",
49
+ r"\bedc(s)?\b": "electronic data capture",
50
+ r"\bctms\b": "clinical trial management system",
51
+ r"\bcsr(s)?\b": "clinical study report",
52
+ r"\bcrf\b": "case report form",
53
+ # Data standards
54
+ r"\bsdtm(s)?\b": "study data tabulation model",
55
+ r"\badam(s)?\b": "analysis data model",
56
+ r"\bdefine[-_ ]?xml\b": "define xml metadata",
57
+ # Compliance / Ethics
58
+ r"\bgcp\b": "good clinical practice",
59
+ r"\biec\b": "independent ethics committee",
60
+ r"\birb\b": "institutional review board",
61
+ r"\bpi\b": "principal investigator",
62
+ r"\bsub[-_ ]?inv(es)?tigators?\b": "sub investigator",
63
+ r"\bsae(s)?\b": "serious adverse event",
64
+ r"\bae(s)?\b": "adverse event",
65
+ r"\bsusar(s)?\b": "suspected unexpected serious adverse reaction",
66
+ # Misc
67
+ r"\bsdv\b": "source data verification",
68
+ r"\bsop(s)?\b": "standard operating procedure",
69
+ r"\bqms\b": "quality management system",
70
+ r"\bicf\b": "informed consent form",
71
+ r"\bregulatory\b": "regulatory compliance",
72
+ }
73
+
74
+ DOMAIN_SYNONYMS = {
75
+ "edc": ["data entry system", "data management platform"],
76
+ "ecrf": ["electronic data entry form", "study data form"],
77
+ "gcp": ["good clinical practice", "ich e6", "regulatory compliance"],
78
+ "sdtm": ["data tabulation model", "cdisc standard"],
79
+ "adam": ["analysis dataset model", "statistical dataset"],
80
+ "ae": ["adverse event", "side effect"],
81
+ "sae": ["serious adverse event", "life threatening event"],
82
+ "susar": ["unexpected serious adverse reaction", "drug safety event"],
83
+ "ctms": ["trial management tool", "site tracking system"],
84
+ "pi": ["principal investigator", "study doctor"],
85
+ "csr": ["clinical study report", "final study document"],
86
+ "qms": ["quality management framework", "audit system"],
87
+ "sop": ["standard operating procedure", "company process document"],
88
+ }
89
+
90
+ GLOSSARY_PATH = "data/glossary.json"
91
+
92
+ # --------------------------------------------------------------------
93
+ # 🧹 Text Normalization
94
+ # --------------------------------------------------------------------
95
+ def normalize_query_text(text: str) -> str:
96
+ """Lowercase, remove punctuation, and expand known abbreviations."""
97
+ text = text.strip().lower()
98
+ text = re.sub(r"[^\w\s\-]", " ", text)
99
+ text = re.sub(r"\s+", " ", text)
100
+ for pattern, repl in NORMALIZATION_MAP.items():
101
+ text = re.sub(pattern, repl, text)
102
+ return text.strip()
103
+
104
+ # --------------------------------------------------------------------
105
+ # ⚙️ VAN (Verb–Adjective–Noun) Extraction — IMPROVED
106
+ # --------------------------------------------------------------------
107
+ def extract_van_tokens(text: str):
108
+ """
109
+ Extract and normalize core content words using VAN logic.
110
+ - Lowercases and expands abbreviations
111
+ - Removes stopwords and determiners ('a', 'an', 'the')
112
+ - Keeps only Verbs, Adjectives, and Nouns
113
+ - Lemmatizes words to singular or base form
114
+ - Deduplicates tokens
115
+ """
116
+ text = normalize_query_text(text)
117
+ if not text:
118
+ return []
119
+
120
+ try:
121
+ tokens = nltk.word_tokenize(text)
122
+ pos_tags = nltk.pos_tag(tokens)
123
+ except LookupError:
124
+ for pkg in ["punkt", "punkt_tab", "averaged_perceptron_tagger"]:
125
+ nltk.download(pkg, download_dir=NLTK_PATH, quiet=True)
126
+ pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
127
+
128
+ filtered = []
129
+ for w, t in pos_tags:
130
+ if not w.isalpha():
131
+ continue
132
+ # Remove determiners and common auxiliaries
133
+ if w in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
134
+ continue
135
+ if w in STOPWORDS:
136
+ continue
137
+ if len(w) <= 2:
138
+ continue
139
+ # Keep only N, V, J
140
+ if t.startswith(("N", "V", "J")):
141
+ pos = (
142
+ "v" if t.startswith("V")
143
+ else "a" if t.startswith("J")
144
+ else "n"
145
+ )
146
+ lemma = lemmatizer.lemmatize(w, pos)
147
+ filtered.append(lemma)
148
+
149
+ # Deduplicate while preserving order
150
+ seen, unique = set(), []
151
+ for w in filtered:
152
+ if w not in seen:
153
+ seen.add(w)
154
+ unique.append(w)
155
+ return unique
156
+
157
+ # --------------------------------------------------------------------
158
+ # 📘 Glossary-based Synonym Expansion
159
+ # --------------------------------------------------------------------
160
+ def expand_with_glossary(tokens: list):
161
+ """Expand tokens using both internal DOMAIN_SYNONYMS and glossary.json."""
162
+ expanded = list(tokens)
163
+
164
+ # Add domain synonym expansion
165
+ for token in tokens:
166
+ key = token.lower()
167
+ if key in DOMAIN_SYNONYMS:
168
+ expanded.extend(DOMAIN_SYNONYMS[key])
169
+
170
+ # Glossary-driven enrichment
171
+ if os.path.exists(GLOSSARY_PATH):
172
+ try:
173
+ with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
174
+ glossary = json.load(f)
175
+ except Exception:
176
+ glossary = {}
177
+ for token in tokens:
178
+ t_norm = re.sub(r"[^a-z0-9]", "", token.lower())
179
+ for term, definition in glossary.items():
180
+ term_norm = re.sub(r"[^a-z0-9]", "", term.lower())
181
+ if t_norm in term_norm or term_norm in t_norm:
182
+ defs = [
183
+ w for w in re.findall(r"[a-z]+", str(definition).lower())
184
+ if w not in STOPWORDS and len(w) > 3
185
+ ]
186
+ expanded.extend(defs[:3])
187
+
188
+ # Deduplicate
189
+ seen, out = set(), []
190
+ for w in expanded:
191
+ if w not in seen:
192
+ seen.add(w)
193
+ out.append(w)
194
+ return out
195
+
196
+ # --------------------------------------------------------------------
197
+ # 🔍 Unified Token Extraction
198
+ # --------------------------------------------------------------------
199
+ def extract_content_words(query: str):
200
+ """Normalize, extract VAN tokens, and expand using domain synonyms & glossary."""
201
+ print(f"🔎 [NLP] Extracting VANs from query: {query}")
202
+ tokens = extract_van_tokens(query)
203
+ expanded = expand_with_glossary(tokens)
204
+ print(f"🔎 [NLP] VAN tokens → {expanded}")
205
+ return expanded
206
+
207
+ # --------------------------------------------------------------------
208
+ # 🧪 Self-test
209
+ # --------------------------------------------------------------------
210
+ if __name__ == "__main__":
211
+ sample = "Explain how EDC and eCRF relate to GCP compliance in a clinical trial?"
212
+ print(extract_content_words(sample))