essprasad commited on
Commit
b816136
·
verified ·
1 Parent(s): 00341b3

Upload 9 files

Browse files
core/bm25.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+ import math
5
+ from collections import defaultdict, Counter
6
+
7
+ class BM25:
8
+ def __init__(self, corpus):
9
+ self.corpus = corpus
10
+ self.tokenized_corpus = [self._tokenize(doc['text']) for doc in corpus]
11
+ self.doc_lens = [len(doc) for doc in self.tokenized_corpus]
12
+ self.avgdl = sum(self.doc_lens) / len(self.doc_lens)
13
+ self.doc_freqs = self._calc_doc_freqs()
14
+ self.k1 = 1.5
15
+ self.b = 0.75
16
+
17
+ def _tokenize(self, text):
18
+ return re.findall(r"\w+", text.lower())
19
+
20
+ def _calc_doc_freqs(self):
21
+ freqs = defaultdict(int)
22
+ for doc in self.tokenized_corpus:
23
+ for word in set(doc):
24
+ freqs[word] += 1
25
+ return freqs
26
+
27
+ def _idf(self, term):
28
+ N = len(self.tokenized_corpus)
29
+ df = self.doc_freqs.get(term, 0)
30
+ return math.log(1 + (N - df + 0.5) / (df + 0.5))
31
+
32
+ def get_scores(self, query_tokens):
33
+ scores = [0.0] * len(self.tokenized_corpus)
34
+ for idx, doc in enumerate(self.tokenized_corpus):
35
+ freqs = Counter(doc)
36
+ dl = self.doc_lens[idx]
37
+ for term in query_tokens:
38
+ idf = self._idf(term)
39
+ tf = freqs[term]
40
+ denom = tf + self.k1 * (1 - self.b + self.b * dl / self.avgdl)
41
+ score = idf * ((tf * (self.k1 + 1)) / denom) if denom != 0 else 0
42
+ scores[idx] += score
43
+ return scores
44
+
45
+ def search_bm25(query, top_n=10):
46
+ from core.vector_store import load_all_text_chunks
47
+ if docs is None:
48
+ docs = load_all_text_chunks()
49
+ bm25 = BM25(docs)
50
+ query_tokens = re.findall(r"\w+", query.lower())
51
+ scores = bm25.get_scores(query_tokens)
52
+ top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
53
+ results = []
54
+ for i in top_indices:
55
+ doc = docs[i].copy()
56
+ doc['score'] = scores[i]
57
+ results.append(doc)
58
+ return results
core/glossary.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/glossary.py
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ from difflib import get_close_matches
7
+ from huggingface_hub import hf_hub_download
8
+
9
+ GLOSSARY = None
10
+ GLOSSARY_TERMS_CACHE = [] # 🧠 Cache of glossary keys for fuzzy matching
11
+ DATASET_REPO = "essprasad/CT-Chat-Index"
12
+ GLOSSARY_FILENAME = "persistent/glossary.json"
13
+
14
+
15
+ def _normalize_term(term: str) -> str:
16
+ """Normalize glossary terms for matching, with fuzzy fallback."""
17
+ if not term:
18
+ return ""
19
+ term = term.lower().strip()
20
+ term = re.sub(r'[\-_/\\.,;:]+', ' ', term)
21
+ term = re.sub(r'\s+', ' ', term)
22
+
23
+ # Common clinical research synonym normalization
24
+ term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf")
25
+ term = term.replace("electronic case report form", "ecrf")
26
+ term = term.replace("case report form", "crf")
27
+ term = term.replace("informed consent form", "icf")
28
+ term = term.replace("good clinical practice", "gcp")
29
+ term = term.replace("serious adverse event", "sae")
30
+ term = term.replace("adverse event", "ae")
31
+ term = term.replace("21 cfr part 11", "21cfrpart11")
32
+ term = term.replace("clinical study report", "csr")
33
+
34
+ term = term.strip()
35
+
36
+ # 🧩 Fuzzy matching fallback (for plural/singular or typos)
37
+ if GLOSSARY_TERMS_CACHE:
38
+ if term not in GLOSSARY_TERMS_CACHE:
39
+ close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85)
40
+ if close:
41
+ # return the closest key for better recall
42
+ return close[0]
43
+
44
+ return term
45
+
46
+
47
+ def _load_glossary():
48
+ """Load glossary.json from Hugging Face Hub (cached)."""
49
+ global GLOSSARY, GLOSSARY_TERMS_CACHE
50
+ if GLOSSARY is not None:
51
+ return GLOSSARY
52
+ try:
53
+ path = hf_hub_download(
54
+ repo_id=DATASET_REPO,
55
+ filename=GLOSSARY_FILENAME,
56
+ repo_type="dataset",
57
+ )
58
+ with open(path, "r", encoding="utf-8") as f:
59
+ raw = json.load(f)
60
+
61
+ GLOSSARY = {}
62
+ for k, vlist in raw.items():
63
+ if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k):
64
+ continue
65
+
66
+ candidate_key = k
67
+ if isinstance(vlist, dict):
68
+ candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k
69
+
70
+ norm = _normalize_term(candidate_key)
71
+ if not norm:
72
+ continue
73
+
74
+ if isinstance(vlist, dict):
75
+ dfn = vlist.get("definition") or vlist.get("text") or ""
76
+ sources = vlist.get("sources", [])
77
+ elif isinstance(vlist, str):
78
+ dfn = vlist
79
+ sources = []
80
+ else:
81
+ dfn, sources = "", []
82
+
83
+ if not dfn or len(dfn.strip()) < 5:
84
+ continue
85
+
86
+ if norm not in GLOSSARY:
87
+ GLOSSARY[norm] = {
88
+ "term": candidate_key.strip(),
89
+ "definition": dfn.strip(),
90
+ "sources": sources if isinstance(sources, list) else []
91
+ }
92
+ else:
93
+ # Merge sources if already exists
94
+ existing = GLOSSARY[norm]
95
+ existing_sources = set(existing.get("sources", []))
96
+ new_sources = set(sources) if sources else set()
97
+ existing["sources"] = list(existing_sources.union(new_sources))
98
+
99
+ # 🧠 Store all glossary keys for fuzzy fallback
100
+ GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys())
101
+
102
+ print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).")
103
+ return GLOSSARY
104
+ except Exception as e:
105
+ print(f"⚠️ Failed to load glossary from Hugging Face: {e}")
106
+ return {}
107
+
108
+
109
+ __all__ = ["_load_glossary", "_normalize_term"]
core/glossary_builder.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 📘 glossary_builder.py
3
+ Builds a unified glossary from PDFs and Excel files.
4
+ - Extracts terms & definitions from PDFs.
5
+ - Merges Excel glossary (uses all definition-related columns with labeled formatting).
6
+ - Saves combined glossary.json locally and uploads to Hugging Face.
7
+ """
8
+
9
+ import os
10
+ import re
11
+ import json
12
+ import time
13
+ import fitz
14
+ import pandas as pd
15
+ from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download
16
+
17
+ # --- Configuration ---
18
+ DATASET_REPO = "essprasad/CT-Chat-Index"
19
+ DOCS_REPO = "essprasad/CT-Chat-Docs"
20
+ LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
21
+ REMOTE_GLOSSARY = "persistent/glossary.json"
22
+ TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")
23
+
24
+
25
+ # --- Helpers ---
26
+ def normalize_term(term: str) -> str:
27
+ if not term:
28
+ return ""
29
+ s = term.lower().strip()
30
+ s = re.sub(r"[\-_/\\.,;:]+", " ", s)
31
+ s = re.sub(r"\s+", " ", s)
32
+ synonyms = {
33
+ "electronic case report form": "ecrf",
34
+ "case report form": "crf",
35
+ "informed consent form": "icf",
36
+ "good clinical practice": "gcp",
37
+ "serious adverse event": "sae",
38
+ "adverse event": "ae",
39
+ "21 cfr part 11": "21cfrpart11",
40
+ "clinical study report": "csr",
41
+ }
42
+ return synonyms.get(s, s)
43
+
44
+
45
+ def extract_text_from_pdf(pdf_path):
46
+ """Extract plain text from a PDF file."""
47
+ try:
48
+ doc = fitz.open(pdf_path)
49
+ text = "\n".join(page.get_text("text") for page in doc)
50
+ doc.close()
51
+ return text.strip()
52
+ except Exception as e:
53
+ print(f"⚠️ Failed to read {pdf_path}: {e}")
54
+ return ""
55
+
56
+
57
+ def extract_definitions_from_text(text):
58
+ """Extract glossary-like term-definition pairs from raw PDF text."""
59
+ glossary = {}
60
+ text = re.sub(r"\r", "", text)
61
+ lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
62
+ i = 0
63
+ while i < len(lines):
64
+ term = lines[i].strip()
65
+ if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
66
+ i += 1
67
+ continue
68
+ if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
69
+ i += 1
70
+ continue
71
+ if term.lower().startswith(("acronym for", "definition", "terms of")):
72
+ i += 1
73
+ continue
74
+
75
+ defn_lines = []
76
+ j = i + 1
77
+ while j < len(lines):
78
+ nxt = lines[j].strip()
79
+ if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
80
+ if not nxt.lower().startswith(("see also", "for example", "for instance")):
81
+ break
82
+ defn_lines.append(nxt)
83
+ j += 1
84
+
85
+ definition = " ".join(defn_lines)
86
+ definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
87
+ definition = re.sub(r"\s{2,}", " ", definition).strip()
88
+
89
+ if len(definition.split()) < 5 or "." not in definition:
90
+ i += 1
91
+ continue
92
+
93
+ norm = normalize_term(term)
94
+ glossary[norm] = {"term": term, "definition": definition}
95
+ i = j
96
+
97
+ return glossary
98
+
99
+
100
+ # --- Main Rebuild Function ---
101
+ def rebuild_and_upload():
102
+ start = time.time()
103
+ print("📘 Starting glossary rebuild...")
104
+ try:
105
+ all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
106
+ pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
107
+ excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
108
+ except Exception as e:
109
+ raise RuntimeError(f"Cannot list repo files: {e}")
110
+
111
+ all_defs = {}
112
+
113
+ # --- 1️⃣ Process PDFs ---
114
+ for pdf in pdfs:
115
+ skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
116
+ if any(sp in pdf.lower() for sp in skip_patterns):
117
+ print(f"⏩ Skipping non-glossary or template file: {pdf}")
118
+ continue
119
+ try:
120
+ print(f"🔍 Processing {pdf}...")
121
+ path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
122
+ text = extract_text_from_pdf(path)
123
+ defs = extract_definitions_from_text(text)
124
+ for k, v in defs.items():
125
+ v.setdefault("sources", []).append(pdf)
126
+ if k in all_defs:
127
+ all_defs[k]["sources"] = list(set(all_defs[k].get("sources", []) + v["sources"]))
128
+ else:
129
+ all_defs[k] = v
130
+ except Exception as e:
131
+ print(f"⚠️ Failed {pdf}: {e}")
132
+
133
+ # --- 2️⃣ Process Excel Glossaries (MRCT etc.) ---
134
+ for excel in excels:
135
+ try:
136
+ print(f"📗 Checking Excel file in dataset: {excel}")
137
+ excel_path = hf_hub_download(
138
+ repo_id=DOCS_REPO,
139
+ filename=excel,
140
+ repo_type="dataset",
141
+ token=TOKEN
142
+ )
143
+ print(f"✅ Downloaded Excel file to {excel_path}")
144
+ xls = pd.read_excel(excel_path, sheet_name=None)
145
+
146
+ total_rows = 0
147
+ excel_entries = []
148
+
149
+ for sheet_name, df in xls.items():
150
+ df = df.fillna("").dropna(how="all")
151
+ if df.shape[0] == 0:
152
+ continue
153
+ df.columns = [str(c).strip() for c in df.columns]
154
+
155
+ # 🧠 Detect the correct 'Glossary Term' column
156
+ term_col = None
157
+ for c in df.columns:
158
+ if "glossary term" in c.lower():
159
+ term_col = c
160
+ break
161
+ if not term_col:
162
+ for c in df.columns:
163
+ if "cdisc" in c.lower() or c.lower() == "term":
164
+ term_col = c
165
+ break
166
+ if not term_col:
167
+ print(f"⚠️ No 'Glossary Term' column found in sheet {sheet_name} of {excel_path}")
168
+ continue
169
+
170
+ # Concatenate all relevant columns with labels for clarity
171
+ for _, row in df.iterrows():
172
+ term = str(row.get(term_col, "")).strip()
173
+ if not term:
174
+ continue
175
+
176
+ def_cols = [
177
+ c for c in df.columns
178
+ if any(k in c.lower() for k in [
179
+ "definition", "context", "info", "related", "resource", "use in context"
180
+ ])
181
+ ]
182
+
183
+ def_parts = []
184
+ for c in def_cols:
185
+ val = str(row.get(c, "")).strip()
186
+ if val:
187
+ def_parts.append(f"<b>{c}:</b> {val}")
188
+
189
+ full_definition = "<br>".join(def_parts).strip()
190
+ if not full_definition:
191
+ continue
192
+
193
+ entry = {
194
+ "term": term,
195
+ "definition": full_definition,
196
+ "source": os.path.basename(excel_path),
197
+ "sheet": sheet_name,
198
+ "type": "Excel",
199
+ }
200
+ excel_entries.append(entry)
201
+ total_rows += 1
202
+
203
+ print(f"✅ Added {total_rows} Excel rows from {excel_path}")
204
+
205
+ # Merge into main glossary dictionary
206
+ for e in excel_entries:
207
+ norm = normalize_term(e["term"])
208
+ payload = {
209
+ "term": e["term"],
210
+ "definition": e["definition"],
211
+ "sources": [e["source"]],
212
+ "type": e.get("type", "Excel"),
213
+ "sheet": e.get("sheet"),
214
+ }
215
+ # Each term+source pair stored uniquely to preserve different definitions
216
+ unique_key = f"{norm}__{os.path.basename(payload['sources'][0])}" if payload.get("sources") else norm
217
+
218
+ if unique_key not in all_defs:
219
+ all_defs[unique_key] = payload
220
+ else:
221
+ # Avoid duplicate merges — just append any new sources
222
+ existing = all_defs[unique_key]
223
+ existing_sources = set(existing.get("sources", []))
224
+ new_sources = set(payload.get("sources", []))
225
+ existing["sources"] = list(existing_sources.union(new_sources))
226
+
227
+ except Exception as e:
228
+ print(f"⚠️ Failed to process Excel {excel}: {e}")
229
+
230
+ # --- 3️⃣ Save combined glossary ---
231
+ os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
232
+ with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
233
+ json.dump(all_defs, f, indent=2, ensure_ascii=False)
234
+
235
+ print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")
236
+
237
+ # --- 4️⃣ Upload to Hugging Face ---
238
+ if TOKEN:
239
+ try:
240
+ upload_file(
241
+ path_or_fileobj=LOCAL_GLOSSARY,
242
+ path_in_repo=REMOTE_GLOSSARY,
243
+ repo_id=DATASET_REPO,
244
+ repo_type="dataset",
245
+ token=TOKEN,
246
+ commit_message="Glossary updated with PDF + Excel (column-labeled definitions)",
247
+ )
248
+ print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
249
+ except Exception as e:
250
+ print(f"⚠️ Upload error: {e}")
251
+
252
+ print(f"✨ Done in {time.time() - start:.1f}s.")
253
+
254
+
255
+ if __name__ == "__main__":
256
+ rebuild_and_upload()
core/hybrid_retriever.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hybrid Retriever with Glossary + FAISS + BM25 support.
3
+ Includes full-paragraph glossary definitions, acronym expansion, and Excel prioritization.
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import time
9
+ from urllib.parse import urlparse
10
+
11
+ from core.glossary import _load_glossary, _normalize_term
12
+ from core.vector_store import _ensure_faiss_index, search_index, load_all_text_chunks
13
+ from core.bm25 import search_bm25
14
+ from utils.nlp_helpers import extract_van_tokens, normalize_query_text
15
+
16
+ DOCS_REPO = "essprasad/CT-Chat-Docs"
17
+ DENSE_TOP_K = 10
18
+
19
+
20
+ def _find_sentence_containing(text: str, phrase: str) -> str:
21
+ """Return the sentence that contains the given phrase."""
22
+ if not text or not phrase:
23
+ return ""
24
+ sentences = re.split(r"[.!?\n\r]", text)
25
+ phrase = phrase.lower()
26
+ for s in sentences:
27
+ if phrase in s.lower():
28
+ return s.strip()
29
+ return ""
30
+
31
+
32
+ def summarize_combined(query: str, mode="short") -> str:
33
+ start = time.time()
34
+ if not query.strip():
35
+ return "<i>No query provided.</i>"
36
+
37
+ # ------------------------------------------------------------------
38
+ # 🧠 VAN-Based Query Normalization
39
+ # ------------------------------------------------------------------
40
+ expanded_query = normalize_query_text(query)
41
+ van_tokens = extract_van_tokens(expanded_query)
42
+ van_query = " ".join(van_tokens).strip()
43
+ normalized_query = van_query or query
44
+
45
+ print(f"🔍 summarize_combined() query='{query}' van_query='{van_query}' normalized_query='{normalized_query}'")
46
+
47
+ glossary = _load_glossary()
48
+
49
+ # ------------------------------------------------------------------
50
+ # 1️⃣ Acronym Map (derived from GCDMP_Glossary.pdf)
51
+ # ------------------------------------------------------------------
52
+ acronym_map = {
53
+ "adr": "adverse drug reaction",
54
+ "ae": "adverse event",
55
+ "asp": "application service provider",
56
+ "asq": "american society for quality",
57
+ "ca": "corrective action",
58
+ "cdisc": "clinical data interchange standards consortium",
59
+ "clia": "clinical laboratory improvement amendments",
60
+ "crf": "case report form",
61
+ "cro": "contract research organization",
62
+ "cs": "clinically significant",
63
+ "ehr": "electronic health record",
64
+ "emr": "electronic medical record",
65
+ "eu": "european union",
66
+ "gcp": "good clinical practice",
67
+ "idmc": "independent data-monitoring committee",
68
+ "iec": "independent ethics committee",
69
+ "ind": "investigational new drug application",
70
+ "irb": "institutional review board",
71
+ "iso": "international organization for standardization",
72
+ "iom": "institute of medicine",
73
+ "iss": "integrated summary of safety",
74
+ "ise": "integrated summary of efficacy",
75
+ "meddra": "medical dictionary for regulatory activities",
76
+ "mrct": "multi-regional clinical trials",
77
+ "ncs": "non clinically significant",
78
+ "nda": "new drug application",
79
+ "ocr": "optical character recognition",
80
+ "qa": "quality assurance",
81
+ "qc": "quality control",
82
+ "sae": "serious adverse event",
83
+ "sla": "service level agreement",
84
+ "sop": "standard operating procedure",
85
+ "spc": "statistical process control",
86
+ "sqc": "statistical quality control",
87
+ "uat": "user acceptance testing",
88
+ "vcl": "virtual central lab",
89
+ "whodrug": "world health organization drug dictionary",
90
+ }
91
+
92
+ acronym_glossary_hits = []
93
+
94
+ # ------------------------------------------------------------------
95
+ # 2️⃣ Direct Glossary Match (and handle acronyms)
96
+ # ------------------------------------------------------------------
97
+ short_candidate = (van_query or normalized_query).strip().lower()
98
+ glossary_key = _normalize_term(short_candidate)
99
+
100
+ # If query matches acronym, expand it
101
+ if glossary_key in acronym_map:
102
+ expansion = acronym_map[glossary_key]
103
+ glossary_key = _normalize_term(expansion)
104
+ print(f"🔁 Acronym expansion: '{short_candidate}' → '{expansion}'")
105
+
106
+ if glossary and glossary_key in glossary:
107
+ entry = glossary[glossary_key]
108
+ term_display = entry.get("term", glossary_key)
109
+ dfn = entry.get("definition") or entry.get("text") or ""
110
+ sources = entry.get("sources", []) or ["unspecified"]
111
+
112
+ html = f"<h3>🧠 Definitions for '{term_display}':</h3>"
113
+ for src in sources:
114
+ html += f"🔹 <b>Source:</b> {src}<br><blockquote>{dfn}</blockquote>"
115
+ print(f"✅ Glossary match for '{glossary_key}' in {time.time() - start:.2f}s")
116
+ return html
117
+
118
+ # ------------------------------------------------------------------
119
+ # 3️⃣ FAISS Dense Retrieval
120
+ # ------------------------------------------------------------------
121
+ dense_query = normalized_query
122
+ dense_hits = []
123
+ try:
124
+ if _ensure_faiss_index():
125
+ dense_hits = search_index(dense_query, top_k=DENSE_TOP_K) or []
126
+ except Exception as e:
127
+ print(f"⚠️ FAISS search failed: {e}")
128
+ print(f"📚 Dense hits before filtering: {len(dense_hits)}")
129
+
130
+ # ------------------------------------------------------------------
131
+ # 4️⃣ Acronym Filtering (Lenient Match)
132
+ # ------------------------------------------------------------------
133
+ if len(normalized_query.split()) == 1 and len(normalized_query) <= 5:
134
+ key = normalized_query.lower()
135
+ expansion = acronym_map.get(key, key)
136
+ pattern = re.compile(
137
+ rf"\b{re.escape(key)}\b|{re.escape(expansion)}|{key}[\s\-\.:;)]|[\(\s]{key}[\s\-\.:;)]",
138
+ re.IGNORECASE,
139
+ )
140
+ filtered_hits = [h for h in dense_hits if pattern.search((h.get("definition") or h.get("text") or "").lower())]
141
+ print(f"🔍 Filtered acronym hits: {len(filtered_hits)} (lenient match incl. '{expansion}')")
142
+ dense_hits = filtered_hits
143
+ else:
144
+ print(f"ℹ️ No acronym filtering applied (query length > 5 chars)")
145
+ print(f"📚 Dense hits after filtering: {len(dense_hits)}")
146
+
147
+ # ------------------------------------------------------------------
148
+ # 5️⃣ BM25 Fallback (Lexical)
149
+ # ------------------------------------------------------------------
150
+ bm25_hits = []
151
+ try:
152
+ docs = load_all_text_chunks()
153
+ if docs:
154
+ bm25_hits = search_bm25(normalized_query, docs, top_n=10)
155
+ except Exception as e:
156
+ print(f"⚠️ BM25 fallback failed: {e}")
157
+ print(f"📑 BM25 hits: {len(bm25_hits)}")
158
+
159
+ # ------------------------------------------------------------------
160
+ # 🧩 Merge & Prioritize — Keep all per-source definitions
161
+ # ------------------------------------------------------------------
162
+ hits = dense_hits + bm25_hits
163
+ if not hits:
164
+ return "<i>No relevant information found.</i>"
165
+
166
+ def score_hit(h):
167
+ """Prioritize Excel > PDF > Web > Other"""
168
+ text = (h.get("definition") or h.get("text") or "").lower()
169
+ src = h.get("file") or h.get("source") or ""
170
+ src_type = (h.get("type") or "").lower()
171
+ key = (van_query or normalized_query).lower()
172
+ score = 0
173
+ if "excel" in src_type or src.lower().endswith((".xls", ".xlsx")):
174
+ score += 10
175
+ elif src.lower().endswith(".pdf"):
176
+ score += 5
177
+ elif src.startswith("http"):
178
+ score += 2
179
+ if key in text:
180
+ score += 3
181
+ return -score
182
+
183
+ hits = sorted(hits, key=score_hit)
184
+
185
+ # ------------------------------------------------------------------
186
+ # 6️⃣ Compose Final Answer — With Icons, Tooltips, & Hyperlinks
187
+ # ------------------------------------------------------------------
188
+ answers = []
189
+ src_counts = {"web": 0, "pdf": 0, "excel": 0, "other": 0}
190
+
191
+ for h in hits:
192
+ txt = h.get("definition") or h.get("text") or ""
193
+ if not txt.strip():
194
+ continue
195
+
196
+ src = h.get("file") or h.get("source") or "unknown"
197
+ src_base = os.path.basename(src)
198
+ src_type = (h.get("type") or "").lower()
199
+ term_name = h.get("term") or (van_query or normalized_query)
200
+
201
+ # --- Categorize source
202
+ if "excel" in src_type or src.lower().endswith((".xls", ".xlsx")):
203
+ icon, cat = "📘", "excel"
204
+ elif "website" in src_type or src.startswith("http"):
205
+ icon, cat = "🌐", "web"
206
+ elif src.lower().endswith(".pdf"):
207
+ icon, cat = "📄", "pdf"
208
+ else:
209
+ icon, cat = "📁", "other"
210
+
211
+ src_counts[cat] += 1
212
+
213
+ # --- Extract URL if present
214
+ url = ""
215
+ if "http" in src:
216
+ url = src
217
+ elif "http" in txt:
218
+ match = re.search(r"https?://\S+", txt)
219
+ if match:
220
+ url = match.group(0).rstrip(".,)")
221
+
222
+ # --- Extract relevant paragraph
223
+ paragraphs = re.split(r"\n\s*\n", txt)
224
+ matched_paragraph = ""
225
+ for p in paragraphs:
226
+ if normalized_query.lower() in p.lower() or (van_query and van_query.lower() in p.lower()):
227
+ matched_paragraph = p.strip()
228
+ break
229
+ excerpt = matched_paragraph or txt.strip()
230
+
231
+ if len(excerpt) > 2000:
232
+ excerpt = excerpt[:2000] + "..."
233
+
234
+ # --- Convert URLs and highlight terms
235
+ excerpt = re.sub(r"(?i)source url:\s*", "", excerpt)
236
+ excerpt = re.sub(r"(https?://[^\s<>'\"]+)", r"<a href='\1' target='_blank'>\1</a>", excerpt)
237
+ excerpt = re.sub(f"(?i)({re.escape(normalized_query)})", r"<mark>\1</mark>", excerpt)
238
+ if term_name:
239
+ excerpt = re.sub(f"(?i)({re.escape(term_name)})", r"<b>\1</b>", excerpt)
240
+
241
+ # --- Build formatted output
242
+ if url:
243
+ parsed = urlparse(url)
244
+ display_name = parsed.netloc or src_base
245
+ link_html = f"<b>{icon} <a href='{url}' target='_blank'>{display_name}</a></b>"
246
+ else:
247
+ link_html = f"<b>{icon} {src_base}</b>"
248
+
249
+ answers.append(f"{link_html}<br><blockquote>{excerpt}</blockquote>")
250
+
251
+ if len(answers) >= 6:
252
+ break
253
+
254
+ # ------------------------------------------------------------------
255
+ # 7️⃣ Final HTML Output
256
+ # ------------------------------------------------------------------
257
+ summary_counts = " | ".join(f"{k.capitalize()}: {v}" for k, v in src_counts.items() if v > 0)
258
+ print(f"✅ Answers from {len(answers)} sources in {time.time() - start:.2f}s")
259
+
260
+ expansion_note = ""
261
+ if normalized_query.lower() in acronym_map:
262
+ expansion_note = f"<p><i>🔁 Acronym expanded: <b>{normalized_query.upper()}</b> → {acronym_map[normalized_query.lower()]}</i></p>"
263
+
264
+ return (
265
+ f"<h3>🧠 Answers (one per source):</h3>"
266
+ + expansion_note
267
+ + f"<p><i>Sources → {summary_counts}</i></p>"
268
+ + "<br>".join(answers)
269
+ )
core/retrieval.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ from whoosh.index import open_dir
5
+ from whoosh.qparser import MultifieldParser
6
+
7
+ WHOOSH_INDEX_PATH = "/home/user/app/persistent/whoosh_index"
8
+
9
+ _ix = None
10
+
11
+ def _load_whoosh():
12
+ global _ix
13
+ if _ix is None and os.path.exists(WHOOSH_INDEX_PATH):
14
+ _ix = open_dir(WHOOSH_INDEX_PATH)
15
+ return _ix
16
+
17
+ def _bm25_search(query, top_n=10):
18
+ ix = _load_whoosh()
19
+ if not ix:
20
+ return []
21
+ parser = MultifieldParser(["text", "title"], schema=ix.schema)
22
+ q = parser.parse(query)
23
+ with ix.searcher() as s:
24
+ results = s.search(q, limit=top_n)
25
+ return [{"text": r["text"], "file": r.get("file", "")} for r in results]
core/van_normalizer.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/van_normalizer.py
2
+ import re
3
+ import nltk
4
+ from nltk import pos_tag, word_tokenize
5
+ from nltk.stem import WordNetLemmatizer
6
+
7
+ # make sure you have these (run once if missing):
8
+ # python -m nltk.downloader punkt averaged_perceptron_tagger wordnet omw-1.4
9
+
10
+ lemmatizer = WordNetLemmatizer()
11
+
12
+ def normalize_to_van(text: str) -> str:
13
+ """
14
+ VAN-based normalization (optimized for clinical trial domain):
15
+ - Lowercases and removes punctuation
16
+ - Tokenizes and POS-tags
17
+ - Keeps only Nouns (N), Adjectives (J), and key Verbs (V)
18
+ - Explicitly removes determiners/articles (a, an, the)
19
+ - Lemmatizes each token to its base form
20
+ - Returns a space-joined string suitable for FAISS embedding
21
+ """
22
+ if not text:
23
+ return ""
24
+
25
+ # Basic cleanup
26
+ text = text.lower().strip()
27
+ text = re.sub(r"[^a-z0-9\s-]", " ", text) # remove punctuation
28
+ tokens = word_tokenize(text)
29
+
30
+ # POS tagging
31
+ tagged = pos_tag(tokens)
32
+
33
+ filtered = []
34
+ for word, tag in tagged:
35
+ # Skip common determiners, articles, and auxiliary verbs
36
+ if word in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
37
+ continue
38
+
39
+ # Keep only verbs, adjectives, and nouns
40
+ if tag.startswith("V") or tag.startswith("J") or tag.startswith("N"):
41
+ filtered.append((word, tag))
42
+
43
+ # Lemmatize each word to its appropriate part of speech
44
+ lemmas = []
45
+ for word, tag in filtered:
46
+ pos = (
47
+ "v" if tag.startswith("V")
48
+ else "a" if tag.startswith("J")
49
+ else "n"
50
+ )
51
+ lemmas.append(lemmatizer.lemmatize(word, pos))
52
+
53
+ # Join and clean
54
+ normalized = " ".join(lemmas).strip()
55
+ normalized = re.sub(r"\s+", " ", normalized) # collapse multiple spaces
56
+ return normalized
57
+
core/vector_search.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ core/vector_search.py
3
+ -----------------------------------------------------
4
+ Performs FAISS semantic search for hybrid retrieval.
5
+ Includes:
6
+ - SentenceTransformer embedding for query
7
+ - FAISS similarity search
8
+ - Metadata + citation extraction
9
+ - Robust fallback if index missing
10
+ """
11
+
12
+ import os
13
+ import json
14
+ import numpy as np
15
+ import faiss
16
+ from sentence_transformers import SentenceTransformer
17
+
18
+ # Paths (shared with vector_store/vector_sync)
19
+ FAISS_INDEX = "persistent/faiss.index"
20
+ FAISS_META = "persistent/faiss.index.meta.json"
21
+
22
+ _model = None
23
+ _index = None
24
+ _meta = []
25
+
26
+
27
+ # ----------------------------
28
+ # 🔹 Loaders
29
+ # ----------------------------
30
+ def _load_model():
31
+ """Lazy-load embedding model."""
32
+ global _model
33
+ if _model is None:
34
+ print("📥 Loading embedding model for retrieval...")
35
+ _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
36
+ print("✅ Model loaded.")
37
+ return _model
38
+
39
+
40
+ def _load_faiss():
41
+ """Load FAISS index + metadata, prefer local persistent copy."""
42
+ global _index, _meta
43
+ if _index is not None:
44
+ return _index, _meta
45
+
46
+ local_index = "/home/user/app/persistent/faiss.index"
47
+ local_meta = "/home/user/app/persistent/faiss.index.meta.json"
48
+
49
+ if os.path.exists(local_index) and os.path.exists(local_meta):
50
+ print("📂 [vector_search] Using local FAISS index.")
51
+ _index = faiss.read_index(local_index)
52
+ with open(local_meta, "r", encoding="utf-8") as f:
53
+ _meta = json.load(f)
54
+ print(f"✅ Loaded local FAISS index ({len(_meta)} entries).")
55
+ return _index, _meta
56
+
57
+ print("☁️ [vector_search] Local FAISS missing, using fallback remote index.")
58
+ return _index, _meta
59
+
60
+ # ----------------------------
61
+ # 🔹 Core Query Function
62
+ # ----------------------------
63
+ def query_faiss(query: str, top_k: int = 5):
64
+ """
65
+ Perform FAISS semantic similarity search.
66
+ Returns:
67
+ results: list of matched text chunks
68
+ meta: list of metadata dicts (with citations)
69
+ """
70
+ index, meta = _load_faiss()
71
+ if index is None or len(meta) == 0:
72
+ return [], []
73
+
74
+ model = _load_model()
75
+ q_emb = np.array(model.encode([query]), dtype=np.float32)
76
+ D, I = index.search(q_emb, top_k)
77
+
78
+ results, citations = [], []
79
+ for idx in I[0]:
80
+ if 0 <= idx < len(meta):
81
+ doc = meta[idx]
82
+ text = clean_text(doc.get("text", ""))
83
+ src = doc.get("source", "Unknown Source")
84
+
85
+ citation = f"📄 <b>Source:</b> {os.path.basename(src)}"
86
+ results.append(text)
87
+ citations.append(citation)
88
+
89
+ return results, citations
90
+
91
+
92
+ # ----------------------------
93
+ # 🔹 Utilities
94
+ # ----------------------------
95
+ def clean_text(text: str, max_len: int = 800):
96
+ """
97
+ Truncate and clean text for readability.
98
+ """
99
+ text = text.replace("\n", " ").replace(" ", " ").strip()
100
+ if len(text) > max_len:
101
+ text = text[:max_len].rsplit(" ", 1)[0] + "..."
102
+ return text
103
+
104
+
105
+ def has_index():
106
+ """Check if FAISS index is available."""
107
+ return os.path.exists(FAISS_INDEX) and os.path.exists(FAISS_META)
core/vector_store.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ core/vector_store.py
3
+ ------------------------------------------------------------
4
+ Unified FAISS + BM25 storage utility for Clinical-Trial Chatbot.
5
+
6
+ ✅ Works with glossary.json or FAISS metadata
7
+ ✅ Returns normalized dicts for hybrid_retriever
8
+ ✅ Adds load_all_text_chunks() for BM25 fallback
9
+ ✅ Safe against missing files
10
+ """
11
+
12
+ import os
13
+ import re
14
+ import json
15
+ import faiss
16
+ from sentence_transformers import SentenceTransformer
17
+
18
+ # Globals used by retriever
19
+ _index = None
20
+ _model = None
21
+ _meta = None
22
+
23
+
24
+ # --------------------------------------------------------------------
25
+ # 1️⃣ Utility: load FAISS index + metadata (MVP version)
26
+ # --------------------------------------------------------------------
27
+ def _ensure_faiss_index():
28
+ """Load FAISS index and metadata — prefer local persistent files, fallback to Hugging Face dataset."""
29
+ global _index, _model, _meta
30
+ if _index is not None and _meta is not None:
31
+ return True
32
+
33
+ import json
34
+ from huggingface_hub import hf_hub_download
35
+
36
+ local_dir = "/home/user/app/persistent"
37
+ local_index = os.path.join(local_dir, "faiss.index")
38
+ local_meta = os.path.join(local_dir, "faiss.index.meta.json")
39
+
40
+ # 1️⃣ Prefer local FAISS (rebuilt and includes URL + Excel)
41
+ if os.path.exists(local_index) and os.path.exists(local_meta):
42
+ print("📂 Using local FAISS index (includes Excel + Web sources).")
43
+ _index = faiss.read_index(local_index)
44
+ with open(local_meta, "r", encoding="utf-8") as f:
45
+ _meta = json.load(f)
46
+ _model = SentenceTransformer("all-MiniLM-L6-v2")
47
+ print(f"✅ [vector_store] Loaded local FAISS ({len(_meta)} vectors).")
48
+ return True
49
+
50
+ # 2️⃣ Fallback: remote dataset
51
+ print("☁️ Local FAISS missing — downloading from Hugging Face dataset...")
52
+ repo_id = "essprasad/CT-Chat-Index"
53
+ repo_type = "dataset"
54
+ runtime_dir = "/home/user/app/runtime_faiss"
55
+ os.makedirs(runtime_dir, exist_ok=True)
56
+
57
+ index_path = hf_hub_download(
58
+ repo_id=repo_id,
59
+ filename="persistent/faiss.index",
60
+ repo_type=repo_type,
61
+ local_dir=runtime_dir,
62
+ cache_dir=runtime_dir,
63
+ force_download=True,
64
+ )
65
+ meta_path = hf_hub_download(
66
+ repo_id=repo_id,
67
+ filename="persistent/faiss.index.meta.json",
68
+ repo_type=repo_type,
69
+ local_dir=runtime_dir,
70
+ cache_dir=runtime_dir,
71
+ force_download=True,
72
+ )
73
+
74
+ print(f"🧠 [vector_store] Loading FAISS index + metadata from {runtime_dir} ...")
75
+ _index = faiss.read_index(index_path)
76
+ with open(meta_path, "r", encoding="utf-8") as f:
77
+ _meta = json.load(f)
78
+ _model = SentenceTransformer("all-MiniLM-L6-v2")
79
+ print(f"✅ [vector_store] Loaded remote FAISS ({len(_meta)} vectors).")
80
+ return True
81
+
82
+ # --------------------------------------------------------------------
83
+ # 2️⃣ Helper: Load all text chunks (for BM25 fallback)
84
+ # --------------------------------------------------------------------
85
+ def load_all_text_chunks():
86
+ """
87
+ Return list of dicts for BM25 fallback and inspection.
88
+ Each dict: {'text', 'file', 'source', 'term', '_meta'}
89
+ """
90
+ meta_path = os.path.join("persistent", "faiss.index.meta.json")
91
+ gloss_path = os.path.join("persistent", "glossary.json")
92
+ docs = []
93
+
94
+ # Prefer FAISS meta (vector_sync output)
95
+ if os.path.exists(meta_path):
96
+ try:
97
+ with open(meta_path, "r", encoding="utf-8") as f:
98
+ meta = json.load(f)
99
+ for m in meta:
100
+ text = m.get("definition") or m.get("text") or m.get("chunk") or ""
101
+ sources = m.get("sources") or m.get("source") or m.get("file") or []
102
+ if isinstance(sources, list) and sources:
103
+ src = sources[0]
104
+ elif isinstance(sources, str) and sources:
105
+ src = sources
106
+ else:
107
+ src = m.get("file") or m.get("source") or "unknown"
108
+ docs.append({
109
+ "text": text,
110
+ "file": src,
111
+ "source": src,
112
+ "term": m.get("term") or m.get("normalized") or "",
113
+ "_meta": m
114
+ })
115
+ return docs
116
+ except Exception as e:
117
+ print(f"⚠️ [vector_store] Failed to read meta.json: {e}")
118
+
119
+ # fallback: glossary.json
120
+ if os.path.exists(gloss_path):
121
+ try:
122
+ with open(gloss_path, "r", encoding="utf-8") as f:
123
+ gloss = json.load(f)
124
+ for k, v in gloss.items():
125
+ term = v.get("term", k)
126
+ definition = v.get("definition", "")
127
+ srcs = v.get("sources", [])
128
+ src = srcs[0] if isinstance(srcs, list) and srcs else (srcs if isinstance(srcs, str) else "glossary")
129
+ docs.append({
130
+ "text": definition,
131
+ "file": src,
132
+ "source": src,
133
+ "term": term,
134
+ "_meta": {"glossary_key": k}
135
+ })
136
+ return docs
137
+ except Exception as e:
138
+ print(f"⚠️ [vector_store] Failed to read glossary.json: {e}")
139
+
140
+ return docs
141
+
142
+
143
+ # --------------------------------------------------------------------
144
+ # 3️⃣ FAISS Search
145
+ # --------------------------------------------------------------------
146
+ def search_index(query, top_k=10):
147
+ """
148
+ Search FAISS and return a list of dict hits for hybrid_retriever.
149
+ Each hit: {'text','file','source','term','_score','_meta'}
150
+ """
151
+ global _index, _model, _meta
152
+ if not _ensure_faiss_index():
153
+ return []
154
+
155
+ q_emb = _model.encode([query], convert_to_numpy=True).astype("float32")
156
+ faiss.normalize_L2(q_emb)
157
+ D, I = _index.search(q_emb, top_k)
158
+
159
+ results = []
160
+ for score, idx in zip(D[0].tolist(), I[0].tolist()):
161
+ if idx < 0 or idx >= len(_meta):
162
+ continue
163
+ m = _meta[idx] if isinstance(_meta[idx], dict) else {"raw": str(_meta[idx])}
164
+ text = m.get("definition") or m.get("text") or m.get("chunk") or ""
165
+ srcs = m.get("sources") or m.get("source") or m.get("file") or []
166
+ if isinstance(srcs, list) and srcs:
167
+ src = srcs[0]
168
+ elif isinstance(srcs, str) and srcs:
169
+ src = srcs
170
+ else:
171
+ src = m.get("file") or m.get("source") or "unknown"
172
+
173
+ results.append({
174
+ "text": text,
175
+ "file": src,
176
+ "source": src,
177
+ "term": m.get("term") or m.get("normalized") or "",
178
+ "_score": float(score),
179
+ "_meta": m
180
+ })
181
+ return results
core/vector_sync.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ core/vector_sync.py
3
+ ------------------------------------------------------------
4
+ Handles FAISS index rebuild + upload to Hugging Face dataset
5
+ without caching, optimized for limited HF Space storage.
6
+ """
7
+
8
+ import os
9
+ import re
10
+ import json
11
+ import faiss
12
+ import numpy as np
13
+ from pathlib import Path
14
+ from huggingface_hub import HfApi, hf_hub_download, upload_file, HfFolder
15
+ from sentence_transformers import SentenceTransformer
16
+ from nltk.stem import WordNetLemmatizer
17
+ from core.van_normalizer import normalize_to_van
18
+
19
+ # ==========================================================
20
+ # Helper: Upload FAISS index + metadata to dataset safely
21
+ # ==========================================================
22
+ from huggingface_hub import HfApi
23
+
24
+ def _upload_to_dataset(index_path: str, meta_path: str, repo_id: str):
25
+ """
26
+ Upload FAISS index + metadata to Hugging Face dataset safely.
27
+ Used by rebuild_index() in app.py.
28
+ """
29
+ try:
30
+ print(f"🚀 [vector_sync] Uploading FAISS index + metadata to {repo_id}...")
31
+ api = HfApi()
32
+
33
+ for path in [index_path, meta_path]:
34
+ if not os.path.exists(path):
35
+ print(f"⚠️ [vector_sync] Skipping {os.path.basename(path)} (not found locally).")
36
+ continue
37
+
38
+ api.upload_file(
39
+ path_or_fileobj=path,
40
+ path_in_repo=f"persistent/{os.path.basename(path)}",
41
+ repo_id=repo_id,
42
+ repo_type="dataset",
43
+ commit_message=f"Auto-upload {os.path.basename(path)}",
44
+ )
45
+ print(f"✅ [vector_sync] Uploaded {os.path.basename(path)}")
46
+
47
+ except Exception as e:
48
+ print(f"⚠️ [vector_sync] Upload failed: {e}")
49
+
50
+ # --------------------------------------------------------------------
51
+ # ⚙️ CONFIGURATION
52
+ # --------------------------------------------------------------------
53
+ REPO_ID = "essprasad/CT-Chat-Index"
54
+ REPO_TYPE = "dataset"
55
+ REMOTE_DIR = "persistent/"
56
+ FILES = ["faiss.index", "faiss.index.meta.json"]
57
+
58
+ api = HfApi()
59
+ token = HfFolder.get_token() or os.getenv("HF_TOKEN")
60
+
61
+ # --------------------------------------------------------------------
62
+ # 🔹 NORMALIZATION HELPERS
63
+ # --------------------------------------------------------------------
64
+ lemmatizer = WordNetLemmatizer()
65
+
66
+ def normalize_for_index(term: str) -> str:
67
+ """Normalize term for embedding."""
68
+ if not term:
69
+ return ""
70
+ s = term.lower().strip()
71
+ s = re.sub(r"[\-_/\\.,;:()]+", " ", s)
72
+ s = re.sub(r"\s+", " ", s)
73
+ words = s.split()
74
+ s = " ".join([lemmatizer.lemmatize(w) for w in words])
75
+ return s.strip()
76
+
77
+ def prepare_text_for_embedding(term: str, definition: str) -> str:
78
+ """Prepare text for embedding with VAN normalization."""
79
+ if not term:
80
+ return ""
81
+ t = term.lower().strip()
82
+ t = re.sub(r"[^\w\s-]", " ", t)
83
+ d = re.sub(r"\s+", " ", definition.strip())
84
+ t_van = normalize_to_van(t)
85
+ return f"{t_van}. {d}".strip()
86
+
87
+ # --------------------------------------------------------------------
88
+ # 🔹 1. IMPORT: Download FAISS from Hub (on-demand)
89
+ # --------------------------------------------------------------------
90
+ def auto_import_from_hub(force=False):
91
+ print(f"📥 [vector_sync] Checking for FAISS index on {REPO_ID}...")
92
+ try:
93
+ for fname in FILES:
94
+ print(f"⬇️ Downloading {fname} ...")
95
+ hf_hub_download(
96
+ repo_id=REPO_ID,
97
+ filename=f"{REMOTE_DIR}{fname}",
98
+ repo_type=REPO_TYPE,
99
+ local_dir="/home/user/app/tmp",
100
+ cache_dir="/home/user/app/tmp",
101
+ local_dir_use_symlinks=False,
102
+ token=token,
103
+ force_download=True,
104
+ )
105
+ print("✅ FAISS index + metadata downloaded.")
106
+ except Exception as e:
107
+ print(f"⚠️ [vector_sync] Could not import FAISS files: {e}")
108
+
109
+ # --------------------------------------------------------------------
110
+ # 🔹 2. EXPORT: Upload FAISS to Hub
111
+ # --------------------------------------------------------------------
112
+ def auto_export_to_hub(commit_msg="Auto-sync after rebuild"):
113
+ """Uploads FAISS index + metadata from /tmp/ to the dataset."""
114
+ if not token:
115
+ print("⚠️ [vector_sync] No HF token found. Skipping upload.")
116
+ return
117
+ print(f"🚀 [vector_sync] Uploading FAISS index + metadata to {REPO_ID}...")
118
+
119
+ try:
120
+ api.upload_file(
121
+ path_or_fileobj="/home/user/app/tmp/faiss.index",
122
+ path_in_repo="persistent/faiss.index",
123
+ repo_id=REPO_ID,
124
+ repo_type=REPO_TYPE,
125
+ token=token,
126
+ commit_message=commit_msg,
127
+ )
128
+ api.upload_file(
129
+ path_or_fileobj="/home/user/app/tmp/faiss.index.meta.json",
130
+ path_in_repo="persistent/faiss.index.meta.json",
131
+ repo_id=REPO_ID,
132
+ repo_type=REPO_TYPE,
133
+ token=token,
134
+ commit_message=commit_msg,
135
+ )
136
+ print("✅ [vector_sync] Upload complete.")
137
+ except Exception as e:
138
+ print(f"⚠️ [vector_sync] Upload failed: {e}")
139
+
140
+ # --------------------------------------------------------------------
141
+ # 🔹 3. REBUILD: Create FAISS from glossary.json
142
+ # --------------------------------------------------------------------
143
+ def rebuild_faiss_from_glossary(
144
+ glossary_path="/home/user/app/persistent/glossary.json",
145
+ model_name="all-MiniLM-L6-v2",
146
+ ):
147
+ """Rebuild FAISS index from glossary.json (no caching, low footprint)."""
148
+ try:
149
+ print(f"🧠 [vector_sync] Rebuilding FAISS from: {glossary_path}")
150
+ if not os.path.isfile(glossary_path):
151
+ print(f"⚠️ Glossary not found: {glossary_path}")
152
+ return None, None
153
+
154
+ with open(glossary_path, "r", encoding="utf-8") as f:
155
+ glossary = json.load(f)
156
+ print(f"📘 Loaded {len(glossary)} glossary entries.")
157
+
158
+ model = SentenceTransformer(model_name)
159
+ texts, metas = [], []
160
+ for k, v in glossary.items():
161
+ term = v.get("term", k)
162
+ definition = v.get("definition", "")
163
+ sources = v.get("sources", [])
164
+ if not definition.strip():
165
+ continue
166
+ combined = prepare_text_for_embedding(term, definition)
167
+ texts.append(combined)
168
+ metas.append({"term": term, "definition": definition, "sources": sources})
169
+
170
+ if not texts:
171
+ print("⚠️ No valid glossary entries for embedding.")
172
+ return None, None
173
+
174
+ print(f"🧩 Encoding {len(texts)} entries with {model_name}...")
175
+ embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
176
+ faiss.normalize_L2(embeddings)
177
+ dim = embeddings.shape[1]
178
+ index = faiss.IndexFlatIP(dim)
179
+ index.add(embeddings)
180
+
181
+ tmp_dir = "/home/user/app/tmp"
182
+ os.makedirs(tmp_dir, exist_ok=True)
183
+ tmp_index = os.path.join(tmp_dir, "faiss.index")
184
+ tmp_meta = os.path.join(tmp_dir, "faiss.index.meta.json")
185
+
186
+ faiss.write_index(index, tmp_index)
187
+ with open(tmp_meta, "w", encoding="utf-8") as f:
188
+ json.dump(metas, f, indent=2, ensure_ascii=False)
189
+
190
+ # Upload and cleanup
191
+ auto_export_to_hub("Glossary-based FAISS rebuild")
192
+ os.remove(tmp_index)
193
+ os.remove(tmp_meta)
194
+
195
+ print(f"✅ [vector_sync] Rebuild complete — {len(texts)} vectors uploaded to dataset.")
196
+ return index, metas
197
+
198
+ except Exception as e:
199
+ print(f"⚠️ Error in rebuild_faiss_from_glossary: {e}")
200
+ return None, None