Spaces:
Running
Running
Upload 9 files
Browse files- core/bm25.py +58 -0
- core/glossary.py +109 -0
- core/glossary_builder.py +256 -0
- core/hybrid_retriever.py +269 -0
- core/retrieval.py +25 -0
- core/van_normalizer.py +57 -0
- core/vector_search.py +107 -0
- core/vector_store.py +181 -0
- core/vector_sync.py +200 -0
core/bm25.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import math
|
| 5 |
+
from collections import defaultdict, Counter
|
| 6 |
+
|
| 7 |
+
class BM25:
|
| 8 |
+
def __init__(self, corpus):
|
| 9 |
+
self.corpus = corpus
|
| 10 |
+
self.tokenized_corpus = [self._tokenize(doc['text']) for doc in corpus]
|
| 11 |
+
self.doc_lens = [len(doc) for doc in self.tokenized_corpus]
|
| 12 |
+
self.avgdl = sum(self.doc_lens) / len(self.doc_lens)
|
| 13 |
+
self.doc_freqs = self._calc_doc_freqs()
|
| 14 |
+
self.k1 = 1.5
|
| 15 |
+
self.b = 0.75
|
| 16 |
+
|
| 17 |
+
def _tokenize(self, text):
|
| 18 |
+
return re.findall(r"\w+", text.lower())
|
| 19 |
+
|
| 20 |
+
def _calc_doc_freqs(self):
|
| 21 |
+
freqs = defaultdict(int)
|
| 22 |
+
for doc in self.tokenized_corpus:
|
| 23 |
+
for word in set(doc):
|
| 24 |
+
freqs[word] += 1
|
| 25 |
+
return freqs
|
| 26 |
+
|
| 27 |
+
def _idf(self, term):
|
| 28 |
+
N = len(self.tokenized_corpus)
|
| 29 |
+
df = self.doc_freqs.get(term, 0)
|
| 30 |
+
return math.log(1 + (N - df + 0.5) / (df + 0.5))
|
| 31 |
+
|
| 32 |
+
def get_scores(self, query_tokens):
|
| 33 |
+
scores = [0.0] * len(self.tokenized_corpus)
|
| 34 |
+
for idx, doc in enumerate(self.tokenized_corpus):
|
| 35 |
+
freqs = Counter(doc)
|
| 36 |
+
dl = self.doc_lens[idx]
|
| 37 |
+
for term in query_tokens:
|
| 38 |
+
idf = self._idf(term)
|
| 39 |
+
tf = freqs[term]
|
| 40 |
+
denom = tf + self.k1 * (1 - self.b + self.b * dl / self.avgdl)
|
| 41 |
+
score = idf * ((tf * (self.k1 + 1)) / denom) if denom != 0 else 0
|
| 42 |
+
scores[idx] += score
|
| 43 |
+
return scores
|
| 44 |
+
|
| 45 |
+
def search_bm25(query, top_n=10):
|
| 46 |
+
from core.vector_store import load_all_text_chunks
|
| 47 |
+
if docs is None:
|
| 48 |
+
docs = load_all_text_chunks()
|
| 49 |
+
bm25 = BM25(docs)
|
| 50 |
+
query_tokens = re.findall(r"\w+", query.lower())
|
| 51 |
+
scores = bm25.get_scores(query_tokens)
|
| 52 |
+
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
|
| 53 |
+
results = []
|
| 54 |
+
for i in top_indices:
|
| 55 |
+
doc = docs[i].copy()
|
| 56 |
+
doc['score'] = scores[i]
|
| 57 |
+
results.append(doc)
|
| 58 |
+
return results
|
core/glossary.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# core/glossary.py
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
from difflib import get_close_matches
|
| 7 |
+
from huggingface_hub import hf_hub_download
|
| 8 |
+
|
| 9 |
+
GLOSSARY = None
|
| 10 |
+
GLOSSARY_TERMS_CACHE = [] # 🧠 Cache of glossary keys for fuzzy matching
|
| 11 |
+
DATASET_REPO = "essprasad/CT-Chat-Index"
|
| 12 |
+
GLOSSARY_FILENAME = "persistent/glossary.json"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _normalize_term(term: str) -> str:
|
| 16 |
+
"""Normalize glossary terms for matching, with fuzzy fallback."""
|
| 17 |
+
if not term:
|
| 18 |
+
return ""
|
| 19 |
+
term = term.lower().strip()
|
| 20 |
+
term = re.sub(r'[\-_/\\.,;:]+', ' ', term)
|
| 21 |
+
term = re.sub(r'\s+', ' ', term)
|
| 22 |
+
|
| 23 |
+
# Common clinical research synonym normalization
|
| 24 |
+
term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf")
|
| 25 |
+
term = term.replace("electronic case report form", "ecrf")
|
| 26 |
+
term = term.replace("case report form", "crf")
|
| 27 |
+
term = term.replace("informed consent form", "icf")
|
| 28 |
+
term = term.replace("good clinical practice", "gcp")
|
| 29 |
+
term = term.replace("serious adverse event", "sae")
|
| 30 |
+
term = term.replace("adverse event", "ae")
|
| 31 |
+
term = term.replace("21 cfr part 11", "21cfrpart11")
|
| 32 |
+
term = term.replace("clinical study report", "csr")
|
| 33 |
+
|
| 34 |
+
term = term.strip()
|
| 35 |
+
|
| 36 |
+
# 🧩 Fuzzy matching fallback (for plural/singular or typos)
|
| 37 |
+
if GLOSSARY_TERMS_CACHE:
|
| 38 |
+
if term not in GLOSSARY_TERMS_CACHE:
|
| 39 |
+
close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85)
|
| 40 |
+
if close:
|
| 41 |
+
# return the closest key for better recall
|
| 42 |
+
return close[0]
|
| 43 |
+
|
| 44 |
+
return term
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _load_glossary():
|
| 48 |
+
"""Load glossary.json from Hugging Face Hub (cached)."""
|
| 49 |
+
global GLOSSARY, GLOSSARY_TERMS_CACHE
|
| 50 |
+
if GLOSSARY is not None:
|
| 51 |
+
return GLOSSARY
|
| 52 |
+
try:
|
| 53 |
+
path = hf_hub_download(
|
| 54 |
+
repo_id=DATASET_REPO,
|
| 55 |
+
filename=GLOSSARY_FILENAME,
|
| 56 |
+
repo_type="dataset",
|
| 57 |
+
)
|
| 58 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 59 |
+
raw = json.load(f)
|
| 60 |
+
|
| 61 |
+
GLOSSARY = {}
|
| 62 |
+
for k, vlist in raw.items():
|
| 63 |
+
if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k):
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
candidate_key = k
|
| 67 |
+
if isinstance(vlist, dict):
|
| 68 |
+
candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k
|
| 69 |
+
|
| 70 |
+
norm = _normalize_term(candidate_key)
|
| 71 |
+
if not norm:
|
| 72 |
+
continue
|
| 73 |
+
|
| 74 |
+
if isinstance(vlist, dict):
|
| 75 |
+
dfn = vlist.get("definition") or vlist.get("text") or ""
|
| 76 |
+
sources = vlist.get("sources", [])
|
| 77 |
+
elif isinstance(vlist, str):
|
| 78 |
+
dfn = vlist
|
| 79 |
+
sources = []
|
| 80 |
+
else:
|
| 81 |
+
dfn, sources = "", []
|
| 82 |
+
|
| 83 |
+
if not dfn or len(dfn.strip()) < 5:
|
| 84 |
+
continue
|
| 85 |
+
|
| 86 |
+
if norm not in GLOSSARY:
|
| 87 |
+
GLOSSARY[norm] = {
|
| 88 |
+
"term": candidate_key.strip(),
|
| 89 |
+
"definition": dfn.strip(),
|
| 90 |
+
"sources": sources if isinstance(sources, list) else []
|
| 91 |
+
}
|
| 92 |
+
else:
|
| 93 |
+
# Merge sources if already exists
|
| 94 |
+
existing = GLOSSARY[norm]
|
| 95 |
+
existing_sources = set(existing.get("sources", []))
|
| 96 |
+
new_sources = set(sources) if sources else set()
|
| 97 |
+
existing["sources"] = list(existing_sources.union(new_sources))
|
| 98 |
+
|
| 99 |
+
# 🧠 Store all glossary keys for fuzzy fallback
|
| 100 |
+
GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys())
|
| 101 |
+
|
| 102 |
+
print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).")
|
| 103 |
+
return GLOSSARY
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"⚠️ Failed to load glossary from Hugging Face: {e}")
|
| 106 |
+
return {}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
__all__ = ["_load_glossary", "_normalize_term"]
|
core/glossary_builder.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
📘 glossary_builder.py
|
| 3 |
+
Builds a unified glossary from PDFs and Excel files.
|
| 4 |
+
- Extracts terms & definitions from PDFs.
|
| 5 |
+
- Merges Excel glossary (uses all definition-related columns with labeled formatting).
|
| 6 |
+
- Saves combined glossary.json locally and uploads to Hugging Face.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import re
|
| 11 |
+
import json
|
| 12 |
+
import time
|
| 13 |
+
import fitz
|
| 14 |
+
import pandas as pd
|
| 15 |
+
from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download
|
| 16 |
+
|
| 17 |
+
# --- Configuration ---
|
| 18 |
+
DATASET_REPO = "essprasad/CT-Chat-Index"
|
| 19 |
+
DOCS_REPO = "essprasad/CT-Chat-Docs"
|
| 20 |
+
LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
|
| 21 |
+
REMOTE_GLOSSARY = "persistent/glossary.json"
|
| 22 |
+
TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# --- Helpers ---
|
| 26 |
+
def normalize_term(term: str) -> str:
|
| 27 |
+
if not term:
|
| 28 |
+
return ""
|
| 29 |
+
s = term.lower().strip()
|
| 30 |
+
s = re.sub(r"[\-_/\\.,;:]+", " ", s)
|
| 31 |
+
s = re.sub(r"\s+", " ", s)
|
| 32 |
+
synonyms = {
|
| 33 |
+
"electronic case report form": "ecrf",
|
| 34 |
+
"case report form": "crf",
|
| 35 |
+
"informed consent form": "icf",
|
| 36 |
+
"good clinical practice": "gcp",
|
| 37 |
+
"serious adverse event": "sae",
|
| 38 |
+
"adverse event": "ae",
|
| 39 |
+
"21 cfr part 11": "21cfrpart11",
|
| 40 |
+
"clinical study report": "csr",
|
| 41 |
+
}
|
| 42 |
+
return synonyms.get(s, s)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def extract_text_from_pdf(pdf_path):
|
| 46 |
+
"""Extract plain text from a PDF file."""
|
| 47 |
+
try:
|
| 48 |
+
doc = fitz.open(pdf_path)
|
| 49 |
+
text = "\n".join(page.get_text("text") for page in doc)
|
| 50 |
+
doc.close()
|
| 51 |
+
return text.strip()
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"⚠️ Failed to read {pdf_path}: {e}")
|
| 54 |
+
return ""
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def extract_definitions_from_text(text):
|
| 58 |
+
"""Extract glossary-like term-definition pairs from raw PDF text."""
|
| 59 |
+
glossary = {}
|
| 60 |
+
text = re.sub(r"\r", "", text)
|
| 61 |
+
lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
|
| 62 |
+
i = 0
|
| 63 |
+
while i < len(lines):
|
| 64 |
+
term = lines[i].strip()
|
| 65 |
+
if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
|
| 66 |
+
i += 1
|
| 67 |
+
continue
|
| 68 |
+
if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
|
| 69 |
+
i += 1
|
| 70 |
+
continue
|
| 71 |
+
if term.lower().startswith(("acronym for", "definition", "terms of")):
|
| 72 |
+
i += 1
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
defn_lines = []
|
| 76 |
+
j = i + 1
|
| 77 |
+
while j < len(lines):
|
| 78 |
+
nxt = lines[j].strip()
|
| 79 |
+
if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
|
| 80 |
+
if not nxt.lower().startswith(("see also", "for example", "for instance")):
|
| 81 |
+
break
|
| 82 |
+
defn_lines.append(nxt)
|
| 83 |
+
j += 1
|
| 84 |
+
|
| 85 |
+
definition = " ".join(defn_lines)
|
| 86 |
+
definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
|
| 87 |
+
definition = re.sub(r"\s{2,}", " ", definition).strip()
|
| 88 |
+
|
| 89 |
+
if len(definition.split()) < 5 or "." not in definition:
|
| 90 |
+
i += 1
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
norm = normalize_term(term)
|
| 94 |
+
glossary[norm] = {"term": term, "definition": definition}
|
| 95 |
+
i = j
|
| 96 |
+
|
| 97 |
+
return glossary
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# --- Main Rebuild Function ---
|
| 101 |
+
def rebuild_and_upload():
|
| 102 |
+
start = time.time()
|
| 103 |
+
print("📘 Starting glossary rebuild...")
|
| 104 |
+
try:
|
| 105 |
+
all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
|
| 106 |
+
pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
|
| 107 |
+
excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
|
| 108 |
+
except Exception as e:
|
| 109 |
+
raise RuntimeError(f"Cannot list repo files: {e}")
|
| 110 |
+
|
| 111 |
+
all_defs = {}
|
| 112 |
+
|
| 113 |
+
# --- 1️⃣ Process PDFs ---
|
| 114 |
+
for pdf in pdfs:
|
| 115 |
+
skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
|
| 116 |
+
if any(sp in pdf.lower() for sp in skip_patterns):
|
| 117 |
+
print(f"⏩ Skipping non-glossary or template file: {pdf}")
|
| 118 |
+
continue
|
| 119 |
+
try:
|
| 120 |
+
print(f"🔍 Processing {pdf}...")
|
| 121 |
+
path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
|
| 122 |
+
text = extract_text_from_pdf(path)
|
| 123 |
+
defs = extract_definitions_from_text(text)
|
| 124 |
+
for k, v in defs.items():
|
| 125 |
+
v.setdefault("sources", []).append(pdf)
|
| 126 |
+
if k in all_defs:
|
| 127 |
+
all_defs[k]["sources"] = list(set(all_defs[k].get("sources", []) + v["sources"]))
|
| 128 |
+
else:
|
| 129 |
+
all_defs[k] = v
|
| 130 |
+
except Exception as e:
|
| 131 |
+
print(f"⚠️ Failed {pdf}: {e}")
|
| 132 |
+
|
| 133 |
+
# --- 2️⃣ Process Excel Glossaries (MRCT etc.) ---
|
| 134 |
+
for excel in excels:
|
| 135 |
+
try:
|
| 136 |
+
print(f"📗 Checking Excel file in dataset: {excel}")
|
| 137 |
+
excel_path = hf_hub_download(
|
| 138 |
+
repo_id=DOCS_REPO,
|
| 139 |
+
filename=excel,
|
| 140 |
+
repo_type="dataset",
|
| 141 |
+
token=TOKEN
|
| 142 |
+
)
|
| 143 |
+
print(f"✅ Downloaded Excel file to {excel_path}")
|
| 144 |
+
xls = pd.read_excel(excel_path, sheet_name=None)
|
| 145 |
+
|
| 146 |
+
total_rows = 0
|
| 147 |
+
excel_entries = []
|
| 148 |
+
|
| 149 |
+
for sheet_name, df in xls.items():
|
| 150 |
+
df = df.fillna("").dropna(how="all")
|
| 151 |
+
if df.shape[0] == 0:
|
| 152 |
+
continue
|
| 153 |
+
df.columns = [str(c).strip() for c in df.columns]
|
| 154 |
+
|
| 155 |
+
# 🧠 Detect the correct 'Glossary Term' column
|
| 156 |
+
term_col = None
|
| 157 |
+
for c in df.columns:
|
| 158 |
+
if "glossary term" in c.lower():
|
| 159 |
+
term_col = c
|
| 160 |
+
break
|
| 161 |
+
if not term_col:
|
| 162 |
+
for c in df.columns:
|
| 163 |
+
if "cdisc" in c.lower() or c.lower() == "term":
|
| 164 |
+
term_col = c
|
| 165 |
+
break
|
| 166 |
+
if not term_col:
|
| 167 |
+
print(f"⚠️ No 'Glossary Term' column found in sheet {sheet_name} of {excel_path}")
|
| 168 |
+
continue
|
| 169 |
+
|
| 170 |
+
# Concatenate all relevant columns with labels for clarity
|
| 171 |
+
for _, row in df.iterrows():
|
| 172 |
+
term = str(row.get(term_col, "")).strip()
|
| 173 |
+
if not term:
|
| 174 |
+
continue
|
| 175 |
+
|
| 176 |
+
def_cols = [
|
| 177 |
+
c for c in df.columns
|
| 178 |
+
if any(k in c.lower() for k in [
|
| 179 |
+
"definition", "context", "info", "related", "resource", "use in context"
|
| 180 |
+
])
|
| 181 |
+
]
|
| 182 |
+
|
| 183 |
+
def_parts = []
|
| 184 |
+
for c in def_cols:
|
| 185 |
+
val = str(row.get(c, "")).strip()
|
| 186 |
+
if val:
|
| 187 |
+
def_parts.append(f"<b>{c}:</b> {val}")
|
| 188 |
+
|
| 189 |
+
full_definition = "<br>".join(def_parts).strip()
|
| 190 |
+
if not full_definition:
|
| 191 |
+
continue
|
| 192 |
+
|
| 193 |
+
entry = {
|
| 194 |
+
"term": term,
|
| 195 |
+
"definition": full_definition,
|
| 196 |
+
"source": os.path.basename(excel_path),
|
| 197 |
+
"sheet": sheet_name,
|
| 198 |
+
"type": "Excel",
|
| 199 |
+
}
|
| 200 |
+
excel_entries.append(entry)
|
| 201 |
+
total_rows += 1
|
| 202 |
+
|
| 203 |
+
print(f"✅ Added {total_rows} Excel rows from {excel_path}")
|
| 204 |
+
|
| 205 |
+
# Merge into main glossary dictionary
|
| 206 |
+
for e in excel_entries:
|
| 207 |
+
norm = normalize_term(e["term"])
|
| 208 |
+
payload = {
|
| 209 |
+
"term": e["term"],
|
| 210 |
+
"definition": e["definition"],
|
| 211 |
+
"sources": [e["source"]],
|
| 212 |
+
"type": e.get("type", "Excel"),
|
| 213 |
+
"sheet": e.get("sheet"),
|
| 214 |
+
}
|
| 215 |
+
# Each term+source pair stored uniquely to preserve different definitions
|
| 216 |
+
unique_key = f"{norm}__{os.path.basename(payload['sources'][0])}" if payload.get("sources") else norm
|
| 217 |
+
|
| 218 |
+
if unique_key not in all_defs:
|
| 219 |
+
all_defs[unique_key] = payload
|
| 220 |
+
else:
|
| 221 |
+
# Avoid duplicate merges — just append any new sources
|
| 222 |
+
existing = all_defs[unique_key]
|
| 223 |
+
existing_sources = set(existing.get("sources", []))
|
| 224 |
+
new_sources = set(payload.get("sources", []))
|
| 225 |
+
existing["sources"] = list(existing_sources.union(new_sources))
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f"⚠️ Failed to process Excel {excel}: {e}")
|
| 229 |
+
|
| 230 |
+
# --- 3️⃣ Save combined glossary ---
|
| 231 |
+
os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
|
| 232 |
+
with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
|
| 233 |
+
json.dump(all_defs, f, indent=2, ensure_ascii=False)
|
| 234 |
+
|
| 235 |
+
print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")
|
| 236 |
+
|
| 237 |
+
# --- 4️⃣ Upload to Hugging Face ---
|
| 238 |
+
if TOKEN:
|
| 239 |
+
try:
|
| 240 |
+
upload_file(
|
| 241 |
+
path_or_fileobj=LOCAL_GLOSSARY,
|
| 242 |
+
path_in_repo=REMOTE_GLOSSARY,
|
| 243 |
+
repo_id=DATASET_REPO,
|
| 244 |
+
repo_type="dataset",
|
| 245 |
+
token=TOKEN,
|
| 246 |
+
commit_message="Glossary updated with PDF + Excel (column-labeled definitions)",
|
| 247 |
+
)
|
| 248 |
+
print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
|
| 249 |
+
except Exception as e:
|
| 250 |
+
print(f"⚠️ Upload error: {e}")
|
| 251 |
+
|
| 252 |
+
print(f"✨ Done in {time.time() - start:.1f}s.")
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
if __name__ == "__main__":
|
| 256 |
+
rebuild_and_upload()
|
core/hybrid_retriever.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hybrid Retriever with Glossary + FAISS + BM25 support.
|
| 3 |
+
Includes full-paragraph glossary definitions, acronym expansion, and Excel prioritization.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
import time
|
| 9 |
+
from urllib.parse import urlparse
|
| 10 |
+
|
| 11 |
+
from core.glossary import _load_glossary, _normalize_term
|
| 12 |
+
from core.vector_store import _ensure_faiss_index, search_index, load_all_text_chunks
|
| 13 |
+
from core.bm25 import search_bm25
|
| 14 |
+
from utils.nlp_helpers import extract_van_tokens, normalize_query_text
|
| 15 |
+
|
| 16 |
+
DOCS_REPO = "essprasad/CT-Chat-Docs"
|
| 17 |
+
DENSE_TOP_K = 10
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _find_sentence_containing(text: str, phrase: str) -> str:
|
| 21 |
+
"""Return the sentence that contains the given phrase."""
|
| 22 |
+
if not text or not phrase:
|
| 23 |
+
return ""
|
| 24 |
+
sentences = re.split(r"[.!?\n\r]", text)
|
| 25 |
+
phrase = phrase.lower()
|
| 26 |
+
for s in sentences:
|
| 27 |
+
if phrase in s.lower():
|
| 28 |
+
return s.strip()
|
| 29 |
+
return ""
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def summarize_combined(query: str, mode="short") -> str:
|
| 33 |
+
start = time.time()
|
| 34 |
+
if not query.strip():
|
| 35 |
+
return "<i>No query provided.</i>"
|
| 36 |
+
|
| 37 |
+
# ------------------------------------------------------------------
|
| 38 |
+
# 🧠 VAN-Based Query Normalization
|
| 39 |
+
# ------------------------------------------------------------------
|
| 40 |
+
expanded_query = normalize_query_text(query)
|
| 41 |
+
van_tokens = extract_van_tokens(expanded_query)
|
| 42 |
+
van_query = " ".join(van_tokens).strip()
|
| 43 |
+
normalized_query = van_query or query
|
| 44 |
+
|
| 45 |
+
print(f"🔍 summarize_combined() query='{query}' van_query='{van_query}' normalized_query='{normalized_query}'")
|
| 46 |
+
|
| 47 |
+
glossary = _load_glossary()
|
| 48 |
+
|
| 49 |
+
# ------------------------------------------------------------------
|
| 50 |
+
# 1️⃣ Acronym Map (derived from GCDMP_Glossary.pdf)
|
| 51 |
+
# ------------------------------------------------------------------
|
| 52 |
+
acronym_map = {
|
| 53 |
+
"adr": "adverse drug reaction",
|
| 54 |
+
"ae": "adverse event",
|
| 55 |
+
"asp": "application service provider",
|
| 56 |
+
"asq": "american society for quality",
|
| 57 |
+
"ca": "corrective action",
|
| 58 |
+
"cdisc": "clinical data interchange standards consortium",
|
| 59 |
+
"clia": "clinical laboratory improvement amendments",
|
| 60 |
+
"crf": "case report form",
|
| 61 |
+
"cro": "contract research organization",
|
| 62 |
+
"cs": "clinically significant",
|
| 63 |
+
"ehr": "electronic health record",
|
| 64 |
+
"emr": "electronic medical record",
|
| 65 |
+
"eu": "european union",
|
| 66 |
+
"gcp": "good clinical practice",
|
| 67 |
+
"idmc": "independent data-monitoring committee",
|
| 68 |
+
"iec": "independent ethics committee",
|
| 69 |
+
"ind": "investigational new drug application",
|
| 70 |
+
"irb": "institutional review board",
|
| 71 |
+
"iso": "international organization for standardization",
|
| 72 |
+
"iom": "institute of medicine",
|
| 73 |
+
"iss": "integrated summary of safety",
|
| 74 |
+
"ise": "integrated summary of efficacy",
|
| 75 |
+
"meddra": "medical dictionary for regulatory activities",
|
| 76 |
+
"mrct": "multi-regional clinical trials",
|
| 77 |
+
"ncs": "non clinically significant",
|
| 78 |
+
"nda": "new drug application",
|
| 79 |
+
"ocr": "optical character recognition",
|
| 80 |
+
"qa": "quality assurance",
|
| 81 |
+
"qc": "quality control",
|
| 82 |
+
"sae": "serious adverse event",
|
| 83 |
+
"sla": "service level agreement",
|
| 84 |
+
"sop": "standard operating procedure",
|
| 85 |
+
"spc": "statistical process control",
|
| 86 |
+
"sqc": "statistical quality control",
|
| 87 |
+
"uat": "user acceptance testing",
|
| 88 |
+
"vcl": "virtual central lab",
|
| 89 |
+
"whodrug": "world health organization drug dictionary",
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
acronym_glossary_hits = []
|
| 93 |
+
|
| 94 |
+
# ------------------------------------------------------------------
|
| 95 |
+
# 2️⃣ Direct Glossary Match (and handle acronyms)
|
| 96 |
+
# ------------------------------------------------------------------
|
| 97 |
+
short_candidate = (van_query or normalized_query).strip().lower()
|
| 98 |
+
glossary_key = _normalize_term(short_candidate)
|
| 99 |
+
|
| 100 |
+
# If query matches acronym, expand it
|
| 101 |
+
if glossary_key in acronym_map:
|
| 102 |
+
expansion = acronym_map[glossary_key]
|
| 103 |
+
glossary_key = _normalize_term(expansion)
|
| 104 |
+
print(f"🔁 Acronym expansion: '{short_candidate}' → '{expansion}'")
|
| 105 |
+
|
| 106 |
+
if glossary and glossary_key in glossary:
|
| 107 |
+
entry = glossary[glossary_key]
|
| 108 |
+
term_display = entry.get("term", glossary_key)
|
| 109 |
+
dfn = entry.get("definition") or entry.get("text") or ""
|
| 110 |
+
sources = entry.get("sources", []) or ["unspecified"]
|
| 111 |
+
|
| 112 |
+
html = f"<h3>🧠 Definitions for '{term_display}':</h3>"
|
| 113 |
+
for src in sources:
|
| 114 |
+
html += f"🔹 <b>Source:</b> {src}<br><blockquote>{dfn}</blockquote>"
|
| 115 |
+
print(f"✅ Glossary match for '{glossary_key}' in {time.time() - start:.2f}s")
|
| 116 |
+
return html
|
| 117 |
+
|
| 118 |
+
# ------------------------------------------------------------------
|
| 119 |
+
# 3️⃣ FAISS Dense Retrieval
|
| 120 |
+
# ------------------------------------------------------------------
|
| 121 |
+
dense_query = normalized_query
|
| 122 |
+
dense_hits = []
|
| 123 |
+
try:
|
| 124 |
+
if _ensure_faiss_index():
|
| 125 |
+
dense_hits = search_index(dense_query, top_k=DENSE_TOP_K) or []
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print(f"⚠️ FAISS search failed: {e}")
|
| 128 |
+
print(f"📚 Dense hits before filtering: {len(dense_hits)}")
|
| 129 |
+
|
| 130 |
+
# ------------------------------------------------------------------
|
| 131 |
+
# 4️⃣ Acronym Filtering (Lenient Match)
|
| 132 |
+
# ------------------------------------------------------------------
|
| 133 |
+
if len(normalized_query.split()) == 1 and len(normalized_query) <= 5:
|
| 134 |
+
key = normalized_query.lower()
|
| 135 |
+
expansion = acronym_map.get(key, key)
|
| 136 |
+
pattern = re.compile(
|
| 137 |
+
rf"\b{re.escape(key)}\b|{re.escape(expansion)}|{key}[\s\-\.:;)]|[\(\s]{key}[\s\-\.:;)]",
|
| 138 |
+
re.IGNORECASE,
|
| 139 |
+
)
|
| 140 |
+
filtered_hits = [h for h in dense_hits if pattern.search((h.get("definition") or h.get("text") or "").lower())]
|
| 141 |
+
print(f"🔍 Filtered acronym hits: {len(filtered_hits)} (lenient match incl. '{expansion}')")
|
| 142 |
+
dense_hits = filtered_hits
|
| 143 |
+
else:
|
| 144 |
+
print(f"ℹ️ No acronym filtering applied (query length > 5 chars)")
|
| 145 |
+
print(f"📚 Dense hits after filtering: {len(dense_hits)}")
|
| 146 |
+
|
| 147 |
+
# ------------------------------------------------------------------
|
| 148 |
+
# 5️⃣ BM25 Fallback (Lexical)
|
| 149 |
+
# ------------------------------------------------------------------
|
| 150 |
+
bm25_hits = []
|
| 151 |
+
try:
|
| 152 |
+
docs = load_all_text_chunks()
|
| 153 |
+
if docs:
|
| 154 |
+
bm25_hits = search_bm25(normalized_query, docs, top_n=10)
|
| 155 |
+
except Exception as e:
|
| 156 |
+
print(f"⚠️ BM25 fallback failed: {e}")
|
| 157 |
+
print(f"📑 BM25 hits: {len(bm25_hits)}")
|
| 158 |
+
|
| 159 |
+
# ------------------------------------------------------------------
|
| 160 |
+
# 🧩 Merge & Prioritize — Keep all per-source definitions
|
| 161 |
+
# ------------------------------------------------------------------
|
| 162 |
+
hits = dense_hits + bm25_hits
|
| 163 |
+
if not hits:
|
| 164 |
+
return "<i>No relevant information found.</i>"
|
| 165 |
+
|
| 166 |
+
def score_hit(h):
|
| 167 |
+
"""Prioritize Excel > PDF > Web > Other"""
|
| 168 |
+
text = (h.get("definition") or h.get("text") or "").lower()
|
| 169 |
+
src = h.get("file") or h.get("source") or ""
|
| 170 |
+
src_type = (h.get("type") or "").lower()
|
| 171 |
+
key = (van_query or normalized_query).lower()
|
| 172 |
+
score = 0
|
| 173 |
+
if "excel" in src_type or src.lower().endswith((".xls", ".xlsx")):
|
| 174 |
+
score += 10
|
| 175 |
+
elif src.lower().endswith(".pdf"):
|
| 176 |
+
score += 5
|
| 177 |
+
elif src.startswith("http"):
|
| 178 |
+
score += 2
|
| 179 |
+
if key in text:
|
| 180 |
+
score += 3
|
| 181 |
+
return -score
|
| 182 |
+
|
| 183 |
+
hits = sorted(hits, key=score_hit)
|
| 184 |
+
|
| 185 |
+
# ------------------------------------------------------------------
|
| 186 |
+
# 6️⃣ Compose Final Answer — With Icons, Tooltips, & Hyperlinks
|
| 187 |
+
# ------------------------------------------------------------------
|
| 188 |
+
answers = []
|
| 189 |
+
src_counts = {"web": 0, "pdf": 0, "excel": 0, "other": 0}
|
| 190 |
+
|
| 191 |
+
for h in hits:
|
| 192 |
+
txt = h.get("definition") or h.get("text") or ""
|
| 193 |
+
if not txt.strip():
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
src = h.get("file") or h.get("source") or "unknown"
|
| 197 |
+
src_base = os.path.basename(src)
|
| 198 |
+
src_type = (h.get("type") or "").lower()
|
| 199 |
+
term_name = h.get("term") or (van_query or normalized_query)
|
| 200 |
+
|
| 201 |
+
# --- Categorize source
|
| 202 |
+
if "excel" in src_type or src.lower().endswith((".xls", ".xlsx")):
|
| 203 |
+
icon, cat = "📘", "excel"
|
| 204 |
+
elif "website" in src_type or src.startswith("http"):
|
| 205 |
+
icon, cat = "🌐", "web"
|
| 206 |
+
elif src.lower().endswith(".pdf"):
|
| 207 |
+
icon, cat = "📄", "pdf"
|
| 208 |
+
else:
|
| 209 |
+
icon, cat = "📁", "other"
|
| 210 |
+
|
| 211 |
+
src_counts[cat] += 1
|
| 212 |
+
|
| 213 |
+
# --- Extract URL if present
|
| 214 |
+
url = ""
|
| 215 |
+
if "http" in src:
|
| 216 |
+
url = src
|
| 217 |
+
elif "http" in txt:
|
| 218 |
+
match = re.search(r"https?://\S+", txt)
|
| 219 |
+
if match:
|
| 220 |
+
url = match.group(0).rstrip(".,)")
|
| 221 |
+
|
| 222 |
+
# --- Extract relevant paragraph
|
| 223 |
+
paragraphs = re.split(r"\n\s*\n", txt)
|
| 224 |
+
matched_paragraph = ""
|
| 225 |
+
for p in paragraphs:
|
| 226 |
+
if normalized_query.lower() in p.lower() or (van_query and van_query.lower() in p.lower()):
|
| 227 |
+
matched_paragraph = p.strip()
|
| 228 |
+
break
|
| 229 |
+
excerpt = matched_paragraph or txt.strip()
|
| 230 |
+
|
| 231 |
+
if len(excerpt) > 2000:
|
| 232 |
+
excerpt = excerpt[:2000] + "..."
|
| 233 |
+
|
| 234 |
+
# --- Convert URLs and highlight terms
|
| 235 |
+
excerpt = re.sub(r"(?i)source url:\s*", "", excerpt)
|
| 236 |
+
excerpt = re.sub(r"(https?://[^\s<>'\"]+)", r"<a href='\1' target='_blank'>\1</a>", excerpt)
|
| 237 |
+
excerpt = re.sub(f"(?i)({re.escape(normalized_query)})", r"<mark>\1</mark>", excerpt)
|
| 238 |
+
if term_name:
|
| 239 |
+
excerpt = re.sub(f"(?i)({re.escape(term_name)})", r"<b>\1</b>", excerpt)
|
| 240 |
+
|
| 241 |
+
# --- Build formatted output
|
| 242 |
+
if url:
|
| 243 |
+
parsed = urlparse(url)
|
| 244 |
+
display_name = parsed.netloc or src_base
|
| 245 |
+
link_html = f"<b>{icon} <a href='{url}' target='_blank'>{display_name}</a></b>"
|
| 246 |
+
else:
|
| 247 |
+
link_html = f"<b>{icon} {src_base}</b>"
|
| 248 |
+
|
| 249 |
+
answers.append(f"{link_html}<br><blockquote>{excerpt}</blockquote>")
|
| 250 |
+
|
| 251 |
+
if len(answers) >= 6:
|
| 252 |
+
break
|
| 253 |
+
|
| 254 |
+
# ------------------------------------------------------------------
|
| 255 |
+
# 7️⃣ Final HTML Output
|
| 256 |
+
# ------------------------------------------------------------------
|
| 257 |
+
summary_counts = " | ".join(f"{k.capitalize()}: {v}" for k, v in src_counts.items() if v > 0)
|
| 258 |
+
print(f"✅ Answers from {len(answers)} sources in {time.time() - start:.2f}s")
|
| 259 |
+
|
| 260 |
+
expansion_note = ""
|
| 261 |
+
if normalized_query.lower() in acronym_map:
|
| 262 |
+
expansion_note = f"<p><i>🔁 Acronym expanded: <b>{normalized_query.upper()}</b> → {acronym_map[normalized_query.lower()]}</i></p>"
|
| 263 |
+
|
| 264 |
+
return (
|
| 265 |
+
f"<h3>🧠 Answers (one per source):</h3>"
|
| 266 |
+
+ expansion_note
|
| 267 |
+
+ f"<p><i>Sources → {summary_counts}</i></p>"
|
| 268 |
+
+ "<br>".join(answers)
|
| 269 |
+
)
|
core/retrieval.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
from whoosh.index import open_dir
|
| 5 |
+
from whoosh.qparser import MultifieldParser
|
| 6 |
+
|
| 7 |
+
WHOOSH_INDEX_PATH = "/home/user/app/persistent/whoosh_index"
|
| 8 |
+
|
| 9 |
+
_ix = None
|
| 10 |
+
|
| 11 |
+
def _load_whoosh():
|
| 12 |
+
global _ix
|
| 13 |
+
if _ix is None and os.path.exists(WHOOSH_INDEX_PATH):
|
| 14 |
+
_ix = open_dir(WHOOSH_INDEX_PATH)
|
| 15 |
+
return _ix
|
| 16 |
+
|
| 17 |
+
def _bm25_search(query, top_n=10):
|
| 18 |
+
ix = _load_whoosh()
|
| 19 |
+
if not ix:
|
| 20 |
+
return []
|
| 21 |
+
parser = MultifieldParser(["text", "title"], schema=ix.schema)
|
| 22 |
+
q = parser.parse(query)
|
| 23 |
+
with ix.searcher() as s:
|
| 24 |
+
results = s.search(q, limit=top_n)
|
| 25 |
+
return [{"text": r["text"], "file": r.get("file", "")} for r in results]
|
core/van_normalizer.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# core/van_normalizer.py
|
| 2 |
+
import re
|
| 3 |
+
import nltk
|
| 4 |
+
from nltk import pos_tag, word_tokenize
|
| 5 |
+
from nltk.stem import WordNetLemmatizer
|
| 6 |
+
|
| 7 |
+
# make sure you have these (run once if missing):
|
| 8 |
+
# python -m nltk.downloader punkt averaged_perceptron_tagger wordnet omw-1.4
|
| 9 |
+
|
| 10 |
+
lemmatizer = WordNetLemmatizer()
|
| 11 |
+
|
| 12 |
+
def normalize_to_van(text: str) -> str:
|
| 13 |
+
"""
|
| 14 |
+
VAN-based normalization (optimized for clinical trial domain):
|
| 15 |
+
- Lowercases and removes punctuation
|
| 16 |
+
- Tokenizes and POS-tags
|
| 17 |
+
- Keeps only Nouns (N), Adjectives (J), and key Verbs (V)
|
| 18 |
+
- Explicitly removes determiners/articles (a, an, the)
|
| 19 |
+
- Lemmatizes each token to its base form
|
| 20 |
+
- Returns a space-joined string suitable for FAISS embedding
|
| 21 |
+
"""
|
| 22 |
+
if not text:
|
| 23 |
+
return ""
|
| 24 |
+
|
| 25 |
+
# Basic cleanup
|
| 26 |
+
text = text.lower().strip()
|
| 27 |
+
text = re.sub(r"[^a-z0-9\s-]", " ", text) # remove punctuation
|
| 28 |
+
tokens = word_tokenize(text)
|
| 29 |
+
|
| 30 |
+
# POS tagging
|
| 31 |
+
tagged = pos_tag(tokens)
|
| 32 |
+
|
| 33 |
+
filtered = []
|
| 34 |
+
for word, tag in tagged:
|
| 35 |
+
# Skip common determiners, articles, and auxiliary verbs
|
| 36 |
+
if word in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
# Keep only verbs, adjectives, and nouns
|
| 40 |
+
if tag.startswith("V") or tag.startswith("J") or tag.startswith("N"):
|
| 41 |
+
filtered.append((word, tag))
|
| 42 |
+
|
| 43 |
+
# Lemmatize each word to its appropriate part of speech
|
| 44 |
+
lemmas = []
|
| 45 |
+
for word, tag in filtered:
|
| 46 |
+
pos = (
|
| 47 |
+
"v" if tag.startswith("V")
|
| 48 |
+
else "a" if tag.startswith("J")
|
| 49 |
+
else "n"
|
| 50 |
+
)
|
| 51 |
+
lemmas.append(lemmatizer.lemmatize(word, pos))
|
| 52 |
+
|
| 53 |
+
# Join and clean
|
| 54 |
+
normalized = " ".join(lemmas).strip()
|
| 55 |
+
normalized = re.sub(r"\s+", " ", normalized) # collapse multiple spaces
|
| 56 |
+
return normalized
|
| 57 |
+
|
core/vector_search.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
core/vector_search.py
|
| 3 |
+
-----------------------------------------------------
|
| 4 |
+
Performs FAISS semantic search for hybrid retrieval.
|
| 5 |
+
Includes:
|
| 6 |
+
- SentenceTransformer embedding for query
|
| 7 |
+
- FAISS similarity search
|
| 8 |
+
- Metadata + citation extraction
|
| 9 |
+
- Robust fallback if index missing
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import json
|
| 14 |
+
import numpy as np
|
| 15 |
+
import faiss
|
| 16 |
+
from sentence_transformers import SentenceTransformer
|
| 17 |
+
|
| 18 |
+
# Paths (shared with vector_store/vector_sync)
|
| 19 |
+
FAISS_INDEX = "persistent/faiss.index"
|
| 20 |
+
FAISS_META = "persistent/faiss.index.meta.json"
|
| 21 |
+
|
| 22 |
+
_model = None
|
| 23 |
+
_index = None
|
| 24 |
+
_meta = []
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ----------------------------
|
| 28 |
+
# 🔹 Loaders
|
| 29 |
+
# ----------------------------
|
| 30 |
+
def _load_model():
|
| 31 |
+
"""Lazy-load embedding model."""
|
| 32 |
+
global _model
|
| 33 |
+
if _model is None:
|
| 34 |
+
print("📥 Loading embedding model for retrieval...")
|
| 35 |
+
_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 36 |
+
print("✅ Model loaded.")
|
| 37 |
+
return _model
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _load_faiss():
|
| 41 |
+
"""Load FAISS index + metadata, prefer local persistent copy."""
|
| 42 |
+
global _index, _meta
|
| 43 |
+
if _index is not None:
|
| 44 |
+
return _index, _meta
|
| 45 |
+
|
| 46 |
+
local_index = "/home/user/app/persistent/faiss.index"
|
| 47 |
+
local_meta = "/home/user/app/persistent/faiss.index.meta.json"
|
| 48 |
+
|
| 49 |
+
if os.path.exists(local_index) and os.path.exists(local_meta):
|
| 50 |
+
print("📂 [vector_search] Using local FAISS index.")
|
| 51 |
+
_index = faiss.read_index(local_index)
|
| 52 |
+
with open(local_meta, "r", encoding="utf-8") as f:
|
| 53 |
+
_meta = json.load(f)
|
| 54 |
+
print(f"✅ Loaded local FAISS index ({len(_meta)} entries).")
|
| 55 |
+
return _index, _meta
|
| 56 |
+
|
| 57 |
+
print("☁️ [vector_search] Local FAISS missing, using fallback remote index.")
|
| 58 |
+
return _index, _meta
|
| 59 |
+
|
| 60 |
+
# ----------------------------
|
| 61 |
+
# 🔹 Core Query Function
|
| 62 |
+
# ----------------------------
|
| 63 |
+
def query_faiss(query: str, top_k: int = 5):
|
| 64 |
+
"""
|
| 65 |
+
Perform FAISS semantic similarity search.
|
| 66 |
+
Returns:
|
| 67 |
+
results: list of matched text chunks
|
| 68 |
+
meta: list of metadata dicts (with citations)
|
| 69 |
+
"""
|
| 70 |
+
index, meta = _load_faiss()
|
| 71 |
+
if index is None or len(meta) == 0:
|
| 72 |
+
return [], []
|
| 73 |
+
|
| 74 |
+
model = _load_model()
|
| 75 |
+
q_emb = np.array(model.encode([query]), dtype=np.float32)
|
| 76 |
+
D, I = index.search(q_emb, top_k)
|
| 77 |
+
|
| 78 |
+
results, citations = [], []
|
| 79 |
+
for idx in I[0]:
|
| 80 |
+
if 0 <= idx < len(meta):
|
| 81 |
+
doc = meta[idx]
|
| 82 |
+
text = clean_text(doc.get("text", ""))
|
| 83 |
+
src = doc.get("source", "Unknown Source")
|
| 84 |
+
|
| 85 |
+
citation = f"📄 <b>Source:</b> {os.path.basename(src)}"
|
| 86 |
+
results.append(text)
|
| 87 |
+
citations.append(citation)
|
| 88 |
+
|
| 89 |
+
return results, citations
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ----------------------------
|
| 93 |
+
# 🔹 Utilities
|
| 94 |
+
# ----------------------------
|
| 95 |
+
def clean_text(text: str, max_len: int = 800):
|
| 96 |
+
"""
|
| 97 |
+
Truncate and clean text for readability.
|
| 98 |
+
"""
|
| 99 |
+
text = text.replace("\n", " ").replace(" ", " ").strip()
|
| 100 |
+
if len(text) > max_len:
|
| 101 |
+
text = text[:max_len].rsplit(" ", 1)[0] + "..."
|
| 102 |
+
return text
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def has_index():
|
| 106 |
+
"""Check if FAISS index is available."""
|
| 107 |
+
return os.path.exists(FAISS_INDEX) and os.path.exists(FAISS_META)
|
core/vector_store.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
core/vector_store.py
|
| 3 |
+
------------------------------------------------------------
|
| 4 |
+
Unified FAISS + BM25 storage utility for Clinical-Trial Chatbot.
|
| 5 |
+
|
| 6 |
+
✅ Works with glossary.json or FAISS metadata
|
| 7 |
+
✅ Returns normalized dicts for hybrid_retriever
|
| 8 |
+
✅ Adds load_all_text_chunks() for BM25 fallback
|
| 9 |
+
✅ Safe against missing files
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import re
|
| 14 |
+
import json
|
| 15 |
+
import faiss
|
| 16 |
+
from sentence_transformers import SentenceTransformer
|
| 17 |
+
|
| 18 |
+
# Globals used by retriever
|
| 19 |
+
_index = None
|
| 20 |
+
_model = None
|
| 21 |
+
_meta = None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# --------------------------------------------------------------------
|
| 25 |
+
# 1️⃣ Utility: load FAISS index + metadata (MVP version)
|
| 26 |
+
# --------------------------------------------------------------------
|
| 27 |
+
def _ensure_faiss_index():
|
| 28 |
+
"""Load FAISS index and metadata — prefer local persistent files, fallback to Hugging Face dataset."""
|
| 29 |
+
global _index, _model, _meta
|
| 30 |
+
if _index is not None and _meta is not None:
|
| 31 |
+
return True
|
| 32 |
+
|
| 33 |
+
import json
|
| 34 |
+
from huggingface_hub import hf_hub_download
|
| 35 |
+
|
| 36 |
+
local_dir = "/home/user/app/persistent"
|
| 37 |
+
local_index = os.path.join(local_dir, "faiss.index")
|
| 38 |
+
local_meta = os.path.join(local_dir, "faiss.index.meta.json")
|
| 39 |
+
|
| 40 |
+
# 1️⃣ Prefer local FAISS (rebuilt and includes URL + Excel)
|
| 41 |
+
if os.path.exists(local_index) and os.path.exists(local_meta):
|
| 42 |
+
print("📂 Using local FAISS index (includes Excel + Web sources).")
|
| 43 |
+
_index = faiss.read_index(local_index)
|
| 44 |
+
with open(local_meta, "r", encoding="utf-8") as f:
|
| 45 |
+
_meta = json.load(f)
|
| 46 |
+
_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 47 |
+
print(f"✅ [vector_store] Loaded local FAISS ({len(_meta)} vectors).")
|
| 48 |
+
return True
|
| 49 |
+
|
| 50 |
+
# 2️⃣ Fallback: remote dataset
|
| 51 |
+
print("☁️ Local FAISS missing — downloading from Hugging Face dataset...")
|
| 52 |
+
repo_id = "essprasad/CT-Chat-Index"
|
| 53 |
+
repo_type = "dataset"
|
| 54 |
+
runtime_dir = "/home/user/app/runtime_faiss"
|
| 55 |
+
os.makedirs(runtime_dir, exist_ok=True)
|
| 56 |
+
|
| 57 |
+
index_path = hf_hub_download(
|
| 58 |
+
repo_id=repo_id,
|
| 59 |
+
filename="persistent/faiss.index",
|
| 60 |
+
repo_type=repo_type,
|
| 61 |
+
local_dir=runtime_dir,
|
| 62 |
+
cache_dir=runtime_dir,
|
| 63 |
+
force_download=True,
|
| 64 |
+
)
|
| 65 |
+
meta_path = hf_hub_download(
|
| 66 |
+
repo_id=repo_id,
|
| 67 |
+
filename="persistent/faiss.index.meta.json",
|
| 68 |
+
repo_type=repo_type,
|
| 69 |
+
local_dir=runtime_dir,
|
| 70 |
+
cache_dir=runtime_dir,
|
| 71 |
+
force_download=True,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
print(f"🧠 [vector_store] Loading FAISS index + metadata from {runtime_dir} ...")
|
| 75 |
+
_index = faiss.read_index(index_path)
|
| 76 |
+
with open(meta_path, "r", encoding="utf-8") as f:
|
| 77 |
+
_meta = json.load(f)
|
| 78 |
+
_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 79 |
+
print(f"✅ [vector_store] Loaded remote FAISS ({len(_meta)} vectors).")
|
| 80 |
+
return True
|
| 81 |
+
|
| 82 |
+
# --------------------------------------------------------------------
|
| 83 |
+
# 2️⃣ Helper: Load all text chunks (for BM25 fallback)
|
| 84 |
+
# --------------------------------------------------------------------
|
| 85 |
+
def load_all_text_chunks():
|
| 86 |
+
"""
|
| 87 |
+
Return list of dicts for BM25 fallback and inspection.
|
| 88 |
+
Each dict: {'text', 'file', 'source', 'term', '_meta'}
|
| 89 |
+
"""
|
| 90 |
+
meta_path = os.path.join("persistent", "faiss.index.meta.json")
|
| 91 |
+
gloss_path = os.path.join("persistent", "glossary.json")
|
| 92 |
+
docs = []
|
| 93 |
+
|
| 94 |
+
# Prefer FAISS meta (vector_sync output)
|
| 95 |
+
if os.path.exists(meta_path):
|
| 96 |
+
try:
|
| 97 |
+
with open(meta_path, "r", encoding="utf-8") as f:
|
| 98 |
+
meta = json.load(f)
|
| 99 |
+
for m in meta:
|
| 100 |
+
text = m.get("definition") or m.get("text") or m.get("chunk") or ""
|
| 101 |
+
sources = m.get("sources") or m.get("source") or m.get("file") or []
|
| 102 |
+
if isinstance(sources, list) and sources:
|
| 103 |
+
src = sources[0]
|
| 104 |
+
elif isinstance(sources, str) and sources:
|
| 105 |
+
src = sources
|
| 106 |
+
else:
|
| 107 |
+
src = m.get("file") or m.get("source") or "unknown"
|
| 108 |
+
docs.append({
|
| 109 |
+
"text": text,
|
| 110 |
+
"file": src,
|
| 111 |
+
"source": src,
|
| 112 |
+
"term": m.get("term") or m.get("normalized") or "",
|
| 113 |
+
"_meta": m
|
| 114 |
+
})
|
| 115 |
+
return docs
|
| 116 |
+
except Exception as e:
|
| 117 |
+
print(f"⚠️ [vector_store] Failed to read meta.json: {e}")
|
| 118 |
+
|
| 119 |
+
# fallback: glossary.json
|
| 120 |
+
if os.path.exists(gloss_path):
|
| 121 |
+
try:
|
| 122 |
+
with open(gloss_path, "r", encoding="utf-8") as f:
|
| 123 |
+
gloss = json.load(f)
|
| 124 |
+
for k, v in gloss.items():
|
| 125 |
+
term = v.get("term", k)
|
| 126 |
+
definition = v.get("definition", "")
|
| 127 |
+
srcs = v.get("sources", [])
|
| 128 |
+
src = srcs[0] if isinstance(srcs, list) and srcs else (srcs if isinstance(srcs, str) else "glossary")
|
| 129 |
+
docs.append({
|
| 130 |
+
"text": definition,
|
| 131 |
+
"file": src,
|
| 132 |
+
"source": src,
|
| 133 |
+
"term": term,
|
| 134 |
+
"_meta": {"glossary_key": k}
|
| 135 |
+
})
|
| 136 |
+
return docs
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"⚠️ [vector_store] Failed to read glossary.json: {e}")
|
| 139 |
+
|
| 140 |
+
return docs
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
# --------------------------------------------------------------------
|
| 144 |
+
# 3️⃣ FAISS Search
|
| 145 |
+
# --------------------------------------------------------------------
|
| 146 |
+
def search_index(query, top_k=10):
|
| 147 |
+
"""
|
| 148 |
+
Search FAISS and return a list of dict hits for hybrid_retriever.
|
| 149 |
+
Each hit: {'text','file','source','term','_score','_meta'}
|
| 150 |
+
"""
|
| 151 |
+
global _index, _model, _meta
|
| 152 |
+
if not _ensure_faiss_index():
|
| 153 |
+
return []
|
| 154 |
+
|
| 155 |
+
q_emb = _model.encode([query], convert_to_numpy=True).astype("float32")
|
| 156 |
+
faiss.normalize_L2(q_emb)
|
| 157 |
+
D, I = _index.search(q_emb, top_k)
|
| 158 |
+
|
| 159 |
+
results = []
|
| 160 |
+
for score, idx in zip(D[0].tolist(), I[0].tolist()):
|
| 161 |
+
if idx < 0 or idx >= len(_meta):
|
| 162 |
+
continue
|
| 163 |
+
m = _meta[idx] if isinstance(_meta[idx], dict) else {"raw": str(_meta[idx])}
|
| 164 |
+
text = m.get("definition") or m.get("text") or m.get("chunk") or ""
|
| 165 |
+
srcs = m.get("sources") or m.get("source") or m.get("file") or []
|
| 166 |
+
if isinstance(srcs, list) and srcs:
|
| 167 |
+
src = srcs[0]
|
| 168 |
+
elif isinstance(srcs, str) and srcs:
|
| 169 |
+
src = srcs
|
| 170 |
+
else:
|
| 171 |
+
src = m.get("file") or m.get("source") or "unknown"
|
| 172 |
+
|
| 173 |
+
results.append({
|
| 174 |
+
"text": text,
|
| 175 |
+
"file": src,
|
| 176 |
+
"source": src,
|
| 177 |
+
"term": m.get("term") or m.get("normalized") or "",
|
| 178 |
+
"_score": float(score),
|
| 179 |
+
"_meta": m
|
| 180 |
+
})
|
| 181 |
+
return results
|
core/vector_sync.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
core/vector_sync.py
|
| 3 |
+
------------------------------------------------------------
|
| 4 |
+
Handles FAISS index rebuild + upload to Hugging Face dataset
|
| 5 |
+
without caching, optimized for limited HF Space storage.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
import json
|
| 11 |
+
import faiss
|
| 12 |
+
import numpy as np
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from huggingface_hub import HfApi, hf_hub_download, upload_file, HfFolder
|
| 15 |
+
from sentence_transformers import SentenceTransformer
|
| 16 |
+
from nltk.stem import WordNetLemmatizer
|
| 17 |
+
from core.van_normalizer import normalize_to_van
|
| 18 |
+
|
| 19 |
+
# ==========================================================
|
| 20 |
+
# Helper: Upload FAISS index + metadata to dataset safely
|
| 21 |
+
# ==========================================================
|
| 22 |
+
from huggingface_hub import HfApi
|
| 23 |
+
|
| 24 |
+
def _upload_to_dataset(index_path: str, meta_path: str, repo_id: str):
|
| 25 |
+
"""
|
| 26 |
+
Upload FAISS index + metadata to Hugging Face dataset safely.
|
| 27 |
+
Used by rebuild_index() in app.py.
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
print(f"🚀 [vector_sync] Uploading FAISS index + metadata to {repo_id}...")
|
| 31 |
+
api = HfApi()
|
| 32 |
+
|
| 33 |
+
for path in [index_path, meta_path]:
|
| 34 |
+
if not os.path.exists(path):
|
| 35 |
+
print(f"⚠️ [vector_sync] Skipping {os.path.basename(path)} (not found locally).")
|
| 36 |
+
continue
|
| 37 |
+
|
| 38 |
+
api.upload_file(
|
| 39 |
+
path_or_fileobj=path,
|
| 40 |
+
path_in_repo=f"persistent/{os.path.basename(path)}",
|
| 41 |
+
repo_id=repo_id,
|
| 42 |
+
repo_type="dataset",
|
| 43 |
+
commit_message=f"Auto-upload {os.path.basename(path)}",
|
| 44 |
+
)
|
| 45 |
+
print(f"✅ [vector_sync] Uploaded {os.path.basename(path)}")
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"⚠️ [vector_sync] Upload failed: {e}")
|
| 49 |
+
|
| 50 |
+
# --------------------------------------------------------------------
|
| 51 |
+
# ⚙️ CONFIGURATION
|
| 52 |
+
# --------------------------------------------------------------------
|
| 53 |
+
REPO_ID = "essprasad/CT-Chat-Index"
|
| 54 |
+
REPO_TYPE = "dataset"
|
| 55 |
+
REMOTE_DIR = "persistent/"
|
| 56 |
+
FILES = ["faiss.index", "faiss.index.meta.json"]
|
| 57 |
+
|
| 58 |
+
api = HfApi()
|
| 59 |
+
token = HfFolder.get_token() or os.getenv("HF_TOKEN")
|
| 60 |
+
|
| 61 |
+
# --------------------------------------------------------------------
|
| 62 |
+
# 🔹 NORMALIZATION HELPERS
|
| 63 |
+
# --------------------------------------------------------------------
|
| 64 |
+
lemmatizer = WordNetLemmatizer()
|
| 65 |
+
|
| 66 |
+
def normalize_for_index(term: str) -> str:
|
| 67 |
+
"""Normalize term for embedding."""
|
| 68 |
+
if not term:
|
| 69 |
+
return ""
|
| 70 |
+
s = term.lower().strip()
|
| 71 |
+
s = re.sub(r"[\-_/\\.,;:()]+", " ", s)
|
| 72 |
+
s = re.sub(r"\s+", " ", s)
|
| 73 |
+
words = s.split()
|
| 74 |
+
s = " ".join([lemmatizer.lemmatize(w) for w in words])
|
| 75 |
+
return s.strip()
|
| 76 |
+
|
| 77 |
+
def prepare_text_for_embedding(term: str, definition: str) -> str:
|
| 78 |
+
"""Prepare text for embedding with VAN normalization."""
|
| 79 |
+
if not term:
|
| 80 |
+
return ""
|
| 81 |
+
t = term.lower().strip()
|
| 82 |
+
t = re.sub(r"[^\w\s-]", " ", t)
|
| 83 |
+
d = re.sub(r"\s+", " ", definition.strip())
|
| 84 |
+
t_van = normalize_to_van(t)
|
| 85 |
+
return f"{t_van}. {d}".strip()
|
| 86 |
+
|
| 87 |
+
# --------------------------------------------------------------------
|
| 88 |
+
# 🔹 1. IMPORT: Download FAISS from Hub (on-demand)
|
| 89 |
+
# --------------------------------------------------------------------
|
| 90 |
+
def auto_import_from_hub(force=False):
|
| 91 |
+
print(f"📥 [vector_sync] Checking for FAISS index on {REPO_ID}...")
|
| 92 |
+
try:
|
| 93 |
+
for fname in FILES:
|
| 94 |
+
print(f"⬇️ Downloading {fname} ...")
|
| 95 |
+
hf_hub_download(
|
| 96 |
+
repo_id=REPO_ID,
|
| 97 |
+
filename=f"{REMOTE_DIR}{fname}",
|
| 98 |
+
repo_type=REPO_TYPE,
|
| 99 |
+
local_dir="/home/user/app/tmp",
|
| 100 |
+
cache_dir="/home/user/app/tmp",
|
| 101 |
+
local_dir_use_symlinks=False,
|
| 102 |
+
token=token,
|
| 103 |
+
force_download=True,
|
| 104 |
+
)
|
| 105 |
+
print("✅ FAISS index + metadata downloaded.")
|
| 106 |
+
except Exception as e:
|
| 107 |
+
print(f"⚠️ [vector_sync] Could not import FAISS files: {e}")
|
| 108 |
+
|
| 109 |
+
# --------------------------------------------------------------------
|
| 110 |
+
# 🔹 2. EXPORT: Upload FAISS to Hub
|
| 111 |
+
# --------------------------------------------------------------------
|
| 112 |
+
def auto_export_to_hub(commit_msg="Auto-sync after rebuild"):
|
| 113 |
+
"""Uploads FAISS index + metadata from /tmp/ to the dataset."""
|
| 114 |
+
if not token:
|
| 115 |
+
print("⚠️ [vector_sync] No HF token found. Skipping upload.")
|
| 116 |
+
return
|
| 117 |
+
print(f"🚀 [vector_sync] Uploading FAISS index + metadata to {REPO_ID}...")
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
api.upload_file(
|
| 121 |
+
path_or_fileobj="/home/user/app/tmp/faiss.index",
|
| 122 |
+
path_in_repo="persistent/faiss.index",
|
| 123 |
+
repo_id=REPO_ID,
|
| 124 |
+
repo_type=REPO_TYPE,
|
| 125 |
+
token=token,
|
| 126 |
+
commit_message=commit_msg,
|
| 127 |
+
)
|
| 128 |
+
api.upload_file(
|
| 129 |
+
path_or_fileobj="/home/user/app/tmp/faiss.index.meta.json",
|
| 130 |
+
path_in_repo="persistent/faiss.index.meta.json",
|
| 131 |
+
repo_id=REPO_ID,
|
| 132 |
+
repo_type=REPO_TYPE,
|
| 133 |
+
token=token,
|
| 134 |
+
commit_message=commit_msg,
|
| 135 |
+
)
|
| 136 |
+
print("✅ [vector_sync] Upload complete.")
|
| 137 |
+
except Exception as e:
|
| 138 |
+
print(f"⚠️ [vector_sync] Upload failed: {e}")
|
| 139 |
+
|
| 140 |
+
# --------------------------------------------------------------------
|
| 141 |
+
# 🔹 3. REBUILD: Create FAISS from glossary.json
|
| 142 |
+
# --------------------------------------------------------------------
|
| 143 |
+
def rebuild_faiss_from_glossary(
|
| 144 |
+
glossary_path="/home/user/app/persistent/glossary.json",
|
| 145 |
+
model_name="all-MiniLM-L6-v2",
|
| 146 |
+
):
|
| 147 |
+
"""Rebuild FAISS index from glossary.json (no caching, low footprint)."""
|
| 148 |
+
try:
|
| 149 |
+
print(f"🧠 [vector_sync] Rebuilding FAISS from: {glossary_path}")
|
| 150 |
+
if not os.path.isfile(glossary_path):
|
| 151 |
+
print(f"⚠️ Glossary not found: {glossary_path}")
|
| 152 |
+
return None, None
|
| 153 |
+
|
| 154 |
+
with open(glossary_path, "r", encoding="utf-8") as f:
|
| 155 |
+
glossary = json.load(f)
|
| 156 |
+
print(f"📘 Loaded {len(glossary)} glossary entries.")
|
| 157 |
+
|
| 158 |
+
model = SentenceTransformer(model_name)
|
| 159 |
+
texts, metas = [], []
|
| 160 |
+
for k, v in glossary.items():
|
| 161 |
+
term = v.get("term", k)
|
| 162 |
+
definition = v.get("definition", "")
|
| 163 |
+
sources = v.get("sources", [])
|
| 164 |
+
if not definition.strip():
|
| 165 |
+
continue
|
| 166 |
+
combined = prepare_text_for_embedding(term, definition)
|
| 167 |
+
texts.append(combined)
|
| 168 |
+
metas.append({"term": term, "definition": definition, "sources": sources})
|
| 169 |
+
|
| 170 |
+
if not texts:
|
| 171 |
+
print("⚠️ No valid glossary entries for embedding.")
|
| 172 |
+
return None, None
|
| 173 |
+
|
| 174 |
+
print(f"🧩 Encoding {len(texts)} entries with {model_name}...")
|
| 175 |
+
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
|
| 176 |
+
faiss.normalize_L2(embeddings)
|
| 177 |
+
dim = embeddings.shape[1]
|
| 178 |
+
index = faiss.IndexFlatIP(dim)
|
| 179 |
+
index.add(embeddings)
|
| 180 |
+
|
| 181 |
+
tmp_dir = "/home/user/app/tmp"
|
| 182 |
+
os.makedirs(tmp_dir, exist_ok=True)
|
| 183 |
+
tmp_index = os.path.join(tmp_dir, "faiss.index")
|
| 184 |
+
tmp_meta = os.path.join(tmp_dir, "faiss.index.meta.json")
|
| 185 |
+
|
| 186 |
+
faiss.write_index(index, tmp_index)
|
| 187 |
+
with open(tmp_meta, "w", encoding="utf-8") as f:
|
| 188 |
+
json.dump(metas, f, indent=2, ensure_ascii=False)
|
| 189 |
+
|
| 190 |
+
# Upload and cleanup
|
| 191 |
+
auto_export_to_hub("Glossary-based FAISS rebuild")
|
| 192 |
+
os.remove(tmp_index)
|
| 193 |
+
os.remove(tmp_meta)
|
| 194 |
+
|
| 195 |
+
print(f"✅ [vector_sync] Rebuild complete — {len(texts)} vectors uploaded to dataset.")
|
| 196 |
+
return index, metas
|
| 197 |
+
|
| 198 |
+
except Exception as e:
|
| 199 |
+
print(f"⚠️ Error in rebuild_faiss_from_glossary: {e}")
|
| 200 |
+
return None, None
|