Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

essprasad commited on Nov 7

Commit

b816136

verified ·

1 Parent(s): 00341b3

Upload 9 files

Browse files

Files changed (9) hide show

core/bm25.py +58 -0
core/glossary.py +109 -0
core/glossary_builder.py +256 -0
core/hybrid_retriever.py +269 -0
core/retrieval.py +25 -0
core/van_normalizer.py +57 -0
core/vector_search.py +107 -0
core/vector_store.py +181 -0
core/vector_sync.py +200 -0

core/bm25.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+import json
+import re
+import math
+from collections import defaultdict, Counter
+class BM25:
+    def __init__(self, corpus):
+        self.corpus = corpus
+        self.tokenized_corpus = [self._tokenize(doc['text']) for doc in corpus]
+        self.doc_lens = [len(doc) for doc in self.tokenized_corpus]
+        self.avgdl = sum(self.doc_lens) / len(self.doc_lens)
+        self.doc_freqs = self._calc_doc_freqs()
+        self.k1 = 1.5
+        self.b = 0.75
+    def _tokenize(self, text):
+        return re.findall(r"\w+", text.lower())
+    def _calc_doc_freqs(self):
+        freqs = defaultdict(int)
+        for doc in self.tokenized_corpus:
+            for word in set(doc):
+                freqs[word] += 1
+        return freqs
+    def _idf(self, term):
+        N = len(self.tokenized_corpus)
+        df = self.doc_freqs.get(term, 0)
+        return math.log(1 + (N - df + 0.5) / (df + 0.5))
+    def get_scores(self, query_tokens):
+        scores = [0.0] * len(self.tokenized_corpus)
+        for idx, doc in enumerate(self.tokenized_corpus):
+            freqs = Counter(doc)
+            dl = self.doc_lens[idx]
+            for term in query_tokens:
+                idf = self._idf(term)
+                tf = freqs[term]
+                denom = tf + self.k1 * (1 - self.b + self.b * dl / self.avgdl)
+                score = idf * ((tf * (self.k1 + 1)) / denom) if denom != 0 else 0
+                scores[idx] += score
+        return scores
+def search_bm25(query, top_n=10):
+    from core.vector_store import load_all_text_chunks
+    if docs is None:
+        docs = load_all_text_chunks()
+    bm25 = BM25(docs)
+    query_tokens = re.findall(r"\w+", query.lower())
+    scores = bm25.get_scores(query_tokens)
+    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
+    results = []
+    for i in top_indices:
+        doc = docs[i].copy()
+        doc['score'] = scores[i]
+        results.append(doc)
+    return results

core/glossary.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# core/glossary.py
+import json
+import os
+import re
+from difflib import get_close_matches
+from huggingface_hub import hf_hub_download
+GLOSSARY = None
+GLOSSARY_TERMS_CACHE = []  # 🧠 Cache of glossary keys for fuzzy matching
+DATASET_REPO = "essprasad/CT-Chat-Index"
+GLOSSARY_FILENAME = "persistent/glossary.json"
+def _normalize_term(term: str) -> str:
+    """Normalize glossary terms for matching, with fuzzy fallback."""
+    if not term:
+        return ""
+    term = term.lower().strip()
+    term = re.sub(r'[\-_/\\.,;:]+', ' ', term)
+    term = re.sub(r'\s+', ' ', term)
+    # Common clinical research synonym normalization
+    term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf")
+    term = term.replace("electronic case report form", "ecrf")
+    term = term.replace("case report form", "crf")
+    term = term.replace("informed consent form", "icf")
+    term = term.replace("good clinical practice", "gcp")
+    term = term.replace("serious adverse event", "sae")
+    term = term.replace("adverse event", "ae")
+    term = term.replace("21 cfr part 11", "21cfrpart11")
+    term = term.replace("clinical study report", "csr")
+    term = term.strip()
+    # 🧩 Fuzzy matching fallback (for plural/singular or typos)
+    if GLOSSARY_TERMS_CACHE:
+        if term not in GLOSSARY_TERMS_CACHE:
+            close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85)
+            if close:
+                # return the closest key for better recall
+                return close[0]
+    return term
+def _load_glossary():
+    """Load glossary.json from Hugging Face Hub (cached)."""
+    global GLOSSARY, GLOSSARY_TERMS_CACHE
+    if GLOSSARY is not None:
+        return GLOSSARY
+    try:
+        path = hf_hub_download(
+            repo_id=DATASET_REPO,
+            filename=GLOSSARY_FILENAME,
+            repo_type="dataset",
+        )
+        with open(path, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+        GLOSSARY = {}
+        for k, vlist in raw.items():
+            if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k):
+                continue
+            candidate_key = k
+            if isinstance(vlist, dict):
+                candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k
+            norm = _normalize_term(candidate_key)
+            if not norm:
+                continue
+            if isinstance(vlist, dict):
+                dfn = vlist.get("definition") or vlist.get("text") or ""
+                sources = vlist.get("sources", [])
+            elif isinstance(vlist, str):
+                dfn = vlist
+                sources = []
+            else:
+                dfn, sources = "", []
+            if not dfn or len(dfn.strip()) < 5:
+                continue
+            if norm not in GLOSSARY:
+                GLOSSARY[norm] = {
+                    "term": candidate_key.strip(),
+                    "definition": dfn.strip(),
+                    "sources": sources if isinstance(sources, list) else []
+                }
+            else:
+                # Merge sources if already exists
+                existing = GLOSSARY[norm]
+                existing_sources = set(existing.get("sources", []))
+                new_sources = set(sources) if sources else set()
+                existing["sources"] = list(existing_sources.union(new_sources))
+        # 🧠 Store all glossary keys for fuzzy fallback
+        GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys())
+        print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).")
+        return GLOSSARY
+    except Exception as e:
+        print(f"⚠️ Failed to load glossary from Hugging Face: {e}")
+        return {}
+__all__ = ["_load_glossary", "_normalize_term"]

core/glossary_builder.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""
+📘 glossary_builder.py
+Builds a unified glossary from PDFs and Excel files.
+- Extracts terms & definitions from PDFs.
+- Merges Excel glossary (uses all definition-related columns with labeled formatting).
+- Saves combined glossary.json locally and uploads to Hugging Face.
+"""
+import os
+import re
+import json
+import time
+import fitz
+import pandas as pd
+from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download
+# --- Configuration ---
+DATASET_REPO = "essprasad/CT-Chat-Index"
+DOCS_REPO = "essprasad/CT-Chat-Docs"
+LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
+REMOTE_GLOSSARY = "persistent/glossary.json"
+TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")
+# --- Helpers ---
+def normalize_term(term: str) -> str:
+    if not term:
+        return ""
+    s = term.lower().strip()
+    s = re.sub(r"[\-_/\\.,;:]+", " ", s)
+    s = re.sub(r"\s+", " ", s)
+    synonyms = {
+        "electronic case report form": "ecrf",
+        "case report form": "crf",
+        "informed consent form": "icf",
+        "good clinical practice": "gcp",
+        "serious adverse event": "sae",
+        "adverse event": "ae",
+        "21 cfr part 11": "21cfrpart11",
+        "clinical study report": "csr",
+    }
+    return synonyms.get(s, s)
+def extract_text_from_pdf(pdf_path):
+    """Extract plain text from a PDF file."""
+    try:
+        doc = fitz.open(pdf_path)
+        text = "\n".join(page.get_text("text") for page in doc)
+        doc.close()
+        return text.strip()
+    except Exception as e:
+        print(f"⚠️ Failed to read {pdf_path}: {e}")
+        return ""
+def extract_definitions_from_text(text):
+    """Extract glossary-like term-definition pairs from raw PDF text."""
+    glossary = {}
+    text = re.sub(r"\r", "", text)
+    lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
+    i = 0
+    while i < len(lines):
+        term = lines[i].strip()
+        if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
+            i += 1
+            continue
+        if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
+            i += 1
+            continue
+        if term.lower().startswith(("acronym for", "definition", "terms of")):
+            i += 1
+            continue
+        defn_lines = []
+        j = i + 1
+        while j < len(lines):
+            nxt = lines[j].strip()
+            if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
+                if not nxt.lower().startswith(("see also", "for example", "for instance")):
+                    break
+            defn_lines.append(nxt)
+            j += 1
+        definition = " ".join(defn_lines)
+        definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
+        definition = re.sub(r"\s{2,}", " ", definition).strip()
+        if len(definition.split()) < 5 or "." not in definition:
+            i += 1
+            continue
+        norm = normalize_term(term)
+        glossary[norm] = {"term": term, "definition": definition}
+        i = j
+    return glossary
+# --- Main Rebuild Function ---
+def rebuild_and_upload():
+    start = time.time()
+    print("📘 Starting glossary rebuild...")
+    try:
+        all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
+        pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
+        excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
+    except Exception as e:
+        raise RuntimeError(f"Cannot list repo files: {e}")
+    all_defs = {}
+    # --- 1️⃣ Process PDFs ---
+    for pdf in pdfs:
+        skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
+        if any(sp in pdf.lower() for sp in skip_patterns):
+            print(f"⏩ Skipping non-glossary or template file: {pdf}")
+            continue
+        try:
+            print(f"🔍 Processing {pdf}...")
+            path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
+            text = extract_text_from_pdf(path)
+            defs = extract_definitions_from_text(text)
+            for k, v in defs.items():
+                v.setdefault("sources", []).append(pdf)
+                if k in all_defs:
+                    all_defs[k]["sources"] = list(set(all_defs[k].get("sources", []) + v["sources"]))
+                else:
+                    all_defs[k] = v
+        except Exception as e:
+            print(f"⚠️ Failed {pdf}: {e}")
+    # --- 2️⃣ Process Excel Glossaries (MRCT etc.) ---
+    for excel in excels:
+        try:
+            print(f"📗 Checking Excel file in dataset: {excel}")
+            excel_path = hf_hub_download(
+                repo_id=DOCS_REPO,
+                filename=excel,
+                repo_type="dataset",
+                token=TOKEN
+            )
+            print(f"✅ Downloaded Excel file to {excel_path}")
+            xls = pd.read_excel(excel_path, sheet_name=None)
+            total_rows = 0
+            excel_entries = []
+            for sheet_name, df in xls.items():
+                df = df.fillna("").dropna(how="all")
+                if df.shape[0] == 0:
+                    continue
+                df.columns = [str(c).strip() for c in df.columns]
+                # 🧠 Detect the correct 'Glossary Term' column
+                term_col = None
+                for c in df.columns:
+                    if "glossary term" in c.lower():
+                        term_col = c
+                        break
+                if not term_col:
+                    for c in df.columns:
+                        if "cdisc" in c.lower() or c.lower() == "term":
+                            term_col = c
+                            break
+                if not term_col:
+                    print(f"⚠️ No 'Glossary Term' column found in sheet {sheet_name} of {excel_path}")
+                    continue
+                # Concatenate all relevant columns with labels for clarity
+                for _, row in df.iterrows():
+                    term = str(row.get(term_col, "")).strip()
+                    if not term:
+                        continue
+                    def_cols = [
+                        c for c in df.columns
+                        if any(k in c.lower() for k in [
+                            "definition", "context", "info", "related", "resource", "use in context"
+                        ])
+                    ]
+                    def_parts = []
+                    for c in def_cols:
+                        val = str(row.get(c, "")).strip()
+                        if val:
+                            def_parts.append(f"<b>{c}:</b> {val}")
+                    full_definition = "<br>".join(def_parts).strip()
+                    if not full_definition:
+                        continue
+                    entry = {
+                        "term": term,
+                        "definition": full_definition,
+                        "source": os.path.basename(excel_path),
+                        "sheet": sheet_name,
+                        "type": "Excel",
+                    }
+                    excel_entries.append(entry)
+                    total_rows += 1
+            print(f"✅ Added {total_rows} Excel rows from {excel_path}")
+            # Merge into main glossary dictionary
+            for e in excel_entries:
+                norm = normalize_term(e["term"])
+                payload = {
+                    "term": e["term"],
+                    "definition": e["definition"],
+                    "sources": [e["source"]],
+                    "type": e.get("type", "Excel"),
+                    "sheet": e.get("sheet"),
+                }
+                # Each term+source pair stored uniquely to preserve different definitions
+                unique_key = f"{norm}__{os.path.basename(payload['sources'][0])}" if payload.get("sources") else norm
+                if unique_key not in all_defs:
+                    all_defs[unique_key] = payload
+                else:
+                    # Avoid duplicate merges — just append any new sources
+                    existing = all_defs[unique_key]
+                    existing_sources = set(existing.get("sources", []))
+                    new_sources = set(payload.get("sources", []))
+                    existing["sources"] = list(existing_sources.union(new_sources))
+        except Exception as e:
+            print(f"⚠️ Failed to process Excel {excel}: {e}")
+    # --- 3️⃣ Save combined glossary ---
+    os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
+    with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
+        json.dump(all_defs, f, indent=2, ensure_ascii=False)
+    print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")
+    # --- 4️⃣ Upload to Hugging Face ---
+    if TOKEN:
+        try:
+            upload_file(
+                path_or_fileobj=LOCAL_GLOSSARY,
+                path_in_repo=REMOTE_GLOSSARY,
+                repo_id=DATASET_REPO,
+                repo_type="dataset",
+                token=TOKEN,
+                commit_message="Glossary updated with PDF + Excel (column-labeled definitions)",
+            )
+            print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
+        except Exception as e:
+            print(f"⚠️ Upload error: {e}")
+    print(f"✨ Done in {time.time() - start:.1f}s.")
+if __name__ == "__main__":
+    rebuild_and_upload()

core/hybrid_retriever.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+Hybrid Retriever with Glossary + FAISS + BM25 support.
+Includes full-paragraph glossary definitions, acronym expansion, and Excel prioritization.
+"""
+import os
+import re
+import time
+from urllib.parse import urlparse
+from core.glossary import _load_glossary, _normalize_term
+from core.vector_store import _ensure_faiss_index, search_index, load_all_text_chunks
+from core.bm25 import search_bm25
+from utils.nlp_helpers import extract_van_tokens, normalize_query_text
+DOCS_REPO = "essprasad/CT-Chat-Docs"
+DENSE_TOP_K = 10
+def _find_sentence_containing(text: str, phrase: str) -> str:
+    """Return the sentence that contains the given phrase."""
+    if not text or not phrase:
+        return ""
+    sentences = re.split(r"[.!?\n\r]", text)
+    phrase = phrase.lower()
+    for s in sentences:
+        if phrase in s.lower():
+            return s.strip()
+    return ""
+def summarize_combined(query: str, mode="short") -> str:
+    start = time.time()
+    if not query.strip():
+        return "<i>No query provided.</i>"
+    # ------------------------------------------------------------------
+    # 🧠 VAN-Based Query Normalization
+    # ------------------------------------------------------------------
+    expanded_query = normalize_query_text(query)
+    van_tokens = extract_van_tokens(expanded_query)
+    van_query = " ".join(van_tokens).strip()
+    normalized_query = van_query or query
+    print(f"🔍 summarize_combined() query='{query}' van_query='{van_query}' normalized_query='{normalized_query}'")
+    glossary = _load_glossary()
+    # ------------------------------------------------------------------
+    # 1️⃣ Acronym Map (derived from GCDMP_Glossary.pdf)
+    # ------------------------------------------------------------------
+    acronym_map = {
+        "adr": "adverse drug reaction",
+        "ae": "adverse event",
+        "asp": "application service provider",
+        "asq": "american society for quality",
+        "ca": "corrective action",
+        "cdisc": "clinical data interchange standards consortium",
+        "clia": "clinical laboratory improvement amendments",
+        "crf": "case report form",
+        "cro": "contract research organization",
+        "cs": "clinically significant",
+        "ehr": "electronic health record",
+        "emr": "electronic medical record",
+        "eu": "european union",
+        "gcp": "good clinical practice",
+        "idmc": "independent data-monitoring committee",
+        "iec": "independent ethics committee",
+        "ind": "investigational new drug application",
+        "irb": "institutional review board",
+        "iso": "international organization for standardization",
+        "iom": "institute of medicine",
+        "iss": "integrated summary of safety",
+        "ise": "integrated summary of efficacy",
+        "meddra": "medical dictionary for regulatory activities",
+        "mrct": "multi-regional clinical trials",
+        "ncs": "non clinically significant",
+        "nda": "new drug application",
+        "ocr": "optical character recognition",
+        "qa": "quality assurance",
+        "qc": "quality control",
+        "sae": "serious adverse event",
+        "sla": "service level agreement",
+        "sop": "standard operating procedure",
+        "spc": "statistical process control",
+        "sqc": "statistical quality control",
+        "uat": "user acceptance testing",
+        "vcl": "virtual central lab",
+        "whodrug": "world health organization drug dictionary",
+    }
+    acronym_glossary_hits = []
+    # ------------------------------------------------------------------
+    # 2️⃣ Direct Glossary Match (and handle acronyms)
+    # ------------------------------------------------------------------
+    short_candidate = (van_query or normalized_query).strip().lower()
+    glossary_key = _normalize_term(short_candidate)
+    # If query matches acronym, expand it
+    if glossary_key in acronym_map:
+        expansion = acronym_map[glossary_key]
+        glossary_key = _normalize_term(expansion)
+        print(f"🔁 Acronym expansion: '{short_candidate}' → '{expansion}'")
+    if glossary and glossary_key in glossary:
+        entry = glossary[glossary_key]
+        term_display = entry.get("term", glossary_key)
+        dfn = entry.get("definition") or entry.get("text") or ""
+        sources = entry.get("sources", []) or ["unspecified"]
+        html = f"<h3>🧠 Definitions for '{term_display}':</h3>"
+        for src in sources:
+            html += f"🔹 <b>Source:</b> {src}<br><blockquote>{dfn}</blockquote>"
+        print(f"✅ Glossary match for '{glossary_key}' in {time.time() - start:.2f}s")
+        return html
+    # ------------------------------------------------------------------
+    # 3️⃣ FAISS Dense Retrieval
+    # ------------------------------------------------------------------
+    dense_query = normalized_query
+    dense_hits = []
+    try:
+        if _ensure_faiss_index():
+            dense_hits = search_index(dense_query, top_k=DENSE_TOP_K) or []
+    except Exception as e:
+        print(f"⚠️ FAISS search failed: {e}")
+    print(f"📚 Dense hits before filtering: {len(dense_hits)}")
+    # ------------------------------------------------------------------
+    # 4️⃣ Acronym Filtering (Lenient Match)
+    # ------------------------------------------------------------------
+    if len(normalized_query.split()) == 1 and len(normalized_query) <= 5:
+        key = normalized_query.lower()
+        expansion = acronym_map.get(key, key)
+        pattern = re.compile(
+            rf"\b{re.escape(key)}\b|{re.escape(expansion)}|{key}[\s\-\.:;)]|[\(\s]{key}[\s\-\.:;)]",
+            re.IGNORECASE,
+        )
+        filtered_hits = [h for h in dense_hits if pattern.search((h.get("definition") or h.get("text") or "").lower())]
+        print(f"🔍 Filtered acronym hits: {len(filtered_hits)} (lenient match incl. '{expansion}')")
+        dense_hits = filtered_hits
+    else:
+        print(f"ℹ️ No acronym filtering applied (query length > 5 chars)")
+    print(f"📚 Dense hits after filtering: {len(dense_hits)}")
+    # ------------------------------------------------------------------
+    # 5️⃣ BM25 Fallback (Lexical)
+    # ------------------------------------------------------------------
+    bm25_hits = []
+    try:
+        docs = load_all_text_chunks()
+        if docs:
+            bm25_hits = search_bm25(normalized_query, docs, top_n=10)
+    except Exception as e:
+        print(f"⚠️ BM25 fallback failed: {e}")
+    print(f"📑 BM25 hits: {len(bm25_hits)}")
+    # ------------------------------------------------------------------
+    # 🧩 Merge & Prioritize — Keep all per-source definitions
+    # ------------------------------------------------------------------
+    hits = dense_hits + bm25_hits
+    if not hits:
+        return "<i>No relevant information found.</i>"
+    def score_hit(h):
+        """Prioritize Excel > PDF > Web > Other"""
+        text = (h.get("definition") or h.get("text") or "").lower()
+        src = h.get("file") or h.get("source") or ""
+        src_type = (h.get("type") or "").lower()
+        key = (van_query or normalized_query).lower()
+        score = 0
+        if "excel" in src_type or src.lower().endswith((".xls", ".xlsx")):
+            score += 10
+        elif src.lower().endswith(".pdf"):
+            score += 5
+        elif src.startswith("http"):
+            score += 2
+        if key in text:
+            score += 3
+        return -score
+    hits = sorted(hits, key=score_hit)
+    # ------------------------------------------------------------------
+    # 6️⃣ Compose Final Answer — With Icons, Tooltips, & Hyperlinks
+    # ------------------------------------------------------------------
+    answers = []
+    src_counts = {"web": 0, "pdf": 0, "excel": 0, "other": 0}
+    for h in hits:
+        txt = h.get("definition") or h.get("text") or ""
+        if not txt.strip():
+            continue
+        src = h.get("file") or h.get("source") or "unknown"
+        src_base = os.path.basename(src)
+        src_type = (h.get("type") or "").lower()
+        term_name = h.get("term") or (van_query or normalized_query)
+        # --- Categorize source
+        if "excel" in src_type or src.lower().endswith((".xls", ".xlsx")):
+            icon, cat = "📘", "excel"
+        elif "website" in src_type or src.startswith("http"):
+            icon, cat = "🌐", "web"
+        elif src.lower().endswith(".pdf"):
+            icon, cat = "📄", "pdf"
+        else:
+            icon, cat = "📁", "other"
+        src_counts[cat] += 1
+        # --- Extract URL if present
+        url = ""
+        if "http" in src:
+            url = src
+        elif "http" in txt:
+            match = re.search(r"https?://\S+", txt)
+            if match:
+                url = match.group(0).rstrip(".,)")
+        # --- Extract relevant paragraph
+        paragraphs = re.split(r"\n\s*\n", txt)
+        matched_paragraph = ""
+        for p in paragraphs:
+            if normalized_query.lower() in p.lower() or (van_query and van_query.lower() in p.lower()):
+                matched_paragraph = p.strip()
+                break
+        excerpt = matched_paragraph or txt.strip()
+        if len(excerpt) > 2000:
+            excerpt = excerpt[:2000] + "..."
+        # --- Convert URLs and highlight terms
+        excerpt = re.sub(r"(?i)source url:\s*", "", excerpt)
+        excerpt = re.sub(r"(https?://[^\s<>'\"]+)", r"<a href='\1' target='_blank'>\1</a>", excerpt)
+        excerpt = re.sub(f"(?i)({re.escape(normalized_query)})", r"<mark>\1</mark>", excerpt)
+        if term_name:
+            excerpt = re.sub(f"(?i)({re.escape(term_name)})", r"<b>\1</b>", excerpt)
+        # --- Build formatted output
+        if url:
+            parsed = urlparse(url)
+            display_name = parsed.netloc or src_base
+            link_html = f"<b>{icon} <a href='{url}' target='_blank'>{display_name}</a></b>"
+        else:
+            link_html = f"<b>{icon} {src_base}</b>"
+        answers.append(f"{link_html}<br><blockquote>{excerpt}</blockquote>")
+        if len(answers) >= 6:
+            break
+    # ------------------------------------------------------------------
+    # 7️⃣ Final HTML Output
+    # ------------------------------------------------------------------
+    summary_counts = " | ".join(f"{k.capitalize()}: {v}" for k, v in src_counts.items() if v > 0)
+    print(f"✅ Answers from {len(answers)} sources in {time.time() - start:.2f}s")
+    expansion_note = ""
+    if normalized_query.lower() in acronym_map:
+        expansion_note = f"<p><i>🔁 Acronym expanded: <b>{normalized_query.upper()}</b> → {acronym_map[normalized_query.lower()]}</i></p>"
+    return (
+        f"<h3>🧠 Answers (one per source):</h3>"
+        + expansion_note
+        + f"<p><i>Sources → {summary_counts}</i></p>"
+        + "<br>".join(answers)
+    )

core/retrieval.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import re
+import json
+from whoosh.index import open_dir
+from whoosh.qparser import MultifieldParser
+WHOOSH_INDEX_PATH = "/home/user/app/persistent/whoosh_index"
+_ix = None
+def _load_whoosh():
+    global _ix
+    if _ix is None and os.path.exists(WHOOSH_INDEX_PATH):
+        _ix = open_dir(WHOOSH_INDEX_PATH)
+    return _ix
+def _bm25_search(query, top_n=10):
+    ix = _load_whoosh()
+    if not ix:
+        return []
+    parser = MultifieldParser(["text", "title"], schema=ix.schema)
+    q = parser.parse(query)
+    with ix.searcher() as s:
+        results = s.search(q, limit=top_n)
+        return [{"text": r["text"], "file": r.get("file", "")} for r in results]

core/van_normalizer.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# core/van_normalizer.py
+import re
+import nltk
+from nltk import pos_tag, word_tokenize
+from nltk.stem import WordNetLemmatizer
+# make sure you have these (run once if missing):
+# python -m nltk.downloader punkt averaged_perceptron_tagger wordnet omw-1.4
+lemmatizer = WordNetLemmatizer()
+def normalize_to_van(text: str) -> str:
+    """
+    VAN-based normalization (optimized for clinical trial domain):
+    - Lowercases and removes punctuation
+    - Tokenizes and POS-tags
+    - Keeps only Nouns (N), Adjectives (J), and key Verbs (V)
+    - Explicitly removes determiners/articles (a, an, the)
+    - Lemmatizes each token to its base form
+    - Returns a space-joined string suitable for FAISS embedding
+    """
+    if not text:
+        return ""
+    # Basic cleanup
+    text = text.lower().strip()
+    text = re.sub(r"[^a-z0-9\s-]", " ", text)  # remove punctuation
+    tokens = word_tokenize(text)
+    # POS tagging
+    tagged = pos_tag(tokens)
+    filtered = []
+    for word, tag in tagged:
+        # Skip common determiners, articles, and auxiliary verbs
+        if word in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}:
+            continue
+        # Keep only verbs, adjectives, and nouns
+        if tag.startswith("V") or tag.startswith("J") or tag.startswith("N"):
+            filtered.append((word, tag))
+    # Lemmatize each word to its appropriate part of speech
+    lemmas = []
+    for word, tag in filtered:
+        pos = (
+            "v" if tag.startswith("V")
+            else "a" if tag.startswith("J")
+            else "n"
+        )
+        lemmas.append(lemmatizer.lemmatize(word, pos))
+    # Join and clean
+    normalized = " ".join(lemmas).strip()
+    normalized = re.sub(r"\s+", " ", normalized)  # collapse multiple spaces
+    return normalized

core/vector_search.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+core/vector_search.py
+-----------------------------------------------------
+Performs FAISS semantic search for hybrid retrieval.
+Includes:
+- SentenceTransformer embedding for query
+- FAISS similarity search
+- Metadata + citation extraction
+- Robust fallback if index missing
+"""
+import os
+import json
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+# Paths (shared with vector_store/vector_sync)
+FAISS_INDEX = "persistent/faiss.index"
+FAISS_META = "persistent/faiss.index.meta.json"
+_model = None
+_index = None
+_meta = []
+# ----------------------------
+# 🔹 Loaders
+# ----------------------------
+def _load_model():
+    """Lazy-load embedding model."""
+    global _model
+    if _model is None:
+        print("📥 Loading embedding model for retrieval...")
+        _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        print("✅ Model loaded.")
+    return _model
+def _load_faiss():
+    """Load FAISS index + metadata, prefer local persistent copy."""
+    global _index, _meta
+    if _index is not None:
+        return _index, _meta
+    local_index = "/home/user/app/persistent/faiss.index"
+    local_meta = "/home/user/app/persistent/faiss.index.meta.json"
+    if os.path.exists(local_index) and os.path.exists(local_meta):
+        print("📂 [vector_search] Using local FAISS index.")
+        _index = faiss.read_index(local_index)
+        with open(local_meta, "r", encoding="utf-8") as f:
+            _meta = json.load(f)
+        print(f"✅ Loaded local FAISS index ({len(_meta)} entries).")
+        return _index, _meta
+    print("☁️ [vector_search] Local FAISS missing, using fallback remote index.")
+    return _index, _meta
+# ----------------------------
+# 🔹 Core Query Function
+# ----------------------------
+def query_faiss(query: str, top_k: int = 5):
+    """
+    Perform FAISS semantic similarity search.
+    Returns:
+        results: list of matched text chunks
+        meta: list of metadata dicts (with citations)
+    """
+    index, meta = _load_faiss()
+    if index is None or len(meta) == 0:
+        return [], []
+    model = _load_model()
+    q_emb = np.array(model.encode([query]), dtype=np.float32)
+    D, I = index.search(q_emb, top_k)
+    results, citations = [], []
+    for idx in I[0]:
+        if 0 <= idx < len(meta):
+            doc = meta[idx]
+            text = clean_text(doc.get("text", ""))
+            src = doc.get("source", "Unknown Source")
+            citation = f"📄 <b>Source:</b> {os.path.basename(src)}"
+            results.append(text)
+            citations.append(citation)
+    return results, citations
+# ----------------------------
+# 🔹 Utilities
+# ----------------------------
+def clean_text(text: str, max_len: int = 800):
+    """
+    Truncate and clean text for readability.
+    """
+    text = text.replace("\n", " ").replace("  ", " ").strip()
+    if len(text) > max_len:
+        text = text[:max_len].rsplit(" ", 1)[0] + "..."
+    return text
+def has_index():
+    """Check if FAISS index is available."""
+    return os.path.exists(FAISS_INDEX) and os.path.exists(FAISS_META)

core/vector_store.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+core/vector_store.py
+------------------------------------------------------------
+Unified FAISS + BM25 storage utility for Clinical-Trial Chatbot.
+✅ Works with glossary.json or FAISS metadata
+✅ Returns normalized dicts for hybrid_retriever
+✅ Adds load_all_text_chunks() for BM25 fallback
+✅ Safe against missing files
+"""
+import os
+import re
+import json
+import faiss
+from sentence_transformers import SentenceTransformer
+# Globals used by retriever
+_index = None
+_model = None
+_meta = None
+# --------------------------------------------------------------------
+# 1️⃣ Utility: load FAISS index + metadata (MVP version)
+# --------------------------------------------------------------------
+def _ensure_faiss_index():
+    """Load FAISS index and metadata — prefer local persistent files, fallback to Hugging Face dataset."""
+    global _index, _model, _meta
+    if _index is not None and _meta is not None:
+        return True
+    import json
+    from huggingface_hub import hf_hub_download
+    local_dir = "/home/user/app/persistent"
+    local_index = os.path.join(local_dir, "faiss.index")
+    local_meta = os.path.join(local_dir, "faiss.index.meta.json")
+    # 1️⃣ Prefer local FAISS (rebuilt and includes URL + Excel)
+    if os.path.exists(local_index) and os.path.exists(local_meta):
+        print("📂 Using local FAISS index (includes Excel + Web sources).")
+        _index = faiss.read_index(local_index)
+        with open(local_meta, "r", encoding="utf-8") as f:
+            _meta = json.load(f)
+        _model = SentenceTransformer("all-MiniLM-L6-v2")
+        print(f"✅ [vector_store] Loaded local FAISS ({len(_meta)} vectors).")
+        return True
+    # 2️⃣ Fallback: remote dataset
+    print("☁️ Local FAISS missing — downloading from Hugging Face dataset...")
+    repo_id = "essprasad/CT-Chat-Index"
+    repo_type = "dataset"
+    runtime_dir = "/home/user/app/runtime_faiss"
+    os.makedirs(runtime_dir, exist_ok=True)
+    index_path = hf_hub_download(
+        repo_id=repo_id,
+        filename="persistent/faiss.index",
+        repo_type=repo_type,
+        local_dir=runtime_dir,
+        cache_dir=runtime_dir,
+        force_download=True,
+    )
+    meta_path = hf_hub_download(
+        repo_id=repo_id,
+        filename="persistent/faiss.index.meta.json",
+        repo_type=repo_type,
+        local_dir=runtime_dir,
+        cache_dir=runtime_dir,
+        force_download=True,
+    )
+    print(f"🧠 [vector_store] Loading FAISS index + metadata from {runtime_dir} ...")
+    _index = faiss.read_index(index_path)
+    with open(meta_path, "r", encoding="utf-8") as f:
+        _meta = json.load(f)
+    _model = SentenceTransformer("all-MiniLM-L6-v2")
+    print(f"✅ [vector_store] Loaded remote FAISS ({len(_meta)} vectors).")
+    return True
+# --------------------------------------------------------------------
+# 2️⃣ Helper: Load all text chunks (for BM25 fallback)
+# --------------------------------------------------------------------
+def load_all_text_chunks():
+    """
+    Return list of dicts for BM25 fallback and inspection.
+    Each dict: {'text', 'file', 'source', 'term', '_meta'}
+    """
+    meta_path = os.path.join("persistent", "faiss.index.meta.json")
+    gloss_path = os.path.join("persistent", "glossary.json")
+    docs = []
+    # Prefer FAISS meta (vector_sync output)
+    if os.path.exists(meta_path):
+        try:
+            with open(meta_path, "r", encoding="utf-8") as f:
+                meta = json.load(f)
+            for m in meta:
+                text = m.get("definition") or m.get("text") or m.get("chunk") or ""
+                sources = m.get("sources") or m.get("source") or m.get("file") or []
+                if isinstance(sources, list) and sources:
+                    src = sources[0]
+                elif isinstance(sources, str) and sources:
+                    src = sources
+                else:
+                    src = m.get("file") or m.get("source") or "unknown"
+                docs.append({
+                    "text": text,
+                    "file": src,
+                    "source": src,
+                    "term": m.get("term") or m.get("normalized") or "",
+                    "_meta": m
+                })
+            return docs
+        except Exception as e:
+            print(f"⚠️ [vector_store] Failed to read meta.json: {e}")
+    # fallback: glossary.json
+    if os.path.exists(gloss_path):
+        try:
+            with open(gloss_path, "r", encoding="utf-8") as f:
+                gloss = json.load(f)
+            for k, v in gloss.items():
+                term = v.get("term", k)
+                definition = v.get("definition", "")
+                srcs = v.get("sources", [])
+                src = srcs[0] if isinstance(srcs, list) and srcs else (srcs if isinstance(srcs, str) else "glossary")
+                docs.append({
+                    "text": definition,
+                    "file": src,
+                    "source": src,
+                    "term": term,
+                    "_meta": {"glossary_key": k}
+                })
+            return docs
+        except Exception as e:
+            print(f"⚠️ [vector_store] Failed to read glossary.json: {e}")
+    return docs
+# --------------------------------------------------------------------
+# 3️⃣ FAISS Search
+# --------------------------------------------------------------------
+def search_index(query, top_k=10):
+    """
+    Search FAISS and return a list of dict hits for hybrid_retriever.
+    Each hit: {'text','file','source','term','_score','_meta'}
+    """
+    global _index, _model, _meta
+    if not _ensure_faiss_index():
+        return []
+    q_emb = _model.encode([query], convert_to_numpy=True).astype("float32")
+    faiss.normalize_L2(q_emb)
+    D, I = _index.search(q_emb, top_k)
+    results = []
+    for score, idx in zip(D[0].tolist(), I[0].tolist()):
+        if idx < 0 or idx >= len(_meta):
+            continue
+        m = _meta[idx] if isinstance(_meta[idx], dict) else {"raw": str(_meta[idx])}
+        text = m.get("definition") or m.get("text") or m.get("chunk") or ""
+        srcs = m.get("sources") or m.get("source") or m.get("file") or []
+        if isinstance(srcs, list) and srcs:
+            src = srcs[0]
+        elif isinstance(srcs, str) and srcs:
+            src = srcs
+        else:
+            src = m.get("file") or m.get("source") or "unknown"
+        results.append({
+            "text": text,
+            "file": src,
+            "source": src,
+            "term": m.get("term") or m.get("normalized") or "",
+            "_score": float(score),
+            "_meta": m
+        })
+    return results

core/vector_sync.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+core/vector_sync.py
+------------------------------------------------------------
+Handles FAISS index rebuild + upload to Hugging Face dataset
+without caching, optimized for limited HF Space storage.
+"""
+import os
+import re
+import json
+import faiss
+import numpy as np
+from pathlib import Path
+from huggingface_hub import HfApi, hf_hub_download, upload_file, HfFolder
+from sentence_transformers import SentenceTransformer
+from nltk.stem import WordNetLemmatizer
+from core.van_normalizer import normalize_to_van
+# ==========================================================
+# Helper: Upload FAISS index + metadata to dataset safely
+# ==========================================================
+from huggingface_hub import HfApi
+def _upload_to_dataset(index_path: str, meta_path: str, repo_id: str):
+    """
+    Upload FAISS index + metadata to Hugging Face dataset safely.
+    Used by rebuild_index() in app.py.
+    """
+    try:
+        print(f"🚀 [vector_sync] Uploading FAISS index + metadata to {repo_id}...")
+        api = HfApi()
+        for path in [index_path, meta_path]:
+            if not os.path.exists(path):
+                print(f"⚠️ [vector_sync] Skipping {os.path.basename(path)} (not found locally).")
+                continue
+            api.upload_file(
+                path_or_fileobj=path,
+                path_in_repo=f"persistent/{os.path.basename(path)}",
+                repo_id=repo_id,
+                repo_type="dataset",
+                commit_message=f"Auto-upload {os.path.basename(path)}",
+            )
+            print(f"✅ [vector_sync] Uploaded {os.path.basename(path)}")
+    except Exception as e:
+        print(f"⚠️ [vector_sync] Upload failed: {e}")
+# --------------------------------------------------------------------
+# ⚙️ CONFIGURATION
+# --------------------------------------------------------------------
+REPO_ID = "essprasad/CT-Chat-Index"
+REPO_TYPE = "dataset"
+REMOTE_DIR = "persistent/"
+FILES = ["faiss.index", "faiss.index.meta.json"]
+api = HfApi()
+token = HfFolder.get_token() or os.getenv("HF_TOKEN")
+# --------------------------------------------------------------------
+# 🔹 NORMALIZATION HELPERS
+# --------------------------------------------------------------------
+lemmatizer = WordNetLemmatizer()
+def normalize_for_index(term: str) -> str:
+    """Normalize term for embedding."""
+    if not term:
+        return ""
+    s = term.lower().strip()
+    s = re.sub(r"[\-_/\\.,;:()]+", " ", s)
+    s = re.sub(r"\s+", " ", s)
+    words = s.split()
+    s = " ".join([lemmatizer.lemmatize(w) for w in words])
+    return s.strip()
+def prepare_text_for_embedding(term: str, definition: str) -> str:
+    """Prepare text for embedding with VAN normalization."""
+    if not term:
+        return ""
+    t = term.lower().strip()
+    t = re.sub(r"[^\w\s-]", " ", t)
+    d = re.sub(r"\s+", " ", definition.strip())
+    t_van = normalize_to_van(t)
+    return f"{t_van}. {d}".strip()
+# --------------------------------------------------------------------
+# 🔹 1. IMPORT: Download FAISS from Hub (on-demand)
+# --------------------------------------------------------------------
+def auto_import_from_hub(force=False):
+    print(f"📥 [vector_sync] Checking for FAISS index on {REPO_ID}...")
+    try:
+        for fname in FILES:
+            print(f"⬇️ Downloading {fname} ...")
+            hf_hub_download(
+                repo_id=REPO_ID,
+                filename=f"{REMOTE_DIR}{fname}",
+                repo_type=REPO_TYPE,
+                local_dir="/home/user/app/tmp",
+                cache_dir="/home/user/app/tmp",
+                local_dir_use_symlinks=False,
+                token=token,
+                force_download=True,
+            )
+        print("✅ FAISS index + metadata downloaded.")
+    except Exception as e:
+        print(f"⚠️ [vector_sync] Could not import FAISS files: {e}")
+# --------------------------------------------------------------------
+# 🔹 2. EXPORT: Upload FAISS to Hub
+# --------------------------------------------------------------------
+def auto_export_to_hub(commit_msg="Auto-sync after rebuild"):
+    """Uploads FAISS index + metadata from /tmp/ to the dataset."""
+    if not token:
+        print("⚠️ [vector_sync] No HF token found. Skipping upload.")
+        return
+    print(f"🚀 [vector_sync] Uploading FAISS index + metadata to {REPO_ID}...")
+    try:
+        api.upload_file(
+            path_or_fileobj="/home/user/app/tmp/faiss.index",
+            path_in_repo="persistent/faiss.index",
+            repo_id=REPO_ID,
+            repo_type=REPO_TYPE,
+            token=token,
+            commit_message=commit_msg,
+        )
+        api.upload_file(
+            path_or_fileobj="/home/user/app/tmp/faiss.index.meta.json",
+            path_in_repo="persistent/faiss.index.meta.json",
+            repo_id=REPO_ID,
+            repo_type=REPO_TYPE,
+            token=token,
+            commit_message=commit_msg,
+        )
+        print("✅ [vector_sync] Upload complete.")
+    except Exception as e:
+        print(f"⚠️ [vector_sync] Upload failed: {e}")
+# --------------------------------------------------------------------
+# 🔹 3. REBUILD: Create FAISS from glossary.json
+# --------------------------------------------------------------------
+def rebuild_faiss_from_glossary(
+    glossary_path="/home/user/app/persistent/glossary.json",
+    model_name="all-MiniLM-L6-v2",
+):
+    """Rebuild FAISS index from glossary.json (no caching, low footprint)."""
+    try:
+        print(f"🧠 [vector_sync] Rebuilding FAISS from: {glossary_path}")
+        if not os.path.isfile(glossary_path):
+            print(f"⚠️ Glossary not found: {glossary_path}")
+            return None, None
+        with open(glossary_path, "r", encoding="utf-8") as f:
+            glossary = json.load(f)
+        print(f"📘 Loaded {len(glossary)} glossary entries.")
+        model = SentenceTransformer(model_name)
+        texts, metas = [], []
+        for k, v in glossary.items():
+            term = v.get("term", k)
+            definition = v.get("definition", "")
+            sources = v.get("sources", [])
+            if not definition.strip():
+                continue
+            combined = prepare_text_for_embedding(term, definition)
+            texts.append(combined)
+            metas.append({"term": term, "definition": definition, "sources": sources})
+        if not texts:
+            print("⚠️ No valid glossary entries for embedding.")
+            return None, None
+        print(f"🧩 Encoding {len(texts)} entries with {model_name}...")
+        embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
+        faiss.normalize_L2(embeddings)
+        dim = embeddings.shape[1]
+        index = faiss.IndexFlatIP(dim)
+        index.add(embeddings)
+        tmp_dir = "/home/user/app/tmp"
+        os.makedirs(tmp_dir, exist_ok=True)
+        tmp_index = os.path.join(tmp_dir, "faiss.index")
+        tmp_meta = os.path.join(tmp_dir, "faiss.index.meta.json")
+        faiss.write_index(index, tmp_index)
+        with open(tmp_meta, "w", encoding="utf-8") as f:
+            json.dump(metas, f, indent=2, ensure_ascii=False)
+        # Upload and cleanup
+        auto_export_to_hub("Glossary-based FAISS rebuild")
+        os.remove(tmp_index)
+        os.remove(tmp_meta)
+        print(f"✅ [vector_sync] Rebuild complete — {len(texts)} vectors uploaded to dataset.")
+        return index, metas
+    except Exception as e:
+        print(f"⚠️ Error in rebuild_faiss_from_glossary: {e}")
+        return None, None