# core/glossary.py import json import os import re from difflib import get_close_matches from huggingface_hub import hf_hub_download GLOSSARY = None GLOSSARY_TERMS_CACHE = [] # 🧠 Cache of glossary keys for fuzzy matching DATASET_REPO = "essprasad/CT-Chat-Index" GLOSSARY_FILENAME = "persistent/glossary.json" def _normalize_term(term: str) -> str: """Normalize glossary terms for matching, with fuzzy fallback.""" if not term: return "" term = term.lower().strip() term = re.sub(r'[\-_/\\.,;:]+', ' ', term) term = re.sub(r'\s+', ' ', term) # Common clinical research synonym normalization term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf") term = term.replace("electronic case report form", "ecrf") term = term.replace("case report form", "crf") term = term.replace("informed consent form", "icf") term = term.replace("good clinical practice", "gcp") term = term.replace("serious adverse event", "sae") term = term.replace("adverse event", "ae") term = term.replace("21 cfr part 11", "21cfrpart11") term = term.replace("clinical study report", "csr") term = term.strip() # 🧩 Fuzzy matching fallback (for plural/singular or typos) if GLOSSARY_TERMS_CACHE: if term not in GLOSSARY_TERMS_CACHE: close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85) if close: # return the closest key for better recall return close[0] return term def _load_glossary(): """Load glossary.json from Hugging Face Hub (cached).""" global GLOSSARY, GLOSSARY_TERMS_CACHE if GLOSSARY is not None: return GLOSSARY try: path = hf_hub_download( repo_id=DATASET_REPO, filename=GLOSSARY_FILENAME, repo_type="dataset", ) with open(path, "r", encoding="utf-8") as f: raw = json.load(f) GLOSSARY = {} for k, vlist in raw.items(): if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k): continue candidate_key = k if isinstance(vlist, dict): candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k norm = _normalize_term(candidate_key) if not norm: continue if isinstance(vlist, dict): dfn = vlist.get("definition") or vlist.get("text") or "" sources = vlist.get("sources", []) elif isinstance(vlist, str): dfn = vlist sources = [] else: dfn, sources = "", [] if not dfn or len(dfn.strip()) < 5: continue if norm not in GLOSSARY: GLOSSARY[norm] = { "term": candidate_key.strip(), "definition": dfn.strip(), "sources": sources if isinstance(sources, list) else [] } else: # Merge sources if already exists existing = GLOSSARY[norm] existing_sources = set(existing.get("sources", [])) new_sources = set(sources) if sources else set() existing["sources"] = list(existing_sources.union(new_sources)) # 🧠 Store all glossary keys for fuzzy fallback GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys()) print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).") return GLOSSARY except Exception as e: print(f"⚠️ Failed to load glossary from Hugging Face: {e}") return {} __all__ = ["_load_glossary", "_normalize_term"]