Spaces:
Running
Running
| # core/glossary.py | |
| import json | |
| import os | |
| import re | |
| from difflib import get_close_matches | |
| from huggingface_hub import hf_hub_download | |
| GLOSSARY = None | |
| GLOSSARY_TERMS_CACHE = [] # 🧠 Cache of glossary keys for fuzzy matching | |
| DATASET_REPO = "essprasad/CT-Chat-Index" | |
| GLOSSARY_FILENAME = "persistent/glossary.json" | |
| def _normalize_term(term: str) -> str: | |
| """Normalize glossary terms for matching, with fuzzy fallback.""" | |
| if not term: | |
| return "" | |
| term = term.lower().strip() | |
| term = re.sub(r'[\-_/\\.,;:]+', ' ', term) | |
| term = re.sub(r'\s+', ' ', term) | |
| # Common clinical research synonym normalization | |
| term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf") | |
| term = term.replace("electronic case report form", "ecrf") | |
| term = term.replace("case report form", "crf") | |
| term = term.replace("informed consent form", "icf") | |
| term = term.replace("good clinical practice", "gcp") | |
| term = term.replace("serious adverse event", "sae") | |
| term = term.replace("adverse event", "ae") | |
| term = term.replace("21 cfr part 11", "21cfrpart11") | |
| term = term.replace("clinical study report", "csr") | |
| term = term.strip() | |
| # 🧩 Fuzzy matching fallback (for plural/singular or typos) | |
| if GLOSSARY_TERMS_CACHE: | |
| if term not in GLOSSARY_TERMS_CACHE: | |
| close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85) | |
| if close: | |
| # return the closest key for better recall | |
| return close[0] | |
| return term | |
| def _load_glossary(): | |
| """Load glossary.json from Hugging Face Hub (cached).""" | |
| global GLOSSARY, GLOSSARY_TERMS_CACHE | |
| if GLOSSARY is not None: | |
| return GLOSSARY | |
| try: | |
| path = hf_hub_download( | |
| repo_id=DATASET_REPO, | |
| filename=GLOSSARY_FILENAME, | |
| repo_type="dataset", | |
| ) | |
| with open(path, "r", encoding="utf-8") as f: | |
| raw = json.load(f) | |
| GLOSSARY = {} | |
| for k, vlist in raw.items(): | |
| if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k): | |
| continue | |
| candidate_key = k | |
| if isinstance(vlist, dict): | |
| candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k | |
| norm = _normalize_term(candidate_key) | |
| if not norm: | |
| continue | |
| if isinstance(vlist, dict): | |
| dfn = vlist.get("definition") or vlist.get("text") or "" | |
| sources = vlist.get("sources", []) | |
| elif isinstance(vlist, str): | |
| dfn = vlist | |
| sources = [] | |
| else: | |
| dfn, sources = "", [] | |
| if not dfn or len(dfn.strip()) < 5: | |
| continue | |
| if norm not in GLOSSARY: | |
| GLOSSARY[norm] = { | |
| "term": candidate_key.strip(), | |
| "definition": dfn.strip(), | |
| "sources": sources if isinstance(sources, list) else [] | |
| } | |
| else: | |
| # Merge sources if already exists | |
| existing = GLOSSARY[norm] | |
| existing_sources = set(existing.get("sources", [])) | |
| new_sources = set(sources) if sources else set() | |
| existing["sources"] = list(existing_sources.union(new_sources)) | |
| # 🧠 Store all glossary keys for fuzzy fallback | |
| GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys()) | |
| print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).") | |
| return GLOSSARY | |
| except Exception as e: | |
| print(f"⚠️ Failed to load glossary from Hugging Face: {e}") | |
| return {} | |
| __all__ = ["_load_glossary", "_normalize_term"] | |