ClinicalTrialBasics / core /glossary.py
essprasad's picture
Upload 10 files
e61e934 verified
raw
history blame
3.8 kB
# core/glossary.py
import json
import os
import re
from difflib import get_close_matches
from huggingface_hub import hf_hub_download
GLOSSARY = None
GLOSSARY_TERMS_CACHE = [] # 🧠 Cache of glossary keys for fuzzy matching
DATASET_REPO = "essprasad/CT-Chat-Index"
GLOSSARY_FILENAME = "persistent/glossary.json"
def _normalize_term(term: str) -> str:
"""Normalize glossary terms for matching, with fuzzy fallback."""
if not term:
return ""
term = term.lower().strip()
term = re.sub(r'[\-_/\\.,;:]+', ' ', term)
term = re.sub(r'\s+', ' ', term)
# Common clinical research synonym normalization
term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf")
term = term.replace("electronic case report form", "ecrf")
term = term.replace("case report form", "crf")
term = term.replace("informed consent form", "icf")
term = term.replace("good clinical practice", "gcp")
term = term.replace("serious adverse event", "sae")
term = term.replace("adverse event", "ae")
term = term.replace("21 cfr part 11", "21cfrpart11")
term = term.replace("clinical study report", "csr")
term = term.strip()
# 🧩 Fuzzy matching fallback (for plural/singular or typos)
if GLOSSARY_TERMS_CACHE:
if term not in GLOSSARY_TERMS_CACHE:
close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85)
if close:
# return the closest key for better recall
return close[0]
return term
def _load_glossary():
"""Load glossary.json from Hugging Face Hub (cached)."""
global GLOSSARY, GLOSSARY_TERMS_CACHE
if GLOSSARY is not None:
return GLOSSARY
try:
path = hf_hub_download(
repo_id=DATASET_REPO,
filename=GLOSSARY_FILENAME,
repo_type="dataset",
)
with open(path, "r", encoding="utf-8") as f:
raw = json.load(f)
GLOSSARY = {}
for k, vlist in raw.items():
if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k):
continue
candidate_key = k
if isinstance(vlist, dict):
candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k
norm = _normalize_term(candidate_key)
if not norm:
continue
if isinstance(vlist, dict):
dfn = vlist.get("definition") or vlist.get("text") or ""
sources = vlist.get("sources", [])
elif isinstance(vlist, str):
dfn = vlist
sources = []
else:
dfn, sources = "", []
if not dfn or len(dfn.strip()) < 5:
continue
if norm not in GLOSSARY:
GLOSSARY[norm] = {
"term": candidate_key.strip(),
"definition": dfn.strip(),
"sources": sources if isinstance(sources, list) else []
}
else:
# Merge sources if already exists
existing = GLOSSARY[norm]
existing_sources = set(existing.get("sources", []))
new_sources = set(sources) if sources else set()
existing["sources"] = list(existing_sources.union(new_sources))
# 🧠 Store all glossary keys for fuzzy fallback
GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys())
print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).")
return GLOSSARY
except Exception as e:
print(f"⚠️ Failed to load glossary from Hugging Face: {e}")
return {}
__all__ = ["_load_glossary", "_normalize_term"]