Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

ClinicalTrialBasics / core /glossary.py

essprasad

Upload 10 files

e61e934 verified 13 days ago

raw

history blame

3.8 kB

	# core/glossary.py

	import json
	import os
	import re
	from difflib import get_close_matches
	from huggingface_hub import hf_hub_download

	GLOSSARY = None
	GLOSSARY_TERMS_CACHE = [] # 🧠 Cache of glossary keys for fuzzy matching
	DATASET_REPO = "essprasad/CT-Chat-Index"
	GLOSSARY_FILENAME = "persistent/glossary.json"


	def _normalize_term(term: str) -> str:
	"""Normalize glossary terms for matching, with fuzzy fallback."""
	if not term:
	return ""
	term = term.lower().strip()
	term = re.sub(r'[\-_/\\.,;:]+', ' ', term)
	term = re.sub(r'\s+', ' ', term)

	# Common clinical research synonym normalization
	term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf")
	term = term.replace("electronic case report form", "ecrf")
	term = term.replace("case report form", "crf")
	term = term.replace("informed consent form", "icf")
	term = term.replace("good clinical practice", "gcp")
	term = term.replace("serious adverse event", "sae")
	term = term.replace("adverse event", "ae")
	term = term.replace("21 cfr part 11", "21cfrpart11")
	term = term.replace("clinical study report", "csr")

	term = term.strip()

	# 🧩 Fuzzy matching fallback (for plural/singular or typos)
	if GLOSSARY_TERMS_CACHE:
	if term not in GLOSSARY_TERMS_CACHE:
	close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85)
	if close:
	# return the closest key for better recall
	return close[0]

	return term


	def _load_glossary():
	"""Load glossary.json from Hugging Face Hub (cached)."""
	global GLOSSARY, GLOSSARY_TERMS_CACHE
	if GLOSSARY is not None:
	return GLOSSARY
	try:
	path = hf_hub_download(
	repo_id=DATASET_REPO,
	filename=GLOSSARY_FILENAME,
	repo_type="dataset",
	)
	with open(path, "r", encoding="utf-8") as f:
	raw = json.load(f)

	GLOSSARY = {}
	for k, vlist in raw.items():
	if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k):
	continue

	candidate_key = k
	if isinstance(vlist, dict):
	candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k

	norm = _normalize_term(candidate_key)
	if not norm:
	continue

	if isinstance(vlist, dict):
	dfn = vlist.get("definition") or vlist.get("text") or ""
	sources = vlist.get("sources", [])
	elif isinstance(vlist, str):
	dfn = vlist
	sources = []
	else:
	dfn, sources = "", []

	if not dfn or len(dfn.strip()) < 5:
	continue

	if norm not in GLOSSARY:
	GLOSSARY[norm] = {
	"term": candidate_key.strip(),
	"definition": dfn.strip(),
	"sources": sources if isinstance(sources, list) else []
	}
	else:
	# Merge sources if already exists
	existing = GLOSSARY[norm]
	existing_sources = set(existing.get("sources", []))
	new_sources = set(sources) if sources else set()
	existing["sources"] = list(existing_sources.union(new_sources))

	# 🧠 Store all glossary keys for fuzzy fallback
	GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys())

	print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).")
	return GLOSSARY
	except Exception as e:
	print(f"⚠️ Failed to load glossary from Hugging Face: {e}")
	return {}


	__all__ = ["_load_glossary", "_normalize_term"]