Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

ClinicalTrialBasics / core /glossary_builder.py

essprasad

Upload 10 files

e61e934 verified 16 days ago

raw

history blame

9.81 kB

	"""
	📘 glossary_builder.py
	Builds a unified glossary from PDFs, Excel, and Web sources.
	- Extracts terms & definitions from PDFs.
	- Merges Excel glossary (with labeled formatting).
	- Optionally fetches glossary or definitions from known web sources.
	- Adds source typing (pdf/excel/web/other).
	- Saves combined glossary.json locally and uploads to Hugging Face.
	"""

	import os
	import re
	import json
	import time
	import fitz
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup
	from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download

	# --- Configuration ---
	DATASET_REPO = "essprasad/CT-Chat-Index"
	DOCS_REPO = "essprasad/CT-Chat-Docs"
	LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
	REMOTE_GLOSSARY = "persistent/glossary.json"
	TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")

	# Known web glossary sources (can expand)
	WEB_SOURCES = [
	"https://mrctcenter.org/glossaryterm/clinical-research/",
	"https://www.fda.gov/patients/drug-development-process/step-3-clinical-research",
	"https://www.cdisc.org/",
	"https://www.ich.org/",
	"https://www.ema.europa.eu/",
	"https://www.who.int/",
	"https://clinicaltrials.gov/",
	]


	# --- Helpers ---
	def normalize_term(term: str) -> str:
	if not term:
	return ""
	s = term.lower().strip()
	s = re.sub(r"[\-_/\\.,;:]+", " ", s)
	s = re.sub(r"\s+", " ", s)
	synonyms = {
	"electronic case report form": "ecrf",
	"case report form": "crf",
	"informed consent form": "icf",
	"good clinical practice": "gcp",
	"serious adverse event": "sae",
	"adverse event": "ae",
	"21 cfr part 11": "21cfrpart11",
	"clinical study report": "csr",
	}
	return synonyms.get(s, s)


	def extract_text_from_pdf(pdf_path):
	try:
	doc = fitz.open(pdf_path)
	text = "\n".join(page.get_text("text") for page in doc)
	doc.close()
	return text.strip()
	except Exception as e:
	print(f"⚠️ Failed to read {pdf_path}: {e}")
	return ""


	def extract_definitions_from_text(text):
	"""Extract glossary-like term-definition pairs from raw PDF text."""
	glossary = {}
	text = re.sub(r"\r", "", text)
	lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
	i = 0
	while i < len(lines):
	term = lines[i].strip()
	if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
	i += 1
	continue
	if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
	i += 1
	continue
	if term.lower().startswith(("acronym for", "definition", "terms of")):
	i += 1
	continue

	defn_lines = []
	j = i + 1
	while j < len(lines):
	nxt = lines[j].strip()
	if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
	if not nxt.lower().startswith(("see also", "for example", "for instance")):
	break
	defn_lines.append(nxt)
	j += 1

	definition = " ".join(defn_lines)
	definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
	definition = re.sub(r"\s{2,}", " ", definition).strip()
	if len(definition.split()) < 5 or "." not in definition:
	i += 1
	continue

	norm = normalize_term(term)
	glossary[norm] = {"term": term, "definition": definition}
	i = j
	return glossary


	def detect_source_type(src: str) -> str:
	if not src:
	return "other"
	src = src.lower()
	if src.endswith(".pdf"):
	return "pdf"
	if src.endswith((".xlsx", ".xls")):
	return "excel"
	if src.startswith("http"):
	return "web"
	return "other"


	def extract_web_glossary(url):
	"""Scrape simple web glossary or definition pages."""
	results = []
	try:
	print(f"🌐 Fetching {url}...")
	resp = requests.get(url, timeout=10)
	if resp.status_code != 200:
	print(f"⚠️ Skipped {url}: HTTP {resp.status_code}")
	return []
	soup = BeautifulSoup(resp.text, "html.parser")
	text = soup.get_text(separator="\n")
	text = re.sub(r"\s{2,}", " ", text).strip()

	# Heuristic: pick possible term-definition snippets
	matches = re.findall(r"([A-Z][A-Za-z0-9\s]{2,30})[:\-]\s*(.{10,200})", text)
	for term, definition in matches[:50]:
	definition = re.sub(r"\s{2,}", " ", definition).strip()
	if len(definition.split()) > 3:
	results.append({
	"term": term.strip(),
	"definition": definition,
	"sources": [url],
	"file": url,
	"type": "web",
	})
	except Exception as e:
	print(f"⚠️ Web fetch failed for {url}: {e}")
	return results


	# --- Main Rebuild Function ---
	def rebuild_and_upload():
	start = time.time()
	print("📘 Starting glossary rebuild...")

	try:
	all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
	pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
	excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
	except Exception as e:
	raise RuntimeError(f"Cannot list repo files: {e}")

	all_defs = {}

	# --- 1️⃣ Process PDFs ---
	for pdf in pdfs:
	skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
	if any(sp in pdf.lower() for sp in skip_patterns):
	print(f"⏩ Skipping non-glossary or template file: {pdf}")
	continue
	try:
	print(f"🔍 Processing {pdf}...")
	path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
	text = extract_text_from_pdf(path)
	defs = extract_definitions_from_text(text)
	for k, v in defs.items():
	v.setdefault("sources", []).append(pdf)
	v["type"] = "pdf"
	v["file"] = os.path.basename(pdf)
	all_defs[f"{k}__{v['file']}"] = v
	except Exception as e:
	print(f"⚠️ Failed {pdf}: {e}")

	# --- 2️⃣ Process Excel Glossaries ---
	for excel in excels:
	try:
	print(f"📗 Checking Excel file in dataset: {excel}")
	excel_path = hf_hub_download(repo_id=DOCS_REPO, filename=excel, repo_type="dataset", token=TOKEN)
	print(f"✅ Downloaded Excel file to {excel_path}")
	xls = pd.read_excel(excel_path, sheet_name=None)
	for sheet_name, df in xls.items():
	df = df.fillna("").dropna(how="all")
	if df.empty:
	continue
	df.columns = [str(c).strip() for c in df.columns]
	term_col = next((c for c in df.columns if "glossary term" in c.lower() or "term" == c.lower()), None)
	if not term_col:
	continue
	for _, row in df.iterrows():
	term = str(row.get(term_col, "")).strip()
	if not term:
	continue
	def_cols = [c for c in df.columns if any(k in c.lower() for k in ["definition", "context", "info", "related", "resource"])]
	def_parts = [f"<b>{c}:</b> {row[c]}" for c in def_cols if str(row[c]).strip()]
	if not def_parts:
	continue
	entry = {
	"term": term,
	"definition": "<br>".join(def_parts),
	"sources": [os.path.basename(excel_path)],
	"file": os.path.basename(excel_path),
	"sheet": sheet_name,
	"type": "excel",
	}
	all_defs[f"{normalize_term(term)}__{entry['file']}"] = entry
	print(f"✅ Processed Excel file {excel}")
	except Exception as e:
	print(f"⚠️ Failed Excel {excel}: {e}")

	# --- 3️⃣ Process Web Glossaries ---
	web_entries = []
	for url in WEB_SOURCES:
	entries = extract_web_glossary(url)
	for e in entries:
	key = f"{normalize_term(e['term'])}__{e['file']}"
	all_defs[key] = e
	web_entries.append(e)
	print(f"✅ Added {len(web_entries)} web glossary entries.")

	# --- 4️⃣ Cleanup & Save ---
	for k, v in all_defs.items():
	v["sources"] = v.get("sources", [])
	if not isinstance(v["sources"], list):
	v["sources"] = [v["sources"]]
	if not v.get("type"):
	v["type"] = detect_source_type(v["sources"][0] if v["sources"] else "")

	os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
	with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
	json.dump(list(all_defs.values()), f, indent=2, ensure_ascii=False)

	print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")

	# --- 5️⃣ Upload to Hugging Face ---
	if TOKEN:
	try:
	upload_file(
	path_or_fileobj=LOCAL_GLOSSARY,
	path_in_repo=REMOTE_GLOSSARY,
	repo_id=DATASET_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message="Glossary updated (PDF + Excel + Web sources)",
	)
	print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
	except Exception as e:
	print(f"⚠️ Upload error: {e}")

	print(f"✨ Done in {time.time() - start:.1f}s.")


	if __name__ == "__main__":
	rebuild_and_upload()