Spaces:

essprasad
/

ClinicalTrialBasics

Running

App Files Files Community

ClinicalTrialBasics / core /glossary_builder.py

essprasad

Upload 9 files

b816136 verified about 1 month ago

raw

history blame

9.52 kB

	"""
	📘 glossary_builder.py
	Builds a unified glossary from PDFs and Excel files.
	- Extracts terms & definitions from PDFs.
	- Merges Excel glossary (uses all definition-related columns with labeled formatting).
	- Saves combined glossary.json locally and uploads to Hugging Face.
	"""

	import os
	import re
	import json
	import time
	import fitz
	import pandas as pd
	from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download

	# --- Configuration ---
	DATASET_REPO = "essprasad/CT-Chat-Index"
	DOCS_REPO = "essprasad/CT-Chat-Docs"
	LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
	REMOTE_GLOSSARY = "persistent/glossary.json"
	TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")


	# --- Helpers ---
	def normalize_term(term: str) -> str:
	if not term:
	return ""
	s = term.lower().strip()
	s = re.sub(r"[\-_/\\.,;:]+", " ", s)
	s = re.sub(r"\s+", " ", s)
	synonyms = {
	"electronic case report form": "ecrf",
	"case report form": "crf",
	"informed consent form": "icf",
	"good clinical practice": "gcp",
	"serious adverse event": "sae",
	"adverse event": "ae",
	"21 cfr part 11": "21cfrpart11",
	"clinical study report": "csr",
	}
	return synonyms.get(s, s)


	def extract_text_from_pdf(pdf_path):
	"""Extract plain text from a PDF file."""
	try:
	doc = fitz.open(pdf_path)
	text = "\n".join(page.get_text("text") for page in doc)
	doc.close()
	return text.strip()
	except Exception as e:
	print(f"⚠️ Failed to read {pdf_path}: {e}")
	return ""


	def extract_definitions_from_text(text):
	"""Extract glossary-like term-definition pairs from raw PDF text."""
	glossary = {}
	text = re.sub(r"\r", "", text)
	lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
	i = 0
	while i < len(lines):
	term = lines[i].strip()
	if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
	i += 1
	continue
	if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
	i += 1
	continue
	if term.lower().startswith(("acronym for", "definition", "terms of")):
	i += 1
	continue

	defn_lines = []
	j = i + 1
	while j < len(lines):
	nxt = lines[j].strip()
	if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
	if not nxt.lower().startswith(("see also", "for example", "for instance")):
	break
	defn_lines.append(nxt)
	j += 1

	definition = " ".join(defn_lines)
	definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
	definition = re.sub(r"\s{2,}", " ", definition).strip()

	if len(definition.split()) < 5 or "." not in definition:
	i += 1
	continue

	norm = normalize_term(term)
	glossary[norm] = {"term": term, "definition": definition}
	i = j

	return glossary


	# --- Main Rebuild Function ---
	def rebuild_and_upload():
	start = time.time()
	print("📘 Starting glossary rebuild...")
	try:
	all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
	pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
	excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
	except Exception as e:
	raise RuntimeError(f"Cannot list repo files: {e}")

	all_defs = {}

	# --- 1️⃣ Process PDFs ---
	for pdf in pdfs:
	skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
	if any(sp in pdf.lower() for sp in skip_patterns):
	print(f"⏩ Skipping non-glossary or template file: {pdf}")
	continue
	try:
	print(f"🔍 Processing {pdf}...")
	path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
	text = extract_text_from_pdf(path)
	defs = extract_definitions_from_text(text)
	for k, v in defs.items():
	v.setdefault("sources", []).append(pdf)
	if k in all_defs:
	all_defs[k]["sources"] = list(set(all_defs[k].get("sources", []) + v["sources"]))
	else:
	all_defs[k] = v
	except Exception as e:
	print(f"⚠️ Failed {pdf}: {e}")

	# --- 2️⃣ Process Excel Glossaries (MRCT etc.) ---
	for excel in excels:
	try:
	print(f"📗 Checking Excel file in dataset: {excel}")
	excel_path = hf_hub_download(
	repo_id=DOCS_REPO,
	filename=excel,
	repo_type="dataset",
	token=TOKEN
	)
	print(f"✅ Downloaded Excel file to {excel_path}")
	xls = pd.read_excel(excel_path, sheet_name=None)

	total_rows = 0
	excel_entries = []

	for sheet_name, df in xls.items():
	df = df.fillna("").dropna(how="all")
	if df.shape[0] == 0:
	continue
	df.columns = [str(c).strip() for c in df.columns]

	# 🧠 Detect the correct 'Glossary Term' column
	term_col = None
	for c in df.columns:
	if "glossary term" in c.lower():
	term_col = c
	break
	if not term_col:
	for c in df.columns:
	if "cdisc" in c.lower() or c.lower() == "term":
	term_col = c
	break
	if not term_col:
	print(f"⚠️ No 'Glossary Term' column found in sheet {sheet_name} of {excel_path}")
	continue

	# Concatenate all relevant columns with labels for clarity
	for _, row in df.iterrows():
	term = str(row.get(term_col, "")).strip()
	if not term:
	continue

	def_cols = [
	c for c in df.columns
	if any(k in c.lower() for k in [
	"definition", "context", "info", "related", "resource", "use in context"
	])
	]

	def_parts = []
	for c in def_cols:
	val = str(row.get(c, "")).strip()
	if val:
	def_parts.append(f"<b>{c}:</b> {val}")

	full_definition = "<br>".join(def_parts).strip()
	if not full_definition:
	continue

	entry = {
	"term": term,
	"definition": full_definition,
	"source": os.path.basename(excel_path),
	"sheet": sheet_name,
	"type": "Excel",
	}
	excel_entries.append(entry)
	total_rows += 1

	print(f"✅ Added {total_rows} Excel rows from {excel_path}")

	# Merge into main glossary dictionary
	for e in excel_entries:
	norm = normalize_term(e["term"])
	payload = {
	"term": e["term"],
	"definition": e["definition"],
	"sources": [e["source"]],
	"type": e.get("type", "Excel"),
	"sheet": e.get("sheet"),
	}
	# Each term+source pair stored uniquely to preserve different definitions
	unique_key = f"{norm}__{os.path.basename(payload['sources'][0])}" if payload.get("sources") else norm

	if unique_key not in all_defs:
	all_defs[unique_key] = payload
	else:
	# Avoid duplicate merges — just append any new sources
	existing = all_defs[unique_key]
	existing_sources = set(existing.get("sources", []))
	new_sources = set(payload.get("sources", []))
	existing["sources"] = list(existing_sources.union(new_sources))

	except Exception as e:
	print(f"⚠️ Failed to process Excel {excel}: {e}")

	# --- 3️⃣ Save combined glossary ---
	os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
	with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
	json.dump(all_defs, f, indent=2, ensure_ascii=False)

	print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")

	# --- 4️⃣ Upload to Hugging Face ---
	if TOKEN:
	try:
	upload_file(
	path_or_fileobj=LOCAL_GLOSSARY,
	path_in_repo=REMOTE_GLOSSARY,
	repo_id=DATASET_REPO,
	repo_type="dataset",
	token=TOKEN,
	commit_message="Glossary updated with PDF + Excel (column-labeled definitions)",
	)
	print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
	except Exception as e:
	print(f"⚠️ Upload error: {e}")

	print(f"✨ Done in {time.time() - start:.1f}s.")


	if __name__ == "__main__":
	rebuild_and_upload()