ClinicalTrialBasics / core /glossary_builder.py
essprasad's picture
Upload 9 files
b816136 verified
raw
history blame
9.52 kB
"""
📘 glossary_builder.py
Builds a unified glossary from PDFs and Excel files.
- Extracts terms & definitions from PDFs.
- Merges Excel glossary (uses all definition-related columns with labeled formatting).
- Saves combined glossary.json locally and uploads to Hugging Face.
"""
import os
import re
import json
import time
import fitz
import pandas as pd
from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download
# --- Configuration ---
DATASET_REPO = "essprasad/CT-Chat-Index"
DOCS_REPO = "essprasad/CT-Chat-Docs"
LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
REMOTE_GLOSSARY = "persistent/glossary.json"
TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")
# --- Helpers ---
def normalize_term(term: str) -> str:
if not term:
return ""
s = term.lower().strip()
s = re.sub(r"[\-_/\\.,;:]+", " ", s)
s = re.sub(r"\s+", " ", s)
synonyms = {
"electronic case report form": "ecrf",
"case report form": "crf",
"informed consent form": "icf",
"good clinical practice": "gcp",
"serious adverse event": "sae",
"adverse event": "ae",
"21 cfr part 11": "21cfrpart11",
"clinical study report": "csr",
}
return synonyms.get(s, s)
def extract_text_from_pdf(pdf_path):
"""Extract plain text from a PDF file."""
try:
doc = fitz.open(pdf_path)
text = "\n".join(page.get_text("text") for page in doc)
doc.close()
return text.strip()
except Exception as e:
print(f"⚠️ Failed to read {pdf_path}: {e}")
return ""
def extract_definitions_from_text(text):
"""Extract glossary-like term-definition pairs from raw PDF text."""
glossary = {}
text = re.sub(r"\r", "", text)
lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
i = 0
while i < len(lines):
term = lines[i].strip()
if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
i += 1
continue
if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
i += 1
continue
if term.lower().startswith(("acronym for", "definition", "terms of")):
i += 1
continue
defn_lines = []
j = i + 1
while j < len(lines):
nxt = lines[j].strip()
if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
if not nxt.lower().startswith(("see also", "for example", "for instance")):
break
defn_lines.append(nxt)
j += 1
definition = " ".join(defn_lines)
definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
definition = re.sub(r"\s{2,}", " ", definition).strip()
if len(definition.split()) < 5 or "." not in definition:
i += 1
continue
norm = normalize_term(term)
glossary[norm] = {"term": term, "definition": definition}
i = j
return glossary
# --- Main Rebuild Function ---
def rebuild_and_upload():
start = time.time()
print("📘 Starting glossary rebuild...")
try:
all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
except Exception as e:
raise RuntimeError(f"Cannot list repo files: {e}")
all_defs = {}
# --- 1️⃣ Process PDFs ---
for pdf in pdfs:
skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
if any(sp in pdf.lower() for sp in skip_patterns):
print(f"⏩ Skipping non-glossary or template file: {pdf}")
continue
try:
print(f"🔍 Processing {pdf}...")
path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
text = extract_text_from_pdf(path)
defs = extract_definitions_from_text(text)
for k, v in defs.items():
v.setdefault("sources", []).append(pdf)
if k in all_defs:
all_defs[k]["sources"] = list(set(all_defs[k].get("sources", []) + v["sources"]))
else:
all_defs[k] = v
except Exception as e:
print(f"⚠️ Failed {pdf}: {e}")
# --- 2️⃣ Process Excel Glossaries (MRCT etc.) ---
for excel in excels:
try:
print(f"📗 Checking Excel file in dataset: {excel}")
excel_path = hf_hub_download(
repo_id=DOCS_REPO,
filename=excel,
repo_type="dataset",
token=TOKEN
)
print(f"✅ Downloaded Excel file to {excel_path}")
xls = pd.read_excel(excel_path, sheet_name=None)
total_rows = 0
excel_entries = []
for sheet_name, df in xls.items():
df = df.fillna("").dropna(how="all")
if df.shape[0] == 0:
continue
df.columns = [str(c).strip() for c in df.columns]
# 🧠 Detect the correct 'Glossary Term' column
term_col = None
for c in df.columns:
if "glossary term" in c.lower():
term_col = c
break
if not term_col:
for c in df.columns:
if "cdisc" in c.lower() or c.lower() == "term":
term_col = c
break
if not term_col:
print(f"⚠️ No 'Glossary Term' column found in sheet {sheet_name} of {excel_path}")
continue
# Concatenate all relevant columns with labels for clarity
for _, row in df.iterrows():
term = str(row.get(term_col, "")).strip()
if not term:
continue
def_cols = [
c for c in df.columns
if any(k in c.lower() for k in [
"definition", "context", "info", "related", "resource", "use in context"
])
]
def_parts = []
for c in def_cols:
val = str(row.get(c, "")).strip()
if val:
def_parts.append(f"<b>{c}:</b> {val}")
full_definition = "<br>".join(def_parts).strip()
if not full_definition:
continue
entry = {
"term": term,
"definition": full_definition,
"source": os.path.basename(excel_path),
"sheet": sheet_name,
"type": "Excel",
}
excel_entries.append(entry)
total_rows += 1
print(f"✅ Added {total_rows} Excel rows from {excel_path}")
# Merge into main glossary dictionary
for e in excel_entries:
norm = normalize_term(e["term"])
payload = {
"term": e["term"],
"definition": e["definition"],
"sources": [e["source"]],
"type": e.get("type", "Excel"),
"sheet": e.get("sheet"),
}
# Each term+source pair stored uniquely to preserve different definitions
unique_key = f"{norm}__{os.path.basename(payload['sources'][0])}" if payload.get("sources") else norm
if unique_key not in all_defs:
all_defs[unique_key] = payload
else:
# Avoid duplicate merges — just append any new sources
existing = all_defs[unique_key]
existing_sources = set(existing.get("sources", []))
new_sources = set(payload.get("sources", []))
existing["sources"] = list(existing_sources.union(new_sources))
except Exception as e:
print(f"⚠️ Failed to process Excel {excel}: {e}")
# --- 3️⃣ Save combined glossary ---
os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
json.dump(all_defs, f, indent=2, ensure_ascii=False)
print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")
# --- 4️⃣ Upload to Hugging Face ---
if TOKEN:
try:
upload_file(
path_or_fileobj=LOCAL_GLOSSARY,
path_in_repo=REMOTE_GLOSSARY,
repo_id=DATASET_REPO,
repo_type="dataset",
token=TOKEN,
commit_message="Glossary updated with PDF + Excel (column-labeled definitions)",
)
print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
except Exception as e:
print(f"⚠️ Upload error: {e}")
print(f"✨ Done in {time.time() - start:.1f}s.")
if __name__ == "__main__":
rebuild_and_upload()