"""
📘 glossary_builder.py
Builds a unified glossary from PDFs, Excel, and Web sources.
- Extracts terms & definitions from PDFs.
- Merges Excel glossary (with labeled formatting).
- Optionally fetches glossary or definitions from known web sources.
- Adds source typing (pdf/excel/web/other).
- Saves combined glossary.json locally and uploads to Hugging Face.
"""
import os
import re
import json
import time
import fitz
import requests
import pandas as pd
from bs4 import BeautifulSoup
from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download
# --- Configuration ---
DATASET_REPO = "essprasad/CT-Chat-Index"
DOCS_REPO = "essprasad/CT-Chat-Docs"
LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
REMOTE_GLOSSARY = "persistent/glossary.json"
TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")
# Known web glossary sources (can expand)
WEB_SOURCES = [
"https://mrctcenter.org/glossaryterm/clinical-research/",
"https://www.fda.gov/patients/drug-development-process/step-3-clinical-research",
"https://www.cdisc.org/",
"https://www.ich.org/",
"https://www.ema.europa.eu/",
"https://www.who.int/",
"https://clinicaltrials.gov/",
]
# --- Helpers ---
def normalize_term(term: str) -> str:
if not term:
return ""
s = term.lower().strip()
s = re.sub(r"[\-_/\\.,;:]+", " ", s)
s = re.sub(r"\s+", " ", s)
synonyms = {
"electronic case report form": "ecrf",
"case report form": "crf",
"informed consent form": "icf",
"good clinical practice": "gcp",
"serious adverse event": "sae",
"adverse event": "ae",
"21 cfr part 11": "21cfrpart11",
"clinical study report": "csr",
}
return synonyms.get(s, s)
def extract_text_from_pdf(pdf_path):
try:
doc = fitz.open(pdf_path)
text = "\n".join(page.get_text("text") for page in doc)
doc.close()
return text.strip()
except Exception as e:
print(f"⚠️ Failed to read {pdf_path}: {e}")
return ""
def extract_definitions_from_text(text):
"""Extract glossary-like term-definition pairs from raw PDF text."""
glossary = {}
text = re.sub(r"\r", "", text)
lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
i = 0
while i < len(lines):
term = lines[i].strip()
if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
i += 1
continue
if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
i += 1
continue
if term.lower().startswith(("acronym for", "definition", "terms of")):
i += 1
continue
defn_lines = []
j = i + 1
while j < len(lines):
nxt = lines[j].strip()
if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
if not nxt.lower().startswith(("see also", "for example", "for instance")):
break
defn_lines.append(nxt)
j += 1
definition = " ".join(defn_lines)
definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
definition = re.sub(r"\s{2,}", " ", definition).strip()
if len(definition.split()) < 5 or "." not in definition:
i += 1
continue
norm = normalize_term(term)
glossary[norm] = {"term": term, "definition": definition}
i = j
return glossary
def detect_source_type(src: str) -> str:
if not src:
return "other"
src = src.lower()
if src.endswith(".pdf"):
return "pdf"
if src.endswith((".xlsx", ".xls")):
return "excel"
if src.startswith("http"):
return "web"
return "other"
def extract_web_glossary(url):
"""Scrape simple web glossary or definition pages."""
results = []
try:
print(f"🌐 Fetching {url}...")
resp = requests.get(url, timeout=10)
if resp.status_code != 200:
print(f"⚠️ Skipped {url}: HTTP {resp.status_code}")
return []
soup = BeautifulSoup(resp.text, "html.parser")
text = soup.get_text(separator="\n")
text = re.sub(r"\s{2,}", " ", text).strip()
# Heuristic: pick possible term-definition snippets
matches = re.findall(r"([A-Z][A-Za-z0-9\s]{2,30})[:\-]\s*(.{10,200})", text)
for term, definition in matches[:50]:
definition = re.sub(r"\s{2,}", " ", definition).strip()
if len(definition.split()) > 3:
results.append({
"term": term.strip(),
"definition": definition,
"sources": [url],
"file": url,
"type": "web",
})
except Exception as e:
print(f"⚠️ Web fetch failed for {url}: {e}")
return results
# --- Main Rebuild Function ---
def rebuild_and_upload():
start = time.time()
print("📘 Starting glossary rebuild...")
try:
all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
except Exception as e:
raise RuntimeError(f"Cannot list repo files: {e}")
all_defs = {}
# --- 1️⃣ Process PDFs ---
for pdf in pdfs:
skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
if any(sp in pdf.lower() for sp in skip_patterns):
print(f"⏩ Skipping non-glossary or template file: {pdf}")
continue
try:
print(f"🔍 Processing {pdf}...")
path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
text = extract_text_from_pdf(path)
defs = extract_definitions_from_text(text)
for k, v in defs.items():
v.setdefault("sources", []).append(pdf)
v["type"] = "pdf"
v["file"] = os.path.basename(pdf)
all_defs[f"{k}__{v['file']}"] = v
except Exception as e:
print(f"⚠️ Failed {pdf}: {e}")
# --- 2️⃣ Process Excel Glossaries ---
for excel in excels:
try:
print(f"📗 Checking Excel file in dataset: {excel}")
excel_path = hf_hub_download(repo_id=DOCS_REPO, filename=excel, repo_type="dataset", token=TOKEN)
print(f"✅ Downloaded Excel file to {excel_path}")
xls = pd.read_excel(excel_path, sheet_name=None)
for sheet_name, df in xls.items():
df = df.fillna("").dropna(how="all")
if df.empty:
continue
df.columns = [str(c).strip() for c in df.columns]
term_col = next((c for c in df.columns if "glossary term" in c.lower() or "term" == c.lower()), None)
if not term_col:
continue
for _, row in df.iterrows():
term = str(row.get(term_col, "")).strip()
if not term:
continue
def_cols = [c for c in df.columns if any(k in c.lower() for k in ["definition", "context", "info", "related", "resource"])]
def_parts = [f"{c}: {row[c]}" for c in def_cols if str(row[c]).strip()]
if not def_parts:
continue
entry = {
"term": term,
"definition": "
".join(def_parts),
"sources": [os.path.basename(excel_path)],
"file": os.path.basename(excel_path),
"sheet": sheet_name,
"type": "excel",
}
all_defs[f"{normalize_term(term)}__{entry['file']}"] = entry
print(f"✅ Processed Excel file {excel}")
except Exception as e:
print(f"⚠️ Failed Excel {excel}: {e}")
# --- 3️⃣ Process Web Glossaries ---
web_entries = []
for url in WEB_SOURCES:
entries = extract_web_glossary(url)
for e in entries:
key = f"{normalize_term(e['term'])}__{e['file']}"
all_defs[key] = e
web_entries.append(e)
print(f"✅ Added {len(web_entries)} web glossary entries.")
# --- 4️⃣ Cleanup & Save ---
for k, v in all_defs.items():
v["sources"] = v.get("sources", [])
if not isinstance(v["sources"], list):
v["sources"] = [v["sources"]]
if not v.get("type"):
v["type"] = detect_source_type(v["sources"][0] if v["sources"] else "")
os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
json.dump(list(all_defs.values()), f, indent=2, ensure_ascii=False)
print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")
# --- 5️⃣ Upload to Hugging Face ---
if TOKEN:
try:
upload_file(
path_or_fileobj=LOCAL_GLOSSARY,
path_in_repo=REMOTE_GLOSSARY,
repo_id=DATASET_REPO,
repo_type="dataset",
token=TOKEN,
commit_message="Glossary updated (PDF + Excel + Web sources)",
)
print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
except Exception as e:
print(f"⚠️ Upload error: {e}")
print(f"✨ Done in {time.time() - start:.1f}s.")
if __name__ == "__main__":
rebuild_and_upload()