""" 📘 glossary_builder.py Builds a unified glossary from PDFs, Excel, and Web sources. - Extracts terms & definitions from PDFs. - Merges Excel glossary (with labeled formatting). - Optionally fetches glossary or definitions from known web sources. - Adds source typing (pdf/excel/web/other). - Saves combined glossary.json locally and uploads to Hugging Face. """ import os import re import json import time import fitz import requests import pandas as pd from bs4 import BeautifulSoup from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download # --- Configuration --- DATASET_REPO = "essprasad/CT-Chat-Index" DOCS_REPO = "essprasad/CT-Chat-Docs" LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json" REMOTE_GLOSSARY = "persistent/glossary.json" TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN") # Known web glossary sources (can expand) WEB_SOURCES = [ "https://mrctcenter.org/glossaryterm/clinical-research/", "https://www.fda.gov/patients/drug-development-process/step-3-clinical-research", "https://www.cdisc.org/", "https://www.ich.org/", "https://www.ema.europa.eu/", "https://www.who.int/", "https://clinicaltrials.gov/", ] # --- Helpers --- def normalize_term(term: str) -> str: if not term: return "" s = term.lower().strip() s = re.sub(r"[\-_/\\.,;:]+", " ", s) s = re.sub(r"\s+", " ", s) synonyms = { "electronic case report form": "ecrf", "case report form": "crf", "informed consent form": "icf", "good clinical practice": "gcp", "serious adverse event": "sae", "adverse event": "ae", "21 cfr part 11": "21cfrpart11", "clinical study report": "csr", } return synonyms.get(s, s) def extract_text_from_pdf(pdf_path): try: doc = fitz.open(pdf_path) text = "\n".join(page.get_text("text") for page in doc) doc.close() return text.strip() except Exception as e: print(f"⚠️ Failed to read {pdf_path}: {e}") return "" def extract_definitions_from_text(text): """Extract glossary-like term-definition pairs from raw PDF text.""" glossary = {} text = re.sub(r"\r", "", text) lines = [ln.strip() for ln in text.split("\n") if ln.strip()] i = 0 while i < len(lines): term = lines[i].strip() if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()): i += 1 continue if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]): i += 1 continue if term.lower().startswith(("acronym for", "definition", "terms of")): i += 1 continue defn_lines = [] j = i + 1 while j < len(lines): nxt = lines[j].strip() if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."): if not nxt.lower().startswith(("see also", "for example", "for instance")): break defn_lines.append(nxt) j += 1 definition = " ".join(defn_lines) definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition) definition = re.sub(r"\s{2,}", " ", definition).strip() if len(definition.split()) < 5 or "." not in definition: i += 1 continue norm = normalize_term(term) glossary[norm] = {"term": term, "definition": definition} i = j return glossary def detect_source_type(src: str) -> str: if not src: return "other" src = src.lower() if src.endswith(".pdf"): return "pdf" if src.endswith((".xlsx", ".xls")): return "excel" if src.startswith("http"): return "web" return "other" def extract_web_glossary(url): """Scrape simple web glossary or definition pages.""" results = [] try: print(f"🌐 Fetching {url}...") resp = requests.get(url, timeout=10) if resp.status_code != 200: print(f"⚠️ Skipped {url}: HTTP {resp.status_code}") return [] soup = BeautifulSoup(resp.text, "html.parser") text = soup.get_text(separator="\n") text = re.sub(r"\s{2,}", " ", text).strip() # Heuristic: pick possible term-definition snippets matches = re.findall(r"([A-Z][A-Za-z0-9\s]{2,30})[:\-]\s*(.{10,200})", text) for term, definition in matches[:50]: definition = re.sub(r"\s{2,}", " ", definition).strip() if len(definition.split()) > 3: results.append({ "term": term.strip(), "definition": definition, "sources": [url], "file": url, "type": "web", }) except Exception as e: print(f"⚠️ Web fetch failed for {url}: {e}") return results # --- Main Rebuild Function --- def rebuild_and_upload(): start = time.time() print("📘 Starting glossary rebuild...") try: all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN) pdfs = [f for f in all_files if f.lower().endswith(".pdf")] excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))] except Exception as e: raise RuntimeError(f"Cannot list repo files: {e}") all_defs = {} # --- 1️⃣ Process PDFs --- for pdf in pdfs: skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"] if any(sp in pdf.lower() for sp in skip_patterns): print(f"⏩ Skipping non-glossary or template file: {pdf}") continue try: print(f"🔍 Processing {pdf}...") path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN) text = extract_text_from_pdf(path) defs = extract_definitions_from_text(text) for k, v in defs.items(): v.setdefault("sources", []).append(pdf) v["type"] = "pdf" v["file"] = os.path.basename(pdf) all_defs[f"{k}__{v['file']}"] = v except Exception as e: print(f"⚠️ Failed {pdf}: {e}") # --- 2️⃣ Process Excel Glossaries --- for excel in excels: try: print(f"📗 Checking Excel file in dataset: {excel}") excel_path = hf_hub_download(repo_id=DOCS_REPO, filename=excel, repo_type="dataset", token=TOKEN) print(f"✅ Downloaded Excel file to {excel_path}") xls = pd.read_excel(excel_path, sheet_name=None) for sheet_name, df in xls.items(): df = df.fillna("").dropna(how="all") if df.empty: continue df.columns = [str(c).strip() for c in df.columns] term_col = next((c for c in df.columns if "glossary term" in c.lower() or "term" == c.lower()), None) if not term_col: continue for _, row in df.iterrows(): term = str(row.get(term_col, "")).strip() if not term: continue def_cols = [c for c in df.columns if any(k in c.lower() for k in ["definition", "context", "info", "related", "resource"])] def_parts = [f"{c}: {row[c]}" for c in def_cols if str(row[c]).strip()] if not def_parts: continue entry = { "term": term, "definition": "
".join(def_parts), "sources": [os.path.basename(excel_path)], "file": os.path.basename(excel_path), "sheet": sheet_name, "type": "excel", } all_defs[f"{normalize_term(term)}__{entry['file']}"] = entry print(f"✅ Processed Excel file {excel}") except Exception as e: print(f"⚠️ Failed Excel {excel}: {e}") # --- 3️⃣ Process Web Glossaries --- web_entries = [] for url in WEB_SOURCES: entries = extract_web_glossary(url) for e in entries: key = f"{normalize_term(e['term'])}__{e['file']}" all_defs[key] = e web_entries.append(e) print(f"✅ Added {len(web_entries)} web glossary entries.") # --- 4️⃣ Cleanup & Save --- for k, v in all_defs.items(): v["sources"] = v.get("sources", []) if not isinstance(v["sources"], list): v["sources"] = [v["sources"]] if not v.get("type"): v["type"] = detect_source_type(v["sources"][0] if v["sources"] else "") os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True) with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f: json.dump(list(all_defs.values()), f, indent=2, ensure_ascii=False) print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}") # --- 5️⃣ Upload to Hugging Face --- if TOKEN: try: upload_file( path_or_fileobj=LOCAL_GLOSSARY, path_in_repo=REMOTE_GLOSSARY, repo_id=DATASET_REPO, repo_type="dataset", token=TOKEN, commit_message="Glossary updated (PDF + Excel + Web sources)", ) print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}") except Exception as e: print(f"⚠️ Upload error: {e}") print(f"✨ Done in {time.time() - start:.1f}s.") if __name__ == "__main__": rebuild_and_upload()