Spaces:
Running
Running
| """ | |
| 📘 glossary_builder.py | |
| Builds a unified glossary from PDFs and Excel files. | |
| - Extracts terms & definitions from PDFs. | |
| - Merges Excel glossary (uses all definition-related columns with labeled formatting). | |
| - Saves combined glossary.json locally and uploads to Hugging Face. | |
| """ | |
| import os | |
| import re | |
| import json | |
| import time | |
| import fitz | |
| import pandas as pd | |
| from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download | |
| # --- Configuration --- | |
| DATASET_REPO = "essprasad/CT-Chat-Index" | |
| DOCS_REPO = "essprasad/CT-Chat-Docs" | |
| LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json" | |
| REMOTE_GLOSSARY = "persistent/glossary.json" | |
| TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN") | |
| # --- Helpers --- | |
| def normalize_term(term: str) -> str: | |
| if not term: | |
| return "" | |
| s = term.lower().strip() | |
| s = re.sub(r"[\-_/\\.,;:]+", " ", s) | |
| s = re.sub(r"\s+", " ", s) | |
| synonyms = { | |
| "electronic case report form": "ecrf", | |
| "case report form": "crf", | |
| "informed consent form": "icf", | |
| "good clinical practice": "gcp", | |
| "serious adverse event": "sae", | |
| "adverse event": "ae", | |
| "21 cfr part 11": "21cfrpart11", | |
| "clinical study report": "csr", | |
| } | |
| return synonyms.get(s, s) | |
| def extract_text_from_pdf(pdf_path): | |
| """Extract plain text from a PDF file.""" | |
| try: | |
| doc = fitz.open(pdf_path) | |
| text = "\n".join(page.get_text("text") for page in doc) | |
| doc.close() | |
| return text.strip() | |
| except Exception as e: | |
| print(f"⚠️ Failed to read {pdf_path}: {e}") | |
| return "" | |
| def extract_definitions_from_text(text): | |
| """Extract glossary-like term-definition pairs from raw PDF text.""" | |
| glossary = {} | |
| text = re.sub(r"\r", "", text) | |
| lines = [ln.strip() for ln in text.split("\n") if ln.strip()] | |
| i = 0 | |
| while i < len(lines): | |
| term = lines[i].strip() | |
| if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()): | |
| i += 1 | |
| continue | |
| if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]): | |
| i += 1 | |
| continue | |
| if term.lower().startswith(("acronym for", "definition", "terms of")): | |
| i += 1 | |
| continue | |
| defn_lines = [] | |
| j = i + 1 | |
| while j < len(lines): | |
| nxt = lines[j].strip() | |
| if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."): | |
| if not nxt.lower().startswith(("see also", "for example", "for instance")): | |
| break | |
| defn_lines.append(nxt) | |
| j += 1 | |
| definition = " ".join(defn_lines) | |
| definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition) | |
| definition = re.sub(r"\s{2,}", " ", definition).strip() | |
| if len(definition.split()) < 5 or "." not in definition: | |
| i += 1 | |
| continue | |
| norm = normalize_term(term) | |
| glossary[norm] = {"term": term, "definition": definition} | |
| i = j | |
| return glossary | |
| # --- Main Rebuild Function --- | |
| def rebuild_and_upload(): | |
| start = time.time() | |
| print("📘 Starting glossary rebuild...") | |
| try: | |
| all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN) | |
| pdfs = [f for f in all_files if f.lower().endswith(".pdf")] | |
| excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))] | |
| except Exception as e: | |
| raise RuntimeError(f"Cannot list repo files: {e}") | |
| all_defs = {} | |
| # --- 1️⃣ Process PDFs --- | |
| for pdf in pdfs: | |
| skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"] | |
| if any(sp in pdf.lower() for sp in skip_patterns): | |
| print(f"⏩ Skipping non-glossary or template file: {pdf}") | |
| continue | |
| try: | |
| print(f"🔍 Processing {pdf}...") | |
| path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN) | |
| text = extract_text_from_pdf(path) | |
| defs = extract_definitions_from_text(text) | |
| for k, v in defs.items(): | |
| v.setdefault("sources", []).append(pdf) | |
| if k in all_defs: | |
| all_defs[k]["sources"] = list(set(all_defs[k].get("sources", []) + v["sources"])) | |
| else: | |
| all_defs[k] = v | |
| except Exception as e: | |
| print(f"⚠️ Failed {pdf}: {e}") | |
| # --- 2️⃣ Process Excel Glossaries (MRCT etc.) --- | |
| for excel in excels: | |
| try: | |
| print(f"📗 Checking Excel file in dataset: {excel}") | |
| excel_path = hf_hub_download( | |
| repo_id=DOCS_REPO, | |
| filename=excel, | |
| repo_type="dataset", | |
| token=TOKEN | |
| ) | |
| print(f"✅ Downloaded Excel file to {excel_path}") | |
| xls = pd.read_excel(excel_path, sheet_name=None) | |
| total_rows = 0 | |
| excel_entries = [] | |
| for sheet_name, df in xls.items(): | |
| df = df.fillna("").dropna(how="all") | |
| if df.shape[0] == 0: | |
| continue | |
| df.columns = [str(c).strip() for c in df.columns] | |
| # 🧠 Detect the correct 'Glossary Term' column | |
| term_col = None | |
| for c in df.columns: | |
| if "glossary term" in c.lower(): | |
| term_col = c | |
| break | |
| if not term_col: | |
| for c in df.columns: | |
| if "cdisc" in c.lower() or c.lower() == "term": | |
| term_col = c | |
| break | |
| if not term_col: | |
| print(f"⚠️ No 'Glossary Term' column found in sheet {sheet_name} of {excel_path}") | |
| continue | |
| # Concatenate all relevant columns with labels for clarity | |
| for _, row in df.iterrows(): | |
| term = str(row.get(term_col, "")).strip() | |
| if not term: | |
| continue | |
| def_cols = [ | |
| c for c in df.columns | |
| if any(k in c.lower() for k in [ | |
| "definition", "context", "info", "related", "resource", "use in context" | |
| ]) | |
| ] | |
| def_parts = [] | |
| for c in def_cols: | |
| val = str(row.get(c, "")).strip() | |
| if val: | |
| def_parts.append(f"<b>{c}:</b> {val}") | |
| full_definition = "<br>".join(def_parts).strip() | |
| if not full_definition: | |
| continue | |
| entry = { | |
| "term": term, | |
| "definition": full_definition, | |
| "source": os.path.basename(excel_path), | |
| "sheet": sheet_name, | |
| "type": "Excel", | |
| } | |
| excel_entries.append(entry) | |
| total_rows += 1 | |
| print(f"✅ Added {total_rows} Excel rows from {excel_path}") | |
| # Merge into main glossary dictionary | |
| for e in excel_entries: | |
| norm = normalize_term(e["term"]) | |
| payload = { | |
| "term": e["term"], | |
| "definition": e["definition"], | |
| "sources": [e["source"]], | |
| "type": e.get("type", "Excel"), | |
| "sheet": e.get("sheet"), | |
| } | |
| # Each term+source pair stored uniquely to preserve different definitions | |
| unique_key = f"{norm}__{os.path.basename(payload['sources'][0])}" if payload.get("sources") else norm | |
| if unique_key not in all_defs: | |
| all_defs[unique_key] = payload | |
| else: | |
| # Avoid duplicate merges — just append any new sources | |
| existing = all_defs[unique_key] | |
| existing_sources = set(existing.get("sources", [])) | |
| new_sources = set(payload.get("sources", [])) | |
| existing["sources"] = list(existing_sources.union(new_sources)) | |
| except Exception as e: | |
| print(f"⚠️ Failed to process Excel {excel}: {e}") | |
| # --- 3️⃣ Save combined glossary --- | |
| os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True) | |
| with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f: | |
| json.dump(all_defs, f, indent=2, ensure_ascii=False) | |
| print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}") | |
| # --- 4️⃣ Upload to Hugging Face --- | |
| if TOKEN: | |
| try: | |
| upload_file( | |
| path_or_fileobj=LOCAL_GLOSSARY, | |
| path_in_repo=REMOTE_GLOSSARY, | |
| repo_id=DATASET_REPO, | |
| repo_type="dataset", | |
| token=TOKEN, | |
| commit_message="Glossary updated with PDF + Excel (column-labeled definitions)", | |
| ) | |
| print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}") | |
| except Exception as e: | |
| print(f"⚠️ Upload error: {e}") | |
| print(f"✨ Done in {time.time() - start:.1f}s.") | |
| if __name__ == "__main__": | |
| rebuild_and_upload() | |