Spaces:
Running
Running
| """ | |
| π glossary_builder.py | |
| Builds a unified glossary from PDFs, Excel, and Web sources. | |
| - Extracts terms & definitions from PDFs. | |
| - Merges Excel glossary (with labeled formatting). | |
| - Optionally fetches glossary or definitions from known web sources. | |
| - Adds source typing (pdf/excel/web/other). | |
| - Saves combined glossary.json locally and uploads to Hugging Face. | |
| """ | |
| import os | |
| import re | |
| import json | |
| import time | |
| import fitz | |
| import requests | |
| import pandas as pd | |
| from bs4 import BeautifulSoup | |
| from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download | |
| # --- Configuration --- | |
| DATASET_REPO = "essprasad/CT-Chat-Index" | |
| DOCS_REPO = "essprasad/CT-Chat-Docs" | |
| LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json" | |
| REMOTE_GLOSSARY = "persistent/glossary.json" | |
| TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN") | |
| # Known web glossary sources (can expand) | |
| WEB_SOURCES = [ | |
| "https://mrctcenter.org/glossaryterm/clinical-research/", | |
| "https://www.fda.gov/patients/drug-development-process/step-3-clinical-research", | |
| "https://www.cdisc.org/", | |
| "https://www.ich.org/", | |
| "https://www.ema.europa.eu/", | |
| "https://www.who.int/", | |
| "https://clinicaltrials.gov/", | |
| ] | |
| # --- Helpers --- | |
| def normalize_term(term: str) -> str: | |
| if not term: | |
| return "" | |
| s = term.lower().strip() | |
| s = re.sub(r"[\-_/\\.,;:]+", " ", s) | |
| s = re.sub(r"\s+", " ", s) | |
| synonyms = { | |
| "electronic case report form": "ecrf", | |
| "case report form": "crf", | |
| "informed consent form": "icf", | |
| "good clinical practice": "gcp", | |
| "serious adverse event": "sae", | |
| "adverse event": "ae", | |
| "21 cfr part 11": "21cfrpart11", | |
| "clinical study report": "csr", | |
| } | |
| return synonyms.get(s, s) | |
| def extract_text_from_pdf(pdf_path): | |
| try: | |
| doc = fitz.open(pdf_path) | |
| text = "\n".join(page.get_text("text") for page in doc) | |
| doc.close() | |
| return text.strip() | |
| except Exception as e: | |
| print(f"β οΈ Failed to read {pdf_path}: {e}") | |
| return "" | |
| def extract_definitions_from_text(text): | |
| """Extract glossary-like term-definition pairs from raw PDF text.""" | |
| glossary = {} | |
| text = re.sub(r"\r", "", text) | |
| lines = [ln.strip() for ln in text.split("\n") if ln.strip()] | |
| i = 0 | |
| while i < len(lines): | |
| term = lines[i].strip() | |
| if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()): | |
| i += 1 | |
| continue | |
| if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]): | |
| i += 1 | |
| continue | |
| if term.lower().startswith(("acronym for", "definition", "terms of")): | |
| i += 1 | |
| continue | |
| defn_lines = [] | |
| j = i + 1 | |
| while j < len(lines): | |
| nxt = lines[j].strip() | |
| if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."): | |
| if not nxt.lower().startswith(("see also", "for example", "for instance")): | |
| break | |
| defn_lines.append(nxt) | |
| j += 1 | |
| definition = " ".join(defn_lines) | |
| definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition) | |
| definition = re.sub(r"\s{2,}", " ", definition).strip() | |
| if len(definition.split()) < 5 or "." not in definition: | |
| i += 1 | |
| continue | |
| norm = normalize_term(term) | |
| glossary[norm] = {"term": term, "definition": definition} | |
| i = j | |
| return glossary | |
| def detect_source_type(src: str) -> str: | |
| if not src: | |
| return "other" | |
| src = src.lower() | |
| if src.endswith(".pdf"): | |
| return "pdf" | |
| if src.endswith((".xlsx", ".xls")): | |
| return "excel" | |
| if src.startswith("http"): | |
| return "web" | |
| return "other" | |
| def extract_web_glossary(url): | |
| """Scrape simple web glossary or definition pages.""" | |
| results = [] | |
| try: | |
| print(f"π Fetching {url}...") | |
| resp = requests.get(url, timeout=10) | |
| if resp.status_code != 200: | |
| print(f"β οΈ Skipped {url}: HTTP {resp.status_code}") | |
| return [] | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| text = soup.get_text(separator="\n") | |
| text = re.sub(r"\s{2,}", " ", text).strip() | |
| # Heuristic: pick possible term-definition snippets | |
| matches = re.findall(r"([A-Z][A-Za-z0-9\s]{2,30})[:\-]\s*(.{10,200})", text) | |
| for term, definition in matches[:50]: | |
| definition = re.sub(r"\s{2,}", " ", definition).strip() | |
| if len(definition.split()) > 3: | |
| results.append({ | |
| "term": term.strip(), | |
| "definition": definition, | |
| "sources": [url], | |
| "file": url, | |
| "type": "web", | |
| }) | |
| except Exception as e: | |
| print(f"β οΈ Web fetch failed for {url}: {e}") | |
| return results | |
| # --- Main Rebuild Function --- | |
| def rebuild_and_upload(): | |
| start = time.time() | |
| print("π Starting glossary rebuild...") | |
| try: | |
| all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN) | |
| pdfs = [f for f in all_files if f.lower().endswith(".pdf")] | |
| excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))] | |
| except Exception as e: | |
| raise RuntimeError(f"Cannot list repo files: {e}") | |
| all_defs = {} | |
| # --- 1οΈβ£ Process PDFs --- | |
| for pdf in pdfs: | |
| skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"] | |
| if any(sp in pdf.lower() for sp in skip_patterns): | |
| print(f"β© Skipping non-glossary or template file: {pdf}") | |
| continue | |
| try: | |
| print(f"π Processing {pdf}...") | |
| path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN) | |
| text = extract_text_from_pdf(path) | |
| defs = extract_definitions_from_text(text) | |
| for k, v in defs.items(): | |
| v.setdefault("sources", []).append(pdf) | |
| v["type"] = "pdf" | |
| v["file"] = os.path.basename(pdf) | |
| all_defs[f"{k}__{v['file']}"] = v | |
| except Exception as e: | |
| print(f"β οΈ Failed {pdf}: {e}") | |
| # --- 2οΈβ£ Process Excel Glossaries --- | |
| for excel in excels: | |
| try: | |
| print(f"π Checking Excel file in dataset: {excel}") | |
| excel_path = hf_hub_download(repo_id=DOCS_REPO, filename=excel, repo_type="dataset", token=TOKEN) | |
| print(f"β Downloaded Excel file to {excel_path}") | |
| xls = pd.read_excel(excel_path, sheet_name=None) | |
| for sheet_name, df in xls.items(): | |
| df = df.fillna("").dropna(how="all") | |
| if df.empty: | |
| continue | |
| df.columns = [str(c).strip() for c in df.columns] | |
| term_col = next((c for c in df.columns if "glossary term" in c.lower() or "term" == c.lower()), None) | |
| if not term_col: | |
| continue | |
| for _, row in df.iterrows(): | |
| term = str(row.get(term_col, "")).strip() | |
| if not term: | |
| continue | |
| def_cols = [c for c in df.columns if any(k in c.lower() for k in ["definition", "context", "info", "related", "resource"])] | |
| def_parts = [f"<b>{c}:</b> {row[c]}" for c in def_cols if str(row[c]).strip()] | |
| if not def_parts: | |
| continue | |
| entry = { | |
| "term": term, | |
| "definition": "<br>".join(def_parts), | |
| "sources": [os.path.basename(excel_path)], | |
| "file": os.path.basename(excel_path), | |
| "sheet": sheet_name, | |
| "type": "excel", | |
| } | |
| all_defs[f"{normalize_term(term)}__{entry['file']}"] = entry | |
| print(f"β Processed Excel file {excel}") | |
| except Exception as e: | |
| print(f"β οΈ Failed Excel {excel}: {e}") | |
| # --- 3οΈβ£ Process Web Glossaries --- | |
| web_entries = [] | |
| for url in WEB_SOURCES: | |
| entries = extract_web_glossary(url) | |
| for e in entries: | |
| key = f"{normalize_term(e['term'])}__{e['file']}" | |
| all_defs[key] = e | |
| web_entries.append(e) | |
| print(f"β Added {len(web_entries)} web glossary entries.") | |
| # --- 4οΈβ£ Cleanup & Save --- | |
| for k, v in all_defs.items(): | |
| v["sources"] = v.get("sources", []) | |
| if not isinstance(v["sources"], list): | |
| v["sources"] = [v["sources"]] | |
| if not v.get("type"): | |
| v["type"] = detect_source_type(v["sources"][0] if v["sources"] else "") | |
| os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True) | |
| with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f: | |
| json.dump(list(all_defs.values()), f, indent=2, ensure_ascii=False) | |
| print(f"β Saved {len(all_defs)} entries β {LOCAL_GLOSSARY}") | |
| # --- 5οΈβ£ Upload to Hugging Face --- | |
| if TOKEN: | |
| try: | |
| upload_file( | |
| path_or_fileobj=LOCAL_GLOSSARY, | |
| path_in_repo=REMOTE_GLOSSARY, | |
| repo_id=DATASET_REPO, | |
| repo_type="dataset", | |
| token=TOKEN, | |
| commit_message="Glossary updated (PDF + Excel + Web sources)", | |
| ) | |
| print(f"π Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}") | |
| except Exception as e: | |
| print(f"β οΈ Upload error: {e}") | |
| print(f"β¨ Done in {time.time() - start:.1f}s.") | |
| if __name__ == "__main__": | |
| rebuild_and_upload() | |