Spaces:

essprasad
/

ClinicalTrialBasics

Running

File size: 9,518 Bytes

b816136

"""
📘 glossary_builder.py
Builds a unified glossary from PDFs and Excel files.
- Extracts terms & definitions from PDFs.
- Merges Excel glossary (uses all definition-related columns with labeled formatting).
- Saves combined glossary.json locally and uploads to Hugging Face.
"""

import os
import re
import json
import time
import fitz
import pandas as pd
from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download

# --- Configuration ---
DATASET_REPO = "essprasad/CT-Chat-Index"
DOCS_REPO = "essprasad/CT-Chat-Docs"
LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
REMOTE_GLOSSARY = "persistent/glossary.json"
TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")


# --- Helpers ---
def normalize_term(term: str) -> str:
    if not term:
        return ""
    s = term.lower().strip()
    s = re.sub(r"[\-_/\\.,;:]+", " ", s)
    s = re.sub(r"\s+", " ", s)
    synonyms = {
        "electronic case report form": "ecrf",
        "case report form": "crf",
        "informed consent form": "icf",
        "good clinical practice": "gcp",
        "serious adverse event": "sae",
        "adverse event": "ae",
        "21 cfr part 11": "21cfrpart11",
        "clinical study report": "csr",
    }
    return synonyms.get(s, s)


def extract_text_from_pdf(pdf_path):
    """Extract plain text from a PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join(page.get_text("text") for page in doc)
        doc.close()
        return text.strip()
    except Exception as e:
        print(f"⚠️ Failed to read {pdf_path}: {e}")
        return ""


def extract_definitions_from_text(text):
    """Extract glossary-like term-definition pairs from raw PDF text."""
    glossary = {}
    text = re.sub(r"\r", "", text)
    lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
    i = 0
    while i < len(lines):
        term = lines[i].strip()
        if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
            i += 1
            continue
        if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
            i += 1
            continue
        if term.lower().startswith(("acronym for", "definition", "terms of")):
            i += 1
            continue

        defn_lines = []
        j = i + 1
        while j < len(lines):
            nxt = lines[j].strip()
            if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
                if not nxt.lower().startswith(("see also", "for example", "for instance")):
                    break
            defn_lines.append(nxt)
            j += 1

        definition = " ".join(defn_lines)
        definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
        definition = re.sub(r"\s{2,}", " ", definition).strip()

        if len(definition.split()) < 5 or "." not in definition:
            i += 1
            continue

        norm = normalize_term(term)
        glossary[norm] = {"term": term, "definition": definition}
        i = j

    return glossary


# --- Main Rebuild Function ---
def rebuild_and_upload():
    start = time.time()
    print("📘 Starting glossary rebuild...")
    try:
        all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
        pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
        excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
    except Exception as e:
        raise RuntimeError(f"Cannot list repo files: {e}")

    all_defs = {}

    # --- 1️⃣ Process PDFs ---
    for pdf in pdfs:
        skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
        if any(sp in pdf.lower() for sp in skip_patterns):
            print(f"⏩ Skipping non-glossary or template file: {pdf}")
            continue
        try:
            print(f"🔍 Processing {pdf}...")
            path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
            text = extract_text_from_pdf(path)
            defs = extract_definitions_from_text(text)
            for k, v in defs.items():
                v.setdefault("sources", []).append(pdf)
                if k in all_defs:
                    all_defs[k]["sources"] = list(set(all_defs[k].get("sources", []) + v["sources"]))
                else:
                    all_defs[k] = v
        except Exception as e:
            print(f"⚠️ Failed {pdf}: {e}")

    # --- 2️⃣ Process Excel Glossaries (MRCT etc.) ---
    for excel in excels:
        try:
            print(f"📗 Checking Excel file in dataset: {excel}")
            excel_path = hf_hub_download(
                repo_id=DOCS_REPO,
                filename=excel,
                repo_type="dataset",
                token=TOKEN
            )
            print(f"✅ Downloaded Excel file to {excel_path}")
            xls = pd.read_excel(excel_path, sheet_name=None)

            total_rows = 0
            excel_entries = []

            for sheet_name, df in xls.items():
                df = df.fillna("").dropna(how="all")
                if df.shape[0] == 0:
                    continue
                df.columns = [str(c).strip() for c in df.columns]

                # 🧠 Detect the correct 'Glossary Term' column
                term_col = None
                for c in df.columns:
                    if "glossary term" in c.lower():
                        term_col = c
                        break
                if not term_col:
                    for c in df.columns:
                        if "cdisc" in c.lower() or c.lower() == "term":
                            term_col = c
                            break
                if not term_col:
                    print(f"⚠️ No 'Glossary Term' column found in sheet {sheet_name} of {excel_path}")
                    continue

                # Concatenate all relevant columns with labels for clarity
                for _, row in df.iterrows():
                    term = str(row.get(term_col, "")).strip()
                    if not term:
                        continue

                    def_cols = [
                        c for c in df.columns
                        if any(k in c.lower() for k in [
                            "definition", "context", "info", "related", "resource", "use in context"
                        ])
                    ]

                    def_parts = []
                    for c in def_cols:
                        val = str(row.get(c, "")).strip()
                        if val:
                            def_parts.append(f"<b>{c}:</b> {val}")

                    full_definition = "<br>".join(def_parts).strip()
                    if not full_definition:
                        continue

                    entry = {
                        "term": term,
                        "definition": full_definition,
                        "source": os.path.basename(excel_path),
                        "sheet": sheet_name,
                        "type": "Excel",
                    }
                    excel_entries.append(entry)
                    total_rows += 1

            print(f"✅ Added {total_rows} Excel rows from {excel_path}")

            # Merge into main glossary dictionary
            for e in excel_entries:
                norm = normalize_term(e["term"])
                payload = {
                    "term": e["term"],
                    "definition": e["definition"],
                    "sources": [e["source"]],
                    "type": e.get("type", "Excel"),
                    "sheet": e.get("sheet"),
                }
                # Each term+source pair stored uniquely to preserve different definitions
                unique_key = f"{norm}__{os.path.basename(payload['sources'][0])}" if payload.get("sources") else norm
                
                if unique_key not in all_defs:
                    all_defs[unique_key] = payload
                else:
                    # Avoid duplicate merges — just append any new sources
                    existing = all_defs[unique_key]
                    existing_sources = set(existing.get("sources", []))
                    new_sources = set(payload.get("sources", []))
                    existing["sources"] = list(existing_sources.union(new_sources))
                    
        except Exception as e:
            print(f"⚠️ Failed to process Excel {excel}: {e}")

    # --- 3️⃣ Save combined glossary ---
    os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
    with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
        json.dump(all_defs, f, indent=2, ensure_ascii=False)

    print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")

    # --- 4️⃣ Upload to Hugging Face ---
    if TOKEN:
        try:
            upload_file(
                path_or_fileobj=LOCAL_GLOSSARY,
                path_in_repo=REMOTE_GLOSSARY,
                repo_id=DATASET_REPO,
                repo_type="dataset",
                token=TOKEN,
                commit_message="Glossary updated with PDF + Excel (column-labeled definitions)",
            )
            print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
        except Exception as e:
            print(f"⚠️ Upload error: {e}")

    print(f"✨ Done in {time.time() - start:.1f}s.")


if __name__ == "__main__":
    rebuild_and_upload()