File size: 9,518 Bytes
b816136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
"""
πŸ“˜ glossary_builder.py
Builds a unified glossary from PDFs and Excel files.
- Extracts terms & definitions from PDFs.
- Merges Excel glossary (uses all definition-related columns with labeled formatting).
- Saves combined glossary.json locally and uploads to Hugging Face.
"""

import os
import re
import json
import time
import fitz
import pandas as pd
from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download

# --- Configuration ---
DATASET_REPO = "essprasad/CT-Chat-Index"
DOCS_REPO = "essprasad/CT-Chat-Docs"
LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
REMOTE_GLOSSARY = "persistent/glossary.json"
TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")


# --- Helpers ---
def normalize_term(term: str) -> str:
    if not term:
        return ""
    s = term.lower().strip()
    s = re.sub(r"[\-_/\\.,;:]+", " ", s)
    s = re.sub(r"\s+", " ", s)
    synonyms = {
        "electronic case report form": "ecrf",
        "case report form": "crf",
        "informed consent form": "icf",
        "good clinical practice": "gcp",
        "serious adverse event": "sae",
        "adverse event": "ae",
        "21 cfr part 11": "21cfrpart11",
        "clinical study report": "csr",
    }
    return synonyms.get(s, s)


def extract_text_from_pdf(pdf_path):
    """Extract plain text from a PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join(page.get_text("text") for page in doc)
        doc.close()
        return text.strip()
    except Exception as e:
        print(f"⚠️ Failed to read {pdf_path}: {e}")
        return ""


def extract_definitions_from_text(text):
    """Extract glossary-like term-definition pairs from raw PDF text."""
    glossary = {}
    text = re.sub(r"\r", "", text)
    lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
    i = 0
    while i < len(lines):
        term = lines[i].strip()
        if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
            i += 1
            continue
        if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
            i += 1
            continue
        if term.lower().startswith(("acronym for", "definition", "terms of")):
            i += 1
            continue

        defn_lines = []
        j = i + 1
        while j < len(lines):
            nxt = lines[j].strip()
            if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
                if not nxt.lower().startswith(("see also", "for example", "for instance")):
                    break
            defn_lines.append(nxt)
            j += 1

        definition = " ".join(defn_lines)
        definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
        definition = re.sub(r"\s{2,}", " ", definition).strip()

        if len(definition.split()) < 5 or "." not in definition:
            i += 1
            continue

        norm = normalize_term(term)
        glossary[norm] = {"term": term, "definition": definition}
        i = j

    return glossary


# --- Main Rebuild Function ---
def rebuild_and_upload():
    start = time.time()
    print("πŸ“˜ Starting glossary rebuild...")
    try:
        all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
        pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
        excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
    except Exception as e:
        raise RuntimeError(f"Cannot list repo files: {e}")

    all_defs = {}

    # --- 1️⃣ Process PDFs ---
    for pdf in pdfs:
        skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
        if any(sp in pdf.lower() for sp in skip_patterns):
            print(f"⏩ Skipping non-glossary or template file: {pdf}")
            continue
        try:
            print(f"πŸ” Processing {pdf}...")
            path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
            text = extract_text_from_pdf(path)
            defs = extract_definitions_from_text(text)
            for k, v in defs.items():
                v.setdefault("sources", []).append(pdf)
                if k in all_defs:
                    all_defs[k]["sources"] = list(set(all_defs[k].get("sources", []) + v["sources"]))
                else:
                    all_defs[k] = v
        except Exception as e:
            print(f"⚠️ Failed {pdf}: {e}")

    # --- 2️⃣ Process Excel Glossaries (MRCT etc.) ---
    for excel in excels:
        try:
            print(f"πŸ“— Checking Excel file in dataset: {excel}")
            excel_path = hf_hub_download(
                repo_id=DOCS_REPO,
                filename=excel,
                repo_type="dataset",
                token=TOKEN
            )
            print(f"βœ… Downloaded Excel file to {excel_path}")
            xls = pd.read_excel(excel_path, sheet_name=None)

            total_rows = 0
            excel_entries = []

            for sheet_name, df in xls.items():
                df = df.fillna("").dropna(how="all")
                if df.shape[0] == 0:
                    continue
                df.columns = [str(c).strip() for c in df.columns]

                # 🧠 Detect the correct 'Glossary Term' column
                term_col = None
                for c in df.columns:
                    if "glossary term" in c.lower():
                        term_col = c
                        break
                if not term_col:
                    for c in df.columns:
                        if "cdisc" in c.lower() or c.lower() == "term":
                            term_col = c
                            break
                if not term_col:
                    print(f"⚠️ No 'Glossary Term' column found in sheet {sheet_name} of {excel_path}")
                    continue

                # Concatenate all relevant columns with labels for clarity
                for _, row in df.iterrows():
                    term = str(row.get(term_col, "")).strip()
                    if not term:
                        continue

                    def_cols = [
                        c for c in df.columns
                        if any(k in c.lower() for k in [
                            "definition", "context", "info", "related", "resource", "use in context"
                        ])
                    ]

                    def_parts = []
                    for c in def_cols:
                        val = str(row.get(c, "")).strip()
                        if val:
                            def_parts.append(f"<b>{c}:</b> {val}")

                    full_definition = "<br>".join(def_parts).strip()
                    if not full_definition:
                        continue

                    entry = {
                        "term": term,
                        "definition": full_definition,
                        "source": os.path.basename(excel_path),
                        "sheet": sheet_name,
                        "type": "Excel",
                    }
                    excel_entries.append(entry)
                    total_rows += 1

            print(f"βœ… Added {total_rows} Excel rows from {excel_path}")

            # Merge into main glossary dictionary
            for e in excel_entries:
                norm = normalize_term(e["term"])
                payload = {
                    "term": e["term"],
                    "definition": e["definition"],
                    "sources": [e["source"]],
                    "type": e.get("type", "Excel"),
                    "sheet": e.get("sheet"),
                }
                # Each term+source pair stored uniquely to preserve different definitions
                unique_key = f"{norm}__{os.path.basename(payload['sources'][0])}" if payload.get("sources") else norm
                
                if unique_key not in all_defs:
                    all_defs[unique_key] = payload
                else:
                    # Avoid duplicate merges β€” just append any new sources
                    existing = all_defs[unique_key]
                    existing_sources = set(existing.get("sources", []))
                    new_sources = set(payload.get("sources", []))
                    existing["sources"] = list(existing_sources.union(new_sources))
                    
        except Exception as e:
            print(f"⚠️ Failed to process Excel {excel}: {e}")

    # --- 3️⃣ Save combined glossary ---
    os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
    with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
        json.dump(all_defs, f, indent=2, ensure_ascii=False)

    print(f"βœ… Saved {len(all_defs)} entries β†’ {LOCAL_GLOSSARY}")

    # --- 4️⃣ Upload to Hugging Face ---
    if TOKEN:
        try:
            upload_file(
                path_or_fileobj=LOCAL_GLOSSARY,
                path_in_repo=REMOTE_GLOSSARY,
                repo_id=DATASET_REPO,
                repo_type="dataset",
                token=TOKEN,
                commit_message="Glossary updated with PDF + Excel (column-labeled definitions)",
            )
            print(f"πŸš€ Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
        except Exception as e:
            print(f"⚠️ Upload error: {e}")

    print(f"✨ Done in {time.time() - start:.1f}s.")


if __name__ == "__main__":
    rebuild_and_upload()