"""
📘 glossary_builder.py
Builds a unified glossary from PDFs, Excel, and Web sources.
- Extracts terms & definitions from PDFs.
- Merges Excel glossary (with labeled formatting).
- Optionally fetches glossary or definitions from known web sources.
- Adds source typing (pdf/excel/web/other).
- Saves combined glossary.json locally and uploads to Hugging Face.
"""

import os
import re
import json
import time
import fitz
import requests
import pandas as pd
from bs4 import BeautifulSoup
from huggingface_hub import upload_file, HfFolder, list_repo_files, hf_hub_download

# --- Configuration ---
DATASET_REPO = "essprasad/CT-Chat-Index"
DOCS_REPO = "essprasad/CT-Chat-Docs"
LOCAL_GLOSSARY = "/home/user/app/persistent/glossary.json"
REMOTE_GLOSSARY = "persistent/glossary.json"
TOKEN = HfFolder.get_token() or os.getenv("HF_TOKEN")

# Known web glossary sources (can expand)
WEB_SOURCES = [
    "https://mrctcenter.org/glossaryterm/clinical-research/",
    "https://www.fda.gov/patients/drug-development-process/step-3-clinical-research",
    "https://www.cdisc.org/",
    "https://www.ich.org/",
    "https://www.ema.europa.eu/",
    "https://www.who.int/",
    "https://clinicaltrials.gov/",
]


# --- Helpers ---
def normalize_term(term: str) -> str:
    if not term:
        return ""
    s = term.lower().strip()
    s = re.sub(r"[\-_/\\.,;:]+", " ", s)
    s = re.sub(r"\s+", " ", s)
    synonyms = {
        "electronic case report form": "ecrf",
        "case report form": "crf",
        "informed consent form": "icf",
        "good clinical practice": "gcp",
        "serious adverse event": "sae",
        "adverse event": "ae",
        "21 cfr part 11": "21cfrpart11",
        "clinical study report": "csr",
    }
    return synonyms.get(s, s)


def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join(page.get_text("text") for page in doc)
        doc.close()
        return text.strip()
    except Exception as e:
        print(f"⚠️ Failed to read {pdf_path}: {e}")
        return ""


def extract_definitions_from_text(text):
    """Extract glossary-like term-definition pairs from raw PDF text."""
    glossary = {}
    text = re.sub(r"\r", "", text)
    lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
    i = 0
    while i < len(lines):
        term = lines[i].strip()
        if len(term) == 1 or term.isdigit() or re.fullmatch(r"[ivxlcdm]+", term.lower()):
            i += 1
            continue
        if any(x in term.lower() for x in ["contents", "chapter", "appendix", "index", "revision", "table of"]):
            i += 1
            continue
        if term.lower().startswith(("acronym for", "definition", "terms of")):
            i += 1
            continue

        defn_lines = []
        j = i + 1
        while j < len(lines):
            nxt = lines[j].strip()
            if re.match(r"^[A-Za-z]", nxt) and len(nxt.split()) <= 6 and not nxt.endswith("."):
                if not nxt.lower().startswith(("see also", "for example", "for instance")):
                    break
            defn_lines.append(nxt)
            j += 1

        definition = " ".join(defn_lines)
        definition = re.sub(r"(\w)-\s+(\w)", r"\1\2", definition)
        definition = re.sub(r"\s{2,}", " ", definition).strip()
        if len(definition.split()) < 5 or "." not in definition:
            i += 1
            continue

        norm = normalize_term(term)
        glossary[norm] = {"term": term, "definition": definition}
        i = j
    return glossary


def detect_source_type(src: str) -> str:
    if not src:
        return "other"
    src = src.lower()
    if src.endswith(".pdf"):
        return "pdf"
    if src.endswith((".xlsx", ".xls")):
        return "excel"
    if src.startswith("http"):
        return "web"
    return "other"


def extract_web_glossary(url):
    """Scrape simple web glossary or definition pages."""
    results = []
    try:
        print(f"🌐 Fetching {url}...")
        resp = requests.get(url, timeout=10)
        if resp.status_code != 200:
            print(f"⚠️ Skipped {url}: HTTP {resp.status_code}")
            return []
        soup = BeautifulSoup(resp.text, "html.parser")
        text = soup.get_text(separator="\n")
        text = re.sub(r"\s{2,}", " ", text).strip()

        # Heuristic: pick possible term-definition snippets
        matches = re.findall(r"([A-Z][A-Za-z0-9\s]{2,30})[:\-]\s*(.{10,200})", text)
        for term, definition in matches[:50]:
            definition = re.sub(r"\s{2,}", " ", definition).strip()
            if len(definition.split()) > 3:
                results.append({
                    "term": term.strip(),
                    "definition": definition,
                    "sources": [url],
                    "file": url,
                    "type": "web",
                })
    except Exception as e:
        print(f"⚠️ Web fetch failed for {url}: {e}")
    return results


# --- Main Rebuild Function ---
def rebuild_and_upload():
    start = time.time()
    print("📘 Starting glossary rebuild...")

    try:
        all_files = list_repo_files(repo_id=DOCS_REPO, repo_type="dataset", token=TOKEN)
        pdfs = [f for f in all_files if f.lower().endswith(".pdf")]
        excels = [f for f in all_files if f.lower().endswith((".xls", ".xlsx"))]
    except Exception as e:
        raise RuntimeError(f"Cannot list repo files: {e}")

    all_defs = {}

    # --- 1️⃣ Process PDFs ---
    for pdf in pdfs:
        skip_patterns = ["topic_", "template", "protocol", "schedule", "painac", "sas", "glossary_printable"]
        if any(sp in pdf.lower() for sp in skip_patterns):
            print(f"⏩ Skipping non-glossary or template file: {pdf}")
            continue
        try:
            print(f"🔍 Processing {pdf}...")
            path = hf_hub_download(repo_id=DOCS_REPO, filename=pdf, repo_type="dataset", token=TOKEN)
            text = extract_text_from_pdf(path)
            defs = extract_definitions_from_text(text)
            for k, v in defs.items():
                v.setdefault("sources", []).append(pdf)
                v["type"] = "pdf"
                v["file"] = os.path.basename(pdf)
                all_defs[f"{k}__{v['file']}"] = v
        except Exception as e:
            print(f"⚠️ Failed {pdf}: {e}")

    # --- 2️⃣ Process Excel Glossaries ---
    for excel in excels:
        try:
            print(f"📗 Checking Excel file in dataset: {excel}")
            excel_path = hf_hub_download(repo_id=DOCS_REPO, filename=excel, repo_type="dataset", token=TOKEN)
            print(f"✅ Downloaded Excel file to {excel_path}")
            xls = pd.read_excel(excel_path, sheet_name=None)
            for sheet_name, df in xls.items():
                df = df.fillna("").dropna(how="all")
                if df.empty:
                    continue
                df.columns = [str(c).strip() for c in df.columns]
                term_col = next((c for c in df.columns if "glossary term" in c.lower() or "term" == c.lower()), None)
                if not term_col:
                    continue
                for _, row in df.iterrows():
                    term = str(row.get(term_col, "")).strip()
                    if not term:
                        continue
                    def_cols = [c for c in df.columns if any(k in c.lower() for k in ["definition", "context", "info", "related", "resource"])]
                    def_parts = [f"<b>{c}:</b> {row[c]}" for c in def_cols if str(row[c]).strip()]
                    if not def_parts:
                        continue
                    entry = {
                        "term": term,
                        "definition": "<br>".join(def_parts),
                        "sources": [os.path.basename(excel_path)],
                        "file": os.path.basename(excel_path),
                        "sheet": sheet_name,
                        "type": "excel",
                    }
                    all_defs[f"{normalize_term(term)}__{entry['file']}"] = entry
            print(f"✅ Processed Excel file {excel}")
        except Exception as e:
            print(f"⚠️ Failed Excel {excel}: {e}")

    # --- 3️⃣ Process Web Glossaries ---
    web_entries = []
    for url in WEB_SOURCES:
        entries = extract_web_glossary(url)
        for e in entries:
            key = f"{normalize_term(e['term'])}__{e['file']}"
            all_defs[key] = e
            web_entries.append(e)
    print(f"✅ Added {len(web_entries)} web glossary entries.")

    # --- 4️⃣ Cleanup & Save ---
    for k, v in all_defs.items():
        v["sources"] = v.get("sources", [])
        if not isinstance(v["sources"], list):
            v["sources"] = [v["sources"]]
        if not v.get("type"):
            v["type"] = detect_source_type(v["sources"][0] if v["sources"] else "")

    os.makedirs(os.path.dirname(LOCAL_GLOSSARY), exist_ok=True)
    with open(LOCAL_GLOSSARY, "w", encoding="utf-8") as f:
        json.dump(list(all_defs.values()), f, indent=2, ensure_ascii=False)

    print(f"✅ Saved {len(all_defs)} entries → {LOCAL_GLOSSARY}")

    # --- 5️⃣ Upload to Hugging Face ---
    if TOKEN:
        try:
            upload_file(
                path_or_fileobj=LOCAL_GLOSSARY,
                path_in_repo=REMOTE_GLOSSARY,
                repo_id=DATASET_REPO,
                repo_type="dataset",
                token=TOKEN,
                commit_message="Glossary updated (PDF + Excel + Web sources)",
            )
            print(f"🚀 Uploaded to {DATASET_REPO}/{REMOTE_GLOSSARY}")
        except Exception as e:
            print(f"⚠️ Upload error: {e}")

    print(f"✨ Done in {time.time() - start:.1f}s.")


if __name__ == "__main__":
    rebuild_and_upload()