# core/glossary.py

import json
import os
import re
from difflib import get_close_matches
from huggingface_hub import hf_hub_download

GLOSSARY = None
GLOSSARY_TERMS_CACHE = []  # 🧠 Cache of glossary keys for fuzzy matching
DATASET_REPO = "essprasad/CT-Chat-Index"
GLOSSARY_FILENAME = "persistent/glossary.json"


def _normalize_term(term: str) -> str:
    """Normalize glossary terms for matching, with fuzzy fallback."""
    if not term:
        return ""
    term = term.lower().strip()
    term = re.sub(r'[\-_/\\.,;:]+', ' ', term)
    term = re.sub(r'\s+', ' ', term)

    # Common clinical research synonym normalization
    term = term.replace("e crf", "ecrf").replace("e-crf", "ecrf").replace("e/crf", "ecrf").replace("e_crf", "ecrf")
    term = term.replace("electronic case report form", "ecrf")
    term = term.replace("case report form", "crf")
    term = term.replace("informed consent form", "icf")
    term = term.replace("good clinical practice", "gcp")
    term = term.replace("serious adverse event", "sae")
    term = term.replace("adverse event", "ae")
    term = term.replace("21 cfr part 11", "21cfrpart11")
    term = term.replace("clinical study report", "csr")

    term = term.strip()

    # 🧩 Fuzzy matching fallback (for plural/singular or typos)
    if GLOSSARY_TERMS_CACHE:
        if term not in GLOSSARY_TERMS_CACHE:
            close = get_close_matches(term, GLOSSARY_TERMS_CACHE, n=1, cutoff=0.85)
            if close:
                # return the closest key for better recall
                return close[0]

    return term


def _load_glossary():
    """Load glossary.json from Hugging Face Hub (cached)."""
    global GLOSSARY, GLOSSARY_TERMS_CACHE
    if GLOSSARY is not None:
        return GLOSSARY
    try:
        path = hf_hub_download(
            repo_id=DATASET_REPO,
            filename=GLOSSARY_FILENAME,
            repo_type="dataset",
        )
        with open(path, "r", encoding="utf-8") as f:
            raw = json.load(f)

        GLOSSARY = {}
        for k, vlist in raw.items():
            if not isinstance(k, str) or len(k.split()) > 12 or re.search(r'\d{4}', k):
                continue

            candidate_key = k
            if isinstance(vlist, dict):
                candidate_key = vlist.get("term") or vlist.get("name") or vlist.get("title") or k

            norm = _normalize_term(candidate_key)
            if not norm:
                continue

            if isinstance(vlist, dict):
                dfn = vlist.get("definition") or vlist.get("text") or ""
                sources = vlist.get("sources", [])
            elif isinstance(vlist, str):
                dfn = vlist
                sources = []
            else:
                dfn, sources = "", []

            if not dfn or len(dfn.strip()) < 5:
                continue

            if norm not in GLOSSARY:
                GLOSSARY[norm] = {
                    "term": candidate_key.strip(),
                    "definition": dfn.strip(),
                    "sources": sources if isinstance(sources, list) else []
                }
            else:
                # Merge sources if already exists
                existing = GLOSSARY[norm]
                existing_sources = set(existing.get("sources", []))
                new_sources = set(sources) if sources else set()
                existing["sources"] = list(existing_sources.union(new_sources))

        # 🧠 Store all glossary keys for fuzzy fallback
        GLOSSARY_TERMS_CACHE = list(GLOSSARY.keys())

        print(f"✅ Glossary downloaded and cached ({len(GLOSSARY)} entries).")
        return GLOSSARY
    except Exception as e:
        print(f"⚠️ Failed to load glossary from Hugging Face: {e}")
        return {}


__all__ = ["_load_glossary", "_normalize_term"]