File size: 7,893 Bytes
e61e934
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
"""
vector_sync.py
Responsibilities:
- rebuild_faiss_from_glossary(glossary_path) -> builds a new faiss.Index + meta list
- _upload_to_dataset(index_path, meta_path, repo_id) -> upload via huggingface_hub
- safe helpers for creating normalized metadata entries
"""

import os
import re
import json
import shutil
from typing import Tuple, List, Dict, Any

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from huggingface_hub import upload_file

# default embedder (same model used elsewhere)
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# directories
PERSISTENT_DIR = "/home/user/app/persistent"
TMP_DIR = "/home/user/app/tmp"
os.makedirs(PERSISTENT_DIR, exist_ok=True)
os.makedirs(TMP_DIR, exist_ok=True)


def _ensure_model():
    """Return global sentence-transformer model."""
    return SentenceTransformer(EMBED_MODEL_NAME)


def _normalize_meta_row(row: Dict[str, Any]) -> Dict[str, Any]:
    """Ensure consistent meta record fields."""
    out = {
        "term": row.get("term") or row.get("Term") or row.get("name") or "",
        "text": row.get("text") or row.get("definition") or row.get("content") or "",
        # keep both 'file' (local/basename) and full 'sources' list
        "file": row.get("file") or row.get("source") or "",
        "type": row.get("type") or "",
        "sources": row.get("sources") if isinstance(row.get("sources"), list) else [row.get("source")] if row.get("source") else []
    }
    return out


# ==========================================================
# 🧠 Main Function: Rebuild FAISS from glossary.json
# ==========================================================
def rebuild_faiss_from_glossary(glossary_path: str):
    """
    Build FAISS index + metadata from glossary JSON file.
    Handles mixed entries (PDF, Excel, Web, Other).
    Fully resilient against malformed or oversized rows.
    """
    print(f"🧩 Building FAISS from glossary: {glossary_path}")
    if not os.path.exists(glossary_path):
        raise FileNotFoundError(f"Glossary not found: {glossary_path}")

    # --- Load JSON safely
    with open(glossary_path, "r", encoding="utf-8") as f:
        try:
            glossary_data = json.load(f)
        except Exception as e:
            raise RuntimeError(f"❌ Failed to load glossary JSON: {e}")

    # Normalize structure
    if isinstance(glossary_data, dict):
        glossary_items = list(glossary_data.values())
    elif isinstance(glossary_data, list):
        glossary_items = glossary_data
    else:
        raise ValueError("Invalid glossary format — must be list or dict.")

    model = SentenceTransformer(EMBED_MODEL_NAME)
    entries, metas, bad_entries, long_entries = [], [], [], []

    # helper: normalized type inference
    def infer_type_from_source(src: str, declared_type: str = "") -> str:
        src_l = (src or "").lower()
        declared = (declared_type or "").lower()
        if src_l.endswith(".pdf") or "pdf" in declared:
            return "pdf"
        if src_l.endswith((".xlsx", ".xls")) or "excel" in declared or "xls" in src_l:
            return "excel"
        if src_l.startswith("http") or declared == "web" or "http" in src_l:
            return "web"
        return "other"

    # --- Process glossary items
    for i, item in enumerate(glossary_items):
        try:
            if not isinstance(item, dict):
                bad_entries.append(item)
                continue

            term = str(item.get("term") or item.get("Term") or item.get("name") or "").strip()
            definition = str(item.get("definition") or item.get("text") or item.get("content") or "").strip()

            # Normalize sources (keep list)
            src_field = item.get("sources") or item.get("source") or item.get("file") or ""
            if isinstance(src_field, list):
                src_list = [str(s).strip() for s in src_field if s]
                src = ", ".join(src_list)
            else:
                src_list = [str(src_field).strip()] if src_field else []
                src = str(src_field).strip()

            declared_type = str(item.get("type") or "").strip().lower()
            entry_type = infer_type_from_source(src, declared_type)

            # Clean up noisy HTML tags and whitespace
            definition_clean = re.sub(r"<[^>]*>", "", definition)
            definition_clean = re.sub(r"\s+", " ", definition_clean).strip()

            # Skip if missing essentials
            if not term or not definition_clean:
                bad_entries.append(item)
                continue

            # Skip extremely long definitions (likely raw HTML or large web content)
            if len(definition_clean) > 3000:
                long_entries.append({
                    "term": term,
                    "len": len(definition_clean),
                    "source": src
                })
                continue

            text = f"Definition of {term}: {definition_clean}"

            entries.append(text)
            metas.append({
                "term": term,
                "definition": definition_clean,
                # preserve the original source list and file name
                "sources": src_list if src_list else [src] if src else [],
                "source": src,
                "type": entry_type,
                "file": os.path.basename(glossary_path)
            })

        except Exception as e:
            bad_entries.append({
                "index": i,
                "error": str(e),
                "raw": str(item)[:300]
            })
            continue

    # --- Diagnostics
    pdf_count = sum(1 for m in metas if m["type"].lower() == "pdf")
    excel_count = sum(1 for m in metas if m["type"].lower() == "excel")
    web_count = sum(1 for m in metas if m["type"].lower() == "web")
    other_count = len(metas) - (pdf_count + excel_count + web_count)

    print(f"🧠 Encoding {len(entries)} entries (PDF={pdf_count}, Excel={excel_count}, Web={web_count}, Other={other_count})…")

    if bad_entries:
        print(f"⚠️ {len(bad_entries)} malformed entries skipped.")
        for b in bad_entries[:3]:
            print("  →", json.dumps(b, ensure_ascii=False)[:300])

    if long_entries:
        print(f"⚠️ {len(long_entries)} very long entries (>3000 chars) skipped.")
        for l in long_entries[:3]:
            print(f"  → Skipped {l['term']} ({l['len']} chars) from {l['source']}")

    if not entries:
        raise RuntimeError("❌ No valid glossary entries found after cleanup!")

    # --- Encoding
    embeddings = model.encode(entries, show_progress_bar=True, convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(embeddings)

    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    print(f"✅ Glossary vectors built ({len(entries)} total entries).")

    # metas is list of dicts aligned with vectors — return exactly as before
    return index, metas

# ==========================================================
# ☁️ Upload Helper
# ==========================================================
def _upload_to_dataset(index_path: str, meta_path: str, repo_id: str) -> None:
    """
    Upload FAISS index and metadata JSON to Hugging Face dataset.
    """
    try:
        print(f"☁️ Uploading {index_path} and {meta_path} to {repo_id}...")
        upload_file(
            path_or_fileobj=index_path,
            path_in_repo=f"persistent/{os.path.basename(index_path)}",
            repo_id=repo_id,
            repo_type="dataset"
        )
        upload_file(
            path_or_fileobj=meta_path,
            path_in_repo=f"persistent/{os.path.basename(meta_path)}",
            repo_id=repo_id,
            repo_type="dataset"
        )
        print("✅ Upload complete.")
    except Exception as e:
        print(f"⚠️ Upload failed: {e}")
        raise