Spaces:
Running
Running
File size: 7,893 Bytes
e61e934 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
"""
vector_sync.py
Responsibilities:
- rebuild_faiss_from_glossary(glossary_path) -> builds a new faiss.Index + meta list
- _upload_to_dataset(index_path, meta_path, repo_id) -> upload via huggingface_hub
- safe helpers for creating normalized metadata entries
"""
import os
import re
import json
import shutil
from typing import Tuple, List, Dict, Any
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from huggingface_hub import upload_file
# default embedder (same model used elsewhere)
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
# directories
PERSISTENT_DIR = "/home/user/app/persistent"
TMP_DIR = "/home/user/app/tmp"
os.makedirs(PERSISTENT_DIR, exist_ok=True)
os.makedirs(TMP_DIR, exist_ok=True)
def _ensure_model():
"""Return global sentence-transformer model."""
return SentenceTransformer(EMBED_MODEL_NAME)
def _normalize_meta_row(row: Dict[str, Any]) -> Dict[str, Any]:
"""Ensure consistent meta record fields."""
out = {
"term": row.get("term") or row.get("Term") or row.get("name") or "",
"text": row.get("text") or row.get("definition") or row.get("content") or "",
# keep both 'file' (local/basename) and full 'sources' list
"file": row.get("file") or row.get("source") or "",
"type": row.get("type") or "",
"sources": row.get("sources") if isinstance(row.get("sources"), list) else [row.get("source")] if row.get("source") else []
}
return out
# ==========================================================
# 🧠 Main Function: Rebuild FAISS from glossary.json
# ==========================================================
def rebuild_faiss_from_glossary(glossary_path: str):
"""
Build FAISS index + metadata from glossary JSON file.
Handles mixed entries (PDF, Excel, Web, Other).
Fully resilient against malformed or oversized rows.
"""
print(f"🧩 Building FAISS from glossary: {glossary_path}")
if not os.path.exists(glossary_path):
raise FileNotFoundError(f"Glossary not found: {glossary_path}")
# --- Load JSON safely
with open(glossary_path, "r", encoding="utf-8") as f:
try:
glossary_data = json.load(f)
except Exception as e:
raise RuntimeError(f"❌ Failed to load glossary JSON: {e}")
# Normalize structure
if isinstance(glossary_data, dict):
glossary_items = list(glossary_data.values())
elif isinstance(glossary_data, list):
glossary_items = glossary_data
else:
raise ValueError("Invalid glossary format — must be list or dict.")
model = SentenceTransformer(EMBED_MODEL_NAME)
entries, metas, bad_entries, long_entries = [], [], [], []
# helper: normalized type inference
def infer_type_from_source(src: str, declared_type: str = "") -> str:
src_l = (src or "").lower()
declared = (declared_type or "").lower()
if src_l.endswith(".pdf") or "pdf" in declared:
return "pdf"
if src_l.endswith((".xlsx", ".xls")) or "excel" in declared or "xls" in src_l:
return "excel"
if src_l.startswith("http") or declared == "web" or "http" in src_l:
return "web"
return "other"
# --- Process glossary items
for i, item in enumerate(glossary_items):
try:
if not isinstance(item, dict):
bad_entries.append(item)
continue
term = str(item.get("term") or item.get("Term") or item.get("name") or "").strip()
definition = str(item.get("definition") or item.get("text") or item.get("content") or "").strip()
# Normalize sources (keep list)
src_field = item.get("sources") or item.get("source") or item.get("file") or ""
if isinstance(src_field, list):
src_list = [str(s).strip() for s in src_field if s]
src = ", ".join(src_list)
else:
src_list = [str(src_field).strip()] if src_field else []
src = str(src_field).strip()
declared_type = str(item.get("type") or "").strip().lower()
entry_type = infer_type_from_source(src, declared_type)
# Clean up noisy HTML tags and whitespace
definition_clean = re.sub(r"<[^>]*>", "", definition)
definition_clean = re.sub(r"\s+", " ", definition_clean).strip()
# Skip if missing essentials
if not term or not definition_clean:
bad_entries.append(item)
continue
# Skip extremely long definitions (likely raw HTML or large web content)
if len(definition_clean) > 3000:
long_entries.append({
"term": term,
"len": len(definition_clean),
"source": src
})
continue
text = f"Definition of {term}: {definition_clean}"
entries.append(text)
metas.append({
"term": term,
"definition": definition_clean,
# preserve the original source list and file name
"sources": src_list if src_list else [src] if src else [],
"source": src,
"type": entry_type,
"file": os.path.basename(glossary_path)
})
except Exception as e:
bad_entries.append({
"index": i,
"error": str(e),
"raw": str(item)[:300]
})
continue
# --- Diagnostics
pdf_count = sum(1 for m in metas if m["type"].lower() == "pdf")
excel_count = sum(1 for m in metas if m["type"].lower() == "excel")
web_count = sum(1 for m in metas if m["type"].lower() == "web")
other_count = len(metas) - (pdf_count + excel_count + web_count)
print(f"🧠 Encoding {len(entries)} entries (PDF={pdf_count}, Excel={excel_count}, Web={web_count}, Other={other_count})…")
if bad_entries:
print(f"⚠️ {len(bad_entries)} malformed entries skipped.")
for b in bad_entries[:3]:
print(" →", json.dumps(b, ensure_ascii=False)[:300])
if long_entries:
print(f"⚠️ {len(long_entries)} very long entries (>3000 chars) skipped.")
for l in long_entries[:3]:
print(f" → Skipped {l['term']} ({l['len']} chars) from {l['source']}")
if not entries:
raise RuntimeError("❌ No valid glossary entries found after cleanup!")
# --- Encoding
embeddings = model.encode(entries, show_progress_bar=True, convert_to_numpy=True).astype("float32")
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
print(f"✅ Glossary vectors built ({len(entries)} total entries).")
# metas is list of dicts aligned with vectors — return exactly as before
return index, metas
# ==========================================================
# ☁️ Upload Helper
# ==========================================================
def _upload_to_dataset(index_path: str, meta_path: str, repo_id: str) -> None:
"""
Upload FAISS index and metadata JSON to Hugging Face dataset.
"""
try:
print(f"☁️ Uploading {index_path} and {meta_path} to {repo_id}...")
upload_file(
path_or_fileobj=index_path,
path_in_repo=f"persistent/{os.path.basename(index_path)}",
repo_id=repo_id,
repo_type="dataset"
)
upload_file(
path_or_fileobj=meta_path,
path_in_repo=f"persistent/{os.path.basename(meta_path)}",
repo_id=repo_id,
repo_type="dataset"
)
print("✅ Upload complete.")
except Exception as e:
print(f"⚠️ Upload failed: {e}")
raise
|