Spaces:
Sleeping
Sleeping
File size: 3,130 Bytes
b816136 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
"""
core/vector_search.py
-----------------------------------------------------
Performs FAISS semantic search for hybrid retrieval.
Includes:
- SentenceTransformer embedding for query
- FAISS similarity search
- Metadata + citation extraction
- Robust fallback if index missing
"""
import os
import json
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# Paths (shared with vector_store/vector_sync)
FAISS_INDEX = "persistent/faiss.index"
FAISS_META = "persistent/faiss.index.meta.json"
_model = None
_index = None
_meta = []
# ----------------------------
# πΉ Loaders
# ----------------------------
def _load_model():
"""Lazy-load embedding model."""
global _model
if _model is None:
print("π₯ Loading embedding model for retrieval...")
_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("β
Model loaded.")
return _model
def _load_faiss():
"""Load FAISS index + metadata, prefer local persistent copy."""
global _index, _meta
if _index is not None:
return _index, _meta
local_index = "/home/user/app/persistent/faiss.index"
local_meta = "/home/user/app/persistent/faiss.index.meta.json"
if os.path.exists(local_index) and os.path.exists(local_meta):
print("π [vector_search] Using local FAISS index.")
_index = faiss.read_index(local_index)
with open(local_meta, "r", encoding="utf-8") as f:
_meta = json.load(f)
print(f"β
Loaded local FAISS index ({len(_meta)} entries).")
return _index, _meta
print("βοΈ [vector_search] Local FAISS missing, using fallback remote index.")
return _index, _meta
# ----------------------------
# πΉ Core Query Function
# ----------------------------
def query_faiss(query: str, top_k: int = 5):
"""
Perform FAISS semantic similarity search.
Returns:
results: list of matched text chunks
meta: list of metadata dicts (with citations)
"""
index, meta = _load_faiss()
if index is None or len(meta) == 0:
return [], []
model = _load_model()
q_emb = np.array(model.encode([query]), dtype=np.float32)
D, I = index.search(q_emb, top_k)
results, citations = [], []
for idx in I[0]:
if 0 <= idx < len(meta):
doc = meta[idx]
text = clean_text(doc.get("text", ""))
src = doc.get("source", "Unknown Source")
citation = f"π <b>Source:</b> {os.path.basename(src)}"
results.append(text)
citations.append(citation)
return results, citations
# ----------------------------
# πΉ Utilities
# ----------------------------
def clean_text(text: str, max_len: int = 800):
"""
Truncate and clean text for readability.
"""
text = text.replace("\n", " ").replace(" ", " ").strip()
if len(text) > max_len:
text = text[:max_len].rsplit(" ", 1)[0] + "..."
return text
def has_index():
"""Check if FAISS index is available."""
return os.path.exists(FAISS_INDEX) and os.path.exists(FAISS_META)
|