Spaces:
Sleeping
Sleeping
Update rag_utils.py
Browse files- rag_utils.py +54 -50
rag_utils.py
CHANGED
|
@@ -1,23 +1,15 @@
|
|
| 1 |
-
# rag_utils.py
|
| 2 |
-
import os, json
|
| 3 |
-
import numpy as np
|
| 4 |
-
from typing import List, Tuple, Dict
|
| 5 |
-
|
| 6 |
-
import faiss
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
|
| 9 |
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
|
| 10 |
-
CACHE_DIR = "/tmp"
|
| 11 |
-
|
| 12 |
TAROT_JSON = os.path.join(DATA_DIR, "tarot_data_full.json")
|
| 13 |
NUM_JSON = os.path.join(DATA_DIR, "numerology_data_full.json")
|
| 14 |
-
|
| 15 |
TAROT_IDX = os.path.join(CACHE_DIR, "faiss_tarot.index")
|
| 16 |
TAROT_META = os.path.join(CACHE_DIR, "faiss_tarot_meta.json")
|
| 17 |
NUM_IDX = os.path.join(CACHE_DIR, "faiss_num.index")
|
| 18 |
NUM_META = os.path.join(CACHE_DIR, "faiss_num_meta.json")
|
| 19 |
-
|
| 20 |
-
# 輕量好用
|
| 21 |
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 22 |
|
| 23 |
_model = None
|
|
@@ -28,42 +20,56 @@ def get_model():
|
|
| 28 |
return _model
|
| 29 |
|
| 30 |
def _build_tarot():
|
| 31 |
-
with open(TAROT_JSON) as f:
|
| 32 |
data = json.load(f)
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
| 48 |
|
| 49 |
def _build_num():
|
| 50 |
-
with open(NUM_JSON) as f:
|
| 51 |
data = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
texts = [
|
| 54 |
-
(i, n["number"],
|
| 55 |
-
(str(n["number"]) + " " + n.get("life_path_meaning","") + " " + n.get("advice","")).strip())
|
| 56 |
-
for i, n in enumerate(data)
|
| 57 |
-
]
|
| 58 |
model = get_model()
|
| 59 |
-
embs = model.encode([t[2] for t in
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
with open(NUM_META, "w") as f:
|
| 66 |
-
json.dump([{"i":i, "number":num, "text":txt} for (i,num,txt) in texts], f, indent=2)
|
| 67 |
|
| 68 |
def ensure_indexes():
|
| 69 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
@@ -72,22 +78,20 @@ def ensure_indexes():
|
|
| 72 |
if not (os.path.exists(NUM_IDX) and os.path.exists(NUM_META)):
|
| 73 |
_build_num()
|
| 74 |
|
| 75 |
-
def _search(
|
| 76 |
model = get_model()
|
| 77 |
-
idx = faiss.read_index(
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
q = model.encode([query], normalize_embeddings=True).astype("float32")
|
| 82 |
D, I = idx.search(q, k)
|
| 83 |
-
|
| 84 |
-
for rank, (score, j) in enumerate(zip(D[0], I[0]),
|
| 85 |
-
m = meta[j]
|
| 86 |
-
m =
|
| 87 |
-
m
|
| 88 |
-
|
| 89 |
-
results.append(m)
|
| 90 |
-
return results
|
| 91 |
|
| 92 |
def search_tarot(query: str, k: int = 3):
|
| 93 |
ensure_indexes()
|
|
|
|
| 1 |
+
# rag_utils.py(替換 _build_tarot / _build_num / ensure/search 等同前)
|
| 2 |
+
import os, json, numpy as np, faiss
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
|
| 5 |
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
|
| 6 |
+
CACHE_DIR = "/tmp"
|
|
|
|
| 7 |
TAROT_JSON = os.path.join(DATA_DIR, "tarot_data_full.json")
|
| 8 |
NUM_JSON = os.path.join(DATA_DIR, "numerology_data_full.json")
|
|
|
|
| 9 |
TAROT_IDX = os.path.join(CACHE_DIR, "faiss_tarot.index")
|
| 10 |
TAROT_META = os.path.join(CACHE_DIR, "faiss_tarot_meta.json")
|
| 11 |
NUM_IDX = os.path.join(CACHE_DIR, "faiss_num.index")
|
| 12 |
NUM_META = os.path.join(CACHE_DIR, "faiss_num_meta.json")
|
|
|
|
|
|
|
| 13 |
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
| 14 |
|
| 15 |
_model = None
|
|
|
|
| 20 |
return _model
|
| 21 |
|
| 22 |
def _build_tarot():
|
| 23 |
+
with open(TAROT_JSON, "r", encoding="utf-8") as f:
|
| 24 |
data = json.load(f)
|
| 25 |
|
| 26 |
+
# 接受 name 或 card_name;text 取 meaning_upright / reversed / advice
|
| 27 |
+
items = []
|
| 28 |
+
for i, c in enumerate(data):
|
| 29 |
+
name = c.get("name") or c.get("card_name") or f"Card {i}"
|
| 30 |
+
text = " ".join([
|
| 31 |
+
c.get("meaning_upright",""),
|
| 32 |
+
c.get("meaning_reversed",""),
|
| 33 |
+
c.get("advice","")
|
| 34 |
+
]).strip()
|
| 35 |
+
if not text:
|
| 36 |
+
# 退而求其次:把 keywords 也串起來
|
| 37 |
+
kws = " ".join((c.get("keywords_upright") or []) + (c.get("keywords_reversed") or []))
|
| 38 |
+
text = f"{name} {kws}".strip()
|
| 39 |
+
items.append((i, name, text))
|
| 40 |
|
| 41 |
+
model = get_model()
|
| 42 |
+
embs = model.encode([t[2] for t in items], normalize_embeddings=True)
|
| 43 |
+
index = faiss.IndexFlatIP(embs.shape[1])
|
| 44 |
+
index.add(np.array(embs, dtype="float32"))
|
| 45 |
|
| 46 |
+
import json as _json
|
| 47 |
+
faiss.write_index(index, TAROT_IDX)
|
| 48 |
+
with open(TAROT_META, "w", encoding="utf-8") as f:
|
| 49 |
+
_json.dump([{"i": i, "card_name": n, "text": t} for (i, n, t) in items], f, ensure_ascii=False, indent=2)
|
| 50 |
|
| 51 |
def _build_num():
|
| 52 |
+
with open(NUM_JSON, "r", encoding="utf-8") as f:
|
| 53 |
data = json.load(f)
|
| 54 |
+
items = []
|
| 55 |
+
for i, n in enumerate(data):
|
| 56 |
+
num = n.get("number")
|
| 57 |
+
text = " ".join([
|
| 58 |
+
str(num),
|
| 59 |
+
n.get("description",""),
|
| 60 |
+
n.get("advice","")
|
| 61 |
+
]).strip()
|
| 62 |
+
items.append((i, num, text))
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
model = get_model()
|
| 65 |
+
embs = model.encode([t[2] for t in items], normalize_embeddings=True)
|
| 66 |
+
index = faiss.IndexFlatIP(embs.shape[1])
|
| 67 |
+
index.add(np.array(embs, dtype="float32"))
|
| 68 |
|
| 69 |
+
import json as _json
|
| 70 |
+
faiss.write_index(index, NUM_IDX)
|
| 71 |
+
with open(NUM_META, "w", encoding="utf-8") as f:
|
| 72 |
+
_json.dump([{"i": i, "number": num, "text": t} for (i, num, t) in items], f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
| 73 |
|
| 74 |
def ensure_indexes():
|
| 75 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
|
|
|
| 78 |
if not (os.path.exists(NUM_IDX) and os.path.exists(NUM_META)):
|
| 79 |
_build_num()
|
| 80 |
|
| 81 |
+
def _search(idx_path, meta_path, query, k):
|
| 82 |
model = get_model()
|
| 83 |
+
idx = faiss.read_index(idx_path)
|
| 84 |
+
import json as _json
|
| 85 |
+
with open(meta_path, "r", encoding="utf-8") as f:
|
| 86 |
+
meta = _json.load(f)
|
| 87 |
q = model.encode([query], normalize_embeddings=True).astype("float32")
|
| 88 |
D, I = idx.search(q, k)
|
| 89 |
+
out = []
|
| 90 |
+
for rank, (score, j) in enumerate(zip(D[0], I[0]), 1):
|
| 91 |
+
m = dict(meta[j])
|
| 92 |
+
m["score"] = float(score); m["rank"] = rank
|
| 93 |
+
out.append(m)
|
| 94 |
+
return out
|
|
|
|
|
|
|
| 95 |
|
| 96 |
def search_tarot(query: str, k: int = 3):
|
| 97 |
ensure_indexes()
|