Spaces:
Sleeping
Sleeping
File size: 5,026 Bytes
dcf08ea ceda26c f081737 dcf08ea f081737 cc23211 ceda26c bd917a1 691dfff bd917a1 691dfff b88c230 bd917a1 f081737 ceda26c f081737 ceda26c f081737 ceda26c f081737 bd917a1 7ce351b f081737 f9e2481 dcf08ea ceda26c f081737 691dfff dcf08ea f081737 ceda26c f081737 ceda26c dcf08ea 82a60fd 691dfff 82a60fd 691dfff cc23211 691dfff bd917a1 82a60fd ceda26c 9c58f40 ceda26c 2adf1be 0ce457c 2adf1be 0ce457c ceda26c 7855967 691dfff 7855967 dcf08ea bd917a1 691dfff bd917a1 dcf08ea f081737 cc23211 dcf08ea 691dfff f081737 cc23211 dcf08ea cc23211 dcf08ea f081737 bd917a1 f081737 dcf08ea f081737 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import re
# -------------------- Config --------------------
TOP_K = 5
FINAL_TOP_N = 1
MIN_CONF = 0.14
CHUNK_LIMIT = 300
MAX_TOKENS = 256
TEMP = 0.2
QUALITY_LOG = "quality_feedback.jsonl"
# -------------------- Load Dataset --------------------
DATASET_PATH = "nbb_merged_full.json"
with open(DATASET_PATH, "r", encoding="utf-8") as f:
RAW_DATA = json.load(f)
def normalize_record(d):
lo_text = ""
if "content" in d and isinstance(d["content"], dict):
lo_text = d["content"].get("lo", "")
elif "data" in d:
lo_text = d["data"].get("answer", "")
return {
"id": d.get("id", str(hash(json.dumps(d)))),
"title": d.get("title", ""),
"section": d.get("section", ""),
"content": {"lo": lo_text},
}
DOCS = [normalize_record(x) for x in RAW_DATA if normalize_record(x)["content"]["lo"].strip()]
assert DOCS, "Dataset ບໍ່ມີ content.lo"
CORPUS = [d["content"]["lo"] for d in DOCS]
IDS = [d["id"] for d in DOCS]
ID2DOC = {d["id"]: d for d in DOCS}
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.95, sublinear_tf=True)
X = vectorizer.fit_transform(CORPUS)
# -------------------- Search --------------------
def search(query, k=TOP_K):
qv = vectorizer.transform([query])
sims = cosine_similarity(qv, X)[0]
idxs = np.argsort(-sims)[:k]
return [{"id": IDS[i], "score": sims[i]} for i in idxs]
# -------------------- Load LLM --------------------
MODEL_PATH = hf_hub_download(
repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
filename="qwen2.5-1.5b-instruct-q4_k_m.gguf"
)
LLM = Llama(
model_path=MODEL_PATH,
n_ctx=1024,
n_threads=4,
n_gpu_layers=0,
n_batch=128,
logits_all=False,
verbose=False
)
SYSTEM_RULES = """
You are a Lao banking assistant for NAYOBY BANK (NBB).
HARD RULES (do not break):
1) Answer ONLY from the provided Context. Do NOT use outside knowledge or make assumptions.
2) If the answer is not clearly in the Context, reply in Lao: "ຂໍອະໄພ ຂ້ອຍບໍ່ພົບຂໍ້ມູນໃນຖານຄວາມຮູ້."
3) Cite the evidence ids at the end in square brackets (1–3 ids).
4) Default reply in Lao; if the whole user question is Thai/English, reply with that language; keep product terms exactly as in Context.
5) Never invent numbers, dates, fees, branches, or contacts beyond the Context.
STYLE:
- Concise (≤ 100 Lao words). Direct answer first, bullets if needed.
- Keep terminology exactly as in Context.
FORMAT:
- End the last line with citations like: [id_a, id_b]
"""
def truncate(text, limit=CHUNK_LIMIT):
return text if len(text) <= limit else text[:limit] + "..."
def build_prompt(question, hits):
ctx = "\n\n".join([
truncate(ID2DOC[h['id']]['content']['lo'])
for h in hits[:FINAL_TOP_N]
])
return f"{SYSTEM_RULES}\n\nContext:\n{ctx}\n\nQuestion:\n{question}\n\nAnswer:"
# -------------------- Helper functions --------------------
def limit_words(text, max_words=100):
words = text.split()
return " ".join(words[:max_words])
def clean_citations(text):
# เอา citation ซ้ำ ๆ ออก และจำกัดไม่เกิน 3 id
match = re.findall(r"\[(.*?)\]", text)
if not match:
return text
ids = match[-1].split(",") # ใช้ citation ชุดสุดท้าย
ids = [x.strip() for x in ids if x.strip()]
ids = list(dict.fromkeys(ids))[:3] # ลบซ้ำ + จำกัด 3
text = re.sub(r"\[.*?\]$", "", text).strip()
return f"{text} [{', '.join(ids)}]"
# -------------------- Answer --------------------
def smart_answer(message):
hits = search(message, k=TOP_K)
if not hits or hits[0]["score"] < MIN_CONF:
return "ຂໍອະໄພ ບໍ່ພົບຂໍ້ມູນໃນຖານຄວາມຮູ້."
prompt = build_prompt(message, hits)
out = LLM(
prompt,
max_tokens=MAX_TOKENS,
temperature=TEMP,
stop=["\n\nQuestion:", "Context:", "Answer:", "</s>"]
)
answer = out["choices"][0]["text"].strip()
answer = limit_words(answer, 100)
answer = clean_citations(answer)
return answer
# -------------------- Gradio Chatbot --------------------
def respond(message, history):
answer = smart_answer(message)
history = history + [(message, answer)]
return history
with gr.Blocks() as demo:
gr.Markdown("## ທົດລອງ RDB Chatbot")
chatbot_ui = gr.Chatbot()
msg = gr.Textbox(placeholder="ພິມຄຳຖາມບ່ອນນີ້...")
msg.submit(respond, [msg, chatbot_ui], chatbot_ui)
if __name__ == "__main__":
demo.launch()
|