File size: 5,026 Bytes
dcf08ea
ceda26c
 
 
f081737
dcf08ea
f081737
cc23211
ceda26c
bd917a1
691dfff
bd917a1
 
691dfff
b88c230
bd917a1
 
 
f081737
 
ceda26c
 
 
f081737
 
 
 
 
 
ceda26c
f081737
 
 
 
ceda26c
 
f081737
bd917a1
7ce351b
f081737
 
 
f9e2481
dcf08ea
 
ceda26c
f081737
691dfff
dcf08ea
f081737
ceda26c
f081737
ceda26c
dcf08ea
82a60fd
691dfff
 
82a60fd
 
 
 
691dfff
cc23211
 
691dfff
bd917a1
 
82a60fd
ceda26c
9c58f40
ceda26c
2adf1be
0ce457c
 
 
 
 
 
2adf1be
 
0ce457c
 
 
 
 
ceda26c
 
7855967
691dfff
7855967
dcf08ea
bd917a1
691dfff
bd917a1
 
dcf08ea
f081737
cc23211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcf08ea
691dfff
 
f081737
cc23211
dcf08ea
cc23211
 
 
 
 
 
 
 
 
 
dcf08ea
 
 
 
 
 
f081737
 
bd917a1
f081737
 
dcf08ea
f081737
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import re

# -------------------- Config --------------------
TOP_K        = 5            
FINAL_TOP_N  = 1           
MIN_CONF     = 0.14        
CHUNK_LIMIT  = 300          
MAX_TOKENS   = 256           
TEMP         = 0.2
QUALITY_LOG  = "quality_feedback.jsonl"   

# -------------------- Load Dataset --------------------
DATASET_PATH = "nbb_merged_full.json"
with open(DATASET_PATH, "r", encoding="utf-8") as f:
    RAW_DATA = json.load(f)

def normalize_record(d):
    lo_text = ""
    if "content" in d and isinstance(d["content"], dict):
        lo_text = d["content"].get("lo", "")
    elif "data" in d:
        lo_text = d["data"].get("answer", "")
    return {
        "id": d.get("id", str(hash(json.dumps(d)))),
        "title": d.get("title", ""),
        "section": d.get("section", ""),
        "content": {"lo": lo_text},
    }

DOCS = [normalize_record(x) for x in RAW_DATA if normalize_record(x)["content"]["lo"].strip()]
assert DOCS, "Dataset ບໍ່ມີ content.lo"

CORPUS = [d["content"]["lo"] for d in DOCS]
IDS = [d["id"] for d in DOCS]
ID2DOC = {d["id"]: d for d in DOCS}

vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, max_df=0.95, sublinear_tf=True)
X = vectorizer.fit_transform(CORPUS)

# -------------------- Search --------------------
def search(query, k=TOP_K):
    qv = vectorizer.transform([query])
    sims = cosine_similarity(qv, X)[0]
    idxs = np.argsort(-sims)[:k]
    return [{"id": IDS[i], "score": sims[i]} for i in idxs]

# -------------------- Load LLM --------------------
MODEL_PATH = hf_hub_download(
    repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",  
    filename="qwen2.5-1.5b-instruct-q4_k_m.gguf"
)

LLM = Llama(
    model_path=MODEL_PATH,
    n_ctx=1024,       
    n_threads=4,     
    n_gpu_layers=0,  
    n_batch=128,      
    logits_all=False,
    verbose=False
)

SYSTEM_RULES = """
You are a Lao banking assistant for NAYOBY BANK (NBB).

HARD RULES (do not break):
1) Answer ONLY from the provided Context. Do NOT use outside knowledge or make assumptions.
2) If the answer is not clearly in the Context, reply in Lao: "ຂໍອະໄພ ຂ້ອຍບໍ່ພົບຂໍ້ມູນໃນຖານຄວາມຮູ້."
3) Cite the evidence ids at the end in square brackets (1–3 ids).
4) Default reply in Lao; if the whole user question is Thai/English, reply with that language; keep product terms exactly as in Context.
5) Never invent numbers, dates, fees, branches, or contacts beyond the Context.

STYLE:
- Concise (≤ 100 Lao words). Direct answer first, bullets if needed.
- Keep terminology exactly as in Context.

FORMAT:
- End the last line with citations like: [id_a, id_b]
"""

def truncate(text, limit=CHUNK_LIMIT):
    return text if len(text) <= limit else text[:limit] + "..."

def build_prompt(question, hits):
    ctx = "\n\n".join([
        truncate(ID2DOC[h['id']]['content']['lo'])
        for h in hits[:FINAL_TOP_N]
    ])
    return f"{SYSTEM_RULES}\n\nContext:\n{ctx}\n\nQuestion:\n{question}\n\nAnswer:"

# -------------------- Helper functions --------------------
def limit_words(text, max_words=100):
    words = text.split()
    return " ".join(words[:max_words])

def clean_citations(text):
    # เอา citation ซ้ำ ๆ ออก และจำกัดไม่เกิน 3 id
    match = re.findall(r"\[(.*?)\]", text)
    if not match:
        return text
    ids = match[-1].split(",")  # ใช้ citation ชุดสุดท้าย
    ids = [x.strip() for x in ids if x.strip()]
    ids = list(dict.fromkeys(ids))[:3]  # ลบซ้ำ + จำกัด 3
    text = re.sub(r"\[.*?\]$", "", text).strip()
    return f"{text} [{', '.join(ids)}]"

# -------------------- Answer --------------------
def smart_answer(message):
    hits = search(message, k=TOP_K)
    if not hits or hits[0]["score"] < MIN_CONF:
        return "ຂໍອະໄພ ບໍ່ພົບຂໍ້ມູນໃນຖານຄວາມຮູ້."
    
    prompt = build_prompt(message, hits)
    out = LLM(
        prompt,
        max_tokens=MAX_TOKENS,
        temperature=TEMP,
        stop=["\n\nQuestion:", "Context:", "Answer:", "</s>"]
    )
    answer = out["choices"][0]["text"].strip()
    answer = limit_words(answer, 100)
    answer = clean_citations(answer)
    return answer

# -------------------- Gradio Chatbot --------------------
def respond(message, history):
    answer = smart_answer(message)
    history = history + [(message, answer)]
    return history

with gr.Blocks() as demo:
    gr.Markdown("## ທົດລອງ RDB Chatbot")
    chatbot_ui = gr.Chatbot()
    msg = gr.Textbox(placeholder="ພິມຄຳຖາມບ່ອນນີ້...")
    msg.submit(respond, [msg, chatbot_ui], chatbot_ui)

if __name__ == "__main__":
    demo.launch()