Spaces:

sibthinon
/

environment

Sleeping

App Files Files Community

sibthinon commited on Jun 11

Commit

dbd7784

verified ·

1 Parent(s): 8933ccb

add rerank

Browse files

Files changed (1) hide show

app.py +27 -3

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from pyairtable import Api
 import pickle
 import re
 import unicodedata
 # Setup Qdrant Client
 qdrant_client = QdrantClient(
@@ -43,15 +44,23 @@ models = {
     "BGE M3": {
         "model": SentenceTransformer("BAAI/bge-m3"),
         "collection": "product_bge-m3",
-        "threshold": 0.5,
         "prefix": ""
     }
 }
 # Utils
 def is_non_thai(text):
     return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
 def normalize(text: str) -> str:
     if is_non_thai(text):
         return text.strip()
@@ -90,7 +99,7 @@ def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3
         if not matched:
             corrected.append(tokens[i])
             i += 1
-    return "".join([word for word in corrected if len(word) > 1 or word in whitelist])
 # Global state
 latest_query_result = {"query": "", "result": "", "raw_query": "", "time": ""}
@@ -110,6 +119,7 @@ def search_product(query, model_choice):
     query_embed = model.encode(prefix + corrected_query)
     try:
         result = qdrant_client.query_points(
             collection_name=collection_name,
             query=query_embed.tolist(),
@@ -120,11 +130,25 @@ def search_product(query, model_choice):
     except Exception as e:
         return f"<p>❌ Qdrant error: {str(e)}</p>"
     elapsed = time.time() - start_time
     html_output = f"<p>⏱ <strong>{elapsed:.2f} วินาที</strong></p>"
     if corrected_query != query:
         html_output += f"<p>🔧 แก้คำค้นจาก: <code>{query}</code> → <code>{corrected_query}</code></p>"
     html_output += '<div style="display: grid; grid-template-columns: repeat(auto-fill, minmax(220px, 1fr)); gap: 20px;">'
     result_summary, found = "", False

 import pickle
 import re
 import unicodedata
+from FlagEmbedding import FlagReranker
 # Setup Qdrant Client
 qdrant_client = QdrantClient(
     "BGE M3": {
         "model": SentenceTransformer("BAAI/bge-m3"),
         "collection": "product_bge-m3",
+        "threshold": 0.45,
         "prefix": ""
     }
 }
+reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)
 # Utils
 def is_non_thai(text):
     return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
+def join_corrected_tokens(corrected: list) -> str:
+    if corrected and is_non_thai("".join(corrected)):
+        return " ".join([w for w in corrected if len(w) > 1 or w in keyword_whitelist])
+    else:
+        return "".join([w for w in corrected if len(w) > 1 or w in keyword_whitelist])
 def normalize(text: str) -> str:
     if is_non_thai(text):
         return text.strip()
         if not matched:
             corrected.append(tokens[i])
             i += 1
+    return join_corrected_tokens(corrected)
 # Global state
 latest_query_result = {"query": "", "result": "", "raw_query": "", "time": ""}
     query_embed = model.encode(prefix + corrected_query)
     try:
+        # 🔍 ดึง top-50 ก่อน rerank
         result = qdrant_client.query_points(
             collection_name=collection_name,
             query=query_embed.tolist(),
     except Exception as e:
         return f"<p>❌ Qdrant error: {str(e)}</p>"
+    # ✅ Rerank Top 10 ด้วย Cross-Encoder (เฉพาะ BGE M3 เท่านั้น)
+    if model_choice == "BGE M3" and len(result) > 0:
+        topk = 10
+        docs = [r.payload.get("name", "") for r in result[:topk]]
+        pairs = [[corrected_query, d] for d in docs]
+        scores = reranker.compute_score(pairs, normalize=True)
+        # ผสมคะแนน: 0.6 จาก embedding, 0.4 จาก reranker
+        result[:topk] = sorted(
+            zip(result[:topk], scores),
+            key=lambda x: 0.6 * x[0].score + 0.4 * x[1],
+            reverse=True
+        )
+        result[:topk] = [r[0] for r in result[:topk]]
     elapsed = time.time() - start_time
     html_output = f"<p>⏱ <strong>{elapsed:.2f} วินาที</strong></p>"
     if corrected_query != query:
         html_output += f"<p>🔧 แก้คำค้นจาก: <code>{query}</code> → <code>{corrected_query}</code></p>"
     html_output += '<div style="display: grid; grid-template-columns: repeat(auto-fill, minmax(220px, 1fr)); gap: 20px;">'
     result_summary, found = "", False