Spaces:
Running
Running
update fuzzy method
Browse files
app.py
CHANGED
|
@@ -37,7 +37,12 @@ with open("keyword_whitelist.pkl", "rb") as f:
|
|
| 37 |
keyword_whitelist = pickle.load(f)
|
| 38 |
|
| 39 |
# Utils
|
|
|
|
|
|
|
|
|
|
| 40 |
def normalize(text: str) -> str:
|
|
|
|
|
|
|
| 41 |
text = unicodedata.normalize("NFC", text)
|
| 42 |
return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
|
| 43 |
|
|
@@ -54,7 +59,17 @@ def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3
|
|
| 54 |
matched = False
|
| 55 |
for n in range(min(max_ngram, len(tokens) - i), 0, -1):
|
| 56 |
phrase = "".join(tokens[i:i+n])
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
if score >= threshold:
|
| 59 |
corrected.append(match)
|
| 60 |
i += n
|
|
|
|
| 37 |
keyword_whitelist = pickle.load(f)
|
| 38 |
|
| 39 |
# Utils
|
| 40 |
+
def is_non_thai(text):
|
| 41 |
+
return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
|
| 42 |
+
|
| 43 |
def normalize(text: str) -> str:
|
| 44 |
+
if is_non_thai(text):
|
| 45 |
+
return text.strip()
|
| 46 |
text = unicodedata.normalize("NFC", text)
|
| 47 |
return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
|
| 48 |
|
|
|
|
| 59 |
matched = False
|
| 60 |
for n in range(min(max_ngram, len(tokens) - i), 0, -1):
|
| 61 |
phrase = "".join(tokens[i:i+n])
|
| 62 |
+
if phrase in whitelist:
|
| 63 |
+
corrected.append(phrase)
|
| 64 |
+
i += n
|
| 65 |
+
matched = True
|
| 66 |
+
break
|
| 67 |
+
match, score, _ = process.extractOne(
|
| 68 |
+
phrase,
|
| 69 |
+
whitelist,
|
| 70 |
+
scorer=fuzz.token_sort_ratio,
|
| 71 |
+
processor=lambda x: x.lower()
|
| 72 |
+
)
|
| 73 |
if score >= threshold:
|
| 74 |
corrected.append(match)
|
| 75 |
i += n
|