Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -146,6 +146,30 @@ def phonetic_match(text, query, method='levenshtein_distance'):
|
|
| 146 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
| 147 |
return 0
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
# Custom Tokenizer
|
| 150 |
def create_custom_tokenizer(file_path):
|
| 151 |
with open(file_path, 'r', encoding='utf-8') as f:
|
|
@@ -396,7 +420,7 @@ def compare_embeddings(file, query, model_types, model_names, split_strategy, ch
|
|
| 396 |
|
| 397 |
# Custom embedding handling
|
| 398 |
if use_custom_embedding:
|
| 399 |
-
custom_model = create_custom_embedding(chunks)
|
| 400 |
embedding_model = CustomEmbeddings(custom_model)
|
| 401 |
|
| 402 |
# Optimizing vocabulary if required
|
|
|
|
| 146 |
return jellyfish.levenshtein_distance(text_phonetic, query_phonetic)
|
| 147 |
return 0
|
| 148 |
|
| 149 |
+
def create_custom_embedding(texts, model_type='word2vec', vector_size=100, window=5, min_count=1):
|
| 150 |
+
# Tokenize the texts
|
| 151 |
+
tokenized_texts = [text.split() for text in texts]
|
| 152 |
+
|
| 153 |
+
if model_type == 'word2vec':
|
| 154 |
+
model = Word2Vec(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
| 155 |
+
elif model_type == 'fasttext':
|
| 156 |
+
model = FastText(sentences=tokenized_texts, vector_size=vector_size, window=window, min_count=min_count, workers=4)
|
| 157 |
+
else:
|
| 158 |
+
raise ValueError("Unsupported model type")
|
| 159 |
+
|
| 160 |
+
return model
|
| 161 |
+
|
| 162 |
+
class CustomEmbeddings(HuggingFaceEmbeddings):
|
| 163 |
+
def __init__(self, model_path):
|
| 164 |
+
self.model = Word2Vec.load(model_path) # or FastText.load() for FastText models
|
| 165 |
+
|
| 166 |
+
def embed_documents(self, texts):
|
| 167 |
+
return [self.model.wv[text.split()] for text in texts]
|
| 168 |
+
|
| 169 |
+
def embed_query(self, text):
|
| 170 |
+
return self.model.wv[text.split()]
|
| 171 |
+
|
| 172 |
+
|
| 173 |
# Custom Tokenizer
|
| 174 |
def create_custom_tokenizer(file_path):
|
| 175 |
with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
| 420 |
|
| 421 |
# Custom embedding handling
|
| 422 |
if use_custom_embedding:
|
| 423 |
+
custom_model = create_custom_embedding(chunks) #add custom model by name, must com from gradio FE
|
| 424 |
embedding_model = CustomEmbeddings(custom_model)
|
| 425 |
|
| 426 |
# Optimizing vocabulary if required
|