Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -343,6 +343,23 @@ def visualize_results(results_df, stats_df):
|
|
| 343 |
plt.tight_layout()
|
| 344 |
return fig
|
| 345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
# Main Comparison Function
|
| 347 |
def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None):
|
| 348 |
all_results = []
|
|
|
|
| 343 |
plt.tight_layout()
|
| 344 |
return fig
|
| 345 |
|
| 346 |
+
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
| 347 |
+
# Count word frequencies
|
| 348 |
+
word_freq = Counter(word for text in texts for word in text.split())
|
| 349 |
+
|
| 350 |
+
# Remove rare words
|
| 351 |
+
optimized_texts = [
|
| 352 |
+
' '.join(word for word in text.split() if word_freq[word] >= min_frequency)
|
| 353 |
+
for text in texts
|
| 354 |
+
]
|
| 355 |
+
|
| 356 |
+
# Train BPE tokenizer
|
| 357 |
+
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
| 358 |
+
trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
| 359 |
+
tokenizer.train_from_iterator(optimized_texts, trainer)
|
| 360 |
+
|
| 361 |
+
return tokenizer, optimized_texts
|
| 362 |
+
|
| 363 |
# Main Comparison Function
|
| 364 |
def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k, lang='german', use_custom_embedding=False, optimize_vocab=False, phonetic_weight=0.3, custom_tokenizer_file=None):
|
| 365 |
all_results = []
|