Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -20,7 +20,7 @@ import jellyfish
|
|
| 20 |
from gensim.models import Word2Vec
|
| 21 |
from gensim.models.fasttext import FastText
|
| 22 |
from collections import Counter
|
| 23 |
-
from tokenizers import Tokenizer
|
| 24 |
from tokenizers.models import WordLevel
|
| 25 |
from tokenizers.trainers import WordLevelTrainer
|
| 26 |
from tokenizers.pre_tokenizers import Whitespace
|
|
@@ -344,6 +344,8 @@ def visualize_results(results_df, stats_df):
|
|
| 344 |
return fig
|
| 345 |
|
| 346 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
|
|
|
|
|
|
| 347 |
# Count word frequencies
|
| 348 |
word_freq = Counter(word for text in texts for word in text.split())
|
| 349 |
|
|
@@ -354,7 +356,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
|
| 354 |
]
|
| 355 |
|
| 356 |
# Train BPE tokenizer
|
| 357 |
-
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
| 358 |
trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
| 359 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
| 360 |
|
|
|
|
| 20 |
from gensim.models import Word2Vec
|
| 21 |
from gensim.models.fasttext import FastText
|
| 22 |
from collections import Counter
|
| 23 |
+
from tokenizers import Tokenizer, models
|
| 24 |
from tokenizers.models import WordLevel
|
| 25 |
from tokenizers.trainers import WordLevelTrainer
|
| 26 |
from tokenizers.pre_tokenizers import Whitespace
|
|
|
|
| 344 |
return fig
|
| 345 |
|
| 346 |
def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
|
| 347 |
+
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
|
| 348 |
+
|
| 349 |
# Count word frequencies
|
| 350 |
word_freq = Counter(word for text in texts for word in text.split())
|
| 351 |
|
|
|
|
| 356 |
]
|
| 357 |
|
| 358 |
# Train BPE tokenizer
|
| 359 |
+
# tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
|
| 360 |
trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
| 361 |
tokenizer.train_from_iterator(optimized_texts, trainer)
|
| 362 |
|