added function to count the occurrences of lemmas per time slice
Browse files- app.py +6 -1
- word2vec.py +17 -1
app.py
CHANGED
|
@@ -30,16 +30,21 @@ def load_models_for_word_dict():
|
|
| 30 |
def load_all_lemmas():
|
| 31 |
return load_compressed_word_list('all_lemmas.pkl.gz')
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# Load compressed word list
|
| 34 |
all_models_words = load_all_models_words()
|
| 35 |
|
| 36 |
-
|
| 37 |
# Prepare lsj dictionary
|
| 38 |
lemma_dict = load_lsj_dict()
|
| 39 |
|
| 40 |
# Load dictionary with words as keys and eligible models as values
|
| 41 |
models_for_word_dict = load_models_for_word_dict()
|
| 42 |
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# Set styles for menu
|
| 45 |
styles = {
|
|
|
|
| 30 |
def load_all_lemmas():
|
| 31 |
return load_compressed_word_list('all_lemmas.pkl.gz')
|
| 32 |
|
| 33 |
+
@st.cache_data
|
| 34 |
+
def load_lemma_count_dict():
|
| 35 |
+
return count_lemmas('lemma_list_raw')
|
| 36 |
+
|
| 37 |
# Load compressed word list
|
| 38 |
all_models_words = load_all_models_words()
|
| 39 |
|
|
|
|
| 40 |
# Prepare lsj dictionary
|
| 41 |
lemma_dict = load_lsj_dict()
|
| 42 |
|
| 43 |
# Load dictionary with words as keys and eligible models as values
|
| 44 |
models_for_word_dict = load_models_for_word_dict()
|
| 45 |
|
| 46 |
+
lemma_counts = load_lemma_count_dict()
|
| 47 |
+
|
| 48 |
|
| 49 |
# Set styles for menu
|
| 50 |
styles = {
|
word2vec.py
CHANGED
|
@@ -8,6 +8,7 @@ import xlsxwriter
|
|
| 8 |
from sklearn.preprocessing import StandardScaler
|
| 9 |
from sklearn.manifold import TSNE
|
| 10 |
import plotly.express as px
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
|
|
@@ -457,6 +458,21 @@ def print_3d_model(model_name):
|
|
| 457 |
print(f'{word}: {vector}')
|
| 458 |
|
| 459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
def main():
|
| 462 |
# model = load_word2vec_model('models/archaic_cbow.model')
|
|
@@ -481,7 +497,7 @@ def main():
|
|
| 481 |
# Iterate over all words and print their vectors
|
| 482 |
# iterate_over_words(model)
|
| 483 |
|
| 484 |
-
|
| 485 |
|
| 486 |
|
| 487 |
if __name__ == "__main__":
|
|
|
|
| 8 |
from sklearn.preprocessing import StandardScaler
|
| 9 |
from sklearn.manifold import TSNE
|
| 10 |
import plotly.express as px
|
| 11 |
+
from collections import Counter
|
| 12 |
|
| 13 |
|
| 14 |
|
|
|
|
| 458 |
print(f'{word}: {vector}')
|
| 459 |
|
| 460 |
|
| 461 |
+
def count_lemmas(directory):
|
| 462 |
+
"""
|
| 463 |
+
Create a Counter with all words and their occurences for all models
|
| 464 |
+
"""
|
| 465 |
+
lemma_count_dict = {}
|
| 466 |
+
for file in os.listdir(directory):
|
| 467 |
+
if file.endswith(".txt"):
|
| 468 |
+
with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
|
| 469 |
+
text = f.read()
|
| 470 |
+
words = text.split()
|
| 471 |
+
lemma_count_dict[file] = Counter(words)
|
| 472 |
+
|
| 473 |
+
return lemma_count_dict
|
| 474 |
+
|
| 475 |
+
|
| 476 |
|
| 477 |
def main():
|
| 478 |
# model = load_word2vec_model('models/archaic_cbow.model')
|
|
|
|
| 497 |
# Iterate over all words and print their vectors
|
| 498 |
# iterate_over_words(model)
|
| 499 |
|
| 500 |
+
count_lemmas('lemma_list_raw')
|
| 501 |
|
| 502 |
|
| 503 |
if __name__ == "__main__":
|