Spaces:

GroNLP
/

agalma

Running

Mark7549 commited on May 22, 2024

Commit

e99824c

1 Parent(s): eb0e921

added function to count the occurrences of lemmas per time slice

Files changed (2) hide show

app.py CHANGED Viewed

@@ -30,16 +30,21 @@ def load_models_for_word_dict():
 def load_all_lemmas():
     return load_compressed_word_list('all_lemmas.pkl.gz')
 # Load compressed word list
 all_models_words = load_all_models_words()
 # Prepare lsj dictionary
 lemma_dict = load_lsj_dict()
 # Load dictionary with words as keys and eligible models as values
 models_for_word_dict = load_models_for_word_dict()
 # Set styles for menu
 styles = {

 def load_all_lemmas():
     return load_compressed_word_list('all_lemmas.pkl.gz')
+@st.cache_data
+def load_lemma_count_dict():
+    return count_lemmas('lemma_list_raw')
 # Load compressed word list
 all_models_words = load_all_models_words()
 # Prepare lsj dictionary
 lemma_dict = load_lsj_dict()
 # Load dictionary with words as keys and eligible models as values
 models_for_word_dict = load_models_for_word_dict()
+lemma_counts = load_lemma_count_dict()
 # Set styles for menu
 styles = {

word2vec.py CHANGED Viewed

@@ -8,6 +8,7 @@ import xlsxwriter
 from sklearn.preprocessing import StandardScaler
 from sklearn.manifold import TSNE
 import plotly.express as px
@@ -457,6 +458,21 @@ def print_3d_model(model_name):
         print(f'{word}: {vector}')
 def main():
     # model = load_word2vec_model('models/archaic_cbow.model')
@@ -481,7 +497,7 @@ def main():
     # Iterate over all words and print their vectors
     # iterate_over_words(model)
-    print_3d_model('archaic')
 if __name__ == "__main__":

 from sklearn.preprocessing import StandardScaler
 from sklearn.manifold import TSNE
 import plotly.express as px
+from collections import Counter
         print(f'{word}: {vector}')
+def count_lemmas(directory):
+    """
+        Create a Counter with all words and their occurences for all models
+    """
+    lemma_count_dict = {}
+    for file in os.listdir(directory):
+        if file.endswith(".txt"):
+            with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
+                text = f.read()
+                words = text.split()
+                lemma_count_dict[file] = Counter(words)
+    return lemma_count_dict
 def main():
     # model = load_word2vec_model('models/archaic_cbow.model')
     # Iterate over all words and print their vectors
     # iterate_over_words(model)
+    count_lemmas('lemma_list_raw')
 if __name__ == "__main__":