Spaces:

GroNLP
/

agalma

Running

App Files Files Community

Mark7549 commited on May 13, 2024

Commit

cdb0a70

1 Parent(s): ee625fc

removed forms for first 2 tabs and used cache to make program faster

Browse files

Files changed (4) hide show

app.py +20 -12
autocomplete.py +58 -1
corpora/compass_filtered_v2.pkl.gz +3 -0
word2vec.py +5 -5

app.py CHANGED Viewed

@@ -21,6 +21,11 @@ def load_lsj_dict():
 def load_all_models_words():
     return sorted(load_compressed_word_list('corpora/compass_filtered.pkl.gz'), key=custom_sort)
 # Load compressed word list
 all_models_words = load_all_models_words()
@@ -28,6 +33,9 @@ all_models_words = load_all_models_words()
 # Prepare lsj dictionary
 lemma_dict = load_lsj_dict()
 # Horizontal menu
 active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
@@ -41,13 +49,13 @@ if active_tab == "Nearest neighbours":
     eligible_models = ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"]
     all_models_words = load_all_models_words()
-    with st.form("nn_form"):
         st.markdown("## Nearest Neighbours")
         target_word = st.multiselect("Enter a word", options=all_models_words, max_selections=1)
         if len(target_word) > 0:
             target_word = target_word[0]
-            eligible_models = check_word_in_models(target_word)
         models = st.multiselect(
             "Select models to search for neighbours",
@@ -55,8 +63,8 @@ if active_tab == "Nearest neighbours":
             )
         n = st.slider("Number of neighbours", 1, 50, 15)
-        nearest_neighbours_button = st.form_submit_button("Find nearest neighbours")
     if nearest_neighbours_button:
         if validate_nearest_neighbours(target_word, n, models) == False:
             st.error('Please fill in all fields')
@@ -98,11 +106,11 @@ if active_tab == "Nearest neighbours":
 # Cosine similarity tab
 elif active_tab == "Cosine similarity":
-    eligible_models_1 = []
-    eligible_models_2 = []
     all_models_words = load_all_models_words()
-    with st.form("cosine_similarity_form"):
         st.markdown("## Cosine similarity")
         col1, col2 = st.columns(2)
         col3, col4 = st.columns(2)
@@ -110,24 +118,24 @@ elif active_tab == "Cosine similarity":
             word_1 = st.multiselect("Enter a word", placeholder="πατήρ", max_selections=1, options=all_models_words)
             if len(word_1) > 0:
                 word_1 = word_1[0]
-                eligible_models_1 = check_word_in_models(word_1)
-            time_slice_1 = st.selectbox("Time slice word 1", eligible_models_1)
         with st.container():
             with col3:
                 word_2 = st.multiselect("Enter a word", placeholder="μήτηρ", max_selections=1, options=all_models_words)
                 if len(word_2) > 0:
                     word_2 = word_2[0]
-                    eligible_models_2 = check_word_in_models(word_2)
             with col4:
                 time_slice_2 = st.selectbox("Time slice word 2", eligible_models_2)
         # Create button for calculating cosine similarity
-        cosine_similarity_button = st.form_submit_button("Calculate cosine similarity")
     # If the button is clicked, execute calculation
     if cosine_similarity_button:

 def load_all_models_words():
     return sorted(load_compressed_word_list('corpora/compass_filtered.pkl.gz'), key=custom_sort)
+@st.cache_data
+def load_models_for_word_dict():
+    return word_in_models_dict('corpora/compass_filtered.pkl.gz')
 # Load compressed word list
 all_models_words = load_all_models_words()
 # Prepare lsj dictionary
 lemma_dict = load_lsj_dict()
+# Load dictionary with words as keys and eligible models as values
+models_for_word_dict = load_models_for_word_dict()
 # Horizontal menu
 active_tab = option_menu(None, ["Nearest neighbours", "Cosine similarity", "3D graph", 'Dictionary'],
     eligible_models = ["Archaic", "Classical", "Hellenistic", "Early Roman", "Late Roman"]
     all_models_words = load_all_models_words()
+    with st.container():
         st.markdown("## Nearest Neighbours")
         target_word = st.multiselect("Enter a word", options=all_models_words, max_selections=1)
         if len(target_word) > 0:
             target_word = target_word[0]
+            eligible_models = models_for_word_dict[target_word]
         models = st.multiselect(
             "Select models to search for neighbours",
             )
         n = st.slider("Number of neighbours", 1, 50, 15)
+        nearest_neighbours_button = st.button("Find nearest neighbours")
     if nearest_neighbours_button:
         if validate_nearest_neighbours(target_word, n, models) == False:
             st.error('Please fill in all fields')
 # Cosine similarity tab
 elif active_tab == "Cosine similarity":
     all_models_words = load_all_models_words()
+    with st.container():
+        eligible_models_1 = []
+        eligible_models_2 = []
         st.markdown("## Cosine similarity")
         col1, col2 = st.columns(2)
         col3, col4 = st.columns(2)
             word_1 = st.multiselect("Enter a word", placeholder="πατήρ", max_selections=1, options=all_models_words)
             if len(word_1) > 0:
                 word_1 = word_1[0]
+                eligible_models_1 = models_for_word_dict[word_1]
+        with col2:
+            time_slice_1 = st.selectbox("Time slice word 1", options = eligible_models_1)
         with st.container():
             with col3:
                 word_2 = st.multiselect("Enter a word", placeholder="μήτηρ", max_selections=1, options=all_models_words)
                 if len(word_2) > 0:
                     word_2 = word_2[0]
+                    eligible_models_2 = models_for_word_dict[word_2]
             with col4:
                 time_slice_2 = st.selectbox("Time slice word 2", eligible_models_2)
         # Create button for calculating cosine similarity
+        cosine_similarity_button = st.button("Calculate cosine similarity")
     # If the button is clicked, execute calculation
     if cosine_similarity_button:

autocomplete.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import pickle
 import gzip
 def get_unique_words(corpus_filename):
@@ -34,4 +35,60 @@ def get_autocomplete(input_word=" ", all_words=" "):
     """
     Get a list of words that start with the input word
     """
-    return [word for word in all_words if word.startswith(input_word)]

 import pickle
 import gzip
+from word2vec import *
 def get_unique_words(corpus_filename):
     """
     Get a list of words that start with the input word
     """
+    return [word for word in all_words if word.startswith(input_word)]
+def custom_sort(item):
+    if item.isdigit():
+        print(item)
+        return (2, item)  # Place numbers last
+    else:
+        return (0, item.lower())
+def order_compressed_list(filename):
+    """
+        Order the compressed list of words alphabetically and put numbers at the end
+    """
+    # Strip extension from filename
+    filename_raw = filename.split('.')[0]
+    with gzip.open(filename, 'rb') as file:
+        words = pickle.load(file)
+    # Sort the words
+    sorted_words = sorted(words, key=custom_sort)
+    return sorted_words
+def read_compressed_list(filename):
+    """
+    Read the compressed list of words
+    """
+    with gzip.open(filename, 'rb') as file:
+        print(pickle.load(file))
+def word_in_models_dict(words_file):
+    """
+    Create a dictionary with words as keys and models in which the word occurs as values
+    """
+    with gzip.open(words_file, 'rb') as file:
+        words = pickle.load(file)
+    models = load_all_models()
+    word_models = {word: [] for word in words}  # Initialize word_models dictionary with empty lists
+    for model in models:
+        model_name = convert_model_to_time_name(model[0])
+        for word in words:
+            if word in model[1].wv.key_to_index:
+                word_models[word].append(model_name)
+    return word_models

corpora/compass_filtered_v2.pkl.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32818a420a9458c7e8be4919f78a2623ffca704cd93340b05b4825f209c01b61
+size 127623

word2vec.py CHANGED Viewed

@@ -161,15 +161,15 @@ def convert_model_to_time_name(model_name):
     '''
         Convert the model name to the time slice name
     '''
-    if model_name == 'archaic_cbow':
         return 'Archaic'
-    elif model_name == 'classical_cbow':
         return 'Classical'
-    elif model_name == 'early_roman_cbow':
         return 'Early Roman'
-    elif model_name == 'hellen_cbow':
         return 'Hellenistic'
-    elif model_name == 'late_roman_cbow':
         return 'Late Roman'

     '''
         Convert the model name to the time slice name
     '''
+    if model_name == 'archaic_cbow' or model_name == 'archaic':
         return 'Archaic'
+    elif model_name == 'classical_cbow' or model_name == 'classical':
         return 'Classical'
+    elif model_name == 'early_roman_cbow' or model_name == 'early_roman':
         return 'Early Roman'
+    elif model_name == 'hellen_cbow' or model_name == 'hellen':
         return 'Hellenistic'
+    elif model_name == 'late_roman_cbow' or model_name == 'late_roman':
         return 'Late Roman'