Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

meg-huggingface commited on Dec 5, 2021

Commit

a2ae370

1 Parent(s): 335424f

More modularizing; npmi and labels

Browse files

Files changed (3) hide show

app.py +5 -12
data_measurements/dataset_statistics.py +20 -20
data_measurements/streamlit_utils.py +4 -5

app.py CHANGED Viewed

@@ -118,9 +118,8 @@ def load_or_prepare(ds_args, show_embeddings, use_cache=False):
     if show_embeddings:
         logs.warning("Loading Embeddings")
         dstats.load_or_prepare_embeddings()
-    # TODO: This has now been moved to calculation when the npmi widget is loaded.
-    logs.warning("Loading Terms for nPMI")
-    dstats.load_or_prepare_npmi_terms()
     logs.warning("Loading Zipf")
     dstats.load_or_prepare_zipf()
     return dstats
@@ -156,6 +155,8 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
         # Embeddings widget
         dstats.load_or_prepare_embeddings()
     dstats.load_or_prepare_text_duplicates()
 def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=True):
     """
@@ -179,17 +180,9 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
     st_utils.expander_label_distribution(dstats.fig_labels, column_id)
     st_utils.expander_text_lengths(dstats, column_id)
     st_utils.expander_text_duplicates(dstats, column_id)
-    # We do the loading of these after the others in order to have some time
-    # to compute while the user works with the details above.
     # Uses an interaction; handled a bit differently than other widgets.
     logs.info("showing npmi widget")
-    npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
-        dstats, use_cache=use_cache
-    )
-    available_terms = npmi_stats.get_available_terms()
-    st_utils.npmi_widget(
-        column_id, available_terms, npmi_stats, _MIN_VOCAB_COUNT)
     logs.info("showing zipf")
     st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
     if show_embeddings:

     if show_embeddings:
         logs.warning("Loading Embeddings")
         dstats.load_or_prepare_embeddings()
+    logs.warning("Loading nPMI")
+    dstats.load_or_prepare_npmi()
     logs.warning("Loading Zipf")
     dstats.load_or_prepare_zipf()
     return dstats
         # Embeddings widget
         dstats.load_or_prepare_embeddings()
     dstats.load_or_prepare_text_duplicates()
+    dstats.load_or_prepare_npmi()
+    dstats.load_or_prepare_zipf()
 def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=True):
     """
     st_utils.expander_label_distribution(dstats.fig_labels, column_id)
     st_utils.expander_text_lengths(dstats, column_id)
     st_utils.expander_text_duplicates(dstats, column_id)
     # Uses an interaction; handled a bit differently than other widgets.
     logs.info("showing npmi widget")
+    st_utils.npmi_widget(dstats.npmi_stats, _MIN_VOCAB_COUNT, column_id)
     logs.info("showing zipf")
     st_utils.expander_zipf(dstats.z, dstats.zipf_fig, column_id)
     if show_embeddings:

data_measurements/dataset_statistics.py CHANGED Viewed

@@ -231,10 +231,6 @@ class DatasetStatisticsCacheClass:
         # nPMI
         # Holds a nPMIStatisticsCacheClass object
         self.npmi_stats = None
-        # TODO: Users ideally can type in whatever words they want.
-        self.termlist = _IDENTITY_TERMS
-        # termlist terms that are available more than _MIN_VOCAB_COUNT times
-        self.available_terms = _IDENTITY_TERMS
         # TODO: Have lowercase be an option for a user to set.
         self.to_lowercase = True
         # The minimum amount of times a word should occur to be included in
@@ -627,24 +623,27 @@ class DatasetStatisticsCacheClass:
                 if save:
                     write_plotly(self.fig_labels, self.fig_labels_fid)
             else:
-                self.get_base_dataset()
-                self.label_dset = self.dset.map(
-                    lambda examples: extract_field(
-                        examples, self.label_field, OUR_LABEL_FIELD
-                    ),
-                    batched=True,
-                    remove_columns=list(self.dset.features),
-                )
-                self.label_df = self.label_dset.to_pandas()
-                self.fig_labels = make_fig_labels(
-                    self.label_df, self.label_names, OUR_LABEL_FIELD
-                )
                 if save:
                     # save extracted label instances
                     self.label_dset.save_to_disk(self.label_dset_fid)
                     write_plotly(self.fig_labels, self.fig_labels_fid)
-    def load_or_prepare_npmi_terms(self):
         self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
         self.npmi_stats.load_or_prepare_npmi_terms()
@@ -693,7 +692,10 @@ class nPMIStatisticsCacheClass:
             # We need to preprocess everything.
             mkdir(self.pmi_cache_path)
         self.joint_npmi_df_dict = {}
-        self.termlist = self.dstats.termlist
         logs.info(self.termlist)
         self.use_cache = use_cache
         # TODO: Let users specify
@@ -701,8 +703,6 @@ class nPMIStatisticsCacheClass:
         self.min_vocab_count = self.dstats.min_vocab_count
         self.subgroup_files = {}
         self.npmi_terms_fid = pjoin(self.dstats.cache_path, "npmi_terms.json")
-        self.available_terms = self.dstats.available_terms
-        logs.info(self.available_terms)
     def load_or_prepare_npmi_terms(self):
         """

         # nPMI
         # Holds a nPMIStatisticsCacheClass object
         self.npmi_stats = None
         # TODO: Have lowercase be an option for a user to set.
         self.to_lowercase = True
         # The minimum amount of times a word should occur to be included in
                 if save:
                     write_plotly(self.fig_labels, self.fig_labels_fid)
             else:
+                self.prepare_labels()
                 if save:
                     # save extracted label instances
                     self.label_dset.save_to_disk(self.label_dset_fid)
                     write_plotly(self.fig_labels, self.fig_labels_fid)
+    def prepare_labels(self):
+        self.get_base_dataset()
+        self.label_dset = self.dset.map(
+            lambda examples: extract_field(
+                examples, self.label_field, OUR_LABEL_FIELD
+            ),
+            batched=True,
+            remove_columns=list(self.dset.features),
+        )
+        self.label_df = self.label_dset.to_pandas()
+        self.fig_labels = make_fig_labels(
+            self.label_df, self.label_names, OUR_LABEL_FIELD
+        )
+    def load_or_prepare_npmi(self):
         self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=self.use_cache)
         self.npmi_stats.load_or_prepare_npmi_terms()
             # We need to preprocess everything.
             mkdir(self.pmi_cache_path)
         self.joint_npmi_df_dict = {}
+        # TODO: Users ideally can type in whatever words they want.
+        self.termlist = _IDENTITY_TERMS
+        # termlist terms that are available more than _MIN_VOCAB_COUNT times
+        self.available_terms = _IDENTITY_TERMS
         logs.info(self.termlist)
         self.use_cache = use_cache
         # TODO: Let users specify
         self.min_vocab_count = self.dstats.min_vocab_count
         self.subgroup_files = {}
         self.npmi_terms_fid = pjoin(self.dstats.cache_path, "npmi_terms.json")
     def load_or_prepare_npmi_terms(self):
         """

data_measurements/streamlit_utils.py CHANGED Viewed

@@ -273,7 +273,6 @@ def expander_text_duplicates(dstats, column_id):
         st.write(
             "### Here is the list of all the duplicated items and their counts in your dataset:"
         )
-        # Eh...adding 1 because otherwise it looks too weird for duplicate counts when the value is just 1.
         if dstats.dup_counts_df is None:
             st.write("There are no duplicates in this dataset! 🥳")
         else:
@@ -393,7 +392,7 @@ with an ideal α value of 1."""
 ### Finally finally finally, show nPMI stuff.
-def npmi_widget(column_id, available_terms, npmi_stats, min_vocab):
     """
     Part of the main app, but uses a user interaction so pulled out as its own f'n.
     :param use_cache:
@@ -403,16 +402,16 @@ def npmi_widget(column_id, available_terms, npmi_stats, min_vocab):
     :return:
     """
     with st.expander(f"Word Association{column_id}: nPMI", expanded=False):
-        if len(available_terms) > 0:
             expander_npmi_description(min_vocab)
             st.markdown("-----")
             term1 = st.selectbox(
                 f"What is the first term you want to select?{column_id}",
-                available_terms,
             )
             term2 = st.selectbox(
                 f"What is the second term you want to select?{column_id}",
-                reversed(available_terms),
             )
             # We calculate/grab nPMI data based on a canonical (alphabetic)
             # subgroup ordering.

         st.write(
             "### Here is the list of all the duplicated items and their counts in your dataset:"
         )
         if dstats.dup_counts_df is None:
             st.write("There are no duplicates in this dataset! 🥳")
         else:
 ### Finally finally finally, show nPMI stuff.
+def npmi_widget(npmi_stats, min_vocab, column_id):
     """
     Part of the main app, but uses a user interaction so pulled out as its own f'n.
     :param use_cache:
     :return:
     """
     with st.expander(f"Word Association{column_id}: nPMI", expanded=False):
+        if len(npmi_stats.available_terms) > 0:
             expander_npmi_description(min_vocab)
             st.markdown("-----")
             term1 = st.selectbox(
                 f"What is the first term you want to select?{column_id}",
+                npmi_stats.available_terms,
             )
             term2 = st.selectbox(
                 f"What is the second term you want to select?{column_id}",
+                reversed(npmi_stats.available_terms),
             )
             # We calculate/grab nPMI data based on a canonical (alphabetic)
             # subgroup ordering.