Spaces:

huggingface
/

data-measurements-tool

Running

App Files Files Community

meg-huggingface commited on Dec 7, 2021

Commit

d508e46

1 Parent(s): dffdb92

More doc stringing and printing stuff

Browse files

Files changed (1) hide show

run_data_measurements.py +14 -5

run_data_measurements.py CHANGED Viewed

@@ -11,8 +11,10 @@ from data_measurements import dataset_utils
 def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
     """
-    Loader specifically for the widgets used in the app.
-    Does not take specifications from user.
     Args:
         ds_args: Dataset configuration settings (config name, split, etc)
         show_embeddings: Whether to compute embeddings (slow)
@@ -62,6 +64,10 @@ def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
 def load_or_prepare(dataset_args, use_cache=False):
     """
     Users can specify which aspects of the dataset they would like to compute.
     Args:
         dataset_args: Dataset configuration settings (config name, split, etc)
         use_cache: Whether to grab files that have already been computed
@@ -70,7 +76,8 @@ def load_or_prepare(dataset_args, use_cache=False):
         Saves files to disk in cache_dir, if user has not specified another dir.
     """
     all = False
-    dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
     print("Loading dataset.")
     dstats.load_or_prepare_dataset()
     print("Dataset loaded.  Preparing vocab.")
@@ -84,7 +91,8 @@ def load_or_prepare(dataset_args, use_cache=False):
         print("\n* Calculating general statistics.")
         dstats.load_or_prepare_general_stats()
         print("Done!")
-        print("Basic text statistics now available at %s." % dstats.general_stats_json_fid)
         print(
             "Text duplicates now available at %s." % dstats.dup_counts_df_fid
         )
@@ -108,7 +116,8 @@ def load_or_prepare(dataset_args, use_cache=False):
             with open(fig_label_json, "w+") as f:
                 json.dump(dstats.fig_labels.to_json(), f)
             print("Done!")
-            print("Label distribution now available at %s." % dstats.label_dset_fid)
             print("Figure saved to %s." % fig_label_html)
     if all or dataset_args["calculation"] == "npmi":

 def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
     """
+    Loader specifically for the widgets used in the app -- does not compute
+    intermediate files, unless they are not there and are needed for a file
+    used in the UI.
+    Does not take specifications from user; does all widgets.
     Args:
         ds_args: Dataset configuration settings (config name, split, etc)
         show_embeddings: Whether to compute embeddings (slow)
 def load_or_prepare(dataset_args, use_cache=False):
     """
     Users can specify which aspects of the dataset they would like to compute.
+    This additionally computes intermediate files not used in the UI.
+    If the calculation flag is not specified by the user (-w), calculates all
+    except for embeddings, as those are quite time consuming so should be
+    specified separately.
     Args:
         dataset_args: Dataset configuration settings (config name, split, etc)
         use_cache: Whether to grab files that have already been computed
         Saves files to disk in cache_dir, if user has not specified another dir.
     """
     all = False
+    dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args,
+                                                            use_cache=use_cache)
     print("Loading dataset.")
     dstats.load_or_prepare_dataset()
     print("Dataset loaded.  Preparing vocab.")
         print("\n* Calculating general statistics.")
         dstats.load_or_prepare_general_stats()
         print("Done!")
+        print("Basic text statistics now available at %s." %
+              dstats.general_stats_json_fid)
         print(
             "Text duplicates now available at %s." % dstats.dup_counts_df_fid
         )
             with open(fig_label_json, "w+") as f:
                 json.dump(dstats.fig_labels.to_json(), f)
             print("Done!")
+            print("Label distribution now available at %s." %
+                  dstats.label_dset_fid)
             print("Figure saved to %s." % fig_label_html)
     if all or dataset_args["calculation"] == "npmi":