Spaces:

huggingface
/

data-measurements-tool

Build error

App Files Files Community

meg-huggingface commited on Dec 5, 2021

Commit

0803ab3

1 Parent(s): a2ae370

Standardizing filenaming a bit.

Browse files

Files changed (2) hide show

data_measurements/dataset_statistics.py +56 -33
run_data_measurements.py +279 -0

data_measurements/dataset_statistics.py CHANGED Viewed

@@ -244,38 +244,61 @@ class DatasetStatisticsCacheClass:
         # path to the directory used for caching
         if not isinstance(text_field, str):
             text_field = "-".join(text_field)
-        if isinstance(label_field, str):
-            label_field = label_field
-        else:
-            label_field = "-".join(label_field)
         self.cache_path = pjoin(
             self.cache_dir,
-            f"{dset_name}_{dset_config}_{split_name}_{text_field}_{label_field}",
         )
         if not isdir(self.cache_path):
             logs.warning("Creating cache directory %s." % self.cache_path)
             mkdir(self.cache_path)
         self.dset_fid = pjoin(self.cache_path, "base_dset")
-        self.dset_peek_fid = pjoin(self.cache_path, "dset_peek.json")
-        self.text_dset_fid = pjoin(self.cache_path, "text_dset")
         self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
         self.label_dset_fid = pjoin(self.cache_path, "label_dset")
         self.length_df_fid = pjoin(self.cache_path, "length_df.feather")
-        self.length_stats_fid = pjoin(self.cache_path, "length_stats.json")
         self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
-        self.general_stats_fid = pjoin(self.cache_path, "general_stats_dict.json")
-        self.dup_counts_df_fid = pjoin(
-            self.cache_path, "dup_counts_df.feather"
-        )
         self.sorted_top_vocab_df_fid = pjoin(self.cache_path,
                                              "sorted_top_vocab.feather")
-        self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
-        self.fig_labels_fid = pjoin(self.cache_path, "fig_labels.json")
-        self.node_list_fid = pjoin(self.cache_path, "node_list.th")
-        self.fig_tree_fid = pjoin(self.cache_path, "fig_tree.json")
         self.zipf_fid = pjoin(self.cache_path, "zipf_basic_stats.json")
         self.zipf_fig_fid = pjoin(self.cache_path, "zipf_fig.json")
     def get_base_dataset(self):
         """Gets a pointer to the truncated base dataset object."""
         if not self.dset:
@@ -301,7 +324,7 @@ class DatasetStatisticsCacheClass:
         # General statistics
         if (
             self.use_cache
-            and exists(self.general_stats_fid)
             and exists(self.dup_counts_df_fid)
             and exists(self.sorted_top_vocab_df_fid)
         ):
@@ -313,7 +336,7 @@ class DatasetStatisticsCacheClass:
             if save:
                 write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
                 write_df(self.dup_counts_df, self.dup_counts_df_fid)
-                write_json(self.general_stats_dict, self.general_stats_fid)
     def load_or_prepare_text_lengths(self, save=True):
@@ -343,8 +366,8 @@ class DatasetStatisticsCacheClass:
                 write_df(self.length_df, self.length_df_fid)
         # Text length stats.
-        if self.use_cache and exists(self.length_stats_fid):
-            with open(self.length_stats_fid, "r") as f:
                 self.length_stats_dict = json.load(f)
             self.avg_length = self.length_stats_dict["avg length"]
             self.std_length = self.length_stats_dict["std length"]
@@ -352,7 +375,7 @@ class DatasetStatisticsCacheClass:
         else:
             self.prepare_text_length_stats()
             if save:
-                write_json(self.length_stats_dict, self.length_stats_fid)
     def prepare_length_df(self):
         if self.tokenized_df is None:
@@ -382,15 +405,15 @@ class DatasetStatisticsCacheClass:
         self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
     def load_or_prepare_embeddings(self, save=True):
-        if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_fid):
             self.node_list = torch.load(self.node_list_fid)
-            self.fig_tree = read_plotly(self.fig_tree_fid)
         elif self.use_cache and exists(self.node_list_fid):
             self.node_list = torch.load(self.node_list_fid)
             self.fig_tree = make_tree_plot(self.node_list,
                                            self.text_dset)
             if save:
-                write_plotly(self.fig_tree, self.fig_tree_fid)
         else:
             self.embeddings = Embeddings(self, use_cache=self.use_cache)
             self.embeddings.make_hierarchical_clustering()
@@ -399,7 +422,7 @@ class DatasetStatisticsCacheClass:
                                            self.text_dset)
             if save:
                 torch.save(self.node_list, self.node_list_fid)
-                write_plotly(self.fig_tree, self.fig_tree_fid)
     # get vocab with word counts
     def load_or_prepare_vocab(self, save=True):
@@ -457,7 +480,7 @@ class DatasetStatisticsCacheClass:
                 write_df(self.dup_counts_df, self.dup_counts_df_fid)
     def load_general_stats(self):
-        self.general_stats_dict = json.load(open(self.general_stats_fid, encoding="utf-8"))
         with open(self.sorted_top_vocab_df_fid, "rb") as f:
             self.sorted_top_vocab_df = feather.read_feather(f)
         self.text_nan_count = self.general_stats_dict[TEXT_NAN_CNT]
@@ -520,15 +543,15 @@ class DatasetStatisticsCacheClass:
         self.load_or_prepare_dset_peek(save)
     def load_or_prepare_dset_peek(self, save=True):
-        if self.use_cache and exists(self.dset_peek_fid):
-            with open(self.dset_peek_fid, "r") as f:
                 self.dset_peek = json.load(f)["dset peek"]
         else:
             if self.dset is None:
                 self.get_base_dataset()
             self.dset_peek = self.dset[:100]
             if save:
-                write_json({"dset peek": self.dset_peek}, self.dset_peek_fid)
     def load_or_prepare_tokenized_df(self, save=True):
         if (self.use_cache and exists(self.tokenized_df_fid)):
@@ -611,8 +634,8 @@ class DatasetStatisticsCacheClass:
         """
         # extracted labels
         if len(self.label_field) > 0:
-            if self.use_cache and exists(self.fig_labels_fid):
-                self.fig_labels = read_plotly(self.fig_labels_fid)
             elif self.use_cache and exists(self.label_dset_fid):
                 # load extracted labels
                 self.label_dset = load_from_disk(self.label_dset_fid)
@@ -621,13 +644,13 @@ class DatasetStatisticsCacheClass:
                     self.label_df, self.label_names, OUR_LABEL_FIELD
                 )
                 if save:
-                    write_plotly(self.fig_labels, self.fig_labels_fid)
             else:
                 self.prepare_labels()
                 if save:
                     # save extracted label instances
                     self.label_dset.save_to_disk(self.label_dset_fid)
-                    write_plotly(self.fig_labels, self.fig_labels_fid)
     def prepare_labels(self):
         self.get_base_dataset()

         # path to the directory used for caching
         if not isinstance(text_field, str):
             text_field = "-".join(text_field)
+        #if isinstance(label_field, str):
+        #    label_field = label_field
+        #else:
+        #    label_field = "-".join(label_field)
         self.cache_path = pjoin(
             self.cache_dir,
+            f"{dset_name}_{dset_config}_{split_name}_{text_field}", #{label_field},
         )
         if not isdir(self.cache_path):
             logs.warning("Creating cache directory %s." % self.cache_path)
             mkdir(self.cache_path)
+        # Cache files not needed for UI
         self.dset_fid = pjoin(self.cache_path, "base_dset")
         self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
         self.label_dset_fid = pjoin(self.cache_path, "label_dset")
+        # Needed for UI -- embeddings
+        self.text_dset_fid = pjoin(self.cache_path, "text_dset")
+        # Needed for UI
+        self.dset_peek_json_fid = pjoin(self.cache_path, "dset_peek.json")
+        ## Label cache files.
+        # Needed for UI
+        self.fig_labels_json_fid = pjoin(self.cache_path, "fig_labels.json")
+        ## Length cache files
+        # Needed for UI
         self.length_df_fid = pjoin(self.cache_path, "length_df.feather")
+        # Needed for UI
+        self.length_stats_json_fid = pjoin(self.cache_path, "length_stats.json")
         self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
+        # Needed for UI
+        self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
+        # Needed for UI
+        self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
+        ## General text stats
+        # Needed for UI
+        self.general_stats_json_fid = pjoin(self.cache_path, "general_stats_dict.json")
+        # Needed for UI
         self.sorted_top_vocab_df_fid = pjoin(self.cache_path,
                                              "sorted_top_vocab.feather")
+        ## Zipf cache files
+        # Needed for UI
         self.zipf_fid = pjoin(self.cache_path, "zipf_basic_stats.json")
+        # Needed for UI
         self.zipf_fig_fid = pjoin(self.cache_path, "zipf_fig.json")
+        ## Embeddings cache files
+        # Needed for UI
+        self.node_list_fid = pjoin(self.cache_path, "node_list.th")
+        # Needed for UI
+        self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
     def get_base_dataset(self):
         """Gets a pointer to the truncated base dataset object."""
         if not self.dset:
         # General statistics
         if (
             self.use_cache
+            and exists(self.general_stats_json_fid)
             and exists(self.dup_counts_df_fid)
             and exists(self.sorted_top_vocab_df_fid)
         ):
             if save:
                 write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
                 write_df(self.dup_counts_df, self.dup_counts_df_fid)
+                write_json(self.general_stats_dict, self.general_stats_json_fid)
     def load_or_prepare_text_lengths(self, save=True):
                 write_df(self.length_df, self.length_df_fid)
         # Text length stats.
+        if self.use_cache and exists(self.length_stats_json_fid):
+            with open(self.length_stats_json_fid, "r") as f:
                 self.length_stats_dict = json.load(f)
             self.avg_length = self.length_stats_dict["avg length"]
             self.std_length = self.length_stats_dict["std length"]
         else:
             self.prepare_text_length_stats()
             if save:
+                write_json(self.length_stats_dict, self.length_stats_json_fid)
     def prepare_length_df(self):
         if self.tokenized_df is None:
         self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
     def load_or_prepare_embeddings(self, save=True):
+        if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_json_fid):
             self.node_list = torch.load(self.node_list_fid)
+            self.fig_tree = read_plotly(self.fig_tree_json_fid)
         elif self.use_cache and exists(self.node_list_fid):
             self.node_list = torch.load(self.node_list_fid)
             self.fig_tree = make_tree_plot(self.node_list,
                                            self.text_dset)
             if save:
+                write_plotly(self.fig_tree, self.fig_tree_json_fid)
         else:
             self.embeddings = Embeddings(self, use_cache=self.use_cache)
             self.embeddings.make_hierarchical_clustering()
                                            self.text_dset)
             if save:
                 torch.save(self.node_list, self.node_list_fid)
+                write_plotly(self.fig_tree, self.fig_tree_json_fid)
     # get vocab with word counts
     def load_or_prepare_vocab(self, save=True):
                 write_df(self.dup_counts_df, self.dup_counts_df_fid)
     def load_general_stats(self):
+        self.general_stats_dict = json.load(open(self.general_stats_json_fid, encoding="utf-8"))
         with open(self.sorted_top_vocab_df_fid, "rb") as f:
             self.sorted_top_vocab_df = feather.read_feather(f)
         self.text_nan_count = self.general_stats_dict[TEXT_NAN_CNT]
         self.load_or_prepare_dset_peek(save)
     def load_or_prepare_dset_peek(self, save=True):
+        if self.use_cache and exists(self.dset_peek_json_fid):
+            with open(self.dset_peek_json_fid, "r") as f:
                 self.dset_peek = json.load(f)["dset peek"]
         else:
             if self.dset is None:
                 self.get_base_dataset()
             self.dset_peek = self.dset[:100]
             if save:
+                write_json({"dset peek": self.dset_peek}, self.dset_peek_json_fid)
     def load_or_prepare_tokenized_df(self, save=True):
         if (self.use_cache and exists(self.tokenized_df_fid)):
         """
         # extracted labels
         if len(self.label_field) > 0:
+            if self.use_cache and exists(self.fig_labels_json_fid):
+                self.fig_labels = read_plotly(self.fig_labels_json_fid)
             elif self.use_cache and exists(self.label_dset_fid):
                 # load extracted labels
                 self.label_dset = load_from_disk(self.label_dset_fid)
                     self.label_df, self.label_names, OUR_LABEL_FIELD
                 )
                 if save:
+                    write_plotly(self.fig_labels, self.fig_labels_json_fid)
             else:
                 self.prepare_labels()
                 if save:
                     # save extracted label instances
                     self.label_dset.save_to_disk(self.label_dset_fid)
+                    write_plotly(self.fig_labels, self.fig_labels_json_fid)
     def prepare_labels(self):
         self.get_base_dataset()

run_data_measurements.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import argparse
+import json
+import textwrap
+from os.path import join as pjoin
+from data_measurements import dataset_statistics
+from data_measurements import dataset_utils
+def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
+    """
+    Loader specifically for the widgets used in the app.
+    Args:
+        ds_args:
+        show_embeddings:
+        use_cache:
+    Returns:
+    """
+    dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args,
+                                                            use_cache=use_cache)
+    # Header widget
+    dstats.load_or_prepare_dset_peek()
+    # General stats widget
+    dstats.load_or_prepare_general_stats()
+    # Labels widget
+    dstats.load_or_prepare_labels()
+    # Text lengths widget
+    dstats.load_or_prepare_text_lengths()
+    if show_embeddings:
+        # Embeddings widget
+        dstats.load_or_prepare_embeddings()
+    # Text duplicates widget
+    dstats.load_or_prepare_text_duplicates()
+    # nPMI widget
+    dstats.load_or_prepare_npmi()
+    npmi_stats = dstats.npmi_stats
+    # Handling for all pairs; in the UI, people select.
+    do_npmi(npmi_stats)
+    # Zipf widget
+    dstats.load_or_prepare_zipf()
+def load_or_prepare(dataset_args, do_html=False, use_cache=False):
+    all = False
+    dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
+    print("Loading dataset.")
+    dstats.load_or_prepare_dataset()
+    print("Dataset loaded.  Preparing vocab.")
+    dstats.load_or_prepare_vocab()
+    print("Vocab prepared.")
+    if not dataset_args["calculation"]:
+        all = True
+    if all or dataset_args["calculation"] == "general":
+        print("\n* Calculating general statistics.")
+        dstats.load_or_prepare_general_stats()
+        print("Done!")
+        print("Basic text statistics now available at %s." % dstats.general_stats_json_fid)
+        print(
+            "Text duplicates now available at %s." % dstats.dup_counts_df_fid
+        )
+    if all or dataset_args["calculation"] == "lengths":
+        print("\n* Calculating text lengths.")
+        fig_tok_length_fid = pjoin(dstats.cache_path, "lengths_fig.html")
+        tok_length_json_fid = pjoin(dstats.cache_path, "lengths.json")
+        dstats.load_or_prepare_text_lengths()
+        with open(tok_length_json_fid, "w+") as f:
+            json.dump(dstats.fig_tok_length.to_json(), f)
+            print("Token lengths now available at %s." % tok_length_json_fid)
+        if do_html:
+            dstats.fig_tok_length.write_html(fig_tok_length_fid)
+            print("Figure saved to %s." % fig_tok_length_fid)
+        print("Done!")
+    if (all and dstats.label_field) or dataset_args["calculation"] == "labels":
+        if not dstats.label_field:
+            print("Warning: You asked for label calculation, but didn't provide the labels field name.  Assuming it is 'label'...")
+            dstats.set_label_field("label")
+            print("\n* Calculating label distribution.")
+            dstats.load_or_prepare_labels()
+            fig_label_html = pjoin(dstats.cache_path, "labels_fig.html")
+            fig_label_json = pjoin(dstats.cache_path, "labels.json")
+            dstats.fig_labels.write_html(fig_label_html)
+            with open(fig_label_json, "w+") as f:
+                json.dump(dstats.fig_labels.to_json(), f)
+            print("Done!")
+            print("Label distribution now available at %s." % dstats.label_dset_fid)
+            print("Figure saved to %s." % fig_label_html)
+    if all or dataset_args["calculation"] == "npmi":
+        print("\n* Preparing nPMI.")
+        npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
+            dstats, use_cache=use_cache
+        )
+        do_npmi(npmi_stats, use_cache=use_cache)
+        print("Done!")
+        print(
+            "nPMI results now available in %s for all identity terms that "
+            "occur more than 10 times and all words that "
+            "co-occur with both terms."
+            % npmi_stats.pmi_cache_path
+        )
+    if all or dataset_args["calculation"] == "zipf":
+        print("\n* Preparing Zipf.")
+        zipf_fig_fid = pjoin(dstats.cache_path, "zipf_fig.html")
+        zipf_json_fid = pjoin(dstats.cache_path, "zipf_fig.json")
+        dstats.load_or_prepare_zipf()
+        zipf_fig = dstats.zipf_fig
+        with open(zipf_json_fid, "w+") as f:
+            json.dump(zipf_fig.to_json(), f)
+        zipf_fig.write_html(zipf_fig_fid)
+        print("Done!")
+        print("Zipf results now available at %s." % dstats.zipf_fid)
+        print(
+            "Figure saved to %s, with corresponding json at %s."
+            % (zipf_fig_fid, zipf_json_fid)
+        )
+    # Don't do this one until someone specifically asks for it -- takes awhile.
+    if dataset_args["calculation"] == "embeddings":
+        print("\n* Preparing text embeddings.")
+        dstats.load_or_prepare_embeddings()
+def do_npmi(npmi_stats, use_cache=True):
+    available_terms = npmi_stats.load_or_prepare_npmi_terms()
+    completed_pairs = {}
+    print("Iterating through terms for joint npmi.")
+    for term1 in available_terms:
+        for term2 in available_terms:
+            if term1 != term2:
+                sorted_terms = tuple(sorted([term1, term2]))
+                if sorted_terms not in completed_pairs:
+                    term1, term2 = sorted_terms
+                    print("Computing nPMI statistics for %s and %s" % (term1, term2))
+                    _ = npmi_stats.load_or_prepare_joint_npmi(sorted_terms)
+                    completed_pairs[tuple(sorted_terms)] = {}
+def get_text_label_df(
+    ds_name,
+    config_name,
+    split_name,
+    text_field,
+    label_field,
+    calculation,
+    out_dir,
+    do_html=False,
+    use_cache=True,
+):
+    if not use_cache:
+        print("Not using any cache; starting afresh")
+    ds_name_to_dict = dataset_utils.get_dataset_info_dicts(ds_name)
+    if label_field:
+        label_field, label_names = (
+            ds_name_to_dict[ds_name][config_name]["features"][label_field][0]
+            if len(ds_name_to_dict[ds_name][config_name]["features"][label_field]) > 0
+            else ((), [])
+        )
+    else:
+        label_field = ()
+        label_names = []
+    dataset_args = {
+        "dset_name": ds_name,
+        "dset_config": config_name,
+        "split_name": split_name,
+        "text_field": text_field,
+        "label_field": label_field,
+        "label_names": label_names,
+        "calculation": calculation,
+        "cache_dir": out_dir,
+    }
+    load_or_prepare_widgets(dataset_args, use_cache=use_cache)
+def main():
+    # TODO: Make this the Hugging Face arg parser
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent(
+            """
+         Example for hate speech18 dataset:
+         python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --feature="text"
+         Example for Glue dataset:
+         python3 run_data_measurements.py --dataset="glue" --config="ax" --split="train" --feature="premise"
+         Example for IMDB dataset:
+         python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"
+         """
+        ),
+    )
+    parser.add_argument(
+        "-d", "--dataset", required=True, help="Name of dataset to prepare"
+    )
+    parser.add_argument(
+        "-c", "--config", required=True, help="Dataset configuration to prepare"
+    )
+    parser.add_argument(
+        "-s", "--split", required=True, type=str, help="Dataset split to prepare"
+    )
+    parser.add_argument(
+        "-f",
+        "--feature",
+        required=True,
+        type=str,
+        default="text",
+        help="Text column to prepare",
+    )
+    parser.add_argument(
+        "-w",
+        "--calculation",
+        help="""What to calculate (defaults to everything except embeddings).\n
+                                                    Options are:\n
+                                                    - `general` (for duplicate counts, missing values, length statistics.)\n
+                                                    - `lengths` for text length distribution\n
+                                                    - `labels` for label distribution\n
+                                                    - `embeddings` (Warning: Slow.)\n
+                                                    - `npmi` for word associations\n
+                                                    - `zipf` for zipfian statistics
+                                                    """,
+    )
+    parser.add_argument(
+        "-l",
+        "--label_field",
+        type=str,
+        required=False,
+        default="",
+        help="Field name for label column in dataset (Required if there is a label field that you want information about)",
+    )
+    parser.add_argument(
+        "--cached",
+        default=False,
+        required=False,
+        action="store_true",
+        help="Whether to use cached files (Optional)",
+    )
+    parser.add_argument(
+        "--do_html",
+        default=False,
+        required=False,
+        action="store_true",
+        help="Whether to write out corresponding HTML files (Optional)",
+    )
+    parser.add_argument("--out_dir", default="cache_dir", help="Where to write out to.")
+    args = parser.parse_args()
+    print("Proceeding with the following arguments:")
+    print(args)
+    # run_data_measurements.py -n hate_speech18 -c default -s train -f text -w npmi
+    get_text_label_df(
+        args.dataset,
+        args.config,
+        args.split,
+        args.feature,
+        args.label_field,
+        args.calculation,
+        args.out_dir,
+        do_html=args.do_html,
+        use_cache=args.cached,
+    )
+    print()
+if __name__ == "__main__":
+    main()