Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
·
f9936fb
1
Parent(s):
deefca3
Update from rollback
Browse files
data_measurements/dataset_statistics.py
CHANGED
|
@@ -303,6 +303,7 @@ class DatasetStatisticsCacheClass:
|
|
| 303 |
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
| 304 |
# Needed for UI
|
| 305 |
self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
|
|
|
|
| 306 |
|
| 307 |
self.live = False
|
| 308 |
|
|
@@ -366,6 +367,7 @@ class DatasetStatisticsCacheClass:
|
|
| 366 |
"""
|
| 367 |
# Text length figure
|
| 368 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
|
|
|
| 369 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
| 370 |
else:
|
| 371 |
if not self.live:
|
|
@@ -709,6 +711,8 @@ class DatasetStatisticsCacheClass:
|
|
| 709 |
zipf_dict = json.load(f)
|
| 710 |
self.z = Zipf()
|
| 711 |
self.z.load(zipf_dict)
|
|
|
|
|
|
|
| 712 |
self.zipf_fig = read_plotly(self.zipf_fig_fid)
|
| 713 |
elif self.use_cache and exists(self.zipf_fid):
|
| 714 |
# TODO: Read zipf data so that the vocab is there.
|
|
@@ -771,26 +775,30 @@ class nPMIStatisticsCacheClass:
|
|
| 771 |
and exists(self.npmi_terms_fid)
|
| 772 |
and json.load(open(self.npmi_terms_fid))["available terms"] != []
|
| 773 |
):
|
| 774 |
-
available_terms = json.load(open(self.npmi_terms_fid))["available terms"]
|
| 775 |
else:
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
"""
|
| 795 |
Run on-the fly, while the app is already open,
|
| 796 |
as it depends on the subgroup terms that the user chooses
|
|
@@ -824,12 +832,14 @@ class nPMIStatisticsCacheClass:
|
|
| 824 |
joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
|
| 825 |
subgroup_pair, subgroup_files
|
| 826 |
)
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
|
|
|
|
|
|
| 833 |
else:
|
| 834 |
joint_npmi_df = pd.DataFrame()
|
| 835 |
logs.info("The joint npmi df is")
|
|
@@ -871,7 +881,7 @@ class nPMIStatisticsCacheClass:
|
|
| 871 |
subgroup_dict[subgroup] = cached_results
|
| 872 |
logs.info("Calculating for subgroup list")
|
| 873 |
joint_npmi_df, subgroup_dict = self.do_npmi(subgroup_pair, subgroup_dict)
|
| 874 |
-
return joint_npmi_df
|
| 875 |
|
| 876 |
# TODO: Update pairwise assumption
|
| 877 |
def do_npmi(self, subgroup_pair, subgroup_dict):
|
|
@@ -882,6 +892,7 @@ class nPMIStatisticsCacheClass:
|
|
| 882 |
:return: Selected identity term's co-occurrence counts with
|
| 883 |
other words, pmi per word, and nPMI per word.
|
| 884 |
"""
|
|
|
|
| 885 |
logs.info("Initializing npmi class")
|
| 886 |
npmi_obj = self.set_npmi_obj()
|
| 887 |
# Canonical ordering used
|
|
@@ -889,18 +900,26 @@ class nPMIStatisticsCacheClass:
|
|
| 889 |
# Calculating nPMI statistics
|
| 890 |
for subgroup in subgroup_pair:
|
| 891 |
# If the subgroup data is already computed, grab it.
|
| 892 |
-
# TODO: Should we set idx and column names similarly to
|
|
|
|
| 893 |
if subgroup not in subgroup_dict:
|
| 894 |
logs.info("Calculating statistics for %s" % subgroup)
|
| 895 |
vocab_cooc_df, pmi_df, npmi_df = npmi_obj.calc_metrics(subgroup)
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 904 |
|
| 905 |
def set_npmi_obj(self):
|
| 906 |
"""
|
|
@@ -1291,3 +1310,4 @@ def write_zipf_data(z, zipf_fid):
|
|
| 1291 |
zipf_dict["uniq_ranks"] = [int(rank) for rank in z.uniq_ranks]
|
| 1292 |
with open(zipf_fid, "w+", encoding="utf-8") as f:
|
| 1293 |
json.dump(zipf_dict, f)
|
|
|
|
|
|
| 303 |
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
| 304 |
# Needed for UI
|
| 305 |
self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
|
| 306 |
+
self.zipf_counts = None
|
| 307 |
|
| 308 |
self.live = False
|
| 309 |
|
|
|
|
| 367 |
"""
|
| 368 |
# Text length figure
|
| 369 |
if (self.use_cache and exists(self.fig_tok_length_fid)):
|
| 370 |
+
self.fig_tok_length_png = mpimg.imread(self.fig_tok_length_fid)
|
| 371 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
| 372 |
else:
|
| 373 |
if not self.live:
|
|
|
|
| 711 |
zipf_dict = json.load(f)
|
| 712 |
self.z = Zipf()
|
| 713 |
self.z.load(zipf_dict)
|
| 714 |
+
# TODO: Should this be cached?
|
| 715 |
+
self.zipf_counts = self.z.calc_zipf_counts(self.vocab_counts_df)
|
| 716 |
self.zipf_fig = read_plotly(self.zipf_fig_fid)
|
| 717 |
elif self.use_cache and exists(self.zipf_fid):
|
| 718 |
# TODO: Read zipf data so that the vocab is there.
|
|
|
|
| 775 |
and exists(self.npmi_terms_fid)
|
| 776 |
and json.load(open(self.npmi_terms_fid))["available terms"] != []
|
| 777 |
):
|
| 778 |
+
self.available_terms = json.load(open(self.npmi_terms_fid))["available terms"]
|
| 779 |
else:
|
| 780 |
+
if not self.live:
|
| 781 |
+
if self.dstats.vocab_counts_df is None:
|
| 782 |
+
self.dstats.load_or_prepare_vocab()
|
| 783 |
+
|
| 784 |
+
true_false = [
|
| 785 |
+
term in self.dstats.vocab_counts_df.index for term in self.termlist
|
| 786 |
+
]
|
| 787 |
+
word_list_tmp = [x for x, y in zip(self.termlist, true_false) if y]
|
| 788 |
+
true_false_counts = [
|
| 789 |
+
self.dstats.vocab_counts_df.loc[word, CNT] >= self.min_vocab_count
|
| 790 |
+
for word in word_list_tmp
|
| 791 |
+
]
|
| 792 |
+
available_terms = [
|
| 793 |
+
word for word, y in zip(word_list_tmp, true_false_counts) if y
|
| 794 |
+
]
|
| 795 |
+
logs.info(available_terms)
|
| 796 |
+
with open(self.npmi_terms_fid, "w+") as f:
|
| 797 |
+
json.dump({"available terms": available_terms}, f)
|
| 798 |
+
self.available_terms = available_terms
|
| 799 |
+
return self.available_terms
|
| 800 |
+
|
| 801 |
+
def load_or_prepare_joint_npmi(self, subgroup_pair, save=True):
|
| 802 |
"""
|
| 803 |
Run on-the fly, while the app is already open,
|
| 804 |
as it depends on the subgroup terms that the user chooses
|
|
|
|
| 832 |
joint_npmi_df, subgroup_dict = self.prepare_joint_npmi_df(
|
| 833 |
subgroup_pair, subgroup_files
|
| 834 |
)
|
| 835 |
+
if save:
|
| 836 |
+
if joint_npmi_df is not None:
|
| 837 |
+
# Cache new results
|
| 838 |
+
logs.info("Writing out.")
|
| 839 |
+
for subgroup in subgroup_pair:
|
| 840 |
+
write_subgroup_npmi_data(subgroup, subgroup_dict, subgroup_files)
|
| 841 |
+
with open(joint_npmi_fid, "w+") as f:
|
| 842 |
+
joint_npmi_df.to_csv(f)
|
| 843 |
else:
|
| 844 |
joint_npmi_df = pd.DataFrame()
|
| 845 |
logs.info("The joint npmi df is")
|
|
|
|
| 881 |
subgroup_dict[subgroup] = cached_results
|
| 882 |
logs.info("Calculating for subgroup list")
|
| 883 |
joint_npmi_df, subgroup_dict = self.do_npmi(subgroup_pair, subgroup_dict)
|
| 884 |
+
return joint_npmi_df, subgroup_dict
|
| 885 |
|
| 886 |
# TODO: Update pairwise assumption
|
| 887 |
def do_npmi(self, subgroup_pair, subgroup_dict):
|
|
|
|
| 892 |
:return: Selected identity term's co-occurrence counts with
|
| 893 |
other words, pmi per word, and nPMI per word.
|
| 894 |
"""
|
| 895 |
+
no_results = False
|
| 896 |
logs.info("Initializing npmi class")
|
| 897 |
npmi_obj = self.set_npmi_obj()
|
| 898 |
# Canonical ordering used
|
|
|
|
| 900 |
# Calculating nPMI statistics
|
| 901 |
for subgroup in subgroup_pair:
|
| 902 |
# If the subgroup data is already computed, grab it.
|
| 903 |
+
# TODO: Should we set idx and column names similarly to
|
| 904 |
+
# how we set them for cached files?
|
| 905 |
if subgroup not in subgroup_dict:
|
| 906 |
logs.info("Calculating statistics for %s" % subgroup)
|
| 907 |
vocab_cooc_df, pmi_df, npmi_df = npmi_obj.calc_metrics(subgroup)
|
| 908 |
+
if vocab_cooc_df is None:
|
| 909 |
+
no_results = True
|
| 910 |
+
else:
|
| 911 |
+
# Store the nPMI information for the current subgroups
|
| 912 |
+
subgroup_dict[subgroup] = (vocab_cooc_df, pmi_df, npmi_df)
|
| 913 |
+
if no_results:
|
| 914 |
+
logs.warning("Couldn't grap the npmi files -- Under construction")
|
| 915 |
+
return None, None
|
| 916 |
+
else:
|
| 917 |
+
# Pair the subgroups together, indexed by all words that
|
| 918 |
+
# co-occur between them.
|
| 919 |
+
logs.info("Computing pairwise npmi bias")
|
| 920 |
+
paired_results = npmi_obj.calc_paired_metrics(subgroup_pair, subgroup_dict)
|
| 921 |
+
UI_results = make_npmi_fig(paired_results, subgroup_pair)
|
| 922 |
+
return UI_results.dropna(), subgroup_dict
|
| 923 |
|
| 924 |
def set_npmi_obj(self):
|
| 925 |
"""
|
|
|
|
| 1310 |
zipf_dict["uniq_ranks"] = [int(rank) for rank in z.uniq_ranks]
|
| 1311 |
with open(zipf_fid, "w+", encoding="utf-8") as f:
|
| 1312 |
json.dump(zipf_dict, f)
|
| 1313 |
+
|