Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
·
0803ab3
1
Parent(s):
a2ae370
Standardizing filenaming a bit.
Browse files- data_measurements/dataset_statistics.py +56 -33
- run_data_measurements.py +279 -0
data_measurements/dataset_statistics.py
CHANGED
|
@@ -244,38 +244,61 @@ class DatasetStatisticsCacheClass:
|
|
| 244 |
# path to the directory used for caching
|
| 245 |
if not isinstance(text_field, str):
|
| 246 |
text_field = "-".join(text_field)
|
| 247 |
-
if isinstance(label_field, str):
|
| 248 |
-
|
| 249 |
-
else:
|
| 250 |
-
|
| 251 |
self.cache_path = pjoin(
|
| 252 |
self.cache_dir,
|
| 253 |
-
f"{dset_name}_{dset_config}_{split_name}_{text_field}
|
| 254 |
)
|
| 255 |
if not isdir(self.cache_path):
|
| 256 |
logs.warning("Creating cache directory %s." % self.cache_path)
|
| 257 |
mkdir(self.cache_path)
|
|
|
|
|
|
|
| 258 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
| 259 |
-
self.dset_peek_fid = pjoin(self.cache_path, "dset_peek.json")
|
| 260 |
-
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
| 261 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
| 262 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
self.length_df_fid = pjoin(self.cache_path, "length_df.feather")
|
| 264 |
-
|
|
|
|
| 265 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
| 266 |
-
|
| 267 |
-
self.dup_counts_df_fid = pjoin(
|
| 268 |
-
|
| 269 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
self.sorted_top_vocab_df_fid = pjoin(self.cache_path,
|
| 271 |
"sorted_top_vocab.feather")
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
| 275 |
-
self.fig_tree_fid = pjoin(self.cache_path, "fig_tree.json")
|
| 276 |
self.zipf_fid = pjoin(self.cache_path, "zipf_basic_stats.json")
|
|
|
|
| 277 |
self.zipf_fig_fid = pjoin(self.cache_path, "zipf_fig.json")
|
| 278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
def get_base_dataset(self):
|
| 280 |
"""Gets a pointer to the truncated base dataset object."""
|
| 281 |
if not self.dset:
|
|
@@ -301,7 +324,7 @@ class DatasetStatisticsCacheClass:
|
|
| 301 |
# General statistics
|
| 302 |
if (
|
| 303 |
self.use_cache
|
| 304 |
-
and exists(self.
|
| 305 |
and exists(self.dup_counts_df_fid)
|
| 306 |
and exists(self.sorted_top_vocab_df_fid)
|
| 307 |
):
|
|
@@ -313,7 +336,7 @@ class DatasetStatisticsCacheClass:
|
|
| 313 |
if save:
|
| 314 |
write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
|
| 315 |
write_df(self.dup_counts_df, self.dup_counts_df_fid)
|
| 316 |
-
write_json(self.general_stats_dict, self.
|
| 317 |
|
| 318 |
|
| 319 |
def load_or_prepare_text_lengths(self, save=True):
|
|
@@ -343,8 +366,8 @@ class DatasetStatisticsCacheClass:
|
|
| 343 |
write_df(self.length_df, self.length_df_fid)
|
| 344 |
|
| 345 |
# Text length stats.
|
| 346 |
-
if self.use_cache and exists(self.
|
| 347 |
-
with open(self.
|
| 348 |
self.length_stats_dict = json.load(f)
|
| 349 |
self.avg_length = self.length_stats_dict["avg length"]
|
| 350 |
self.std_length = self.length_stats_dict["std length"]
|
|
@@ -352,7 +375,7 @@ class DatasetStatisticsCacheClass:
|
|
| 352 |
else:
|
| 353 |
self.prepare_text_length_stats()
|
| 354 |
if save:
|
| 355 |
-
write_json(self.length_stats_dict, self.
|
| 356 |
|
| 357 |
def prepare_length_df(self):
|
| 358 |
if self.tokenized_df is None:
|
|
@@ -382,15 +405,15 @@ class DatasetStatisticsCacheClass:
|
|
| 382 |
self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
|
| 383 |
|
| 384 |
def load_or_prepare_embeddings(self, save=True):
|
| 385 |
-
if self.use_cache and exists(self.node_list_fid) and exists(self.
|
| 386 |
self.node_list = torch.load(self.node_list_fid)
|
| 387 |
-
self.fig_tree = read_plotly(self.
|
| 388 |
elif self.use_cache and exists(self.node_list_fid):
|
| 389 |
self.node_list = torch.load(self.node_list_fid)
|
| 390 |
self.fig_tree = make_tree_plot(self.node_list,
|
| 391 |
self.text_dset)
|
| 392 |
if save:
|
| 393 |
-
write_plotly(self.fig_tree, self.
|
| 394 |
else:
|
| 395 |
self.embeddings = Embeddings(self, use_cache=self.use_cache)
|
| 396 |
self.embeddings.make_hierarchical_clustering()
|
|
@@ -399,7 +422,7 @@ class DatasetStatisticsCacheClass:
|
|
| 399 |
self.text_dset)
|
| 400 |
if save:
|
| 401 |
torch.save(self.node_list, self.node_list_fid)
|
| 402 |
-
write_plotly(self.fig_tree, self.
|
| 403 |
|
| 404 |
# get vocab with word counts
|
| 405 |
def load_or_prepare_vocab(self, save=True):
|
|
@@ -457,7 +480,7 @@ class DatasetStatisticsCacheClass:
|
|
| 457 |
write_df(self.dup_counts_df, self.dup_counts_df_fid)
|
| 458 |
|
| 459 |
def load_general_stats(self):
|
| 460 |
-
self.general_stats_dict = json.load(open(self.
|
| 461 |
with open(self.sorted_top_vocab_df_fid, "rb") as f:
|
| 462 |
self.sorted_top_vocab_df = feather.read_feather(f)
|
| 463 |
self.text_nan_count = self.general_stats_dict[TEXT_NAN_CNT]
|
|
@@ -520,15 +543,15 @@ class DatasetStatisticsCacheClass:
|
|
| 520 |
self.load_or_prepare_dset_peek(save)
|
| 521 |
|
| 522 |
def load_or_prepare_dset_peek(self, save=True):
|
| 523 |
-
if self.use_cache and exists(self.
|
| 524 |
-
with open(self.
|
| 525 |
self.dset_peek = json.load(f)["dset peek"]
|
| 526 |
else:
|
| 527 |
if self.dset is None:
|
| 528 |
self.get_base_dataset()
|
| 529 |
self.dset_peek = self.dset[:100]
|
| 530 |
if save:
|
| 531 |
-
write_json({"dset peek": self.dset_peek}, self.
|
| 532 |
|
| 533 |
def load_or_prepare_tokenized_df(self, save=True):
|
| 534 |
if (self.use_cache and exists(self.tokenized_df_fid)):
|
|
@@ -611,8 +634,8 @@ class DatasetStatisticsCacheClass:
|
|
| 611 |
"""
|
| 612 |
# extracted labels
|
| 613 |
if len(self.label_field) > 0:
|
| 614 |
-
if self.use_cache and exists(self.
|
| 615 |
-
self.fig_labels = read_plotly(self.
|
| 616 |
elif self.use_cache and exists(self.label_dset_fid):
|
| 617 |
# load extracted labels
|
| 618 |
self.label_dset = load_from_disk(self.label_dset_fid)
|
|
@@ -621,13 +644,13 @@ class DatasetStatisticsCacheClass:
|
|
| 621 |
self.label_df, self.label_names, OUR_LABEL_FIELD
|
| 622 |
)
|
| 623 |
if save:
|
| 624 |
-
write_plotly(self.fig_labels, self.
|
| 625 |
else:
|
| 626 |
self.prepare_labels()
|
| 627 |
if save:
|
| 628 |
# save extracted label instances
|
| 629 |
self.label_dset.save_to_disk(self.label_dset_fid)
|
| 630 |
-
write_plotly(self.fig_labels, self.
|
| 631 |
|
| 632 |
def prepare_labels(self):
|
| 633 |
self.get_base_dataset()
|
|
|
|
| 244 |
# path to the directory used for caching
|
| 245 |
if not isinstance(text_field, str):
|
| 246 |
text_field = "-".join(text_field)
|
| 247 |
+
#if isinstance(label_field, str):
|
| 248 |
+
# label_field = label_field
|
| 249 |
+
#else:
|
| 250 |
+
# label_field = "-".join(label_field)
|
| 251 |
self.cache_path = pjoin(
|
| 252 |
self.cache_dir,
|
| 253 |
+
f"{dset_name}_{dset_config}_{split_name}_{text_field}", #{label_field},
|
| 254 |
)
|
| 255 |
if not isdir(self.cache_path):
|
| 256 |
logs.warning("Creating cache directory %s." % self.cache_path)
|
| 257 |
mkdir(self.cache_path)
|
| 258 |
+
|
| 259 |
+
# Cache files not needed for UI
|
| 260 |
self.dset_fid = pjoin(self.cache_path, "base_dset")
|
|
|
|
|
|
|
| 261 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
| 262 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
| 263 |
+
|
| 264 |
+
# Needed for UI -- embeddings
|
| 265 |
+
self.text_dset_fid = pjoin(self.cache_path, "text_dset")
|
| 266 |
+
# Needed for UI
|
| 267 |
+
self.dset_peek_json_fid = pjoin(self.cache_path, "dset_peek.json")
|
| 268 |
+
|
| 269 |
+
## Label cache files.
|
| 270 |
+
# Needed for UI
|
| 271 |
+
self.fig_labels_json_fid = pjoin(self.cache_path, "fig_labels.json")
|
| 272 |
+
|
| 273 |
+
## Length cache files
|
| 274 |
+
# Needed for UI
|
| 275 |
self.length_df_fid = pjoin(self.cache_path, "length_df.feather")
|
| 276 |
+
# Needed for UI
|
| 277 |
+
self.length_stats_json_fid = pjoin(self.cache_path, "length_stats.json")
|
| 278 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
| 279 |
+
# Needed for UI
|
| 280 |
+
self.dup_counts_df_fid = pjoin(self.cache_path, "dup_counts_df.feather")
|
| 281 |
+
# Needed for UI
|
| 282 |
+
self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
|
| 283 |
+
|
| 284 |
+
## General text stats
|
| 285 |
+
# Needed for UI
|
| 286 |
+
self.general_stats_json_fid = pjoin(self.cache_path, "general_stats_dict.json")
|
| 287 |
+
# Needed for UI
|
| 288 |
self.sorted_top_vocab_df_fid = pjoin(self.cache_path,
|
| 289 |
"sorted_top_vocab.feather")
|
| 290 |
+
## Zipf cache files
|
| 291 |
+
# Needed for UI
|
|
|
|
|
|
|
| 292 |
self.zipf_fid = pjoin(self.cache_path, "zipf_basic_stats.json")
|
| 293 |
+
# Needed for UI
|
| 294 |
self.zipf_fig_fid = pjoin(self.cache_path, "zipf_fig.json")
|
| 295 |
|
| 296 |
+
## Embeddings cache files
|
| 297 |
+
# Needed for UI
|
| 298 |
+
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
| 299 |
+
# Needed for UI
|
| 300 |
+
self.fig_tree_json_fid = pjoin(self.cache_path, "fig_tree.json")
|
| 301 |
+
|
| 302 |
def get_base_dataset(self):
|
| 303 |
"""Gets a pointer to the truncated base dataset object."""
|
| 304 |
if not self.dset:
|
|
|
|
| 324 |
# General statistics
|
| 325 |
if (
|
| 326 |
self.use_cache
|
| 327 |
+
and exists(self.general_stats_json_fid)
|
| 328 |
and exists(self.dup_counts_df_fid)
|
| 329 |
and exists(self.sorted_top_vocab_df_fid)
|
| 330 |
):
|
|
|
|
| 336 |
if save:
|
| 337 |
write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
|
| 338 |
write_df(self.dup_counts_df, self.dup_counts_df_fid)
|
| 339 |
+
write_json(self.general_stats_dict, self.general_stats_json_fid)
|
| 340 |
|
| 341 |
|
| 342 |
def load_or_prepare_text_lengths(self, save=True):
|
|
|
|
| 366 |
write_df(self.length_df, self.length_df_fid)
|
| 367 |
|
| 368 |
# Text length stats.
|
| 369 |
+
if self.use_cache and exists(self.length_stats_json_fid):
|
| 370 |
+
with open(self.length_stats_json_fid, "r") as f:
|
| 371 |
self.length_stats_dict = json.load(f)
|
| 372 |
self.avg_length = self.length_stats_dict["avg length"]
|
| 373 |
self.std_length = self.length_stats_dict["std length"]
|
|
|
|
| 375 |
else:
|
| 376 |
self.prepare_text_length_stats()
|
| 377 |
if save:
|
| 378 |
+
write_json(self.length_stats_dict, self.length_stats_json_fid)
|
| 379 |
|
| 380 |
def prepare_length_df(self):
|
| 381 |
if self.tokenized_df is None:
|
|
|
|
| 405 |
self.fig_tok_length = make_fig_lengths(self.tokenized_df, LENGTH_FIELD)
|
| 406 |
|
| 407 |
def load_or_prepare_embeddings(self, save=True):
|
| 408 |
+
if self.use_cache and exists(self.node_list_fid) and exists(self.fig_tree_json_fid):
|
| 409 |
self.node_list = torch.load(self.node_list_fid)
|
| 410 |
+
self.fig_tree = read_plotly(self.fig_tree_json_fid)
|
| 411 |
elif self.use_cache and exists(self.node_list_fid):
|
| 412 |
self.node_list = torch.load(self.node_list_fid)
|
| 413 |
self.fig_tree = make_tree_plot(self.node_list,
|
| 414 |
self.text_dset)
|
| 415 |
if save:
|
| 416 |
+
write_plotly(self.fig_tree, self.fig_tree_json_fid)
|
| 417 |
else:
|
| 418 |
self.embeddings = Embeddings(self, use_cache=self.use_cache)
|
| 419 |
self.embeddings.make_hierarchical_clustering()
|
|
|
|
| 422 |
self.text_dset)
|
| 423 |
if save:
|
| 424 |
torch.save(self.node_list, self.node_list_fid)
|
| 425 |
+
write_plotly(self.fig_tree, self.fig_tree_json_fid)
|
| 426 |
|
| 427 |
# get vocab with word counts
|
| 428 |
def load_or_prepare_vocab(self, save=True):
|
|
|
|
| 480 |
write_df(self.dup_counts_df, self.dup_counts_df_fid)
|
| 481 |
|
| 482 |
def load_general_stats(self):
|
| 483 |
+
self.general_stats_dict = json.load(open(self.general_stats_json_fid, encoding="utf-8"))
|
| 484 |
with open(self.sorted_top_vocab_df_fid, "rb") as f:
|
| 485 |
self.sorted_top_vocab_df = feather.read_feather(f)
|
| 486 |
self.text_nan_count = self.general_stats_dict[TEXT_NAN_CNT]
|
|
|
|
| 543 |
self.load_or_prepare_dset_peek(save)
|
| 544 |
|
| 545 |
def load_or_prepare_dset_peek(self, save=True):
|
| 546 |
+
if self.use_cache and exists(self.dset_peek_json_fid):
|
| 547 |
+
with open(self.dset_peek_json_fid, "r") as f:
|
| 548 |
self.dset_peek = json.load(f)["dset peek"]
|
| 549 |
else:
|
| 550 |
if self.dset is None:
|
| 551 |
self.get_base_dataset()
|
| 552 |
self.dset_peek = self.dset[:100]
|
| 553 |
if save:
|
| 554 |
+
write_json({"dset peek": self.dset_peek}, self.dset_peek_json_fid)
|
| 555 |
|
| 556 |
def load_or_prepare_tokenized_df(self, save=True):
|
| 557 |
if (self.use_cache and exists(self.tokenized_df_fid)):
|
|
|
|
| 634 |
"""
|
| 635 |
# extracted labels
|
| 636 |
if len(self.label_field) > 0:
|
| 637 |
+
if self.use_cache and exists(self.fig_labels_json_fid):
|
| 638 |
+
self.fig_labels = read_plotly(self.fig_labels_json_fid)
|
| 639 |
elif self.use_cache and exists(self.label_dset_fid):
|
| 640 |
# load extracted labels
|
| 641 |
self.label_dset = load_from_disk(self.label_dset_fid)
|
|
|
|
| 644 |
self.label_df, self.label_names, OUR_LABEL_FIELD
|
| 645 |
)
|
| 646 |
if save:
|
| 647 |
+
write_plotly(self.fig_labels, self.fig_labels_json_fid)
|
| 648 |
else:
|
| 649 |
self.prepare_labels()
|
| 650 |
if save:
|
| 651 |
# save extracted label instances
|
| 652 |
self.label_dset.save_to_disk(self.label_dset_fid)
|
| 653 |
+
write_plotly(self.fig_labels, self.fig_labels_json_fid)
|
| 654 |
|
| 655 |
def prepare_labels(self):
|
| 656 |
self.get_base_dataset()
|
run_data_measurements.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import textwrap
|
| 4 |
+
from os.path import join as pjoin
|
| 5 |
+
|
| 6 |
+
from data_measurements import dataset_statistics
|
| 7 |
+
from data_measurements import dataset_utils
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def load_or_prepare_widgets(ds_args, show_embeddings=False, use_cache=False):
|
| 11 |
+
"""
|
| 12 |
+
Loader specifically for the widgets used in the app.
|
| 13 |
+
Args:
|
| 14 |
+
ds_args:
|
| 15 |
+
show_embeddings:
|
| 16 |
+
use_cache:
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
|
| 20 |
+
"""
|
| 21 |
+
dstats = dataset_statistics.DatasetStatisticsCacheClass(**ds_args,
|
| 22 |
+
use_cache=use_cache)
|
| 23 |
+
# Header widget
|
| 24 |
+
dstats.load_or_prepare_dset_peek()
|
| 25 |
+
# General stats widget
|
| 26 |
+
dstats.load_or_prepare_general_stats()
|
| 27 |
+
# Labels widget
|
| 28 |
+
dstats.load_or_prepare_labels()
|
| 29 |
+
# Text lengths widget
|
| 30 |
+
dstats.load_or_prepare_text_lengths()
|
| 31 |
+
if show_embeddings:
|
| 32 |
+
# Embeddings widget
|
| 33 |
+
dstats.load_or_prepare_embeddings()
|
| 34 |
+
# Text duplicates widget
|
| 35 |
+
dstats.load_or_prepare_text_duplicates()
|
| 36 |
+
# nPMI widget
|
| 37 |
+
dstats.load_or_prepare_npmi()
|
| 38 |
+
npmi_stats = dstats.npmi_stats
|
| 39 |
+
# Handling for all pairs; in the UI, people select.
|
| 40 |
+
do_npmi(npmi_stats)
|
| 41 |
+
# Zipf widget
|
| 42 |
+
dstats.load_or_prepare_zipf()
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def load_or_prepare(dataset_args, do_html=False, use_cache=False):
|
| 46 |
+
all = False
|
| 47 |
+
dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache)
|
| 48 |
+
print("Loading dataset.")
|
| 49 |
+
dstats.load_or_prepare_dataset()
|
| 50 |
+
print("Dataset loaded. Preparing vocab.")
|
| 51 |
+
dstats.load_or_prepare_vocab()
|
| 52 |
+
print("Vocab prepared.")
|
| 53 |
+
|
| 54 |
+
if not dataset_args["calculation"]:
|
| 55 |
+
all = True
|
| 56 |
+
|
| 57 |
+
if all or dataset_args["calculation"] == "general":
|
| 58 |
+
print("\n* Calculating general statistics.")
|
| 59 |
+
dstats.load_or_prepare_general_stats()
|
| 60 |
+
print("Done!")
|
| 61 |
+
print("Basic text statistics now available at %s." % dstats.general_stats_json_fid)
|
| 62 |
+
print(
|
| 63 |
+
"Text duplicates now available at %s." % dstats.dup_counts_df_fid
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
if all or dataset_args["calculation"] == "lengths":
|
| 67 |
+
print("\n* Calculating text lengths.")
|
| 68 |
+
fig_tok_length_fid = pjoin(dstats.cache_path, "lengths_fig.html")
|
| 69 |
+
tok_length_json_fid = pjoin(dstats.cache_path, "lengths.json")
|
| 70 |
+
dstats.load_or_prepare_text_lengths()
|
| 71 |
+
with open(tok_length_json_fid, "w+") as f:
|
| 72 |
+
json.dump(dstats.fig_tok_length.to_json(), f)
|
| 73 |
+
print("Token lengths now available at %s." % tok_length_json_fid)
|
| 74 |
+
if do_html:
|
| 75 |
+
dstats.fig_tok_length.write_html(fig_tok_length_fid)
|
| 76 |
+
print("Figure saved to %s." % fig_tok_length_fid)
|
| 77 |
+
print("Done!")
|
| 78 |
+
|
| 79 |
+
if (all and dstats.label_field) or dataset_args["calculation"] == "labels":
|
| 80 |
+
if not dstats.label_field:
|
| 81 |
+
print("Warning: You asked for label calculation, but didn't provide the labels field name. Assuming it is 'label'...")
|
| 82 |
+
dstats.set_label_field("label")
|
| 83 |
+
print("\n* Calculating label distribution.")
|
| 84 |
+
dstats.load_or_prepare_labels()
|
| 85 |
+
fig_label_html = pjoin(dstats.cache_path, "labels_fig.html")
|
| 86 |
+
fig_label_json = pjoin(dstats.cache_path, "labels.json")
|
| 87 |
+
dstats.fig_labels.write_html(fig_label_html)
|
| 88 |
+
with open(fig_label_json, "w+") as f:
|
| 89 |
+
json.dump(dstats.fig_labels.to_json(), f)
|
| 90 |
+
print("Done!")
|
| 91 |
+
print("Label distribution now available at %s." % dstats.label_dset_fid)
|
| 92 |
+
print("Figure saved to %s." % fig_label_html)
|
| 93 |
+
|
| 94 |
+
if all or dataset_args["calculation"] == "npmi":
|
| 95 |
+
print("\n* Preparing nPMI.")
|
| 96 |
+
npmi_stats = dataset_statistics.nPMIStatisticsCacheClass(
|
| 97 |
+
dstats, use_cache=use_cache
|
| 98 |
+
)
|
| 99 |
+
do_npmi(npmi_stats, use_cache=use_cache)
|
| 100 |
+
print("Done!")
|
| 101 |
+
print(
|
| 102 |
+
"nPMI results now available in %s for all identity terms that "
|
| 103 |
+
"occur more than 10 times and all words that "
|
| 104 |
+
"co-occur with both terms."
|
| 105 |
+
% npmi_stats.pmi_cache_path
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
if all or dataset_args["calculation"] == "zipf":
|
| 109 |
+
print("\n* Preparing Zipf.")
|
| 110 |
+
zipf_fig_fid = pjoin(dstats.cache_path, "zipf_fig.html")
|
| 111 |
+
zipf_json_fid = pjoin(dstats.cache_path, "zipf_fig.json")
|
| 112 |
+
dstats.load_or_prepare_zipf()
|
| 113 |
+
zipf_fig = dstats.zipf_fig
|
| 114 |
+
with open(zipf_json_fid, "w+") as f:
|
| 115 |
+
json.dump(zipf_fig.to_json(), f)
|
| 116 |
+
zipf_fig.write_html(zipf_fig_fid)
|
| 117 |
+
print("Done!")
|
| 118 |
+
print("Zipf results now available at %s." % dstats.zipf_fid)
|
| 119 |
+
print(
|
| 120 |
+
"Figure saved to %s, with corresponding json at %s."
|
| 121 |
+
% (zipf_fig_fid, zipf_json_fid)
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Don't do this one until someone specifically asks for it -- takes awhile.
|
| 125 |
+
if dataset_args["calculation"] == "embeddings":
|
| 126 |
+
print("\n* Preparing text embeddings.")
|
| 127 |
+
dstats.load_or_prepare_embeddings()
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def do_npmi(npmi_stats, use_cache=True):
|
| 131 |
+
available_terms = npmi_stats.load_or_prepare_npmi_terms()
|
| 132 |
+
completed_pairs = {}
|
| 133 |
+
print("Iterating through terms for joint npmi.")
|
| 134 |
+
for term1 in available_terms:
|
| 135 |
+
for term2 in available_terms:
|
| 136 |
+
if term1 != term2:
|
| 137 |
+
sorted_terms = tuple(sorted([term1, term2]))
|
| 138 |
+
if sorted_terms not in completed_pairs:
|
| 139 |
+
term1, term2 = sorted_terms
|
| 140 |
+
print("Computing nPMI statistics for %s and %s" % (term1, term2))
|
| 141 |
+
_ = npmi_stats.load_or_prepare_joint_npmi(sorted_terms)
|
| 142 |
+
completed_pairs[tuple(sorted_terms)] = {}
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def get_text_label_df(
|
| 146 |
+
ds_name,
|
| 147 |
+
config_name,
|
| 148 |
+
split_name,
|
| 149 |
+
text_field,
|
| 150 |
+
label_field,
|
| 151 |
+
calculation,
|
| 152 |
+
out_dir,
|
| 153 |
+
do_html=False,
|
| 154 |
+
use_cache=True,
|
| 155 |
+
):
|
| 156 |
+
if not use_cache:
|
| 157 |
+
print("Not using any cache; starting afresh")
|
| 158 |
+
ds_name_to_dict = dataset_utils.get_dataset_info_dicts(ds_name)
|
| 159 |
+
if label_field:
|
| 160 |
+
label_field, label_names = (
|
| 161 |
+
ds_name_to_dict[ds_name][config_name]["features"][label_field][0]
|
| 162 |
+
if len(ds_name_to_dict[ds_name][config_name]["features"][label_field]) > 0
|
| 163 |
+
else ((), [])
|
| 164 |
+
)
|
| 165 |
+
else:
|
| 166 |
+
label_field = ()
|
| 167 |
+
label_names = []
|
| 168 |
+
dataset_args = {
|
| 169 |
+
"dset_name": ds_name,
|
| 170 |
+
"dset_config": config_name,
|
| 171 |
+
"split_name": split_name,
|
| 172 |
+
"text_field": text_field,
|
| 173 |
+
"label_field": label_field,
|
| 174 |
+
"label_names": label_names,
|
| 175 |
+
"calculation": calculation,
|
| 176 |
+
"cache_dir": out_dir,
|
| 177 |
+
}
|
| 178 |
+
load_or_prepare_widgets(dataset_args, use_cache=use_cache)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def main():
|
| 182 |
+
# TODO: Make this the Hugging Face arg parser
|
| 183 |
+
parser = argparse.ArgumentParser(
|
| 184 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 185 |
+
description=textwrap.dedent(
|
| 186 |
+
"""
|
| 187 |
+
|
| 188 |
+
Example for hate speech18 dataset:
|
| 189 |
+
python3 run_data_measurements.py --dataset="hate_speech18" --config="default" --split="train" --feature="text"
|
| 190 |
+
|
| 191 |
+
Example for Glue dataset:
|
| 192 |
+
python3 run_data_measurements.py --dataset="glue" --config="ax" --split="train" --feature="premise"
|
| 193 |
+
|
| 194 |
+
Example for IMDB dataset:
|
| 195 |
+
python3 run_data_measurements.py --dataset="imdb" --config="plain_text" --split="train" --label_field="label" --feature="text"
|
| 196 |
+
"""
|
| 197 |
+
),
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
parser.add_argument(
|
| 201 |
+
"-d", "--dataset", required=True, help="Name of dataset to prepare"
|
| 202 |
+
)
|
| 203 |
+
parser.add_argument(
|
| 204 |
+
"-c", "--config", required=True, help="Dataset configuration to prepare"
|
| 205 |
+
)
|
| 206 |
+
parser.add_argument(
|
| 207 |
+
"-s", "--split", required=True, type=str, help="Dataset split to prepare"
|
| 208 |
+
)
|
| 209 |
+
parser.add_argument(
|
| 210 |
+
"-f",
|
| 211 |
+
"--feature",
|
| 212 |
+
required=True,
|
| 213 |
+
type=str,
|
| 214 |
+
default="text",
|
| 215 |
+
help="Text column to prepare",
|
| 216 |
+
)
|
| 217 |
+
parser.add_argument(
|
| 218 |
+
"-w",
|
| 219 |
+
"--calculation",
|
| 220 |
+
help="""What to calculate (defaults to everything except embeddings).\n
|
| 221 |
+
Options are:\n
|
| 222 |
+
|
| 223 |
+
- `general` (for duplicate counts, missing values, length statistics.)\n
|
| 224 |
+
|
| 225 |
+
- `lengths` for text length distribution\n
|
| 226 |
+
|
| 227 |
+
- `labels` for label distribution\n
|
| 228 |
+
|
| 229 |
+
- `embeddings` (Warning: Slow.)\n
|
| 230 |
+
|
| 231 |
+
- `npmi` for word associations\n
|
| 232 |
+
|
| 233 |
+
- `zipf` for zipfian statistics
|
| 234 |
+
""",
|
| 235 |
+
)
|
| 236 |
+
parser.add_argument(
|
| 237 |
+
"-l",
|
| 238 |
+
"--label_field",
|
| 239 |
+
type=str,
|
| 240 |
+
required=False,
|
| 241 |
+
default="",
|
| 242 |
+
help="Field name for label column in dataset (Required if there is a label field that you want information about)",
|
| 243 |
+
)
|
| 244 |
+
parser.add_argument(
|
| 245 |
+
"--cached",
|
| 246 |
+
default=False,
|
| 247 |
+
required=False,
|
| 248 |
+
action="store_true",
|
| 249 |
+
help="Whether to use cached files (Optional)",
|
| 250 |
+
)
|
| 251 |
+
parser.add_argument(
|
| 252 |
+
"--do_html",
|
| 253 |
+
default=False,
|
| 254 |
+
required=False,
|
| 255 |
+
action="store_true",
|
| 256 |
+
help="Whether to write out corresponding HTML files (Optional)",
|
| 257 |
+
)
|
| 258 |
+
parser.add_argument("--out_dir", default="cache_dir", help="Where to write out to.")
|
| 259 |
+
|
| 260 |
+
args = parser.parse_args()
|
| 261 |
+
print("Proceeding with the following arguments:")
|
| 262 |
+
print(args)
|
| 263 |
+
# run_data_measurements.py -n hate_speech18 -c default -s train -f text -w npmi
|
| 264 |
+
get_text_label_df(
|
| 265 |
+
args.dataset,
|
| 266 |
+
args.config,
|
| 267 |
+
args.split,
|
| 268 |
+
args.feature,
|
| 269 |
+
args.label_field,
|
| 270 |
+
args.calculation,
|
| 271 |
+
args.out_dir,
|
| 272 |
+
do_html=args.do_html,
|
| 273 |
+
use_cache=args.cached,
|
| 274 |
+
)
|
| 275 |
+
print()
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
if __name__ == "__main__":
|
| 279 |
+
main()
|