Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
·
e1f2cc3
1
Parent(s):
6af9ef6
Removing any need for a dataframe in expander_general_stats; instead making sure to cache and load the small amount of details needed for this widget. Note I also moved around a couple functions -- same content, just moved -- so that it was easier for me to navigate through the code. I also pulled out a couple of sub-functions from larger functions, again to make the code easier to work with/understand, as well as helping to further modularize so we can limit what needs to be cached.
Browse files- app.py +2 -3
- data_measurements/dataset_statistics.py +120 -83
- data_measurements/dataset_utils.py +2 -0
- data_measurements/streamlit_utils.py +17 -15
app.py
CHANGED
|
@@ -143,7 +143,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
|
|
| 143 |
logs.info("showing header")
|
| 144 |
st_utils.expander_header(dstats, ds_name_to_dict, column_id)
|
| 145 |
logs.info("showing general stats")
|
| 146 |
-
st_utils.expander_general_stats(dstats,
|
| 147 |
st_utils.expander_label_distribution(dstats.label_df, dstats.fig_labels, column_id)
|
| 148 |
st_utils.expander_text_lengths(
|
| 149 |
dstats.tokenized_df,
|
|
@@ -154,7 +154,7 @@ def show_column(dstats, ds_name_to_dict, show_embeddings, column_id, use_cache=T
|
|
| 154 |
LENGTH_FIELD,
|
| 155 |
column_id,
|
| 156 |
)
|
| 157 |
-
st_utils.expander_text_duplicates(dstats
|
| 158 |
|
| 159 |
# We do the loading of these after the others in order to have some time
|
| 160 |
# to compute while the user works with the details above.
|
|
@@ -191,7 +191,6 @@ def main():
|
|
| 191 |
|
| 192 |
# When not doing new development, use the cache.
|
| 193 |
use_cache = True
|
| 194 |
-
# TODO: Better handling of this eg, st.sidebar.checkbox("Show clustering")=
|
| 195 |
show_embeddings = st.sidebar.checkbox("Show embeddings")
|
| 196 |
# List of datasets for which embeddings are hard to compute:
|
| 197 |
|
|
|
|
| 143 |
logs.info("showing header")
|
| 144 |
st_utils.expander_header(dstats, ds_name_to_dict, column_id)
|
| 145 |
logs.info("showing general stats")
|
| 146 |
+
st_utils.expander_general_stats(dstats, column_id)
|
| 147 |
st_utils.expander_label_distribution(dstats.label_df, dstats.fig_labels, column_id)
|
| 148 |
st_utils.expander_text_lengths(
|
| 149 |
dstats.tokenized_df,
|
|
|
|
| 154 |
LENGTH_FIELD,
|
| 155 |
column_id,
|
| 156 |
)
|
| 157 |
+
st_utils.expander_text_duplicates(dstats, column_id)
|
| 158 |
|
| 159 |
# We do the loading of these after the others in order to have some time
|
| 160 |
# to compute while the user works with the details above.
|
|
|
|
| 191 |
|
| 192 |
# When not doing new development, use the cache.
|
| 193 |
use_cache = True
|
|
|
|
| 194 |
show_embeddings = st.sidebar.checkbox("Show embeddings")
|
| 195 |
# List of datasets for which embeddings are hard to compute:
|
| 196 |
|
data_measurements/dataset_statistics.py
CHANGED
|
@@ -33,6 +33,8 @@ from nltk.corpus import stopwords
|
|
| 33 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 34 |
|
| 35 |
from .dataset_utils import (
|
|
|
|
|
|
|
| 36 |
CNT,
|
| 37 |
DEDUP_TOT,
|
| 38 |
EMBEDDING_FIELD,
|
|
@@ -143,13 +145,9 @@ _TREE_MIN_NODES = 250
|
|
| 143 |
# as long as we're using sklearn - already pushing the resources
|
| 144 |
_MAX_CLUSTER_EXAMPLES = 5000
|
| 145 |
_NUM_VOCAB_BATCHES = 2000
|
| 146 |
-
|
| 147 |
-
|
| 148 |
_CVEC = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", lowercase=True)
|
| 149 |
|
| 150 |
-
num_rows = 200000
|
| 151 |
-
|
| 152 |
-
|
| 153 |
class DatasetStatisticsCacheClass:
|
| 154 |
def __init__(
|
| 155 |
self,
|
|
@@ -193,7 +191,7 @@ class DatasetStatisticsCacheClass:
|
|
| 193 |
self.label_dset = None
|
| 194 |
## Data frames
|
| 195 |
# Tokenized text
|
| 196 |
-
self.tokenized_df =
|
| 197 |
# save sentence length histogram in the class so it doesn't ge re-computed
|
| 198 |
self.fig_tok_length = None
|
| 199 |
# Data Frame version of self.label_dset
|
|
@@ -205,12 +203,14 @@ class DatasetStatisticsCacheClass:
|
|
| 205 |
# Vocabulary filtered to remove stopwords
|
| 206 |
self.vocab_counts_filtered_df = None
|
| 207 |
## General statistics and duplicates
|
|
|
|
|
|
|
| 208 |
# Number of NaN values (NOT empty strings)
|
| 209 |
self.text_nan_count = 0
|
| 210 |
# Number of text items that appear more than once in the dataset
|
| 211 |
self.dedup_total = 0
|
| 212 |
# Duplicated text items along with their number of occurences ("count")
|
| 213 |
-
self.
|
| 214 |
self.avg_length = None
|
| 215 |
self.std_length = None
|
| 216 |
self.general_stats_dict = None
|
|
@@ -258,10 +258,12 @@ class DatasetStatisticsCacheClass:
|
|
| 258 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
| 259 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
| 260 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
| 261 |
-
self.general_stats_fid = pjoin(self.cache_path, "
|
| 262 |
-
self.
|
| 263 |
-
self.cache_path, "
|
| 264 |
)
|
|
|
|
|
|
|
| 265 |
self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
|
| 266 |
self.fig_labels_fid = pjoin(self.cache_path, "fig_labels.json")
|
| 267 |
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
|
@@ -285,38 +287,47 @@ class DatasetStatisticsCacheClass:
|
|
| 285 |
self.get_base_dataset()
|
| 286 |
return self.dset[:100]
|
| 287 |
|
| 288 |
-
def load_or_prepare_general_stats(self, use_cache=False):
|
| 289 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
-
|
|
|
|
|
|
|
| 292 |
# General statistics
|
| 293 |
if (
|
| 294 |
use_cache
|
| 295 |
and exists(self.general_stats_fid)
|
| 296 |
-
and exists(self.
|
|
|
|
| 297 |
):
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
)
|
| 301 |
else:
|
| 302 |
-
(
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
self.
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
|
|
|
|
|
|
| 313 |
|
| 314 |
def load_or_prepare_text_lengths(self, use_cache=False, save=True):
|
| 315 |
# TODO: Everything here can be read from cache; it's in a transitory
|
| 316 |
# state atm where just the fig is cached. Clean up.
|
| 317 |
if use_cache and exists(self.fig_tok_length_fid):
|
| 318 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
| 319 |
-
if
|
| 320 |
self.tokenized_df = self.do_tokenization()
|
| 321 |
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[TOKENIZED_FIELD].apply(len)
|
| 322 |
self.avg_length = round(
|
|
@@ -385,56 +396,54 @@ class DatasetStatisticsCacheClass:
|
|
| 385 |
logs.info("filtered vocab")
|
| 386 |
logs.info(self.vocab_counts_filtered_df)
|
| 387 |
|
| 388 |
-
def
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
# TODO: Current UI only uses the fig, meaning the self.z here is irrelevant
|
| 394 |
-
# when only reading from cache. Either the UI should use it, or it should
|
| 395 |
-
# be removed when reading in cache
|
| 396 |
-
if use_cache and exists(self.zipf_fig_fid) and exists(self.zipf_fid):
|
| 397 |
-
with open(self.zipf_fid, "r") as f:
|
| 398 |
-
zipf_dict = json.load(f)
|
| 399 |
-
self.z = Zipf()
|
| 400 |
-
self.z.load(zipf_dict)
|
| 401 |
-
self.zipf_fig = read_plotly(self.zipf_fig_fid)
|
| 402 |
-
elif use_cache and exists(self.zipf_fid):
|
| 403 |
-
# TODO: Read zipf data so that the vocab is there.
|
| 404 |
-
with open(self.zipf_fid, "r") as f:
|
| 405 |
-
zipf_dict = json.load(f)
|
| 406 |
-
self.z = Zipf()
|
| 407 |
-
self.z.load(zipf_dict)
|
| 408 |
-
self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
|
| 409 |
-
if save:
|
| 410 |
-
write_plotly(self.zipf_fig, self.zipf_fig_fid)
|
| 411 |
-
else:
|
| 412 |
-
self.z = Zipf(self.vocab_counts_df)
|
| 413 |
-
self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
|
| 414 |
-
if save:
|
| 415 |
-
write_zipf_data(self.z, self.zipf_fid)
|
| 416 |
-
write_plotly(self.zipf_fig, self.zipf_fig_fid)
|
| 417 |
|
| 418 |
-
def
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
dup_df.pivot_table(
|
| 423 |
-
columns=[
|
| 424 |
).sort_values(ascending=False),
|
| 425 |
columns=[CNT],
|
| 426 |
)
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
dedup_total = sum(
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
with open(text_duplicate_counts_df_fid, "rb") as f:
|
| 437 |
-
self.text_dup_counts_df = feather.read_feather(f)
|
| 438 |
|
| 439 |
def load_or_prepare_dataset(self, use_cache=True, save=True):
|
| 440 |
"""
|
|
@@ -449,20 +458,24 @@ class DatasetStatisticsCacheClass:
|
|
| 449 |
Returns:
|
| 450 |
|
| 451 |
"""
|
| 452 |
-
|
| 453 |
-
self.
|
|
|
|
|
|
|
| 454 |
|
| 455 |
-
|
|
|
|
| 456 |
if (use_cache and exists(self.tokenized_df_fid)):
|
| 457 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
| 458 |
else:
|
| 459 |
# tokenize all text instances
|
| 460 |
self.tokenized_df = self.do_tokenization()
|
| 461 |
if save:
|
|
|
|
| 462 |
# save tokenized text
|
| 463 |
write_df(self.tokenized_df, self.tokenized_df_fid)
|
| 464 |
|
| 465 |
-
def load_or_prepare_text_dset(self,
|
| 466 |
if (use_cache and exists(self.text_dset_fid)):
|
| 467 |
# load extracted text
|
| 468 |
self.text_dset = load_from_disk(self.text_dset_fid)
|
|
@@ -557,11 +570,35 @@ class DatasetStatisticsCacheClass:
|
|
| 557 |
self.label_dset.save_to_disk(self.label_dset_fid)
|
| 558 |
write_plotly(self.fig_labels, self.fig_labels_fid)
|
| 559 |
|
| 560 |
-
def
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
|
| 566 |
def _set_idx_col_names(self, input_vocab_df):
|
| 567 |
if input_vocab_df.index.name != VOCAB and VOCAB in input_vocab_df.columns:
|
|
|
|
| 33 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 34 |
|
| 35 |
from .dataset_utils import (
|
| 36 |
+
TOT_WORDS,
|
| 37 |
+
TOT_OPEN_WORDS,
|
| 38 |
CNT,
|
| 39 |
DEDUP_TOT,
|
| 40 |
EMBEDDING_FIELD,
|
|
|
|
| 145 |
# as long as we're using sklearn - already pushing the resources
|
| 146 |
_MAX_CLUSTER_EXAMPLES = 5000
|
| 147 |
_NUM_VOCAB_BATCHES = 2000
|
| 148 |
+
_TOP_N = 100
|
|
|
|
| 149 |
_CVEC = CountVectorizer(token_pattern="(?u)\\b\\w+\\b", lowercase=True)
|
| 150 |
|
|
|
|
|
|
|
|
|
|
| 151 |
class DatasetStatisticsCacheClass:
|
| 152 |
def __init__(
|
| 153 |
self,
|
|
|
|
| 191 |
self.label_dset = None
|
| 192 |
## Data frames
|
| 193 |
# Tokenized text
|
| 194 |
+
self.tokenized_df = None
|
| 195 |
# save sentence length histogram in the class so it doesn't ge re-computed
|
| 196 |
self.fig_tok_length = None
|
| 197 |
# Data Frame version of self.label_dset
|
|
|
|
| 203 |
# Vocabulary filtered to remove stopwords
|
| 204 |
self.vocab_counts_filtered_df = None
|
| 205 |
## General statistics and duplicates
|
| 206 |
+
self.total_words = 0
|
| 207 |
+
self.total_open_words = 0
|
| 208 |
# Number of NaN values (NOT empty strings)
|
| 209 |
self.text_nan_count = 0
|
| 210 |
# Number of text items that appear more than once in the dataset
|
| 211 |
self.dedup_total = 0
|
| 212 |
# Duplicated text items along with their number of occurences ("count")
|
| 213 |
+
self.dup_counts_df = None
|
| 214 |
self.avg_length = None
|
| 215 |
self.std_length = None
|
| 216 |
self.general_stats_dict = None
|
|
|
|
| 258 |
self.tokenized_df_fid = pjoin(self.cache_path, "tokenized_df.feather")
|
| 259 |
self.label_dset_fid = pjoin(self.cache_path, "label_dset")
|
| 260 |
self.vocab_counts_df_fid = pjoin(self.cache_path, "vocab_counts.feather")
|
| 261 |
+
self.general_stats_fid = pjoin(self.cache_path, "general_stats_dict.json")
|
| 262 |
+
self.dup_counts_df_fid = pjoin(
|
| 263 |
+
self.cache_path, "dup_counts_df.feather"
|
| 264 |
)
|
| 265 |
+
self.sorted_top_vocab_df_fid = pjoin(self.cache_path,
|
| 266 |
+
"sorted_top_vocab.feather")
|
| 267 |
self.fig_tok_length_fid = pjoin(self.cache_path, "fig_tok_length.json")
|
| 268 |
self.fig_labels_fid = pjoin(self.cache_path, "fig_labels.json")
|
| 269 |
self.node_list_fid = pjoin(self.cache_path, "node_list.th")
|
|
|
|
| 287 |
self.get_base_dataset()
|
| 288 |
return self.dset[:100]
|
| 289 |
|
| 290 |
+
def load_or_prepare_general_stats(self, use_cache=False, save=True):
|
| 291 |
+
"""
|
| 292 |
+
Content for expander_general_stats widget.
|
| 293 |
+
Provides statistics for total words, total open words,
|
| 294 |
+
the sorted top vocab, the NaN count, and the duplicate count.
|
| 295 |
+
Args:
|
| 296 |
+
use_cache:
|
| 297 |
|
| 298 |
+
Returns:
|
| 299 |
+
|
| 300 |
+
"""
|
| 301 |
# General statistics
|
| 302 |
if (
|
| 303 |
use_cache
|
| 304 |
and exists(self.general_stats_fid)
|
| 305 |
+
and exists(self.dup_counts_df_fid)
|
| 306 |
+
and exists(self.sorted_top_vocab_df_fid)
|
| 307 |
):
|
| 308 |
+
print('Loading cached general stats')
|
| 309 |
+
self.load_general_stats()
|
|
|
|
| 310 |
else:
|
| 311 |
+
print('Preparing general stats')
|
| 312 |
+
self.prepare_general_stats()
|
| 313 |
+
if save:
|
| 314 |
+
print(self.sorted_top_vocab_df)
|
| 315 |
+
print(self.sorted_top_vocab_df_fid)
|
| 316 |
+
write_df(self.sorted_top_vocab_df, self.sorted_top_vocab_df_fid)
|
| 317 |
+
print(self.dup_counts_df)
|
| 318 |
+
print(self.dup_counts_df_fid)
|
| 319 |
+
write_df(self.dup_counts_df, self.dup_counts_df_fid)
|
| 320 |
+
print(self.general_stats_dict)
|
| 321 |
+
print(self.general_stats_fid)
|
| 322 |
+
write_json(self.general_stats_dict, self.general_stats_fid)
|
| 323 |
+
|
| 324 |
|
| 325 |
def load_or_prepare_text_lengths(self, use_cache=False, save=True):
|
| 326 |
# TODO: Everything here can be read from cache; it's in a transitory
|
| 327 |
# state atm where just the fig is cached. Clean up.
|
| 328 |
if use_cache and exists(self.fig_tok_length_fid):
|
| 329 |
self.fig_tok_length = read_plotly(self.fig_tok_length_fid)
|
| 330 |
+
if self.tokenized_df is None:
|
| 331 |
self.tokenized_df = self.do_tokenization()
|
| 332 |
self.tokenized_df[LENGTH_FIELD] = self.tokenized_df[TOKENIZED_FIELD].apply(len)
|
| 333 |
self.avg_length = round(
|
|
|
|
| 396 |
logs.info("filtered vocab")
|
| 397 |
logs.info(self.vocab_counts_filtered_df)
|
| 398 |
|
| 399 |
+
def load_vocab(self):
|
| 400 |
+
with open(self.vocab_counts_df_fid, "rb") as f:
|
| 401 |
+
self.vocab_counts_df = feather.read_feather(f)
|
| 402 |
+
# Handling for changes in how the index is saved.
|
| 403 |
+
self.vocab_counts_df = self._set_idx_col_names(self.vocab_counts_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
+
def load_general_stats(self):
|
| 406 |
+
self.general_stats_dict = json.load(open(self.general_stats_fid, encoding="utf-8"))
|
| 407 |
+
with open(self.dup_counts_df_fid, "rb") as f:
|
| 408 |
+
self.dup_counts_df = feather.read_feather(f)
|
| 409 |
+
with open(self.sorted_top_vocab_df_fid, "rb") as f:
|
| 410 |
+
self.sorted_top_vocab_df = feather.read_feather(f)
|
| 411 |
+
self.text_nan_count = self.general_stats_dict[TEXT_NAN_CNT]
|
| 412 |
+
self.dedup_total = self.general_stats_dict[DEDUP_TOT]
|
| 413 |
+
self.total_words = self.general_stats_dict[TOT_WORDS]
|
| 414 |
+
self.total_open_words = self.general_stats_dict[TOT_OPEN_WORDS]
|
| 415 |
+
|
| 416 |
+
def prepare_general_stats(self):
|
| 417 |
+
if self.tokenized_df is None:
|
| 418 |
+
logs.warning("Tokenized dataset not yet loaded; doing so.")
|
| 419 |
+
self.load_or_prepare_dataset()
|
| 420 |
+
if self.vocab_counts_df is None:
|
| 421 |
+
logs.warning("Vocab not yet loaded; doing so.")
|
| 422 |
+
self.load_or_prepare_vocab()
|
| 423 |
+
self.sorted_top_vocab_df = self.vocab_counts_filtered_df.sort_values(
|
| 424 |
+
"count", ascending=False
|
| 425 |
+
).head(_TOP_N)
|
| 426 |
+
print('basics')
|
| 427 |
+
self.total_words = len(self.vocab_counts_df)
|
| 428 |
+
self.total_open_words = len(self.vocab_counts_filtered_df)
|
| 429 |
+
self.text_nan_count = int(self.tokenized_df.isnull().sum().sum())
|
| 430 |
+
dup_df = self.tokenized_df[self.tokenized_df.duplicated([OUR_TEXT_FIELD])]
|
| 431 |
+
print('dup df')
|
| 432 |
+
self.dup_counts_df = pd.DataFrame(
|
| 433 |
dup_df.pivot_table(
|
| 434 |
+
columns=[OUR_TEXT_FIELD], aggfunc="size"
|
| 435 |
).sort_values(ascending=False),
|
| 436 |
columns=[CNT],
|
| 437 |
)
|
| 438 |
+
print('deddup df')
|
| 439 |
+
self.dup_counts_df[OUR_TEXT_FIELD] = self.dup_counts_df.index.copy()
|
| 440 |
+
self.dedup_total = sum(self.dup_counts_df[CNT])
|
| 441 |
+
self.general_stats_dict = {
|
| 442 |
+
TOT_WORDS: self.total_words,
|
| 443 |
+
TOT_OPEN_WORDS: self.total_open_words,
|
| 444 |
+
TEXT_NAN_CNT: self.text_nan_count,
|
| 445 |
+
DEDUP_TOT: self.dedup_total,
|
| 446 |
+
}
|
|
|
|
|
|
|
| 447 |
|
| 448 |
def load_or_prepare_dataset(self, use_cache=True, save=True):
|
| 449 |
"""
|
|
|
|
| 458 |
Returns:
|
| 459 |
|
| 460 |
"""
|
| 461 |
+
logs.info("Doing text dset.")
|
| 462 |
+
self.load_or_prepare_text_dset(use_cache, save)
|
| 463 |
+
logs.info("Doing tokenized dataframe")
|
| 464 |
+
self.load_or_prepare_tokenized_df(use_cache, save)
|
| 465 |
|
| 466 |
+
|
| 467 |
+
def load_or_prepare_tokenized_df(self, use_cache, save):
|
| 468 |
if (use_cache and exists(self.tokenized_df_fid)):
|
| 469 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
| 470 |
else:
|
| 471 |
# tokenize all text instances
|
| 472 |
self.tokenized_df = self.do_tokenization()
|
| 473 |
if save:
|
| 474 |
+
logs.warning("Saving tokenized dataset to disk")
|
| 475 |
# save tokenized text
|
| 476 |
write_df(self.tokenized_df, self.tokenized_df_fid)
|
| 477 |
|
| 478 |
+
def load_or_prepare_text_dset(self, use_cache, save):
|
| 479 |
if (use_cache and exists(self.text_dset_fid)):
|
| 480 |
# load extracted text
|
| 481 |
self.text_dset = load_from_disk(self.text_dset_fid)
|
|
|
|
| 570 |
self.label_dset.save_to_disk(self.label_dset_fid)
|
| 571 |
write_plotly(self.fig_labels, self.fig_labels_fid)
|
| 572 |
|
| 573 |
+
def load_or_prepare_npmi_terms(self, use_cache=False):
|
| 574 |
+
self.npmi_stats = nPMIStatisticsCacheClass(self, use_cache=use_cache)
|
| 575 |
+
self.npmi_stats.load_or_prepare_npmi_terms()
|
| 576 |
+
|
| 577 |
+
def load_or_prepare_zipf(self, use_cache=False, save=True):
|
| 578 |
+
# TODO: Current UI only uses the fig, meaning the self.z here is irrelevant
|
| 579 |
+
# when only reading from cache. Either the UI should use it, or it should
|
| 580 |
+
# be removed when reading in cache
|
| 581 |
+
if use_cache and exists(self.zipf_fig_fid) and exists(self.zipf_fid):
|
| 582 |
+
with open(self.zipf_fid, "r") as f:
|
| 583 |
+
zipf_dict = json.load(f)
|
| 584 |
+
self.z = Zipf()
|
| 585 |
+
self.z.load(zipf_dict)
|
| 586 |
+
self.zipf_fig = read_plotly(self.zipf_fig_fid)
|
| 587 |
+
elif use_cache and exists(self.zipf_fid):
|
| 588 |
+
# TODO: Read zipf data so that the vocab is there.
|
| 589 |
+
with open(self.zipf_fid, "r") as f:
|
| 590 |
+
zipf_dict = json.load(f)
|
| 591 |
+
self.z = Zipf()
|
| 592 |
+
self.z.load(zipf_dict)
|
| 593 |
+
self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
|
| 594 |
+
if save:
|
| 595 |
+
write_plotly(self.zipf_fig, self.zipf_fig_fid)
|
| 596 |
+
else:
|
| 597 |
+
self.z = Zipf(self.vocab_counts_df)
|
| 598 |
+
self.zipf_fig = make_zipf_fig(self.vocab_counts_df, self.z)
|
| 599 |
+
if save:
|
| 600 |
+
write_zipf_data(self.z, self.zipf_fid)
|
| 601 |
+
write_plotly(self.zipf_fig, self.zipf_fig_fid)
|
| 602 |
|
| 603 |
def _set_idx_col_names(self, input_vocab_df):
|
| 604 |
if input_vocab_df.index.name != VOCAB and VOCAB in input_vocab_df.columns:
|
data_measurements/dataset_utils.py
CHANGED
|
@@ -43,6 +43,8 @@ PROP = "proportion"
|
|
| 43 |
TEXT_NAN_CNT = "text_nan_count"
|
| 44 |
TXT_LEN = "text lengths"
|
| 45 |
DEDUP_TOT = "dedup_total"
|
|
|
|
|
|
|
| 46 |
|
| 47 |
_DATASET_LIST = [
|
| 48 |
"c4",
|
|
|
|
| 43 |
TEXT_NAN_CNT = "text_nan_count"
|
| 44 |
TXT_LEN = "text lengths"
|
| 45 |
DEDUP_TOT = "dedup_total"
|
| 46 |
+
TOT_WORDS = "total words"
|
| 47 |
+
TOT_OPEN_WORDS = "total open words"
|
| 48 |
|
| 49 |
_DATASET_LIST = [
|
| 50 |
"c4",
|
data_measurements/streamlit_utils.py
CHANGED
|
@@ -102,32 +102,34 @@ def expander_header(dstats, ds_name_to_dict, column_id):
|
|
| 102 |
st.dataframe(dstats.get_dataset_peek())
|
| 103 |
|
| 104 |
|
| 105 |
-
def expander_general_stats(dstats,
|
| 106 |
with st.expander(f"General Text Statistics{column_id}"):
|
| 107 |
st.caption(
|
| 108 |
-
"Use this widget to check whether the terms you see most represented
|
|
|
|
| 109 |
)
|
| 110 |
st.markdown(
|
| 111 |
-
"There are {0} total words".format(str(
|
| 112 |
)
|
| 113 |
st.markdown(
|
| 114 |
"There are {0} words after removing closed "
|
| 115 |
-
"class words".format(str(
|
| 116 |
)
|
| 117 |
-
sorted_top_vocab_df = dstats.vocab_counts_filtered_df.sort_values(
|
| 118 |
-
"count", ascending=False
|
| 119 |
-
).head(top_n)
|
| 120 |
st.markdown(
|
| 121 |
-
"The most common
|
|
|
|
|
|
|
| 122 |
)
|
| 123 |
-
st.dataframe(sorted_top_vocab_df)
|
| 124 |
st.markdown(
|
| 125 |
"There are {0} missing values in the dataset.".format(
|
| 126 |
str(dstats.text_nan_count)
|
| 127 |
)
|
| 128 |
)
|
| 129 |
st.markdown(
|
| 130 |
-
"There are {0} duplicate items in the dataset.
|
|
|
|
|
|
|
| 131 |
str(dstats.dedup_total)
|
| 132 |
)
|
| 133 |
)
|
|
@@ -269,7 +271,8 @@ def expander_text_embeddings(
|
|
| 269 |
|
| 270 |
|
| 271 |
### Then, show duplicates
|
| 272 |
-
def expander_text_duplicates(
|
|
|
|
| 273 |
with st.expander(f"Text Duplicates{column_id}", expanded=False):
|
| 274 |
st.caption(
|
| 275 |
"Use this widget to identify text strings that appear more than once."
|
|
@@ -277,16 +280,15 @@ def expander_text_duplicates(dedup_df, column_id):
|
|
| 277 |
st.markdown(
|
| 278 |
"A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
|
| 279 |
)
|
| 280 |
-
dedup_df["count"] = dedup_df["count"] + 1
|
| 281 |
st.markdown("------")
|
| 282 |
st.write(
|
| 283 |
"### Here is the list of all the duplicated items and their counts in your dataset:"
|
| 284 |
)
|
| 285 |
# Eh...adding 1 because otherwise it looks too weird for duplicate counts when the value is just 1.
|
| 286 |
-
if len(
|
| 287 |
st.write("There are no duplicates in this dataset! 🥳")
|
| 288 |
else:
|
| 289 |
-
gb = GridOptionsBuilder.from_dataframe(
|
| 290 |
gb.configure_column(
|
| 291 |
f"text{column_id}",
|
| 292 |
wrapText=True,
|
|
@@ -296,7 +298,7 @@ def expander_text_duplicates(dedup_df, column_id):
|
|
| 296 |
use_container_width=True,
|
| 297 |
)
|
| 298 |
go = gb.build()
|
| 299 |
-
AgGrid(
|
| 300 |
|
| 301 |
|
| 302 |
def expander_npmi_description(min_vocab):
|
|
|
|
| 102 |
st.dataframe(dstats.get_dataset_peek())
|
| 103 |
|
| 104 |
|
| 105 |
+
def expander_general_stats(dstats, column_id):
|
| 106 |
with st.expander(f"General Text Statistics{column_id}"):
|
| 107 |
st.caption(
|
| 108 |
+
"Use this widget to check whether the terms you see most represented"
|
| 109 |
+
" in the dataset make sense for the goals of the dataset."
|
| 110 |
)
|
| 111 |
st.markdown(
|
| 112 |
+
"There are {0} total words".format(str(dstats.total_words))
|
| 113 |
)
|
| 114 |
st.markdown(
|
| 115 |
"There are {0} words after removing closed "
|
| 116 |
+
"class words".format(str(dstats.total_open_words))
|
| 117 |
)
|
|
|
|
|
|
|
|
|
|
| 118 |
st.markdown(
|
| 119 |
+
"The most common "
|
| 120 |
+
"[open class words](https://dictionary.apa.org/open-class-words) "
|
| 121 |
+
"and their counts are: "
|
| 122 |
)
|
| 123 |
+
st.dataframe(dstats.sorted_top_vocab_df)
|
| 124 |
st.markdown(
|
| 125 |
"There are {0} missing values in the dataset.".format(
|
| 126 |
str(dstats.text_nan_count)
|
| 127 |
)
|
| 128 |
)
|
| 129 |
st.markdown(
|
| 130 |
+
"There are {0} duplicate items in the dataset. "
|
| 131 |
+
"For more information about the duplicates, "
|
| 132 |
+
"click the 'Duplicates' tab below.".format(
|
| 133 |
str(dstats.dedup_total)
|
| 134 |
)
|
| 135 |
)
|
|
|
|
| 271 |
|
| 272 |
|
| 273 |
### Then, show duplicates
|
| 274 |
+
def expander_text_duplicates(dstats, column_id):
|
| 275 |
+
# TODO: Saving/loading figure
|
| 276 |
with st.expander(f"Text Duplicates{column_id}", expanded=False):
|
| 277 |
st.caption(
|
| 278 |
"Use this widget to identify text strings that appear more than once."
|
|
|
|
| 280 |
st.markdown(
|
| 281 |
"A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
|
| 282 |
)
|
|
|
|
| 283 |
st.markdown("------")
|
| 284 |
st.write(
|
| 285 |
"### Here is the list of all the duplicated items and their counts in your dataset:"
|
| 286 |
)
|
| 287 |
# Eh...adding 1 because otherwise it looks too weird for duplicate counts when the value is just 1.
|
| 288 |
+
if len(dstats.dup_counts_df) == 0:
|
| 289 |
st.write("There are no duplicates in this dataset! 🥳")
|
| 290 |
else:
|
| 291 |
+
gb = GridOptionsBuilder.from_dataframe(dstats.dup_counts_df)
|
| 292 |
gb.configure_column(
|
| 293 |
f"text{column_id}",
|
| 294 |
wrapText=True,
|
|
|
|
| 298 |
use_container_width=True,
|
| 299 |
)
|
| 300 |
go = gb.build()
|
| 301 |
+
AgGrid(dstats.dup_counts_df, gridOptions=go)
|
| 302 |
|
| 303 |
|
| 304 |
def expander_npmi_description(min_vocab):
|