Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
·
14e5c2a
1
Parent(s):
1a4c18a
Try..except catching for errors
Browse files- app.py +48 -18
- data_measurements/streamlit_utils.py +67 -64
app.py
CHANGED
|
@@ -150,25 +150,55 @@ def load_or_prepare_widgets(ds_args, show_embeddings, use_cache=False):
|
|
| 150 |
mkdir(CACHE_DIR)
|
| 151 |
if use_cache:
|
| 152 |
logs.warning("Using cache")
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
if show_embeddings:
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
return dstats
|
| 173 |
|
| 174 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
|
|
|
| 150 |
mkdir(CACHE_DIR)
|
| 151 |
if use_cache:
|
| 152 |
logs.warning("Using cache")
|
| 153 |
+
try:
|
| 154 |
+
dstats = dataset_statistics.DatasetStatisticsCacheClass(CACHE_DIR, **ds_args, use_cache=use_cache)
|
| 155 |
+
# Don't recalculate; we're live
|
| 156 |
+
dstats.set_deployment(True)
|
| 157 |
+
except:
|
| 158 |
+
logs.warning("We're screwed")
|
| 159 |
+
try:
|
| 160 |
+
# We need to have the text_dset loaded for further load_or_prepare
|
| 161 |
+
dstats.load_or_prepare_dataset()
|
| 162 |
+
except:
|
| 163 |
+
logs.warning("Missing a cache for load or prepare dataset")
|
| 164 |
+
try:
|
| 165 |
+
# Header widget
|
| 166 |
+
dstats.load_or_prepare_dset_peek()
|
| 167 |
+
except:
|
| 168 |
+
logs.warning("Missing a cache for dset peek")
|
| 169 |
+
try:
|
| 170 |
+
# General stats widget
|
| 171 |
+
dstats.load_or_prepare_general_stats()
|
| 172 |
+
except:
|
| 173 |
+
logs.warning("Missing a cache for general stats")
|
| 174 |
+
try:
|
| 175 |
+
# Labels widget
|
| 176 |
+
dstats.load_or_prepare_labels()
|
| 177 |
+
except:
|
| 178 |
+
logs.warning("Missing a cache for prepare labels")
|
| 179 |
+
try:
|
| 180 |
+
# Text lengths widget
|
| 181 |
+
dstats.load_or_prepare_text_lengths()
|
| 182 |
+
except:
|
| 183 |
+
logs.warning("Missing a cache for text lengths")
|
| 184 |
if show_embeddings:
|
| 185 |
+
try:
|
| 186 |
+
# Embeddings widget
|
| 187 |
+
dstats.load_or_prepare_embeddings()
|
| 188 |
+
except:
|
| 189 |
+
logs.warning("Missing a cache for embeddings")
|
| 190 |
+
try:
|
| 191 |
+
dstats.load_or_prepare_text_duplicates()
|
| 192 |
+
except:
|
| 193 |
+
logs.warning("Missing a cache for text duplicates")
|
| 194 |
+
try:
|
| 195 |
+
dstats.load_or_prepare_npmi()
|
| 196 |
+
except:
|
| 197 |
+
logs.warning("Missing a cache for npmi")
|
| 198 |
+
try:
|
| 199 |
+
dstats.load_or_prepare_zipf()
|
| 200 |
+
except:
|
| 201 |
+
logs.warning("Missing a cache for zipf")
|
| 202 |
return dstats
|
| 203 |
|
| 204 |
def show_column(dstats, ds_name_to_dict, show_embeddings, column_id):
|
data_measurements/streamlit_utils.py
CHANGED
|
@@ -319,72 +319,75 @@ def expander_npmi_description(min_vocab):
|
|
| 319 |
|
| 320 |
### Finally, show Zipf stuff
|
| 321 |
def expander_zipf(z, zipf_fig, column_id):
|
| 322 |
-
_ZIPF_CAPTION = """This shows how close the observed language is to an ideal
|
| 323 |
-
natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
|
| 324 |
-
calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
|
| 325 |
-
|
| 326 |
-
powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
|
| 327 |
-
zipf_summary = (
|
| 328 |
-
"The optimal alpha based on this dataset is: **"
|
| 329 |
-
+ str(round(z.alpha, 2))
|
| 330 |
-
+ "**, with a KS distance of: **"
|
| 331 |
-
+ str(round(z.distance, 2))
|
| 332 |
-
)
|
| 333 |
-
zipf_summary += (
|
| 334 |
-
"**. This was fit with a minimum rank value of: **"
|
| 335 |
-
+ str(int(z.xmin))
|
| 336 |
-
+ "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
|
| 337 |
-
)
|
| 338 |
-
|
| 339 |
-
alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
|
| 340 |
-
xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
|
| 341 |
-
fit_results_table = pd.DataFrame.from_dict(
|
| 342 |
-
{
|
| 343 |
-
r"Alpha:": [str("%.2f" % z.alpha)],
|
| 344 |
-
"KS distance:": [str("%.2f" % z.distance)],
|
| 345 |
-
"Min rank:": [str("%s" % int(z.xmin))],
|
| 346 |
-
},
|
| 347 |
-
columns=["Results"],
|
| 348 |
-
orient="index",
|
| 349 |
-
)
|
| 350 |
-
fit_results_table.index.name = column_id
|
| 351 |
with st.expander(
|
| 352 |
f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
|
| 353 |
):
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
"""
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
st.markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
|
| 389 |
|
| 390 |
### Finally finally finally, show nPMI stuff.
|
|
@@ -427,7 +430,7 @@ def npmi_widget(npmi_stats, min_vocab, column_id):
|
|
| 427 |
|
| 428 |
def npmi_show(paired_results):
|
| 429 |
if paired_results.empty:
|
| 430 |
-
st.markdown("No words that co-occur enough times for results! Or there's a 🐛.")
|
| 431 |
else:
|
| 432 |
s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
|
| 433 |
# s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
|
|
|
|
| 319 |
|
| 320 |
### Finally, show Zipf stuff
|
| 321 |
def expander_zipf(z, zipf_fig, column_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
with st.expander(
|
| 323 |
f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
|
| 324 |
):
|
| 325 |
+
try:
|
| 326 |
+
_ZIPF_CAPTION = """This shows how close the observed language is to an ideal
|
| 327 |
+
natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
|
| 328 |
+
calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""
|
| 329 |
+
|
| 330 |
+
powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
|
| 331 |
+
zipf_summary = (
|
| 332 |
+
"The optimal alpha based on this dataset is: **"
|
| 333 |
+
+ str(round(z.alpha, 2))
|
| 334 |
+
+ "**, with a KS distance of: **"
|
| 335 |
+
+ str(round(z.distance, 2))
|
| 336 |
+
)
|
| 337 |
+
zipf_summary += (
|
| 338 |
+
"**. This was fit with a minimum rank value of: **"
|
| 339 |
+
+ str(int(z.xmin))
|
| 340 |
+
+ "**, which is the optimal rank *beyond which* the scaling regime of the power law fits best."
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
|
| 344 |
+
xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
|
| 345 |
+
fit_results_table = pd.DataFrame.from_dict(
|
| 346 |
+
{
|
| 347 |
+
r"Alpha:": [str("%.2f" % z.alpha)],
|
| 348 |
+
"KS distance:": [str("%.2f" % z.distance)],
|
| 349 |
+
"Min rank:": [str("%s" % int(z.xmin))],
|
| 350 |
+
},
|
| 351 |
+
columns=["Results"],
|
| 352 |
+
orient="index",
|
| 353 |
+
)
|
| 354 |
+
fit_results_table.index.name = column_id
|
| 355 |
+
st.caption(
|
| 356 |
+
"Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
|
| 357 |
+
)
|
| 358 |
+
st.markdown(_ZIPF_CAPTION)
|
| 359 |
+
st.write(
|
| 360 |
+
"""
|
| 361 |
+
A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
|
| 362 |
+
with an ideal α value of 1."""
|
| 363 |
+
)
|
| 364 |
+
st.markdown(
|
| 365 |
+
"In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
|
| 366 |
+
)
|
| 367 |
+
st.markdown(
|
| 368 |
+
"Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
|
| 369 |
+
)
|
| 370 |
+
st.markdown("-----")
|
| 371 |
+
st.write("### Here is your dataset's Zipf results:")
|
| 372 |
+
st.dataframe(fit_results_table)
|
| 373 |
+
st.write(zipf_summary)
|
| 374 |
+
# TODO: Nice UI version of the content in the comments.
|
| 375 |
+
# st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
|
| 376 |
+
# if z.ks_test.pvalue < 0.01:
|
| 377 |
+
# st.markdown(
|
| 378 |
+
# "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
|
| 379 |
+
# else:
|
| 380 |
+
# st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
|
| 381 |
+
# st.markdown("Checking the goodness of fit of our observed distribution")
|
| 382 |
+
# st.markdown("to the hypothesized power law distribution")
|
| 383 |
+
# st.markdown("using a Kolmogorov–Smirnov (KS) test.")
|
| 384 |
+
st.plotly_chart(zipf_fig, use_container_width=True)
|
| 385 |
+
if z.alpha > 2:
|
| 386 |
+
st.markdown(alpha_warning)
|
| 387 |
+
if z.xmin > 5:
|
| 388 |
+
st.markdown(xmin_warning)
|
| 389 |
+
except:
|
| 390 |
+
st.write("Under construction!")
|
| 391 |
|
| 392 |
|
| 393 |
### Finally finally finally, show nPMI stuff.
|
|
|
|
| 430 |
|
| 431 |
def npmi_show(paired_results):
|
| 432 |
if paired_results.empty:
|
| 433 |
+
st.markdown("No words that co-occur enough times for results! Or there's a 🐛. Or we're still computing this one. 🤷")
|
| 434 |
else:
|
| 435 |
s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
|
| 436 |
# s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
|