Spaces:
Runtime error
Runtime error
Commit
·
bfbcd60
1
Parent(s):
649ea6a
button to download parameters
Browse files- app.py +114 -88
- explanation_filtering_pipeline.pdf +0 -0
app.py
CHANGED
|
@@ -162,9 +162,7 @@ class Visualization:
|
|
| 162 |
if "10" in val_repetitions_lengths
|
| 163 |
else 0
|
| 164 |
)
|
| 165 |
-
label_selectbox = (
|
| 166 |
-
"Length of the repetitions (that will determine the repetitions ratio)."
|
| 167 |
-
)
|
| 168 |
repetitions_length = st.sidebar.selectbox(
|
| 169 |
label=label_selectbox,
|
| 170 |
options=val_repetitions_lengths,
|
|
@@ -261,6 +259,7 @@ class Visualization:
|
|
| 261 |
return keys, conds
|
| 262 |
|
| 263 |
self.keys, conds = set_sliders()
|
|
|
|
| 264 |
|
| 265 |
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
| 266 |
all_conds = np.all(all_conds, axis=0)
|
|
@@ -347,10 +346,14 @@ class Visualization:
|
|
| 347 |
cutoff_def = "If the length of a word is higher than this number, the word is removed."
|
| 348 |
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
|
| 349 |
cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
|
|
|
|
|
|
|
| 350 |
|
| 351 |
incorrect_substrings = st.sidebar.checkbox(
|
| 352 |
"Remove words with incorrect substrings."
|
| 353 |
)
|
|
|
|
|
|
|
| 354 |
|
| 355 |
cond_words = self.words["len_word"] <= cutoff_word
|
| 356 |
if incorrect_substrings:
|
|
@@ -381,6 +384,13 @@ class Visualization:
|
|
| 381 |
)
|
| 382 |
st.dataframe(retained_words)
|
| 383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
def plot_distributions_filtering_parameters(self):
|
| 385 |
st.header("Distributions of the filtering parameters")
|
| 386 |
|
|
@@ -437,94 +447,109 @@ class Visualization:
|
|
| 437 |
is_discarded = False
|
| 438 |
|
| 439 |
def is_doc_discarded(key, score):
|
| 440 |
-
if key[2]:
|
| 441 |
return score > key[1]
|
| 442 |
else:
|
| 443 |
return score < key[1]
|
| 444 |
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
|
| 529 |
def download_data(self):
|
| 530 |
st.header("Download data")
|
|
@@ -543,8 +568,9 @@ class Visualization:
|
|
| 543 |
self.set_title()
|
| 544 |
self.filtering_of_docs()
|
| 545 |
self.filtering_of_words()
|
|
|
|
| 546 |
self.plot_distributions_filtering_parameters()
|
| 547 |
-
#self.plot_zipf_law()
|
| 548 |
self.analyse_personal_doc()
|
| 549 |
self.download_data()
|
| 550 |
|
|
|
|
| 162 |
if "10" in val_repetitions_lengths
|
| 163 |
else 0
|
| 164 |
)
|
| 165 |
+
label_selectbox = "Length of the repetitions (that will determine the repetitions ratio)."
|
|
|
|
|
|
|
| 166 |
repetitions_length = st.sidebar.selectbox(
|
| 167 |
label=label_selectbox,
|
| 168 |
options=val_repetitions_lengths,
|
|
|
|
| 259 |
return keys, conds
|
| 260 |
|
| 261 |
self.keys, conds = set_sliders()
|
| 262 |
+
self.parameters = self.keys * 1
|
| 263 |
|
| 264 |
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
|
| 265 |
all_conds = np.all(all_conds, axis=0)
|
|
|
|
| 346 |
cutoff_def = "If the length of a word is higher than this number, the word is removed."
|
| 347 |
max_len_word = min(int(np.max(self.words["len_word"])) + 1, 200)
|
| 348 |
cutoff_word = st.sidebar.slider(cutoff_def, 0, max_len_word, max_len_word)
|
| 349 |
+
self.parameters.append(("len_word", cutoff_word, True))
|
| 350 |
+
st.sidebar.caption("---------")
|
| 351 |
|
| 352 |
incorrect_substrings = st.sidebar.checkbox(
|
| 353 |
"Remove words with incorrect substrings."
|
| 354 |
)
|
| 355 |
+
self.parameters.append(("incorrect_substrings", incorrect_substrings))
|
| 356 |
+
st.sidebar.caption("---------")
|
| 357 |
|
| 358 |
cond_words = self.words["len_word"] <= cutoff_word
|
| 359 |
if incorrect_substrings:
|
|
|
|
| 384 |
)
|
| 385 |
st.dataframe(retained_words)
|
| 386 |
|
| 387 |
+
def download_parameters(self):
|
| 388 |
+
btn = st.sidebar.download_button(
|
| 389 |
+
label="Download current parameters as json",
|
| 390 |
+
data=json.dumps(self.parameters),
|
| 391 |
+
file_name=f"parameters_{self.lang_dataset_id}.json",
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
def plot_distributions_filtering_parameters(self):
|
| 395 |
st.header("Distributions of the filtering parameters")
|
| 396 |
|
|
|
|
| 447 |
is_discarded = False
|
| 448 |
|
| 449 |
def is_doc_discarded(key, score):
|
| 450 |
+
if key[2]: # max cutoff
|
| 451 |
return score > key[1]
|
| 452 |
else:
|
| 453 |
return score < key[1]
|
| 454 |
|
| 455 |
+
if personal_doc:
|
| 456 |
+
|
| 457 |
+
st.markdown("Statistics of the document:")
|
| 458 |
+
|
| 459 |
+
for key in self.keys:
|
| 460 |
+
if key[0] == "number_words":
|
| 461 |
+
words = ModifyingDocuments.get_words_from_document(
|
| 462 |
+
personal_doc,
|
| 463 |
+
self.sentencepiece_model_tok,
|
| 464 |
+
lower_case=False,
|
| 465 |
+
strip_characters=self.param["strip_characters"],
|
| 466 |
+
)
|
| 467 |
+
if key[2]:
|
| 468 |
+
st.markdown(f"Number of words: {len(words)}")
|
| 469 |
+
if is_doc_discarded(key, len(words)):
|
| 470 |
+
is_discarded = True
|
| 471 |
+
|
| 472 |
+
elif key[0] == "repetitions_ratio":
|
| 473 |
+
repetitions_ratio = Filtering.compute_repetitions_ratio(
|
| 474 |
+
personal_doc, int(key[3])
|
| 475 |
+
)
|
| 476 |
+
repetitions_ratio = round(repetitions_ratio, 3)
|
| 477 |
+
st.markdown(f"Repetitions ratio: {repetitions_ratio}")
|
| 478 |
+
if is_doc_discarded(key, repetitions_ratio):
|
| 479 |
+
is_discarded = True
|
| 480 |
+
|
| 481 |
+
elif key[0] == "special_characters_ratio":
|
| 482 |
+
special_characters_ratio = (
|
| 483 |
+
Filtering.compute_special_characters_ratio(
|
| 484 |
+
personal_doc, self.param["special_characters"]
|
| 485 |
+
)
|
| 486 |
+
)
|
| 487 |
+
special_characters_ratio = round(special_characters_ratio, 3)
|
| 488 |
+
st.markdown(f"Special characters ratio: {special_characters_ratio}")
|
| 489 |
+
if is_doc_discarded(key, special_characters_ratio):
|
| 490 |
+
is_discarded = True
|
| 491 |
+
|
| 492 |
+
elif key[0] == "stopwords_ratio":
|
| 493 |
+
stopwords_ratio = Filtering.compute_stopwords_ratio(
|
| 494 |
+
personal_doc,
|
| 495 |
+
self.sentencepiece_model_tok,
|
| 496 |
+
self.param["strip_characters"],
|
| 497 |
+
self.param["cond_words_augmentation"],
|
| 498 |
+
self.param["words_augmentation_group_sizes"],
|
| 499 |
+
self.param["words_augmentation_join_char"],
|
| 500 |
+
self.stopwords,
|
| 501 |
+
)
|
| 502 |
+
stopwords_ratio = round(stopwords_ratio, 3)
|
| 503 |
+
st.markdown(f"Stop words ratio: {stopwords_ratio}")
|
| 504 |
+
if is_doc_discarded(key, stopwords_ratio):
|
| 505 |
+
is_discarded = True
|
| 506 |
+
|
| 507 |
+
elif key[0] == "badwords_ratio":
|
| 508 |
+
badwords_ratio = Filtering.compute_badwords_ratio(
|
| 509 |
+
personal_doc,
|
| 510 |
+
self.sentencepiece_model_tok,
|
| 511 |
+
self.param["strip_characters"],
|
| 512 |
+
self.param["cond_words_augmentation"],
|
| 513 |
+
self.param["words_augmentation_group_sizes"],
|
| 514 |
+
self.param["words_augmentation_join_char"],
|
| 515 |
+
self.badwords,
|
| 516 |
+
)
|
| 517 |
+
badwords_ratio = round(badwords_ratio, 3)
|
| 518 |
+
st.markdown(f"Flagged words ratio: {badwords_ratio}")
|
| 519 |
+
if is_doc_discarded(key, badwords_ratio):
|
| 520 |
+
is_discarded = True
|
| 521 |
+
|
| 522 |
+
elif key[0] == "lang_id_score":
|
| 523 |
+
(
|
| 524 |
+
lang_pred_dataset_id,
|
| 525 |
+
lang_id_score,
|
| 526 |
+
) = Filtering.compute_lang_id_pred_score(
|
| 527 |
+
personal_doc, self.model_lang_id
|
| 528 |
+
)
|
| 529 |
+
lang_id_score = round(lang_id_score, 3)
|
| 530 |
+
st.markdown(
|
| 531 |
+
f"Language identification confidence score: {lang_id_score}"
|
| 532 |
+
)
|
| 533 |
+
if is_doc_discarded(key, badwords_ratio) or (
|
| 534 |
+
self.lang_dataset_id != lang_pred_dataset_id
|
| 535 |
+
):
|
| 536 |
+
is_discarded = True
|
| 537 |
+
|
| 538 |
+
elif key[0] == "perplexity_score":
|
| 539 |
+
perplexity_score = Filtering.compute_perplexity_score(
|
| 540 |
+
personal_doc,
|
| 541 |
+
self.sentencepiece_model,
|
| 542 |
+
self.kenlm_model,
|
| 543 |
+
)
|
| 544 |
+
perplexity_score = round(perplexity_score, 3)
|
| 545 |
+
st.markdown(f"Perplexity score: {perplexity_score}")
|
| 546 |
+
if is_doc_discarded(key, perplexity_score):
|
| 547 |
+
is_discarded = True
|
| 548 |
+
|
| 549 |
+
is_discarded = "" if is_discarded else "not "
|
| 550 |
+
st.markdown(
|
| 551 |
+
f"With the current filtering parameters, this document **is {is_discarded}discarded**."
|
| 552 |
+
)
|
| 553 |
|
| 554 |
def download_data(self):
|
| 555 |
st.header("Download data")
|
|
|
|
| 568 |
self.set_title()
|
| 569 |
self.filtering_of_docs()
|
| 570 |
self.filtering_of_words()
|
| 571 |
+
self.download_parameters()
|
| 572 |
self.plot_distributions_filtering_parameters()
|
| 573 |
+
# self.plot_zipf_law()
|
| 574 |
self.analyse_personal_doc()
|
| 575 |
self.download_data()
|
| 576 |
|
explanation_filtering_pipeline.pdf
CHANGED
|
Binary files a/explanation_filtering_pipeline.pdf and b/explanation_filtering_pipeline.pdf differ
|
|
|