Spaces:
Runtime error
Runtime error
Commit
·
6f25c5c
1
Parent(s):
d463071
new tool to analyse our own doc
Browse files- .gitignore +2 -0
- app.py +132 -4
- parameters_filtering.py +2 -2
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*cpython-39.pyc
|
| 2 |
+
.DS_Store
|
app.py
CHANGED
|
@@ -13,7 +13,7 @@ import numpy as np
|
|
| 13 |
|
| 14 |
import matplotlib.pyplot as plt
|
| 15 |
|
| 16 |
-
from filtering import Filtering
|
| 17 |
|
| 18 |
|
| 19 |
class Visualization:
|
|
@@ -25,6 +25,10 @@ class Visualization:
|
|
| 25 |
num_docs,
|
| 26 |
num_docs_for_words,
|
| 27 |
max_len_text_display,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
):
|
| 29 |
self.path_instructions = path_instructions
|
| 30 |
self.path_data = path_data
|
|
@@ -33,6 +37,23 @@ class Visualization:
|
|
| 33 |
self.num_docs_for_words = num_docs_for_words
|
| 34 |
self.max_len_text_display = max_len_text_display
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def preamble(self):
|
| 37 |
st.markdown(
|
| 38 |
"Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
|
|
@@ -159,6 +180,7 @@ class Visualization:
|
|
| 159 |
"repetitions_ratio",
|
| 160 |
cutoff_repetitions_ratio,
|
| 161 |
True,
|
|
|
|
| 162 |
)
|
| 163 |
keys.append(new_key)
|
| 164 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
@@ -392,8 +414,104 @@ class Visualization:
|
|
| 392 |
ax.set_ylabel("frequency in the documents")
|
| 393 |
st.pyplot(fig)
|
| 394 |
|
| 395 |
-
def
|
| 396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
|
| 398 |
def download_data(self):
|
| 399 |
st.header("Download data")
|
|
@@ -413,7 +531,7 @@ class Visualization:
|
|
| 413 |
self.filtering_of_words()
|
| 414 |
self.plot_distributions_filtering_parameters()
|
| 415 |
#self.plot_zipf_law()
|
| 416 |
-
self.
|
| 417 |
self.download_data()
|
| 418 |
|
| 419 |
|
|
@@ -424,6 +542,12 @@ num_docs = 5000
|
|
| 424 |
num_docs_for_words = 500
|
| 425 |
max_len_text_display = 10000
|
| 426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
visualization = Visualization(
|
| 428 |
path_instructions,
|
| 429 |
path_data,
|
|
@@ -431,5 +555,9 @@ visualization = Visualization(
|
|
| 431 |
num_docs,
|
| 432 |
num_docs_for_words,
|
| 433 |
max_len_text_display,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
)
|
| 435 |
visualization.visualization()
|
|
|
|
| 13 |
|
| 14 |
import matplotlib.pyplot as plt
|
| 15 |
|
| 16 |
+
from filtering import LoadParameters, ModifyingDocuments, Filtering
|
| 17 |
|
| 18 |
|
| 19 |
class Visualization:
|
|
|
|
| 25 |
num_docs,
|
| 26 |
num_docs_for_words,
|
| 27 |
max_len_text_display,
|
| 28 |
+
lang_dataset_id,
|
| 29 |
+
path_fasttext_model,
|
| 30 |
+
path_sentencepiece_model,
|
| 31 |
+
path_kenlm_model,
|
| 32 |
):
|
| 33 |
self.path_instructions = path_instructions
|
| 34 |
self.path_data = path_data
|
|
|
|
| 37 |
self.num_docs_for_words = num_docs_for_words
|
| 38 |
self.max_len_text_display = max_len_text_display
|
| 39 |
|
| 40 |
+
self.lang_dataset_id = lang_dataset_id
|
| 41 |
+
self.param = LoadParameters.load_parameters(lang_dataset_id)
|
| 42 |
+
self.stopwords = LoadParameters.load_stopwords(lang_dataset_id)
|
| 43 |
+
self.badwords = LoadParameters.load_badwords(lang_dataset_id)
|
| 44 |
+
self.model_lang_id = LoadParameters.load_model_lang_id(
|
| 45 |
+
lang_dataset_id, path_fasttext_model
|
| 46 |
+
)
|
| 47 |
+
self.sentencepiece_model = LoadParameters.load_sentencepiece_model(
|
| 48 |
+
lang_dataset_id, path_sentencepiece_model
|
| 49 |
+
)
|
| 50 |
+
self.sentencepiece_model_tok = (
|
| 51 |
+
self.sentencepiece_model if self.param["tokenization"] else None
|
| 52 |
+
)
|
| 53 |
+
self.kenlm_model = LoadParameters.load_kenlm_model(
|
| 54 |
+
lang_dataset_id, path_kenlm_model
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
def preamble(self):
|
| 58 |
st.markdown(
|
| 59 |
"Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail."
|
|
|
|
| 180 |
"repetitions_ratio",
|
| 181 |
cutoff_repetitions_ratio,
|
| 182 |
True,
|
| 183 |
+
repetitions_length,
|
| 184 |
)
|
| 185 |
keys.append(new_key)
|
| 186 |
cond = get_cond(new_key[0], new_key[1], new_key[2])
|
|
|
|
| 414 |
ax.set_ylabel("frequency in the documents")
|
| 415 |
st.pyplot(fig)
|
| 416 |
|
| 417 |
+
def analyse_personal_doc(self):
|
| 418 |
+
st.header("Analyse your own document")
|
| 419 |
+
|
| 420 |
+
personal_doc = st.text_area(
|
| 421 |
+
label="Paste here the document you want to analyse",
|
| 422 |
+
value="",
|
| 423 |
+
max_chars=10000,
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
is_discarded = False
|
| 427 |
+
|
| 428 |
+
def is_doc_discarded(key, score):
|
| 429 |
+
if key[2]: # max cutoff
|
| 430 |
+
return score > key[1]
|
| 431 |
+
else:
|
| 432 |
+
return score < key[1]
|
| 433 |
+
|
| 434 |
+
for key in self.keys:
|
| 435 |
+
if key[0] == "number_words":
|
| 436 |
+
words = ModifyingDocuments.get_words_from_document(
|
| 437 |
+
personal_doc,
|
| 438 |
+
self.sentencepiece_model_tok,
|
| 439 |
+
lower_case=False,
|
| 440 |
+
strip_characters=self.param["strip_characters"],
|
| 441 |
+
)
|
| 442 |
+
if key[2]:
|
| 443 |
+
st.markdown(f"Number of words: {len(words)}")
|
| 444 |
+
if is_doc_discarded(key, len(words)):
|
| 445 |
+
is_discarded = True
|
| 446 |
+
|
| 447 |
+
elif key[0] == "repetitions_ratio":
|
| 448 |
+
repetitions_ratio = Filtering.compute_repetitions_ratio(personal_doc, int(key[3]))
|
| 449 |
+
repetitions_ratio = round(repetitions_ratio, 3)
|
| 450 |
+
st.markdown(f"Repetitions ratio: {repetitions_ratio}")
|
| 451 |
+
if is_doc_discarded(key, repetitions_ratio):
|
| 452 |
+
is_discarded = True
|
| 453 |
+
|
| 454 |
+
elif key[0] == "special_characters_ratio":
|
| 455 |
+
special_characters_ratio = Filtering.compute_special_characters_ratio(
|
| 456 |
+
personal_doc, self.param["special_characters"]
|
| 457 |
+
)
|
| 458 |
+
special_characters_ratio = round(special_characters_ratio, 3)
|
| 459 |
+
st.markdown(f"Special characters ratio: {special_characters_ratio}")
|
| 460 |
+
if is_doc_discarded(key, special_characters_ratio):
|
| 461 |
+
is_discarded = True
|
| 462 |
+
|
| 463 |
+
elif key[0] == "stopwords_ratio":
|
| 464 |
+
stopwords_ratio = Filtering.compute_stopwords_ratio(
|
| 465 |
+
personal_doc,
|
| 466 |
+
self.sentencepiece_model_tok,
|
| 467 |
+
self.param["strip_characters"],
|
| 468 |
+
self.param["cond_words_augmentation"],
|
| 469 |
+
self.param["words_augmentation_group_sizes"],
|
| 470 |
+
self.param["words_augmentation_join_char"],
|
| 471 |
+
self.stopwords,
|
| 472 |
+
)
|
| 473 |
+
stopwords_ratio = round(stopwords_ratio, 3)
|
| 474 |
+
st.markdown(f"Stop words ratio: {stopwords_ratio}")
|
| 475 |
+
if is_doc_discarded(key, stopwords_ratio):
|
| 476 |
+
is_discarded = True
|
| 477 |
+
|
| 478 |
+
elif key[0] == "badwords_ratio":
|
| 479 |
+
badwords_ratio = Filtering.compute_badwords_ratio(
|
| 480 |
+
personal_doc,
|
| 481 |
+
self.sentencepiece_model_tok,
|
| 482 |
+
self.param["strip_characters"],
|
| 483 |
+
self.param["cond_words_augmentation"],
|
| 484 |
+
self.param["words_augmentation_group_sizes"],
|
| 485 |
+
self.param["words_augmentation_join_char"],
|
| 486 |
+
self.badwords,
|
| 487 |
+
)
|
| 488 |
+
badwords_ratio = round(badwords_ratio, 3)
|
| 489 |
+
st.markdown(f"Flagged words ratio: {badwords_ratio}")
|
| 490 |
+
if is_doc_discarded(key, badwords_ratio):
|
| 491 |
+
is_discarded = True
|
| 492 |
+
|
| 493 |
+
elif key[0] == "lang_id_score":
|
| 494 |
+
lang_pred_dataset_id, lang_id_score = Filtering.compute_lang_id_pred_score(
|
| 495 |
+
personal_doc, self.model_lang_id
|
| 496 |
+
)
|
| 497 |
+
lang_id_score = round(lang_id_score, 3)
|
| 498 |
+
st.markdown(f"Language identification confidence score: {lang_id_score}")
|
| 499 |
+
if is_doc_discarded(key, badwords_ratio) or (self.lang_dataset_id != lang_pred_dataset_id):
|
| 500 |
+
is_discarded = True
|
| 501 |
+
|
| 502 |
+
elif key[0] == "perplexity_score":
|
| 503 |
+
perplexity_score = Filtering.compute_perplexity_score(
|
| 504 |
+
personal_doc,
|
| 505 |
+
self.sentencepiece_model,
|
| 506 |
+
self.kenlm_model,
|
| 507 |
+
)
|
| 508 |
+
perplexity_score = round(perplexity_score, 3)
|
| 509 |
+
st.markdown(f"Perplexity score: {perplexity_score}")
|
| 510 |
+
if is_doc_discarded(key, perplexity_score):
|
| 511 |
+
is_discarded = True
|
| 512 |
+
|
| 513 |
+
is_discarded = "" if is_discarded else "not "
|
| 514 |
+
st.markdown(f"With the current filtering parameters, this document is {is_discarded}discarded.")
|
| 515 |
|
| 516 |
def download_data(self):
|
| 517 |
st.header("Download data")
|
|
|
|
| 531 |
self.filtering_of_words()
|
| 532 |
self.plot_distributions_filtering_parameters()
|
| 533 |
#self.plot_zipf_law()
|
| 534 |
+
self.analyse_personal_doc()
|
| 535 |
self.download_data()
|
| 536 |
|
| 537 |
|
|
|
|
| 542 |
num_docs_for_words = 500
|
| 543 |
max_len_text_display = 10000
|
| 544 |
|
| 545 |
+
# Only useful for analyse_personal_doc
|
| 546 |
+
lang_dataset_id = "en"
|
| 547 |
+
path_fasttext_model = "./lid.176.bin"
|
| 548 |
+
path_sentencepiece_model = "./en.sp.model"
|
| 549 |
+
path_kenlm_model = "./en.arpa.bin"
|
| 550 |
+
|
| 551 |
visualization = Visualization(
|
| 552 |
path_instructions,
|
| 553 |
path_data,
|
|
|
|
| 555 |
num_docs,
|
| 556 |
num_docs_for_words,
|
| 557 |
max_len_text_display,
|
| 558 |
+
lang_dataset_id,
|
| 559 |
+
path_fasttext_model,
|
| 560 |
+
path_sentencepiece_model,
|
| 561 |
+
path_kenlm_model,
|
| 562 |
)
|
| 563 |
visualization.visualization()
|
parameters_filtering.py
CHANGED
|
@@ -7,8 +7,8 @@ other_special_characters = (
|
|
| 7 |
" ’“”–ー一▬…✦�£•€«»°·═"
|
| 8 |
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰
‑≤≥‖"
|
| 9 |
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
|
| 10 |
-
"
|
| 11 |
-
"
|
| 12 |
)
|
| 13 |
emoji = list(emoji.UNICODE_EMOJI["en"].keys())
|
| 14 |
|
|
|
|
| 7 |
" ’“”–ー一▬…✦�£•€«»°·═"
|
| 8 |
"×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰
‑≤≥‖"
|
| 9 |
"◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†上ン:∼⁄・♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
|
| 10 |
+
"゜ʼ≖ʼ¤ッツシ℃√!【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
|
| 11 |
+
"」﴾》"
|
| 12 |
)
|
| 13 |
emoji = list(emoji.UNICODE_EMOJI["en"].keys())
|
| 14 |
|