Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						0610f9d
	
1
								Parent(s):
							
							0319ee2
								
visualization: choose between several languages
Browse files- app.py +115 -91
- zh.arpa.bin +3 -0
- zh.sp.model +3 -0
- zh_examples_with_stats.json +3 -0
    	
        app.py
    CHANGED
    
    | @@ -16,12 +16,12 @@ import numpy as np | |
| 16 | 
             
            import matplotlib.pyplot as plt
         | 
| 17 |  | 
| 18 | 
             
            from filtering import LoadParameters, ModifyingDocuments, Filtering
         | 
|  | |
| 19 |  | 
| 20 |  | 
| 21 | 
            -
            class  | 
| 22 | 
             
                def __init__(
         | 
| 23 | 
             
                    self,
         | 
| 24 | 
            -
                    path_instructions,
         | 
| 25 | 
             
                    path_data,
         | 
| 26 | 
             
                    lang,
         | 
| 27 | 
             
                    num_docs,
         | 
| @@ -32,7 +32,6 @@ class Visualization: | |
| 32 | 
             
                    path_sentencepiece_model,
         | 
| 33 | 
             
                    path_kenlm_model,
         | 
| 34 | 
             
                ):
         | 
| 35 | 
            -
                    self.path_instructions = path_instructions
         | 
| 36 | 
             
                    self.path_data = path_data
         | 
| 37 | 
             
                    self.lang = lang
         | 
| 38 | 
             
                    self.num_docs = num_docs
         | 
| @@ -56,32 +55,8 @@ class Visualization: | |
| 56 | 
             
                        lang_dataset_id, path_kenlm_model
         | 
| 57 | 
             
                    )
         | 
| 58 |  | 
| 59 | 
            -
                def  | 
| 60 | 
            -
                    st. | 
| 61 | 
            -
                        "This demo can be a little slow, and only allows you to process up to 5000 documents "
         | 
| 62 | 
            -
                        "for a decent speed. If you want to display up to three times more documents and have "
         | 
| 63 | 
            -
                        "a faster visualization, we invite you to run this "
         | 
| 64 | 
            -
                        "[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) "
         | 
| 65 | 
            -
                        "on your computer."
         | 
| 66 | 
            -
                    )
         | 
| 67 | 
            -
             | 
| 68 | 
            -
                def preamble(self):
         | 
| 69 | 
            -
                    def get_binary_file_downloader_html(bin_file, file_label="File"):
         | 
| 70 | 
            -
                        with open(bin_file, "rb") as f:
         | 
| 71 | 
            -
                            data = f.read()
         | 
| 72 | 
            -
                        bin_str = base64.b64encode(data).decode()
         | 
| 73 | 
            -
                        href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
         | 
| 74 | 
            -
                        return href
         | 
| 75 | 
            -
             | 
| 76 | 
            -
                    st.markdown(
         | 
| 77 | 
            -
                        "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this "
         | 
| 78 | 
            -
                        + get_binary_file_downloader_html(
         | 
| 79 | 
            -
                            self.path_instructions,
         | 
| 80 | 
            -
                            "pdf",
         | 
| 81 | 
            -
                        )
         | 
| 82 | 
            -
                        + ".",
         | 
| 83 | 
            -
                        unsafe_allow_html=True,
         | 
| 84 | 
            -
                    )
         | 
| 85 |  | 
| 86 | 
             
                def open_data(self):
         | 
| 87 | 
             
                    with open(self.path_data) as json_file:
         | 
| @@ -109,9 +84,6 @@ class Visualization: | |
| 109 | 
             
                    self.docs_checkpoint = pd.DataFrame(docs)
         | 
| 110 | 
             
                    self.docs = self.docs_checkpoint
         | 
| 111 |  | 
| 112 | 
            -
                def set_title(self):
         | 
| 113 | 
            -
                    st.title(f"Filtering visualization")
         | 
| 114 | 
            -
             | 
| 115 | 
             
                @staticmethod
         | 
| 116 | 
             
                def print_discarded_by_cond(cond):
         | 
| 117 | 
             
                    st.caption(
         | 
| @@ -169,9 +141,9 @@ class Visualization: | |
| 169 | 
             
                                )
         | 
| 170 | 
             
                                new_key = ("number_words", cutoff_min_number_words, False)
         | 
| 171 | 
             
                                keys.append(new_key)
         | 
| 172 | 
            -
                                 | 
| 173 | 
             
                                cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 174 | 
            -
                                 | 
| 175 |  | 
| 176 | 
             
                                cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
         | 
| 177 | 
             
                                cutoff_max_number_words = st.slider(
         | 
| @@ -180,7 +152,7 @@ class Visualization: | |
| 180 | 
             
                                new_key = ("number_words", cutoff_max_number_words, True)
         | 
| 181 | 
             
                                keys.append(new_key)
         | 
| 182 | 
             
                                cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 183 | 
            -
                                 | 
| 184 |  | 
| 185 | 
             
                                conds["number_words"] = [cond_1, cond_2]
         | 
| 186 |  | 
| @@ -226,9 +198,9 @@ class Visualization: | |
| 226 | 
             
                                    repetitions_length,
         | 
| 227 | 
             
                                )
         | 
| 228 | 
             
                                keys.append(new_key)
         | 
| 229 | 
            -
                                 | 
| 230 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 231 | 
            -
                                 | 
| 232 | 
             
                                conds["repetitions_ratio"] = [cond]
         | 
| 233 |  | 
| 234 | 
             
                        if "special_characters_ratio" in columns:
         | 
| @@ -243,9 +215,9 @@ class Visualization: | |
| 243 | 
             
                                    True,
         | 
| 244 | 
             
                                )
         | 
| 245 | 
             
                                keys.append(new_key)
         | 
| 246 | 
            -
                                 | 
| 247 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 248 | 
            -
                                 | 
| 249 | 
             
                                conds["special_characters_ratio"] = [cond]
         | 
| 250 |  | 
| 251 | 
             
                        if "stopwords_ratio" in columns:
         | 
| @@ -279,9 +251,9 @@ class Visualization: | |
| 279 | 
             
                                )
         | 
| 280 | 
             
                                new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
         | 
| 281 | 
             
                                keys.append(new_key)
         | 
| 282 | 
            -
                                 | 
| 283 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 284 | 
            -
                                 | 
| 285 | 
             
                                conds["stopwords_ratio"] = [cond]
         | 
| 286 |  | 
| 287 | 
             
                        if "flagged_words_ratio" in columns:
         | 
| @@ -316,9 +288,9 @@ class Visualization: | |
| 316 | 
             
                                )
         | 
| 317 | 
             
                                new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
         | 
| 318 | 
             
                                keys.append(new_key)
         | 
| 319 | 
            -
                                 | 
| 320 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 321 | 
            -
                                 | 
| 322 | 
             
                                conds["flagged_words_ratio"] = [cond]
         | 
| 323 |  | 
| 324 | 
             
                        if "lang_id_score" in columns:
         | 
| @@ -329,9 +301,9 @@ class Visualization: | |
| 329 | 
             
                                )
         | 
| 330 | 
             
                                new_key = ("lang_id_score", cutoff_lang_id_score, False)
         | 
| 331 | 
             
                                keys.append(new_key)
         | 
| 332 | 
            -
                                 | 
| 333 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 334 | 
            -
                                 | 
| 335 | 
             
                                conds["lang_id_score"] = [cond]
         | 
| 336 |  | 
| 337 | 
             
                        if "perplexity_score" in columns:
         | 
| @@ -341,9 +313,9 @@ class Visualization: | |
| 341 | 
             
                                cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp)
         | 
| 342 | 
             
                                new_key = ("perplexity_score", cutoff_perplexity_score, True)
         | 
| 343 | 
             
                                keys.append(new_key)
         | 
| 344 | 
            -
                                 | 
| 345 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 346 | 
            -
                                 | 
| 347 | 
             
                                conds["perplexity_score"] = [cond]
         | 
| 348 |  | 
| 349 | 
             
                        return keys, conds
         | 
| @@ -361,7 +333,7 @@ class Visualization: | |
| 361 | 
             
                            f"Filtering on documents, for {self.num_docs} {self.lang} documents"
         | 
| 362 | 
             
                        )
         | 
| 363 |  | 
| 364 | 
            -
                         | 
| 365 | 
             
                            self.docs, np.invert(all_conds), "Discarded documents", "docs"
         | 
| 366 | 
             
                        )
         | 
| 367 |  | 
| @@ -375,7 +347,7 @@ class Visualization: | |
| 375 |  | 
| 376 | 
             
                            if "number_words" in columns:
         | 
| 377 | 
             
                                cond_filter = np.invert(np.all(conds["number_words"], axis=0))
         | 
| 378 | 
            -
                                 | 
| 379 | 
             
                                    self.docs,
         | 
| 380 | 
             
                                    cond_filter,
         | 
| 381 | 
             
                                    "Discarded documents for the filter on the number of words",
         | 
| @@ -384,7 +356,7 @@ class Visualization: | |
| 384 |  | 
| 385 | 
             
                            if "repetitions_ratio" in columns:
         | 
| 386 | 
             
                                cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
         | 
| 387 | 
            -
                                 | 
| 388 | 
             
                                    self.docs,
         | 
| 389 | 
             
                                    cond_filter,
         | 
| 390 | 
             
                                    "Discarded documents for the filter on the repetitions ratio",
         | 
| @@ -395,7 +367,7 @@ class Visualization: | |
| 395 | 
             
                                cond_filter = np.invert(
         | 
| 396 | 
             
                                    np.all(conds["special_characters_ratio"], axis=0)
         | 
| 397 | 
             
                                )
         | 
| 398 | 
            -
                                 | 
| 399 | 
             
                                    self.docs,
         | 
| 400 | 
             
                                    cond_filter,
         | 
| 401 | 
             
                                    "Discarded documents for the filter on the special characters ratio",
         | 
| @@ -404,7 +376,7 @@ class Visualization: | |
| 404 |  | 
| 405 | 
             
                            if "stopwords_ratio" in columns:
         | 
| 406 | 
             
                                cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
         | 
| 407 | 
            -
                                 | 
| 408 | 
             
                                    self.docs,
         | 
| 409 | 
             
                                    cond_filter,
         | 
| 410 | 
             
                                    "Discarded documents for the filter on the stop words ratio",
         | 
| @@ -415,7 +387,7 @@ class Visualization: | |
| 415 | 
             
                                cond_filter = np.invert(
         | 
| 416 | 
             
                                    np.all(conds["flagged_words_ratio"], axis=0)
         | 
| 417 | 
             
                                )
         | 
| 418 | 
            -
                                 | 
| 419 | 
             
                                    self.docs,
         | 
| 420 | 
             
                                    cond_filter,
         | 
| 421 | 
             
                                    "Discarded documents for the filter on the flagged words ratio",
         | 
| @@ -424,7 +396,7 @@ class Visualization: | |
| 424 |  | 
| 425 | 
             
                            if "lang_id_score" in columns:
         | 
| 426 | 
             
                                cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
         | 
| 427 | 
            -
                                 | 
| 428 | 
             
                                    self.docs,
         | 
| 429 | 
             
                                    cond_filter,
         | 
| 430 | 
             
                                    "Discarded documents for the filter on the language identification confidence score",
         | 
| @@ -433,14 +405,14 @@ class Visualization: | |
| 433 |  | 
| 434 | 
             
                            if "perplexity_score" in columns:
         | 
| 435 | 
             
                                cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
         | 
| 436 | 
            -
                                 | 
| 437 | 
             
                                    self.docs,
         | 
| 438 | 
             
                                    cond_filter,
         | 
| 439 | 
             
                                    "Discarded documents for the filter on the perplexity score",
         | 
| 440 | 
             
                                    "docs",
         | 
| 441 | 
             
                                )
         | 
| 442 |  | 
| 443 | 
            -
                         | 
| 444 | 
             
                            self.docs, all_conds, "Retained documents", "docs"
         | 
| 445 | 
             
                        )
         | 
| 446 |  | 
| @@ -468,9 +440,9 @@ class Visualization: | |
| 468 | 
             
                                cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
         | 
| 469 | 
             
                                new_key = ("len_word", cutoff_word, True)
         | 
| 470 | 
             
                                self.parameters.append(new_key)
         | 
| 471 | 
            -
                                 | 
| 472 | 
             
                                cond_len_words = self.words["len_word"] <= cutoff_word
         | 
| 473 | 
            -
                                 | 
| 474 | 
             
                                conds_words["len_word"] = cond_len_words
         | 
| 475 |  | 
| 476 | 
             
                        if "incorrect_substrings" in columns:
         | 
| @@ -509,7 +481,7 @@ class Visualization: | |
| 509 | 
             
                                            for i in range(len(self.words["incorrect_substrings"]))
         | 
| 510 | 
             
                                        ]
         | 
| 511 | 
             
                                    )
         | 
| 512 | 
            -
                                 | 
| 513 | 
             
                                conds_words["incorrect_substrings"] = cond_incorrect_substrings
         | 
| 514 |  | 
| 515 | 
             
                        all_conds_words = np.all(list(conds_words.values()), axis=0)
         | 
| @@ -526,7 +498,7 @@ class Visualization: | |
| 526 | 
             
                                f"we consider in this section words for only {self.num_docs_for_words} documents."
         | 
| 527 | 
             
                            )
         | 
| 528 |  | 
| 529 | 
            -
                             | 
| 530 | 
             
                                self.words, np.invert(all_conds_words), "Discarded words", "words"
         | 
| 531 | 
             
                            )
         | 
| 532 |  | 
| @@ -539,7 +511,7 @@ class Visualization: | |
| 539 |  | 
| 540 | 
             
                                if "len_word" in columns:
         | 
| 541 | 
             
                                    cond_filter = np.invert(conds_words["len_word"])
         | 
| 542 | 
            -
                                     | 
| 543 | 
             
                                        self.words,
         | 
| 544 | 
             
                                        cond_filter,
         | 
| 545 | 
             
                                        "Discarded words for the filter on length",
         | 
| @@ -548,14 +520,14 @@ class Visualization: | |
| 548 |  | 
| 549 | 
             
                                if "incorrect_substrings" in columns:
         | 
| 550 | 
             
                                    cond_filter = np.invert(conds_words["incorrect_substrings"])
         | 
| 551 | 
            -
                                     | 
| 552 | 
             
                                        self.words,
         | 
| 553 | 
             
                                        cond_filter,
         | 
| 554 | 
             
                                        "Discarded words for the filter on incorrect substrings",
         | 
| 555 | 
             
                                        "words",
         | 
| 556 | 
             
                                    )
         | 
| 557 |  | 
| 558 | 
            -
                             | 
| 559 | 
             
                                self.words, all_conds_words, "Retained words", "words"
         | 
| 560 | 
             
                            )
         | 
| 561 |  | 
| @@ -709,40 +681,92 @@ class Visualization: | |
| 709 | 
             
                                f"With the current filtering parameters, this document **is {is_discarded}discarded**."
         | 
| 710 | 
             
                            )
         | 
| 711 |  | 
| 712 | 
            -
                def  | 
| 713 | 
            -
                    self.warning_preamble()
         | 
| 714 | 
            -
                    self.preamble()
         | 
| 715 | 
            -
                    self.open_data()
         | 
| 716 | 
             
                    self.set_title()
         | 
|  | |
| 717 | 
             
                    self.filtering_of_docs()
         | 
| 718 | 
             
                    self.filtering_of_words()
         | 
| 719 | 
             
                    self.download_parameters()
         | 
| 720 | 
             
                    self.analyse_personal_doc()
         | 
| 721 |  | 
| 722 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 723 | 
             
            path_instructions = "./explanation_filtering_pipeline.pdf"
         | 
| 724 | 
            -
             | 
| 725 | 
            -
             | 
| 726 | 
            -
             | 
| 727 | 
            -
             | 
| 728 | 
            -
             | 
| 729 | 
            -
             | 
| 730 | 
            -
             | 
| 731 | 
            -
             | 
| 732 | 
            -
             | 
| 733 | 
            -
             | 
| 734 | 
            -
             | 
| 735 | 
            -
             | 
| 736 | 
            -
             | 
| 737 | 
            -
                 | 
| 738 | 
            -
             | 
| 739 | 
            -
             | 
| 740 | 
            -
             | 
| 741 | 
            -
                num_docs_for_words,
         | 
| 742 | 
            -
                max_len_text_display,
         | 
| 743 | 
            -
                lang_dataset_id,
         | 
| 744 | 
            -
                path_fasttext_model,
         | 
| 745 | 
            -
                path_sentencepiece_model,
         | 
| 746 | 
            -
                path_kenlm_model,
         | 
| 747 | 
            -
            )
         | 
| 748 | 
             
            visualization.visualization()
         | 
|  | |
| 16 | 
             
            import matplotlib.pyplot as plt
         | 
| 17 |  | 
| 18 | 
             
            from filtering import LoadParameters, ModifyingDocuments, Filtering
         | 
| 19 | 
            +
            from languages_id import langs_id
         | 
| 20 |  | 
| 21 |  | 
| 22 | 
            +
            class Visualization_for_lang:
         | 
| 23 | 
             
                def __init__(
         | 
| 24 | 
             
                    self,
         | 
|  | |
| 25 | 
             
                    path_data,
         | 
| 26 | 
             
                    lang,
         | 
| 27 | 
             
                    num_docs,
         | 
|  | |
| 32 | 
             
                    path_sentencepiece_model,
         | 
| 33 | 
             
                    path_kenlm_model,
         | 
| 34 | 
             
                ):
         | 
|  | |
| 35 | 
             
                    self.path_data = path_data
         | 
| 36 | 
             
                    self.lang = lang
         | 
| 37 | 
             
                    self.num_docs = num_docs
         | 
|  | |
| 55 | 
             
                        lang_dataset_id, path_kenlm_model
         | 
| 56 | 
             
                    )
         | 
| 57 |  | 
| 58 | 
            +
                def set_title(self):
         | 
| 59 | 
            +
                    st.title(f"Filtering visualization for {self.lang}")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 60 |  | 
| 61 | 
             
                def open_data(self):
         | 
| 62 | 
             
                    with open(self.path_data) as json_file:
         | 
|  | |
| 84 | 
             
                    self.docs_checkpoint = pd.DataFrame(docs)
         | 
| 85 | 
             
                    self.docs = self.docs_checkpoint
         | 
| 86 |  | 
|  | |
|  | |
|  | |
| 87 | 
             
                @staticmethod
         | 
| 88 | 
             
                def print_discarded_by_cond(cond):
         | 
| 89 | 
             
                    st.caption(
         | 
|  | |
| 141 | 
             
                                )
         | 
| 142 | 
             
                                new_key = ("number_words", cutoff_min_number_words, False)
         | 
| 143 | 
             
                                keys.append(new_key)
         | 
| 144 | 
            +
                                Visualization_for_lang.plot_hist(self.docs, new_key)
         | 
| 145 | 
             
                                cond_1 = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 146 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond_1)
         | 
| 147 |  | 
| 148 | 
             
                                cutoff_def = "If the number of words of a document is higher than this number, the document is removed."
         | 
| 149 | 
             
                                cutoff_max_number_words = st.slider(
         | 
|  | |
| 152 | 
             
                                new_key = ("number_words", cutoff_max_number_words, True)
         | 
| 153 | 
             
                                keys.append(new_key)
         | 
| 154 | 
             
                                cond_2 = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 155 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond_2)
         | 
| 156 |  | 
| 157 | 
             
                                conds["number_words"] = [cond_1, cond_2]
         | 
| 158 |  | 
|  | |
| 198 | 
             
                                    repetitions_length,
         | 
| 199 | 
             
                                )
         | 
| 200 | 
             
                                keys.append(new_key)
         | 
| 201 | 
            +
                                Visualization_for_lang.plot_hist(self.docs, new_key)
         | 
| 202 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 203 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond)
         | 
| 204 | 
             
                                conds["repetitions_ratio"] = [cond]
         | 
| 205 |  | 
| 206 | 
             
                        if "special_characters_ratio" in columns:
         | 
|  | |
| 215 | 
             
                                    True,
         | 
| 216 | 
             
                                )
         | 
| 217 | 
             
                                keys.append(new_key)
         | 
| 218 | 
            +
                                Visualization_for_lang.plot_hist(self.docs, new_key)
         | 
| 219 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 220 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond)
         | 
| 221 | 
             
                                conds["special_characters_ratio"] = [cond]
         | 
| 222 |  | 
| 223 | 
             
                        if "stopwords_ratio" in columns:
         | 
|  | |
| 251 | 
             
                                )
         | 
| 252 | 
             
                                new_key = ("stopwords_ratio", cutoff_stopwords_ratio, False)
         | 
| 253 | 
             
                                keys.append(new_key)
         | 
| 254 | 
            +
                                Visualization_for_lang.plot_hist(self.docs, new_key)
         | 
| 255 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 256 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond)
         | 
| 257 | 
             
                                conds["stopwords_ratio"] = [cond]
         | 
| 258 |  | 
| 259 | 
             
                        if "flagged_words_ratio" in columns:
         | 
|  | |
| 288 | 
             
                                )
         | 
| 289 | 
             
                                new_key = ("flagged_words_ratio", cutoff_flagged_words_ratio, True)
         | 
| 290 | 
             
                                keys.append(new_key)
         | 
| 291 | 
            +
                                Visualization_for_lang.plot_hist(self.docs, new_key)
         | 
| 292 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 293 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond)
         | 
| 294 | 
             
                                conds["flagged_words_ratio"] = [cond]
         | 
| 295 |  | 
| 296 | 
             
                        if "lang_id_score" in columns:
         | 
|  | |
| 301 | 
             
                                )
         | 
| 302 | 
             
                                new_key = ("lang_id_score", cutoff_lang_id_score, False)
         | 
| 303 | 
             
                                keys.append(new_key)
         | 
| 304 | 
            +
                                Visualization_for_lang.plot_hist(self.docs, new_key)
         | 
| 305 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 306 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond)
         | 
| 307 | 
             
                                conds["lang_id_score"] = [cond]
         | 
| 308 |  | 
| 309 | 
             
                        if "perplexity_score" in columns:
         | 
|  | |
| 313 | 
             
                                cutoff_perplexity_score = st.slider(cutoff_def, 0, max_pp, max_pp)
         | 
| 314 | 
             
                                new_key = ("perplexity_score", cutoff_perplexity_score, True)
         | 
| 315 | 
             
                                keys.append(new_key)
         | 
| 316 | 
            +
                                Visualization_for_lang.plot_hist(self.docs, new_key)
         | 
| 317 | 
             
                                cond = get_cond(new_key[0], new_key[1], new_key[2])
         | 
| 318 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond)
         | 
| 319 | 
             
                                conds["perplexity_score"] = [cond]
         | 
| 320 |  | 
| 321 | 
             
                        return keys, conds
         | 
|  | |
| 333 | 
             
                            f"Filtering on documents, for {self.num_docs} {self.lang} documents"
         | 
| 334 | 
             
                        )
         | 
| 335 |  | 
| 336 | 
            +
                        Visualization_for_lang.display_dataset(
         | 
| 337 | 
             
                            self.docs, np.invert(all_conds), "Discarded documents", "docs"
         | 
| 338 | 
             
                        )
         | 
| 339 |  | 
|  | |
| 347 |  | 
| 348 | 
             
                            if "number_words" in columns:
         | 
| 349 | 
             
                                cond_filter = np.invert(np.all(conds["number_words"], axis=0))
         | 
| 350 | 
            +
                                Visualization_for_lang.display_dataset(
         | 
| 351 | 
             
                                    self.docs,
         | 
| 352 | 
             
                                    cond_filter,
         | 
| 353 | 
             
                                    "Discarded documents for the filter on the number of words",
         | 
|  | |
| 356 |  | 
| 357 | 
             
                            if "repetitions_ratio" in columns:
         | 
| 358 | 
             
                                cond_filter = np.invert(np.all(conds["repetitions_ratio"], axis=0))
         | 
| 359 | 
            +
                                Visualization_for_lang.display_dataset(
         | 
| 360 | 
             
                                    self.docs,
         | 
| 361 | 
             
                                    cond_filter,
         | 
| 362 | 
             
                                    "Discarded documents for the filter on the repetitions ratio",
         | 
|  | |
| 367 | 
             
                                cond_filter = np.invert(
         | 
| 368 | 
             
                                    np.all(conds["special_characters_ratio"], axis=0)
         | 
| 369 | 
             
                                )
         | 
| 370 | 
            +
                                Visualization_for_lang.display_dataset(
         | 
| 371 | 
             
                                    self.docs,
         | 
| 372 | 
             
                                    cond_filter,
         | 
| 373 | 
             
                                    "Discarded documents for the filter on the special characters ratio",
         | 
|  | |
| 376 |  | 
| 377 | 
             
                            if "stopwords_ratio" in columns:
         | 
| 378 | 
             
                                cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
         | 
| 379 | 
            +
                                Visualization_for_lang.display_dataset(
         | 
| 380 | 
             
                                    self.docs,
         | 
| 381 | 
             
                                    cond_filter,
         | 
| 382 | 
             
                                    "Discarded documents for the filter on the stop words ratio",
         | 
|  | |
| 387 | 
             
                                cond_filter = np.invert(
         | 
| 388 | 
             
                                    np.all(conds["flagged_words_ratio"], axis=0)
         | 
| 389 | 
             
                                )
         | 
| 390 | 
            +
                                Visualization_for_lang.display_dataset(
         | 
| 391 | 
             
                                    self.docs,
         | 
| 392 | 
             
                                    cond_filter,
         | 
| 393 | 
             
                                    "Discarded documents for the filter on the flagged words ratio",
         | 
|  | |
| 396 |  | 
| 397 | 
             
                            if "lang_id_score" in columns:
         | 
| 398 | 
             
                                cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
         | 
| 399 | 
            +
                                Visualization_for_lang.display_dataset(
         | 
| 400 | 
             
                                    self.docs,
         | 
| 401 | 
             
                                    cond_filter,
         | 
| 402 | 
             
                                    "Discarded documents for the filter on the language identification confidence score",
         | 
|  | |
| 405 |  | 
| 406 | 
             
                            if "perplexity_score" in columns:
         | 
| 407 | 
             
                                cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
         | 
| 408 | 
            +
                                Visualization_for_lang.display_dataset(
         | 
| 409 | 
             
                                    self.docs,
         | 
| 410 | 
             
                                    cond_filter,
         | 
| 411 | 
             
                                    "Discarded documents for the filter on the perplexity score",
         | 
| 412 | 
             
                                    "docs",
         | 
| 413 | 
             
                                )
         | 
| 414 |  | 
| 415 | 
            +
                        Visualization_for_lang.display_dataset(
         | 
| 416 | 
             
                            self.docs, all_conds, "Retained documents", "docs"
         | 
| 417 | 
             
                        )
         | 
| 418 |  | 
|  | |
| 440 | 
             
                                cutoff_word = st.slider(cutoff_def, 0, max_len_word, max_len_word)
         | 
| 441 | 
             
                                new_key = ("len_word", cutoff_word, True)
         | 
| 442 | 
             
                                self.parameters.append(new_key)
         | 
| 443 | 
            +
                                Visualization_for_lang.plot_hist(self.words, new_key)
         | 
| 444 | 
             
                                cond_len_words = self.words["len_word"] <= cutoff_word
         | 
| 445 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond_len_words)
         | 
| 446 | 
             
                                conds_words["len_word"] = cond_len_words
         | 
| 447 |  | 
| 448 | 
             
                        if "incorrect_substrings" in columns:
         | 
|  | |
| 481 | 
             
                                            for i in range(len(self.words["incorrect_substrings"]))
         | 
| 482 | 
             
                                        ]
         | 
| 483 | 
             
                                    )
         | 
| 484 | 
            +
                                Visualization_for_lang.print_discarded_by_cond(cond_incorrect_substrings)
         | 
| 485 | 
             
                                conds_words["incorrect_substrings"] = cond_incorrect_substrings
         | 
| 486 |  | 
| 487 | 
             
                        all_conds_words = np.all(list(conds_words.values()), axis=0)
         | 
|  | |
| 498 | 
             
                                f"we consider in this section words for only {self.num_docs_for_words} documents."
         | 
| 499 | 
             
                            )
         | 
| 500 |  | 
| 501 | 
            +
                            Visualization_for_lang.display_dataset(
         | 
| 502 | 
             
                                self.words, np.invert(all_conds_words), "Discarded words", "words"
         | 
| 503 | 
             
                            )
         | 
| 504 |  | 
|  | |
| 511 |  | 
| 512 | 
             
                                if "len_word" in columns:
         | 
| 513 | 
             
                                    cond_filter = np.invert(conds_words["len_word"])
         | 
| 514 | 
            +
                                    Visualization_for_lang.display_dataset(
         | 
| 515 | 
             
                                        self.words,
         | 
| 516 | 
             
                                        cond_filter,
         | 
| 517 | 
             
                                        "Discarded words for the filter on length",
         | 
|  | |
| 520 |  | 
| 521 | 
             
                                if "incorrect_substrings" in columns:
         | 
| 522 | 
             
                                    cond_filter = np.invert(conds_words["incorrect_substrings"])
         | 
| 523 | 
            +
                                    Visualization_for_lang.display_dataset(
         | 
| 524 | 
             
                                        self.words,
         | 
| 525 | 
             
                                        cond_filter,
         | 
| 526 | 
             
                                        "Discarded words for the filter on incorrect substrings",
         | 
| 527 | 
             
                                        "words",
         | 
| 528 | 
             
                                    )
         | 
| 529 |  | 
| 530 | 
            +
                            Visualization_for_lang.display_dataset(
         | 
| 531 | 
             
                                self.words, all_conds_words, "Retained words", "words"
         | 
| 532 | 
             
                            )
         | 
| 533 |  | 
|  | |
| 681 | 
             
                                f"With the current filtering parameters, this document **is {is_discarded}discarded**."
         | 
| 682 | 
             
                            )
         | 
| 683 |  | 
| 684 | 
            +
                def visualization_for_lang(self):
         | 
|  | |
|  | |
|  | |
| 685 | 
             
                    self.set_title()
         | 
| 686 | 
            +
                    self.open_data()
         | 
| 687 | 
             
                    self.filtering_of_docs()
         | 
| 688 | 
             
                    self.filtering_of_words()
         | 
| 689 | 
             
                    self.download_parameters()
         | 
| 690 | 
             
                    self.analyse_personal_doc()
         | 
| 691 |  | 
| 692 |  | 
| 693 | 
            +
            class Visualization:
         | 
| 694 | 
            +
                def __init__(self, path_instructions, param_visu_langs):
         | 
| 695 | 
            +
                    self.path_instructions = path_instructions
         | 
| 696 | 
            +
                    self.param_visu_langs = param_visu_langs
         | 
| 697 | 
            +
             | 
| 698 | 
            +
                def preamble(self):
         | 
| 699 | 
            +
                    def get_binary_file_downloader_html(bin_file, file_label="File"):
         | 
| 700 | 
            +
                        with open(bin_file, "rb") as f:
         | 
| 701 | 
            +
                            data = f.read()
         | 
| 702 | 
            +
                        bin_str = base64.b64encode(data).decode()
         | 
| 703 | 
            +
                        href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">{file_label}</a>'
         | 
| 704 | 
            +
                        return href
         | 
| 705 | 
            +
             | 
| 706 | 
            +
                    st.markdown(
         | 
| 707 | 
            +
                        "Before diving into this demo, you might want to take a look at how the filtering pipeline looks like in more detail in this "
         | 
| 708 | 
            +
                        + get_binary_file_downloader_html(
         | 
| 709 | 
            +
                            self.path_instructions,
         | 
| 710 | 
            +
                            "pdf",
         | 
| 711 | 
            +
                        )
         | 
| 712 | 
            +
                        + ".",
         | 
| 713 | 
            +
                        unsafe_allow_html=True,
         | 
| 714 | 
            +
                    )
         | 
| 715 | 
            +
             | 
| 716 | 
            +
                def warning_preamble(self):
         | 
| 717 | 
            +
                    st.markdown(
         | 
| 718 | 
            +
                        "This demo can be a little slow, and only allows you to process up to 5000 documents "
         | 
| 719 | 
            +
                        "for a decent speed. If you want to display up to three times more documents and have "
         | 
| 720 | 
            +
                        "a faster visualization, we invite you to run this "
         | 
| 721 | 
            +
                        "[code](https://github.com/bigscience-workshop/data_tooling/tree/master/ac_dc/visualization) "
         | 
| 722 | 
            +
                        "on your computer."
         | 
| 723 | 
            +
                    )
         | 
| 724 | 
            +
             | 
| 725 | 
            +
                def choose_lang(self):
         | 
| 726 | 
            +
                    options = [self.param_visu_langs[lang_dataset_id]["lang"] for lang_dataset_id in self.param_visu_langs]
         | 
| 727 | 
            +
                    index = options.index("English") if ("English" in options) else 0
         | 
| 728 | 
            +
                    lang_chosen = st.selectbox(
         | 
| 729 | 
            +
                        label="Select the language for visualization",
         | 
| 730 | 
            +
                        options=options,
         | 
| 731 | 
            +
                        index=index,
         | 
| 732 | 
            +
                    )
         | 
| 733 | 
            +
                    if lang_chosen != "None":
         | 
| 734 | 
            +
                        lang_chosen_dataset_id = langs_id.loc[langs_id["lang"] == lang_chosen, "dataset_id"].iloc[0]
         | 
| 735 | 
            +
                        visualization_for_lang = Visualization_for_lang(
         | 
| 736 | 
            +
                            path_data = self.param_visu_langs[lang_chosen_dataset_id]["path_data"],
         | 
| 737 | 
            +
                            lang = self.param_visu_langs[lang_chosen_dataset_id]["lang"],
         | 
| 738 | 
            +
                            num_docs = self.param_visu_langs[lang_chosen_dataset_id]["num_docs"],
         | 
| 739 | 
            +
                            num_docs_for_words = self.param_visu_langs[lang_chosen_dataset_id]["num_docs_for_words"],
         | 
| 740 | 
            +
                            max_len_text_display = self.param_visu_langs[lang_chosen_dataset_id]["max_len_text_display"],
         | 
| 741 | 
            +
                            lang_dataset_id = self.param_visu_langs[lang_chosen_dataset_id]["lang_dataset_id"],
         | 
| 742 | 
            +
                            path_fasttext_model = self.param_visu_langs[lang_chosen_dataset_id]["path_fasttext_model"],
         | 
| 743 | 
            +
                            path_sentencepiece_model = self.param_visu_langs[lang_chosen_dataset_id]["path_sentencepiece_model"],
         | 
| 744 | 
            +
                            path_kenlm_model = self.param_visu_langs[lang_chosen_dataset_id]["path_kenlm_model"],
         | 
| 745 | 
            +
                        )
         | 
| 746 | 
            +
                        visualization_for_lang.visualization_for_lang()
         | 
| 747 | 
            +
             | 
| 748 | 
            +
                def visualization(self):
         | 
| 749 | 
            +
                    self.preamble()
         | 
| 750 | 
            +
                    self.warning_preamble()
         | 
| 751 | 
            +
                    self.choose_lang()
         | 
| 752 | 
            +
             | 
| 753 | 
            +
             | 
| 754 | 
             
            path_instructions = "./explanation_filtering_pipeline.pdf"
         | 
| 755 | 
            +
             | 
| 756 | 
            +
            param_visu_langs = {
         | 
| 757 | 
            +
                lang_dataset_id: {
         | 
| 758 | 
            +
                    "path_data": f"./{lang_dataset_id}_examples_with_stats.json",
         | 
| 759 | 
            +
                    "lang": langs_id.loc[langs_id["dataset_id"] == lang_dataset_id, "lang"].iloc[0],
         | 
| 760 | 
            +
                    "num_docs": 5000,
         | 
| 761 | 
            +
                    "num_docs_for_words": 500,
         | 
| 762 | 
            +
                    "max_len_text_display": 10000,
         | 
| 763 | 
            +
                    "lang_dataset_id": lang_dataset_id,
         | 
| 764 | 
            +
                    "path_fasttext_model": "./lid.176.bin",
         | 
| 765 | 
            +
                    "path_sentencepiece_model": f"./{lang_dataset_id}.sp.model",
         | 
| 766 | 
            +
                    "path_kenlm_model": f"./{lang_dataset_id}.arpa.bin",
         | 
| 767 | 
            +
                }
         | 
| 768 | 
            +
                for lang_dataset_id in ["en", "zh"]
         | 
| 769 | 
            +
            }
         | 
| 770 | 
            +
             | 
| 771 | 
            +
            visualization = Visualization(path_instructions, param_visu_langs)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 772 | 
             
            visualization.visualization()
         | 
    	
        zh.arpa.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:f157d94cb2828bbb44b5dddf38e7eb7f62a47d317917646a73fe2af50a3dad68
         | 
| 3 | 
            +
            size 3392018416
         | 
    	
        zh.sp.model
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:b30b883dfac9927edeb1fba8894ebc8ca4452aa3e26fb4ff3ff0e653ba011db7
         | 
| 3 | 
            +
            size 1366946
         | 
    	
        zh_examples_with_stats.json
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:90ffaf5e5c7b556587c8b2b97ad49c752bea5608d5cc56b7ea03fb0d96a71fd2
         | 
| 3 | 
            +
            size 62914634
         | 

