Upload processor (#8)
Browse files- Upload processor (5794e69355c505dff2995927cbf2667005b76a44)
- added_tokens.json +0 -3
- preprocessor_config.json +98 -98
- special_tokens_map.json +5 -4
- tokenizer_config.json +5 -128
    	
        added_tokens.json
    CHANGED
    
    | @@ -1,8 +1,5 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "</s>": 3,
         | 
| 3 | 
            -
              "<MINED_DATA>": 256099,
         | 
| 4 | 
            -
              "<MMT_BT_DATA>": 256100,
         | 
| 5 | 
            -
              "<SMT_BT_DATA>": 256101,
         | 
| 6 | 
             
              "<pad>": 0,
         | 
| 7 | 
             
              "<s>": 2,
         | 
| 8 | 
             
              "<unk>": 1,
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "</s>": 3,
         | 
|  | |
|  | |
|  | |
| 3 | 
             
              "<pad>": 0,
         | 
| 4 | 
             
              "<s>": 2,
         | 
| 5 | 
             
              "<unk>": 1,
         | 
    	
        preprocessor_config.json
    CHANGED
    
    | @@ -2,104 +2,104 @@ | |
| 2 | 
             
              "feature_extractor_type": "SeamlessM4TFeatureExtractor",
         | 
| 3 | 
             
              "feature_size": 80,
         | 
| 4 | 
             
              "language_code": [
         | 
| 5 | 
            -
                " | 
| 6 | 
            -
                " | 
| 7 | 
            -
                " | 
| 8 | 
            -
                " | 
| 9 | 
            -
                " | 
| 10 | 
            -
                " | 
| 11 | 
            -
                " | 
| 12 | 
            -
                " | 
| 13 | 
            -
                " | 
| 14 | 
            -
                " | 
| 15 | 
            -
                " | 
| 16 | 
            -
                " | 
| 17 | 
            -
                " | 
| 18 | 
            -
                " | 
| 19 | 
            -
                " | 
| 20 | 
            -
                " | 
| 21 | 
            -
                " | 
| 22 | 
            -
                " | 
| 23 | 
            -
                " | 
| 24 | 
            -
                " | 
| 25 | 
            -
                " | 
| 26 | 
            -
                " | 
| 27 | 
            -
                " | 
| 28 | 
            -
                " | 
| 29 | 
            -
                " | 
| 30 | 
            -
                " | 
| 31 | 
            -
                " | 
| 32 | 
            -
                " | 
| 33 | 
            -
                " | 
| 34 | 
            -
                " | 
| 35 | 
            -
                " | 
| 36 | 
            -
                " | 
| 37 | 
            -
                " | 
| 38 | 
            -
                " | 
| 39 | 
            -
                " | 
| 40 | 
            -
                " | 
| 41 | 
            -
                " | 
| 42 | 
            -
                " | 
| 43 | 
            -
                " | 
| 44 | 
            -
                " | 
| 45 | 
            -
                " | 
| 46 | 
            -
                " | 
| 47 | 
            -
                " | 
| 48 | 
            -
                " | 
| 49 | 
            -
                " | 
| 50 | 
            -
                " | 
| 51 | 
            -
                " | 
| 52 | 
            -
                " | 
| 53 | 
            -
                " | 
| 54 | 
            -
                " | 
| 55 | 
            -
                " | 
| 56 | 
            -
                " | 
| 57 | 
            -
                " | 
| 58 | 
            -
                " | 
| 59 | 
            -
                " | 
| 60 | 
            -
                " | 
| 61 | 
            -
                " | 
| 62 | 
            -
                " | 
| 63 | 
            -
                " | 
| 64 | 
            -
                " | 
| 65 | 
            -
                " | 
| 66 | 
            -
                " | 
| 67 | 
            -
                " | 
| 68 | 
            -
                " | 
| 69 | 
            -
                " | 
| 70 | 
            -
                " | 
| 71 | 
            -
                " | 
| 72 | 
            -
                " | 
| 73 | 
            -
                " | 
| 74 | 
            -
                " | 
| 75 | 
            -
                " | 
| 76 | 
            -
                " | 
| 77 | 
            -
                " | 
| 78 | 
            -
                " | 
| 79 | 
            -
                " | 
| 80 | 
            -
                " | 
| 81 | 
            -
                " | 
| 82 | 
            -
                " | 
| 83 | 
            -
                " | 
| 84 | 
            -
                " | 
| 85 | 
            -
                " | 
| 86 | 
            -
                " | 
| 87 | 
            -
                " | 
| 88 | 
            -
                " | 
| 89 | 
            -
                " | 
| 90 | 
            -
                " | 
| 91 | 
            -
                " | 
| 92 | 
            -
                " | 
| 93 | 
            -
                " | 
| 94 | 
            -
                " | 
| 95 | 
            -
                " | 
| 96 | 
            -
                " | 
| 97 | 
            -
                " | 
| 98 | 
            -
                " | 
| 99 | 
            -
                " | 
| 100 | 
            -
                " | 
| 101 | 
            -
                " | 
| 102 | 
            -
                " | 
| 103 | 
             
              ],
         | 
| 104 | 
             
              "num_mel_bins": 80,
         | 
| 105 | 
             
              "padding_side": "right",
         | 
|  | |
| 2 | 
             
              "feature_extractor_type": "SeamlessM4TFeatureExtractor",
         | 
| 3 | 
             
              "feature_size": 80,
         | 
| 4 | 
             
              "language_code": [
         | 
| 5 | 
            +
                "__afr__",
         | 
| 6 | 
            +
                "__amh__",
         | 
| 7 | 
            +
                "__arb__",
         | 
| 8 | 
            +
                "__ary__",
         | 
| 9 | 
            +
                "__arz__",
         | 
| 10 | 
            +
                "__asm__",
         | 
| 11 | 
            +
                "__azj__",
         | 
| 12 | 
            +
                "__bel__",
         | 
| 13 | 
            +
                "__ben__",
         | 
| 14 | 
            +
                "__bos__",
         | 
| 15 | 
            +
                "__bul__",
         | 
| 16 | 
            +
                "__cat__",
         | 
| 17 | 
            +
                "__ceb__",
         | 
| 18 | 
            +
                "__ces__",
         | 
| 19 | 
            +
                "__ckb__",
         | 
| 20 | 
            +
                "__cmn__",
         | 
| 21 | 
            +
                "__cmn_Hant__",
         | 
| 22 | 
            +
                "__cym__",
         | 
| 23 | 
            +
                "__dan__",
         | 
| 24 | 
            +
                "__deu__",
         | 
| 25 | 
            +
                "__ell__",
         | 
| 26 | 
            +
                "__eng__",
         | 
| 27 | 
            +
                "__est__",
         | 
| 28 | 
            +
                "__eus__",
         | 
| 29 | 
            +
                "__fin__",
         | 
| 30 | 
            +
                "__fra__",
         | 
| 31 | 
            +
                "__fuv__",
         | 
| 32 | 
            +
                "__gaz__",
         | 
| 33 | 
            +
                "__gle__",
         | 
| 34 | 
            +
                "__glg__",
         | 
| 35 | 
            +
                "__guj__",
         | 
| 36 | 
            +
                "__heb__",
         | 
| 37 | 
            +
                "__hin__",
         | 
| 38 | 
            +
                "__hrv__",
         | 
| 39 | 
            +
                "__hun__",
         | 
| 40 | 
            +
                "__hye__",
         | 
| 41 | 
            +
                "__ibo__",
         | 
| 42 | 
            +
                "__ind__",
         | 
| 43 | 
            +
                "__isl__",
         | 
| 44 | 
            +
                "__ita__",
         | 
| 45 | 
            +
                "__jav__",
         | 
| 46 | 
            +
                "__jpn__",
         | 
| 47 | 
            +
                "__kan__",
         | 
| 48 | 
            +
                "__kat__",
         | 
| 49 | 
            +
                "__kaz__",
         | 
| 50 | 
            +
                "__khk__",
         | 
| 51 | 
            +
                "__khm__",
         | 
| 52 | 
            +
                "__kir__",
         | 
| 53 | 
            +
                "__kor__",
         | 
| 54 | 
            +
                "__lao__",
         | 
| 55 | 
            +
                "__lit__",
         | 
| 56 | 
            +
                "__lug__",
         | 
| 57 | 
            +
                "__luo__",
         | 
| 58 | 
            +
                "__lvs__",
         | 
| 59 | 
            +
                "__mai__",
         | 
| 60 | 
            +
                "__mal__",
         | 
| 61 | 
            +
                "__mar__",
         | 
| 62 | 
            +
                "__mkd__",
         | 
| 63 | 
            +
                "__mlt__",
         | 
| 64 | 
            +
                "__mni__",
         | 
| 65 | 
            +
                "__mya__",
         | 
| 66 | 
            +
                "__nld__",
         | 
| 67 | 
            +
                "__nno__",
         | 
| 68 | 
            +
                "__nob__",
         | 
| 69 | 
            +
                "__npi__",
         | 
| 70 | 
            +
                "__nya__",
         | 
| 71 | 
            +
                "__ory__",
         | 
| 72 | 
            +
                "__pan__",
         | 
| 73 | 
            +
                "__pbt__",
         | 
| 74 | 
            +
                "__pes__",
         | 
| 75 | 
            +
                "__pol__",
         | 
| 76 | 
            +
                "__por__",
         | 
| 77 | 
            +
                "__ron__",
         | 
| 78 | 
            +
                "__rus__",
         | 
| 79 | 
            +
                "__sat__",
         | 
| 80 | 
            +
                "__slk__",
         | 
| 81 | 
            +
                "__slv__",
         | 
| 82 | 
            +
                "__sna__",
         | 
| 83 | 
            +
                "__snd__",
         | 
| 84 | 
            +
                "__som__",
         | 
| 85 | 
            +
                "__spa__",
         | 
| 86 | 
            +
                "__srp__",
         | 
| 87 | 
            +
                "__swe__",
         | 
| 88 | 
            +
                "__swh__",
         | 
| 89 | 
            +
                "__tam__",
         | 
| 90 | 
            +
                "__tel__",
         | 
| 91 | 
            +
                "__tgk__",
         | 
| 92 | 
            +
                "__tgl__",
         | 
| 93 | 
            +
                "__tha__",
         | 
| 94 | 
            +
                "__tur__",
         | 
| 95 | 
            +
                "__ukr__",
         | 
| 96 | 
            +
                "__urd__",
         | 
| 97 | 
            +
                "__uzn__",
         | 
| 98 | 
            +
                "__vie__",
         | 
| 99 | 
            +
                "__yor__",
         | 
| 100 | 
            +
                "__yue__",
         | 
| 101 | 
            +
                "__zlm__",
         | 
| 102 | 
            +
                "__zul__"
         | 
| 103 | 
             
              ],
         | 
| 104 | 
             
              "num_mel_bins": 80,
         | 
| 105 | 
             
              "padding_side": "right",
         | 
    	
        special_tokens_map.json
    CHANGED
    
    | @@ -1,5 +1,9 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "additional_special_tokens": [
         | 
|  | |
|  | |
|  | |
|  | |
| 3 | 
             
                "__afr__",
         | 
| 4 | 
             
                "__amh__",
         | 
| 5 | 
             
                "__arb__",
         | 
| @@ -97,10 +101,7 @@ | |
| 97 | 
             
                "__yor__",
         | 
| 98 | 
             
                "__yue__",
         | 
| 99 | 
             
                "__zlm__",
         | 
| 100 | 
            -
                "__zul__" | 
| 101 | 
            -
                "<MINED_DATA>",
         | 
| 102 | 
            -
                "<MMT_BT_DATA>",
         | 
| 103 | 
            -
                "<SMT_BT_DATA>"
         | 
| 104 | 
             
              ],
         | 
| 105 | 
             
              "bos_token": "<s>",
         | 
| 106 | 
             
              "cls_token": "<s>",
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "additional_special_tokens": [
         | 
| 3 | 
            +
                "<pad>",
         | 
| 4 | 
            +
                "<unk>",
         | 
| 5 | 
            +
                "<s>",
         | 
| 6 | 
            +
                "</s>",
         | 
| 7 | 
             
                "__afr__",
         | 
| 8 | 
             
                "__amh__",
         | 
| 9 | 
             
                "__arb__",
         | 
|  | |
| 101 | 
             
                "__yor__",
         | 
| 102 | 
             
                "__yue__",
         | 
| 103 | 
             
                "__zlm__",
         | 
| 104 | 
            +
                "__zul__"
         | 
|  | |
|  | |
|  | |
| 105 | 
             
              ],
         | 
| 106 | 
             
              "bos_token": "<s>",
         | 
| 107 | 
             
              "cls_token": "<s>",
         | 
    	
        tokenizer_config.json
    CHANGED
    
    | @@ -815,33 +815,13 @@ | |
| 815 | 
             
                  "rstrip": true,
         | 
| 816 | 
             
                  "single_word": false,
         | 
| 817 | 
             
                  "special": true
         | 
| 818 | 
            -
                },
         | 
| 819 | 
            -
                "256099": {
         | 
| 820 | 
            -
                  "content": "<MINED_DATA>",
         | 
| 821 | 
            -
                  "lstrip": true,
         | 
| 822 | 
            -
                  "normalized": false,
         | 
| 823 | 
            -
                  "rstrip": true,
         | 
| 824 | 
            -
                  "single_word": false,
         | 
| 825 | 
            -
                  "special": true
         | 
| 826 | 
            -
                },
         | 
| 827 | 
            -
                "256100": {
         | 
| 828 | 
            -
                  "content": "<MMT_BT_DATA>",
         | 
| 829 | 
            -
                  "lstrip": true,
         | 
| 830 | 
            -
                  "normalized": false,
         | 
| 831 | 
            -
                  "rstrip": true,
         | 
| 832 | 
            -
                  "single_word": false,
         | 
| 833 | 
            -
                  "special": true
         | 
| 834 | 
            -
                },
         | 
| 835 | 
            -
                "256101": {
         | 
| 836 | 
            -
                  "content": "<SMT_BT_DATA>",
         | 
| 837 | 
            -
                  "lstrip": true,
         | 
| 838 | 
            -
                  "normalized": false,
         | 
| 839 | 
            -
                  "rstrip": true,
         | 
| 840 | 
            -
                  "single_word": false,
         | 
| 841 | 
            -
                  "special": true
         | 
| 842 | 
             
                }
         | 
| 843 | 
             
              },
         | 
| 844 | 
             
              "additional_special_tokens": [
         | 
|  | |
|  | |
|  | |
|  | |
| 845 | 
             
                "__afr__",
         | 
| 846 | 
             
                "__amh__",
         | 
| 847 | 
             
                "__arb__",
         | 
| @@ -939,115 +919,12 @@ | |
| 939 | 
             
                "__yor__",
         | 
| 940 | 
             
                "__yue__",
         | 
| 941 | 
             
                "__zlm__",
         | 
| 942 | 
            -
                "__zul__" | 
| 943 | 
            -
                "<MINED_DATA>",
         | 
| 944 | 
            -
                "<MMT_BT_DATA>",
         | 
| 945 | 
            -
                "<SMT_BT_DATA>"
         | 
| 946 | 
             
              ],
         | 
| 947 | 
             
              "bos_token": "<s>",
         | 
| 948 | 
             
              "clean_up_tokenization_spaces": true,
         | 
| 949 | 
             
              "cls_token": "<s>",
         | 
| 950 | 
             
              "eos_token": "</s>",
         | 
| 951 | 
            -
              "language_code": [
         | 
| 952 | 
            -
                "afr",
         | 
| 953 | 
            -
                "amh",
         | 
| 954 | 
            -
                "arb",
         | 
| 955 | 
            -
                "ary",
         | 
| 956 | 
            -
                "arz",
         | 
| 957 | 
            -
                "asm",
         | 
| 958 | 
            -
                "azj",
         | 
| 959 | 
            -
                "bel",
         | 
| 960 | 
            -
                "ben",
         | 
| 961 | 
            -
                "bos",
         | 
| 962 | 
            -
                "bul",
         | 
| 963 | 
            -
                "cat",
         | 
| 964 | 
            -
                "ceb",
         | 
| 965 | 
            -
                "ces",
         | 
| 966 | 
            -
                "ckb",
         | 
| 967 | 
            -
                "cmn",
         | 
| 968 | 
            -
                "cmn_Hant",
         | 
| 969 | 
            -
                "cym",
         | 
| 970 | 
            -
                "dan",
         | 
| 971 | 
            -
                "deu",
         | 
| 972 | 
            -
                "ell",
         | 
| 973 | 
            -
                "eng",
         | 
| 974 | 
            -
                "est",
         | 
| 975 | 
            -
                "eus",
         | 
| 976 | 
            -
                "fin",
         | 
| 977 | 
            -
                "fra",
         | 
| 978 | 
            -
                "fuv",
         | 
| 979 | 
            -
                "gaz",
         | 
| 980 | 
            -
                "gle",
         | 
| 981 | 
            -
                "glg",
         | 
| 982 | 
            -
                "guj",
         | 
| 983 | 
            -
                "heb",
         | 
| 984 | 
            -
                "hin",
         | 
| 985 | 
            -
                "hrv",
         | 
| 986 | 
            -
                "hun",
         | 
| 987 | 
            -
                "hye",
         | 
| 988 | 
            -
                "ibo",
         | 
| 989 | 
            -
                "ind",
         | 
| 990 | 
            -
                "isl",
         | 
| 991 | 
            -
                "ita",
         | 
| 992 | 
            -
                "jav",
         | 
| 993 | 
            -
                "jpn",
         | 
| 994 | 
            -
                "kan",
         | 
| 995 | 
            -
                "kat",
         | 
| 996 | 
            -
                "kaz",
         | 
| 997 | 
            -
                "khk",
         | 
| 998 | 
            -
                "khm",
         | 
| 999 | 
            -
                "kir",
         | 
| 1000 | 
            -
                "kor",
         | 
| 1001 | 
            -
                "lao",
         | 
| 1002 | 
            -
                "lit",
         | 
| 1003 | 
            -
                "lug",
         | 
| 1004 | 
            -
                "luo",
         | 
| 1005 | 
            -
                "lvs",
         | 
| 1006 | 
            -
                "mai",
         | 
| 1007 | 
            -
                "mal",
         | 
| 1008 | 
            -
                "mar",
         | 
| 1009 | 
            -
                "mkd",
         | 
| 1010 | 
            -
                "mlt",
         | 
| 1011 | 
            -
                "mni",
         | 
| 1012 | 
            -
                "mya",
         | 
| 1013 | 
            -
                "nld",
         | 
| 1014 | 
            -
                "nno",
         | 
| 1015 | 
            -
                "nob",
         | 
| 1016 | 
            -
                "npi",
         | 
| 1017 | 
            -
                "nya",
         | 
| 1018 | 
            -
                "ory",
         | 
| 1019 | 
            -
                "pan",
         | 
| 1020 | 
            -
                "pbt",
         | 
| 1021 | 
            -
                "pes",
         | 
| 1022 | 
            -
                "pol",
         | 
| 1023 | 
            -
                "por",
         | 
| 1024 | 
            -
                "ron",
         | 
| 1025 | 
            -
                "rus",
         | 
| 1026 | 
            -
                "sat",
         | 
| 1027 | 
            -
                "slk",
         | 
| 1028 | 
            -
                "slv",
         | 
| 1029 | 
            -
                "sna",
         | 
| 1030 | 
            -
                "snd",
         | 
| 1031 | 
            -
                "som",
         | 
| 1032 | 
            -
                "spa",
         | 
| 1033 | 
            -
                "srp",
         | 
| 1034 | 
            -
                "swe",
         | 
| 1035 | 
            -
                "swh",
         | 
| 1036 | 
            -
                "tam",
         | 
| 1037 | 
            -
                "tel",
         | 
| 1038 | 
            -
                "tgk",
         | 
| 1039 | 
            -
                "tgl",
         | 
| 1040 | 
            -
                "tha",
         | 
| 1041 | 
            -
                "tur",
         | 
| 1042 | 
            -
                "ukr",
         | 
| 1043 | 
            -
                "urd",
         | 
| 1044 | 
            -
                "uzn",
         | 
| 1045 | 
            -
                "vie",
         | 
| 1046 | 
            -
                "yor",
         | 
| 1047 | 
            -
                "yue",
         | 
| 1048 | 
            -
                "zlm",
         | 
| 1049 | 
            -
                "zul"
         | 
| 1050 | 
            -
              ],
         | 
| 1051 | 
             
              "model_max_length": 1000000000000000019884624838656,
         | 
| 1052 | 
             
              "pad_token": "<pad>",
         | 
| 1053 | 
             
              "processor_class": "SeamlessM4TProcessor",
         | 
|  | |
| 815 | 
             
                  "rstrip": true,
         | 
| 816 | 
             
                  "single_word": false,
         | 
| 817 | 
             
                  "special": true
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 818 | 
             
                }
         | 
| 819 | 
             
              },
         | 
| 820 | 
             
              "additional_special_tokens": [
         | 
| 821 | 
            +
                "<pad>",
         | 
| 822 | 
            +
                "<unk>",
         | 
| 823 | 
            +
                "<s>",
         | 
| 824 | 
            +
                "</s>",
         | 
| 825 | 
             
                "__afr__",
         | 
| 826 | 
             
                "__amh__",
         | 
| 827 | 
             
                "__arb__",
         | 
|  | |
| 919 | 
             
                "__yor__",
         | 
| 920 | 
             
                "__yue__",
         | 
| 921 | 
             
                "__zlm__",
         | 
| 922 | 
            +
                "__zul__"
         | 
|  | |
|  | |
|  | |
| 923 | 
             
              ],
         | 
| 924 | 
             
              "bos_token": "<s>",
         | 
| 925 | 
             
              "clean_up_tokenization_spaces": true,
         | 
| 926 | 
             
              "cls_token": "<s>",
         | 
| 927 | 
             
              "eos_token": "</s>",
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 928 | 
             
              "model_max_length": 1000000000000000019884624838656,
         | 
| 929 | 
             
              "pad_token": "<pad>",
         | 
| 930 | 
             
              "processor_class": "SeamlessM4TProcessor",
         | 

