Align tokenizer with mistral-common (#158)
Browse files- Align tokenizer with mistral-common (5b244f98e11d4b832d5cbcc5ffd9b1a6d36afcaa)
Co-authored-by: Matthew Carrigan <Rocketknight1@users.noreply.huggingface.co>
- README.md +3 -3
- special_tokens_map.json +21 -3
- tokenizer.json +7 -16
- tokenizer_config.json +3 -1
    	
        README.md
    CHANGED
    
    | @@ -1,10 +1,10 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            license: apache-2.0
         | 
| 3 | 
            -
            pipeline_tag: text-generation
         | 
| 4 | 
             
            language:
         | 
| 5 | 
            -
             | 
|  | |
| 6 | 
             
            tags:
         | 
| 7 | 
             
            - pretrained
         | 
|  | |
| 8 | 
             
            inference:
         | 
| 9 | 
             
              parameters:
         | 
| 10 | 
             
                temperature: 0.7
         | 
|  | |
| 1 | 
             
            ---
         | 
|  | |
|  | |
| 2 | 
             
            language:
         | 
| 3 | 
            +
            - en
         | 
| 4 | 
            +
            license: apache-2.0
         | 
| 5 | 
             
            tags:
         | 
| 6 | 
             
            - pretrained
         | 
| 7 | 
            +
            pipeline_tag: text-generation
         | 
| 8 | 
             
            inference:
         | 
| 9 | 
             
              parameters:
         | 
| 10 | 
             
                temperature: 0.7
         | 
    	
        special_tokens_map.json
    CHANGED
    
    | @@ -1,5 +1,23 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
            -
              "bos_token":  | 
| 3 | 
            -
             | 
| 4 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 5 | 
             
            }
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
            +
              "bos_token": {
         | 
| 3 | 
            +
                "content": "<s>",
         | 
| 4 | 
            +
                "lstrip": false,
         | 
| 5 | 
            +
                "normalized": false,
         | 
| 6 | 
            +
                "rstrip": false,
         | 
| 7 | 
            +
                "single_word": false
         | 
| 8 | 
            +
              },
         | 
| 9 | 
            +
              "eos_token": {
         | 
| 10 | 
            +
                "content": "</s>",
         | 
| 11 | 
            +
                "lstrip": false,
         | 
| 12 | 
            +
                "normalized": false,
         | 
| 13 | 
            +
                "rstrip": false,
         | 
| 14 | 
            +
                "single_word": false
         | 
| 15 | 
            +
              },
         | 
| 16 | 
            +
              "unk_token": {
         | 
| 17 | 
            +
                "content": "<unk>",
         | 
| 18 | 
            +
                "lstrip": false,
         | 
| 19 | 
            +
                "normalized": false,
         | 
| 20 | 
            +
                "rstrip": false,
         | 
| 21 | 
            +
                "single_word": false
         | 
| 22 | 
            +
              }
         | 
| 23 | 
             
            }
         | 
    	
        tokenizer.json
    CHANGED
    
    | @@ -31,23 +31,13 @@ | |
| 31 | 
             
                  "special": true
         | 
| 32 | 
             
                }
         | 
| 33 | 
             
              ],
         | 
| 34 | 
            -
              "normalizer":  | 
| 35 | 
            -
             | 
| 36 | 
            -
                " | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
                  },
         | 
| 41 | 
            -
                  {
         | 
| 42 | 
            -
                    "type": "Replace",
         | 
| 43 | 
            -
                    "pattern": {
         | 
| 44 | 
            -
                      "String": " "
         | 
| 45 | 
            -
                    },
         | 
| 46 | 
            -
                    "content": "▁"
         | 
| 47 | 
            -
                  }
         | 
| 48 | 
            -
                ]
         | 
| 49 | 
             
              },
         | 
| 50 | 
            -
              "pre_tokenizer": null,
         | 
| 51 | 
             
              "post_processor": {
         | 
| 52 | 
             
                "type": "TemplateProcessing",
         | 
| 53 | 
             
                "single": [
         | 
| @@ -134,6 +124,7 @@ | |
| 134 | 
             
                "end_of_word_suffix": null,
         | 
| 135 | 
             
                "fuse_unk": true,
         | 
| 136 | 
             
                "byte_fallback": true,
         | 
|  | |
| 137 | 
             
                "vocab": {
         | 
| 138 | 
             
                  "<unk>": 0,
         | 
| 139 | 
             
                  "<s>": 1,
         | 
|  | |
| 31 | 
             
                  "special": true
         | 
| 32 | 
             
                }
         | 
| 33 | 
             
              ],
         | 
| 34 | 
            +
              "normalizer": null,
         | 
| 35 | 
            +
              "pre_tokenizer": {
         | 
| 36 | 
            +
                "type": "Metaspace",
         | 
| 37 | 
            +
                "replacement": "▁",
         | 
| 38 | 
            +
                "prepend_scheme": "first",
         | 
| 39 | 
            +
                "split": false
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 40 | 
             
              },
         | 
|  | |
| 41 | 
             
              "post_processor": {
         | 
| 42 | 
             
                "type": "TemplateProcessing",
         | 
| 43 | 
             
                "single": [
         | 
|  | |
| 124 | 
             
                "end_of_word_suffix": null,
         | 
| 125 | 
             
                "fuse_unk": true,
         | 
| 126 | 
             
                "byte_fallback": true,
         | 
| 127 | 
            +
                "ignore_merges": false,
         | 
| 128 | 
             
                "vocab": {
         | 
| 129 | 
             
                  "<unk>": 0,
         | 
| 130 | 
             
                  "<s>": 1,
         | 
    	
        tokenizer_config.json
    CHANGED
    
    | @@ -1,6 +1,7 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "add_bos_token": true,
         | 
| 3 | 
             
              "add_eos_token": false,
         | 
|  | |
| 4 | 
             
              "added_tokens_decoder": {
         | 
| 5 | 
             
                "0": {
         | 
| 6 | 
             
                  "content": "<unk>",
         | 
| @@ -29,9 +30,10 @@ | |
| 29 | 
             
              },
         | 
| 30 | 
             
              "additional_special_tokens": [],
         | 
| 31 | 
             
              "bos_token": "<s>",
         | 
|  | |
| 32 | 
             
              "clean_up_tokenization_spaces": false,
         | 
| 33 | 
             
              "eos_token": "</s>",
         | 
| 34 | 
            -
              "legacy":  | 
| 35 | 
             
              "model_max_length": 1000000000000000019884624838656,
         | 
| 36 | 
             
              "pad_token": null,
         | 
| 37 | 
             
              "sp_model_kwargs": {},
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
              "add_bos_token": true,
         | 
| 3 | 
             
              "add_eos_token": false,
         | 
| 4 | 
            +
              "add_prefix_space": null,
         | 
| 5 | 
             
              "added_tokens_decoder": {
         | 
| 6 | 
             
                "0": {
         | 
| 7 | 
             
                  "content": "<unk>",
         | 
|  | |
| 30 | 
             
              },
         | 
| 31 | 
             
              "additional_special_tokens": [],
         | 
| 32 | 
             
              "bos_token": "<s>",
         | 
| 33 | 
            +
              "chat_template": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n        {%- else %}\n            {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- ' ' + message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n    {%- endif %}\n{%- endfor %}\n",
         | 
| 34 | 
             
              "clean_up_tokenization_spaces": false,
         | 
| 35 | 
             
              "eos_token": "</s>",
         | 
| 36 | 
            +
              "legacy": false,
         | 
| 37 | 
             
              "model_max_length": 1000000000000000019884624838656,
         | 
| 38 | 
             
              "pad_token": null,
         | 
| 39 | 
             
              "sp_model_kwargs": {},
         | 

 
		