| from transformers import LlamaTokenizerFast | |
| # Ultra-FineWeb classifier is using "deepseek-ai/DeepSeek-V2" | |
| # path = "deepseek-ai/DeepSeek-V2" | |
| path = "local_tokenizer" | |
| tokenizer = LlamaTokenizerFast.from_pretrained(path, trust_remote_code=True) | |
| # test tokenizer | |
| content = "MiniCPM4: Ultra-Efficient LLMs on End Devices" | |
| token_ids = tokenizer.encode(content, add_special_tokens=False) | |
| print(token_ids) | |
| # decode each token and print | |
| for token_id in token_ids: | |
| print(tokenizer.decode([token_id])) | |