Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| # @Time : 2021/12/8 12:07 a.m. | |
| # @Author : JianingWang | |
| # @File : JiebaTokenizer | |
| import jieba | |
| from transformers import BertTokenizer | |
| class JiebaTokenizer(BertTokenizer): | |
| def __init__( | |
| self, pre_tokenizer=lambda x: jieba.cut(x, HMM=False), *args, **kwargs | |
| ): | |
| super().__init__(*args, **kwargs) | |
| self.pre_tokenizer = pre_tokenizer | |
| def _tokenize(self, text, *arg, **kwargs): | |
| split_tokens = [] | |
| for text in self.pre_tokenizer(text): | |
| if text in self.vocab: | |
| split_tokens.append(text) | |
| else: | |
| split_tokens.extend(super()._tokenize(text)) | |
| return split_tokens | |