Spaces:
Runtime error
Runtime error
| from os import PathLike | |
| from typing import Dict, List, Union | |
| from wenet.text.base_tokenizer import BaseTokenizer, T as Type | |
| class HuggingFaceTokenizer(BaseTokenizer): | |
| def __init__(self, model: Union[str, PathLike], *args, **kwargs) -> None: | |
| # NOTE(Mddct): don't build here, pickle issues | |
| self.model = model | |
| self.tokenizer = None | |
| self.args = args | |
| self.kwargs = kwargs | |
| def __getstate__(self): | |
| state = self.__dict__.copy() | |
| del state['tokenizer'] | |
| return state | |
| def __setstate__(self, state): | |
| self.__dict__.update(state) | |
| recovery = {'tokenizer': None} | |
| self.__dict__.update(recovery) | |
| def _build_hugging_face(self): | |
| from transformers import AutoTokenizer | |
| if self.tokenizer is None: | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| self.model, **self.kwargs) | |
| self.t2i = self.tokenizer.get_vocab() | |
| def text2tokens(self, line: str) -> List[Type]: | |
| self._build_hugging_face() | |
| return self.tokenizer.tokenize(line) | |
| def tokens2text(self, tokens: List[Type]) -> str: | |
| self._build_hugging_face() | |
| ids = self.tokens2ids(tokens) | |
| return self.tokenizer.decode(ids) | |
| def tokens2ids(self, tokens: List[Type]) -> List[int]: | |
| self._build_hugging_face() | |
| return self.tokenizer.convert_tokens_to_ids(tokens) | |
| def ids2tokens(self, ids: List[int]) -> List[Type]: | |
| self._build_hugging_face() | |
| return self.tokenizer.convert_ids_to_tokens(ids) | |
| def vocab_size(self) -> int: | |
| self._build_hugging_face() | |
| # TODO: we need special tokenize size in future | |
| return len(self.tokenizer) | |
| def symbol_table(self) -> Dict[Type, int]: | |
| self._build_hugging_face() | |
| return self.t2i | |