Spaces:
Runtime error
Runtime error
| from os import PathLike | |
| from typing import Dict, List, Optional, Union | |
| from wenet.text.char_tokenizer import CharTokenizer | |
| from wenet.text.tokenize_utils import tokenize_by_bpe_model | |
| class BpeTokenizer(CharTokenizer): | |
| def __init__( | |
| self, | |
| bpe_model: Union[PathLike, str], | |
| symbol_table: Union[str, PathLike, Dict], | |
| non_lang_syms: Optional[Union[str, PathLike, List]] = None, | |
| split_with_space: bool = False, | |
| connect_symbol: str = '', | |
| unk='<unk>', | |
| ) -> None: | |
| super().__init__(symbol_table, non_lang_syms, split_with_space, | |
| connect_symbol, unk) | |
| self._model = bpe_model | |
| # NOTE(Mddct): multiprocessing.Process() issues | |
| # don't build sp here | |
| self.bpe_model = None | |
| def _build_sp(self): | |
| if self.bpe_model is None: | |
| import sentencepiece as spm | |
| self.bpe_model = spm.SentencePieceProcessor() | |
| self.bpe_model.load(self._model) | |
| def text2tokens(self, line: str) -> List[str]: | |
| self._build_sp() | |
| line = line.strip() | |
| if self.non_lang_syms_pattern is not None: | |
| parts = self.non_lang_syms_pattern.split(line.upper()) | |
| parts = [w for w in parts if len(w.strip()) > 0] | |
| else: | |
| parts = [line] | |
| tokens = [] | |
| for part in parts: | |
| if part in self.non_lang_syms: | |
| tokens.append(part) | |
| else: | |
| tokens.extend(tokenize_by_bpe_model(self.bpe_model, part)) | |
| return tokens | |
| def tokens2text(self, tokens: List[str]) -> str: | |
| self._build_sp() | |
| text = super().tokens2text(tokens) | |
| return text.replace("▁", ' ').strip() | |