Spaces:
Running
on
Zero
Running
on
Zero
| # Copyright (c) 2024 Amphion. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| from diffrhythm.g2p.g2p import cleaners | |
| from tokenizers import Tokenizer | |
| from diffrhythm.g2p.g2p.text_tokenizers import TextTokenizer | |
| import LangSegment | |
| import json | |
| import re | |
| class PhonemeBpeTokenizer: | |
| def __init__(self, vacab_path="./diffrhythm/g2p/g2p/vocab.json"): | |
| self.lang2backend = { | |
| "zh": "cmn", | |
| "ja": "ja", | |
| "en": "en-us", | |
| "fr": "fr-fr", | |
| "ko": "ko", | |
| "de": "de", | |
| } | |
| self.text_tokenizers = {} | |
| self.int_text_tokenizers() | |
| with open(vacab_path, "r") as f: | |
| json_data = f.read() | |
| data = json.loads(json_data) | |
| self.vocab = data["vocab"] | |
| LangSegment.setfilters(["en", "zh", "ja", "ko", "fr", "de"]) | |
| def int_text_tokenizers(self): | |
| for key, value in self.lang2backend.items(): | |
| self.text_tokenizers[key] = TextTokenizer(language=value) | |
| def tokenize(self, text, sentence, language): | |
| # 1. convert text to phoneme | |
| phonemes = [] | |
| if language == "auto": | |
| seglist = LangSegment.getTexts(text) | |
| tmp_ph = [] | |
| for seg in seglist: | |
| tmp_ph.append( | |
| self._clean_text( | |
| seg["text"], sentence, seg["lang"], ["cjekfd_cleaners"] | |
| ) | |
| ) | |
| phonemes = "|_|".join(tmp_ph) | |
| else: | |
| phonemes = self._clean_text(text, sentence, language, ["cjekfd_cleaners"]) | |
| # print('clean text: ', phonemes) | |
| # 2. tokenize phonemes | |
| phoneme_tokens = self.phoneme2token(phonemes) | |
| # print('encode: ', phoneme_tokens) | |
| # # 3. decode tokens [optional] | |
| # decoded_text = self.tokenizer.decode(phoneme_tokens) | |
| # print('decoded: ', decoded_text) | |
| return phonemes, phoneme_tokens | |
| def _clean_text(self, text, sentence, language, cleaner_names): | |
| for name in cleaner_names: | |
| cleaner = getattr(cleaners, name) | |
| if not cleaner: | |
| raise Exception("Unknown cleaner: %s" % name) | |
| text = cleaner(text, sentence, language, self.text_tokenizers) | |
| return text | |
| def phoneme2token(self, phonemes): | |
| tokens = [] | |
| if isinstance(phonemes, list): | |
| for phone in phonemes: | |
| phone = phone.split("\t")[0] | |
| phonemes_split = phone.split("|") | |
| tokens.append( | |
| [self.vocab[p] for p in phonemes_split if p in self.vocab] | |
| ) | |
| else: | |
| phonemes = phonemes.split("\t")[0] | |
| phonemes_split = phonemes.split("|") | |
| tokens = [self.vocab[p] for p in phonemes_split if p in self.vocab] | |
| return tokens | |