Chatterbox-Multilingual-TTS

Running on Zero

App Files Files Community

Zihan428 commited on Sep 4

Commit

49abc70

1 Parent(s): 0aafff4

Update analyzer modules and tokenizer

Browse files

Files changed (6) hide show

app.py +4 -4
src/chatterbox/models/t3/inference/alignment_stream_analyzer.py +2 -2
src/chatterbox/models/t3/modules/t3_config.py +4 -0
src/chatterbox/models/t3/t3.py +11 -8
src/chatterbox/models/tokenizers/tokenizer.py +26 -70
src/chatterbox/mtl_tts.py +1 -1

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ MODEL = None
 LANGUAGE_CONFIG = {
     "ar": {
-        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_m1.flac",
         "text": "في الشهر الماضي، وصلنا إلى معلم جديد بمليارين من المشاهدات على قناتنا على يوتيوب."
     },
     "da": {
@@ -57,7 +57,7 @@ LANGUAGE_CONFIG = {
         "text": "Il mese scorso abbiamo raggiunto un nuovo traguardo: due miliardi di visualizzazioni sul nostro canale YouTube."
     },
     "ja": {
-        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja_f.flac",
         "text": "先月、私たちのYouTubeチャンネルで二十億回の再生回数という新たなマイルストーンに到達しました。"
     },
     "ko": {
@@ -101,8 +101,8 @@ LANGUAGE_CONFIG = {
         "text": "Geçen ay YouTube kanalımızda iki milyar görüntüleme ile yeni bir dönüm noktasına ulaştık."
     },
     "zh": {
-        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f.flac",
-        "text": "上个月，我们达到了一个新的里程碑，我们的YouTube频道观看次数达到了二十亿次，这绝对令人难以置信。"
     },
 }

 LANGUAGE_CONFIG = {
     "ar": {
+        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac",
         "text": "في الشهر الماضي، وصلنا إلى معلم جديد بمليارين من المشاهدات على قناتنا على يوتيوب."
     },
     "da": {
         "text": "Il mese scorso abbiamo raggiunto un nuovo traguardo: due miliardi di visualizzazioni sul nostro canale YouTube."
     },
     "ja": {
+        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac",
         "text": "先月、私たちのYouTubeチャンネルで二十億回の再生回数という新たなマイルストーンに到達しました。"
     },
     "ko": {
         "text": "Geçen ay YouTube kanalımızda iki milyar görüntüleme ile yeni bir dönüm noktasına ulaştık."
     },
     "zh": {
+        "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
+        "text": "上个月，我们达到了一个新的里程碑. 我们的YouTube频道观看次数达到了二十亿次，这绝对令人难以置信。"
     },
 }

src/chatterbox/models/t3/inference/alignment_stream_analyzer.py CHANGED Viewed

@@ -155,12 +155,12 @@ class AlignmentStreamAnalyzer:
         token_repetition = (
             # self.complete and
             len(self.generated_tokens) >= 3 and
-            len(set(self.generated_tokens[-3:])) == 1
         )
         if token_repetition:
             repeated_token = self.generated_tokens[-1]
-            logger.warning(f"🚨 Detected 3x repetition of token {repeated_token}")
         # Suppress EoS to prevent early termination
         if cur_text_posn < S - 3 and S > 5:  # Only suppress if text is longer than 5 tokens

         token_repetition = (
             # self.complete and
             len(self.generated_tokens) >= 3 and
+            len(set(self.generated_tokens[-2:])) == 1
         )
         if token_repetition:
             repeated_token = self.generated_tokens[-1]
+            logger.warning(f"🚨 Detected 2x repetition of token {repeated_token}")
         # Suppress EoS to prevent early termination
         if cur_text_posn < S - 3 and S > 5:  # Only suppress if text is longer than 5 tokens

src/chatterbox/models/t3/modules/t3_config.py CHANGED Viewed

@@ -25,6 +25,10 @@ class T3Config:
     @property
     def n_channels(self):
         return LLAMA_CONFIGS[self.llama_config_name]["hidden_size"]
     @classmethod
     def english_only(cls):

     @property
     def n_channels(self):
         return LLAMA_CONFIGS[self.llama_config_name]["hidden_size"]
+    @property
+    def is_multilingual(self):
+        return self.text_tokens_dict_size == 2352
     @classmethod
     def english_only(cls):

src/chatterbox/models/t3/t3.py CHANGED Viewed

@@ -257,14 +257,17 @@ class T3(nn.Module):
         # TODO? synchronize the expensive compile function
         # with self.compile_lock:
         if not self.compiled:
-            alignment_stream_analyzer = AlignmentStreamAnalyzer(
-                self.tfmr,
-                None,
-                text_tokens_slice=(len_cond, len_cond + text_tokens.size(-1)),
-                alignment_layer_idx=9, # TODO: hparam or something?
-                eos_idx=self.hp.stop_speech_token,
-            )
-            assert alignment_stream_analyzer.eos_idx == self.hp.stop_speech_token
             patched_model = T3HuggingfaceBackend(
                 config=self.cfg,

         # TODO? synchronize the expensive compile function
         # with self.compile_lock:
         if not self.compiled:
+            # Default to None for English models, only create for multilingual
+            alignment_stream_analyzer = None
+            if self.hp.is_multilingual:
+                alignment_stream_analyzer = AlignmentStreamAnalyzer(
+                    self.tfmr,
+                    None,
+                    text_tokens_slice=(len_cond, len_cond + text_tokens.size(-1)),
+                    alignment_layer_idx=9, # TODO: hparam or something?
+                    eos_idx=self.hp.stop_speech_token,
+                )
+                assert alignment_stream_analyzer.eos_idx == self.hp.stop_speech_token
             patched_model = T3HuggingfaceBackend(
                 config=self.cfg,

src/chatterbox/models/tokenizers/tokenizer.py CHANGED Viewed

@@ -151,9 +151,7 @@ def korean_normalize(text: str) -> str:
         return initial + medial + final
     # Decompose syllables and normalize punctuation
-    result = ''.join(decompose_hangul(char) for char in text)
-    result = re.sub(r'[…~？！，：；（）「」『』]', '.', result)  # Korean punctuation
     return result.strip()
@@ -201,81 +199,39 @@ class ChineseCangjieConverter:
     def _cangjie_encode(self, glyph: str):
         """Encode a single Chinese glyph to Cangjie code."""
-        code = self.word2cj.get(glyph, None)
-        if code is None:
             return None
-        index = self.cj2word[code].index(glyph)
-        index_suffix = str(index) if index > 0 else ""
-        return code + index_suffix
-    def _normalize_numbers(self, text: str) -> str:
-        """Convert Arabic numerals (1-99) to Chinese characters."""
-        digit_map = {'0': '零', '1': '一', '2': '二', '3': '三', '4': '四',
-                     '5': '五', '6': '六', '7': '七', '8': '八', '9': '九'}
-        pattern = re.compile(r'(?<!\d)(\d{1,2})(?!\d)')
-        def convert_number(match):
-            num = int(match.group(1))
-            if num == 0:
-                return '零'
-            elif 1 <= num <= 9:
-                return digit_map[str(num)]
-            elif num == 10:
-                return '十'
-            elif 11 <= num <= 19:
-                return '十' + digit_map[str(num % 10)]
-            elif 20 <= num <= 99:
-                tens, ones = divmod(num, 10)
-                if ones == 0:
-                    return digit_map[str(tens)] + '十'
-                else:
-                    return digit_map[str(tens)] + '十' + digit_map[str(ones)]
-            else:
-                return match.group(1)
-        return pattern.sub(convert_number, text)
-    def convert_chinese_text(self, text: str) -> str:
         """Convert Chinese characters in text to Cangjie tokens."""
-        text = re.sub('[、，：；〜－（）｟｠]', ',', text)
-        text = re.sub('(。|…)', '.', text)
-        text = self._normalize_numbers(text)
-        # Skip segmentation for simple sequences (numbers, punctuation, short phrases)
         if self.segmenter is not None:
-            # This avoids over-segmenting number sequences like "一, 二, 三"
-            is_simple_sequence = (
-                len([c for c in text if category(c) == "Lo"]) <= 15 and  # Max 15 Chinese chars
-                text.count(',') >= 2  # Contains multiple commas (likely enumeration)
-            )
-            # Only segment complex Chinese text (longer sentences without enumeration patterns)
-            if not is_simple_sequence and len(text) > 10:
-                chinese_chars = sum(1 for c in text if category(c) == "Lo")
-                total_chars = len([c for c in text if c.strip()])
-                if chinese_chars > 5 and chinese_chars / total_chars > 0.7:
-                    segmented_words = self.segmenter.cut(text)
-                    text = " ".join(segmented_words)
-        output = []
-        for char in text:
-            if category(char) == "Lo":  # Chinese character
-                cangjie = self._cangjie_encode(char)
                 if cangjie is None:
-                    output.append(char)
                     continue
-                code_tokens = [f"[cj_{c}]" for c in cangjie]
-                code_tokens.append("[cj_.]")
-                output.append("".join(code_tokens))
             else:
-                output.append(char)
         return "".join(output)
@@ -299,7 +255,7 @@ class MTLTokenizer:
     def encode(self, txt: str, language_id: str = None):
         # Language-specific text processing
         if language_id == 'zh':
-            txt = self.cangjie_converter.convert_chinese_text(txt)
         elif language_id == 'ja':
             txt = hiragana_normalize(txt)
         elif language_id == 'he':

         return initial + medial + final
     # Decompose syllables and normalize punctuation
+    result = ''.join(decompose_hangul(char) for char in text)
     return result.strip()
     def _cangjie_encode(self, glyph: str):
         """Encode a single Chinese glyph to Cangjie code."""
+        normed_glyph = glyph
+        code = self.word2cj.get(normed_glyph, None)
+        if code is None:  # e.g. Japanese hiragana
             return None
+        index = self.cj2word[code].index(normed_glyph)
+        index = str(index) if index > 0 else ""
+        return code + str(index)
+    def __call__(self, text):
         """Convert Chinese characters in text to Cangjie tokens."""
+        output = []
         if self.segmenter is not None:
+            segmented_words = self.segmenter.cut(text)
+            full_text = " ".join(segmented_words)
+        else:
+            full_text = text
+        for t in full_text:
+            if category(t) == "Lo":
+                cangjie = self._cangjie_encode(t)
                 if cangjie is None:
+                    output.append(t)
                     continue
+                code = []
+                for c in cangjie:
+                    code.append(f"[cj_{c}]")
+                code.append("[cj_.]")
+                code = "".join(code)
+                output.append(code)
             else:
+                output.append(t)
         return "".join(output)
     def encode(self, txt: str, language_id: str = None):
         # Language-specific text processing
         if language_id == 'zh':
+            txt = self.cangjie_converter(txt)
         elif language_id == 'ja':
             txt = hiragana_normalize(txt)
         elif language_id == 'he':

src/chatterbox/mtl_tts.py CHANGED Viewed

@@ -83,7 +83,7 @@ def punc_norm(text: str) -> str:
     # Add full stop if no ending punc
     text = text.rstrip(" ")
-    sentence_enders = {".", "!", "?", "-", ","}
     if not any(text.endswith(p) for p in sentence_enders):
         text += "."

     # Add full stop if no ending punc
     text = text.rstrip(" ")
+    sentence_enders = {".", "!", "?", "-", ",","、","，","。","？","！"}
     if not any(text.endswith(p) for p in sentence_enders):
         text += "."