Spaces:

amphion
/

maskgct

Configuration error

App Files Files Community

raoyonghui commited on Nov 6, 2024

Commit

0faafc9

1 Parent(s): 6ec52a1

support long text synthesis

Browse files

Files changed (1) hide show

app.py +115 -36

app.py CHANGED Viewed

@@ -45,6 +45,77 @@ def detect_speech_language(speech_file):
     _, probs = whisper_model.detect_language(mel)
     return max(probs, key=probs.get)
 @torch.no_grad()
 def get_prompt_text(speech_16k, language):
@@ -320,43 +391,51 @@ def maskgct_inference(
     rescale_cfg_s2a=0.75,
     device=torch.device("cuda:0"),
 ):
-    speech_16k = librosa.load(prompt_speech_path, sr=16000)[0]
-    speech = librosa.load(prompt_speech_path, sr=24000)[0]
-    prompt_language = detect_speech_language(prompt_speech_path)
-    full_prompt_text, short_prompt_text, shot_prompt_end_ts = get_prompt_text(prompt_speech_path,
-                                                                              prompt_language)
-    # use the first 4+ seconds wav as the prompt in case the prompt wav is too long
-    speech = speech[0: int(shot_prompt_end_ts * 24000)]
-    speech_16k = speech_16k[0: int(shot_prompt_end_ts*16000)]
-    target_language = detect_text_language(target_text)
-    combine_semantic_code, _ = text2semantic(
-        device,
-        speech_16k,
-        short_prompt_text,
-        prompt_language,
-        target_text,
-        target_language,
-        target_len,
-        n_timesteps,
-        cfg,
-        rescale_cfg,
-    )
-    acoustic_code = extract_acoustic_code(torch.tensor(speech).unsqueeze(0).to(device))
-    _, recovered_audio = semantic2acoustic(
-        device,
-        combine_semantic_code,
-        acoustic_code,
-        n_timesteps=n_timesteps_s2a,
-        cfg=cfg_s2a,
-        rescale_cfg=rescale_cfg_s2a,
-    )
-    return recovered_audio
-@spaces.GPU
 def inference(
     prompt_wav,
     target_text,
@@ -398,7 +477,7 @@ iface = gr.Interface(
     fn=inference,
     inputs=[
         gr.Audio(label="Upload Prompt Wav", type="filepath"),
-        gr.Textbox(label="Target Text"),
         gr.Number(
             label="Target Duration (in seconds), if the target duration is less than 0, the system will estimate a duration.", value=-1
         ),  # Removed 'optional=True'

     _, probs = whisper_model.detect_language(mel)
     return max(probs, key=probs.get)
+def is_chinese(string):
+    """
+    check if the string contains any Chinese character
+    :return: bool
+    """
+    for ch in string:
+        if u'\u4e00' <= ch <= u'\u9fff':
+            return True
+    return False
+def is_english(string):
+    """
+    check if the string contains any English leter
+    :return: bool
+    """
+    for ch in string:
+        if ch.isalpha():
+            return True
+    return False
+def preprocess(sentence):
+    if is_chinese(sentence[-1]) or is_english(sentence[-1]):
+        sentence = sentence + "。"
+    if sentence[-1] == "!":
+        sentence = sentence[0:-1] + "！"
+    elif sentence[-1] == "?":
+        sentence = sentence[0:-1] + "？"
+    elif sentence[-1] not in ["？", "！"] :
+        sentence = sentence[0:-1] +"。"
+    return sentence
+def split_paragraph(text):
+    sentences = []
+    first_punt_list = ";!?。！？；…"
+    second_punc_list = first_punt_list + ", ，"
+    third_punt_list = second_punc_list +  "」）》”’』］)>\"']】 "
+    fisrt_punc_check_start = 5
+    second_punc_check_start = 40
+    third_punc_check_start = 60
+    force_seg_len = 80
+    cur_length = 0.0
+    temp_sent = ""
+    for char in text:
+        temp_sent = temp_sent + char
+        if is_english(char):
+            cur_length = cur_length + 0.3
+        elif is_chinese(char):
+            cur_length = cur_length + 1
+        else:
+            cur_length = cur_length + 0.6
+        if cur_length < fisrt_punc_check_start:
+            continue
+        do_split = False
+        if char in first_punt_list:
+            do_split = True
+        elif cur_length > second_punc_check_start and char in second_punc_list:
+            do_split = True
+        elif cur_length > third_punc_check_start and char in third_punt_list:
+            do_split = True
+        elif cur_length > force_seg_len:
+            do_split = True
+        if do_split:
+            sentences.append(temp_sent)
+            cur_length = 0
+            temp_sent = ""
+    if len(temp_sent):
+        sentences.append(temp_sent)
+    return sentences
 @torch.no_grad()
 def get_prompt_text(speech_16k, language):
     rescale_cfg_s2a=0.75,
     device=torch.device("cuda:0"),
 ):
+    sentences = split_paragraph(target_text)
+    total_recovered_audio = None
+    print("split_paragraph: before:", target_text, "\nafter:", sentences)
+    for sentence in sentences:
+        target_text = preprocess(sentence)
+        speech_16k = librosa.load(prompt_speech_path, sr=16000)[0]
+        speech = librosa.load(prompt_speech_path, sr=24000)[0]
+        prompt_language = detect_speech_language(prompt_speech_path)
+        full_prompt_text, short_prompt_text, shot_prompt_end_ts = get_prompt_text(prompt_speech_path,
+                                                                                prompt_language)
+        # use the first 4+ seconds wav as the prompt in case the prompt wav is too long
+        speech = speech[0: int(shot_prompt_end_ts * 24000)]
+        speech_16k = speech_16k[0: int(shot_prompt_end_ts*16000)]
+        target_language = detect_text_language(target_text)
+        combine_semantic_code, _ = text2semantic(
+            device,
+            speech_16k,
+            short_prompt_text,
+            prompt_language,
+            target_text,
+            target_language,
+            target_len,
+            n_timesteps,
+            cfg,
+            rescale_cfg,
+        )
+        acoustic_code = extract_acoustic_code(torch.tensor(speech).unsqueeze(0).to(device))
+        _, recovered_audio = semantic2acoustic(
+            device,
+            combine_semantic_code,
+            acoustic_code,
+            n_timesteps=n_timesteps_s2a,
+            cfg=cfg_s2a,
+            rescale_cfg=rescale_cfg_s2a,
+        )
+        print("finish text:", target_text)
+        if total_recovered_audio is None:
+            total_recovered_audio = recovered_audio
+        else:
+            total_recovered_audio = np.concatenate([total_recovered_audio, recovered_audio])
+    return total_recovered_audio
+@spaces.GPU(duration=300)
 def inference(
     prompt_wav,
     target_text,
     fn=inference,
     inputs=[
         gr.Audio(label="Upload Prompt Wav", type="filepath"),
+        gr.Textbox(label="Target Text", max_length=1024),
         gr.Number(
             label="Target Duration (in seconds), if the target duration is less than 0, the system will estimate a duration.", value=-1
         ),  # Removed 'optional=True'