Spaces:

ASLP-lab
/

OSUM-EChat

Running on Zero

App Files Files Community

xlgeng commited on Aug 21

Commit

13f013f

1 Parent(s): 58a2540

开始部署

Browse files

Files changed (1) hide show

app.py +25 -24

app.py CHANGED Viewed

@@ -30,6 +30,11 @@ except ImportError:
     print("torch_npu is not available. if you want to use npu, please install it.")
 from huggingface_hub import hf_hub_download
@@ -51,7 +56,7 @@ cosyvoice_model_path="./CosyVoice-300M-25Hz"
 device = torch.device("cuda")
 print("开始加载模型 A...")
 model_a, tokenizer_a = load_model_and_tokenizer(CHECKPOINT_PATH_A, CONFIG_PATH)
-model_a.eval().cuda()
 print("\n开始加载模型 B...")
 if CHECKPOINT_PATH_B is not None:
@@ -61,15 +66,15 @@ else:
     model_b, tokenizer_b = None, None
 loaded_models = {
-    NAME_A: {"model": model_a, "tokenizer": tokenizer_a},
     NAME_B: {"model": model_b, "tokenizer": tokenizer_b},
 } if model_b is not None else {
-    NAME_A: {"model": model_a, "tokenizer": tokenizer_a},
 }
 print("\n所有模型已加载完毕。")
-cosyvoice = CosyVoice(cosyvoice_model_path)
-cosyvoice.eval().cuda()
 # 将图片转换为 Base64
 with open("./tts/assert/实验室.png", "rb") as image_file:
@@ -114,11 +119,6 @@ for item in prompt_audio_choices:
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
-import time
-import datetime
-import torch
-from common_utils.utils4infer import get_feat_from_wav_path, token_list2wav
 @spaces.GPU
 def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice, prompt_speech_data):
@@ -135,13 +135,14 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
     # 通用初始化：模型设备设置
     start_time = time.time()
     res_text = None
     try:
         # 1. 处理TTS任务
         if input_prompt.endswith("_TTS"):
             text_for_tts = input_prompt.replace("_TTS", "")
             # T2S推理逻辑
-            res_tensor = model.generate_tts(device=device, text=text_for_tts)[0]
             res_token_list = res_tensor.tolist()
             res_text = res_token_list[:-1]
             print(f"T2S 推理消耗时间: {time.time() - start_time:.2f} 秒")
@@ -153,7 +154,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
-            res_text = model.generate(
                 wavs=feat,
                 wavs_len=feat_lens,
                 prompt=prompt,
@@ -168,7 +169,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
             # T2T推理逻辑
             print(f'开始t2t推理, question_txt: {question_txt}')
             if is_npu: torch_npu.npu.synchronize()
-            res_text = model.generate_text2text(
                 device=device,
                 text=question_txt
             )[0]
@@ -183,7 +184,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
-            output_text, text_res, speech_res = model.generate_s2s_no_stream_with_repetition_penalty(
                 wavs=feat,
                 wavs_len=feat_lens,
             )
@@ -197,7 +198,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
-            output_text, text_res, speech_res = model.generate_s2s_no_stream_think_with_repetition_penalty(
                 wavs=feat,
                 wavs_len=feat_lens,
             )
@@ -211,7 +212,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
-            res_text = model.generate4chat(
                 wavs=feat,
                 wavs_len=feat_lens,
                 cache_implementation="static"
@@ -225,7 +226,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
-            res_text = model.generate4chat_think(
                 wavs=feat,
                 wavs_len=feat_lens,
                 cache_implementation="static"
@@ -239,7 +240,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
-            res_text = model.generate(
                 wavs=feat,
                 wavs_len=feat_lens,
                 prompt=input_prompt,
@@ -260,20 +261,20 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
     wav_path_output = input_wav_path
     if task_choice == "TTS任务" or "empathetic_s2s_dialogue" in task_choice:
         if isinstance(output_res, list):  # TTS case
-            cosyvoice.eval()
-            time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-            wav_path = f"./tmp/{time_str}.wav"
-            wav_path_output = token_list2wav(output_res, prompt_speech_data, wav_path, cosyvoice)
             # wav_path_output = get_wav_from_token_list(output_res, prompt_speech_data)
             output_res = "生成的token: " + str(output_res)
         elif isinstance(output_res, str) and "|" in output_res:  # S2S case
             try:
                 text_res, token_list_str = output_res.split("|")
                 token_list = json.loads(token_list_str)
-                cosyvoice.eval()
                 time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
                 wav_path = f"./tmp/{time_str}.wav"
-                wav_path_output = token_list2wav(token_list, prompt_speech_data, wav_path, cosyvoice)
                 # wav_path_output = get_wav_from_token_list(token_list, prompt_speech_data)
                 output_res = text_res
             except (ValueError, json.JSONDecodeError) as e:

     print("torch_npu is not available. if you want to use npu, please install it.")
+import time
+import datetime
+import torch
+from common_utils.utils4infer import get_feat_from_wav_path, token_list2wav
 from huggingface_hub import hf_hub_download
 device = torch.device("cuda")
 print("开始加载模型 A...")
 model_a, tokenizer_a = load_model_and_tokenizer(CHECKPOINT_PATH_A, CONFIG_PATH)
+model_a
 print("\n开始加载模型 B...")
 if CHECKPOINT_PATH_B is not None:
     model_b, tokenizer_b = None, None
 loaded_models = {
+    NAME_A: {"model": model_b, "tokenizer": tokenizer_b},
     NAME_B: {"model": model_b, "tokenizer": tokenizer_b},
 } if model_b is not None else {
+    NAME_A: {"model": model_b, "tokenizer": tokenizer_b},
 }
 print("\n所有模型已加载完毕。")
+# cosyvoice = CosyVoice(cosyvoice_model_path)
+# cosyvoice.eval().cuda()
 # 将图片转换为 Base64
 with open("./tts/assert/实验室.png", "rb") as image_file:
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
 @spaces.GPU
 def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice, prompt_speech_data):
     # 通用初始化：模型设备设置
     start_time = time.time()
     res_text = None
+    model_a.eval().cuda()
     try:
         # 1. 处理TTS任务
         if input_prompt.endswith("_TTS"):
             text_for_tts = input_prompt.replace("_TTS", "")
             # T2S推理逻辑
+            res_tensor = model_a.generate_tts(device=device, text=text_for_tts)[0]
             res_token_list = res_tensor.tolist()
             res_text = res_token_list[:-1]
             print(f"T2S 推理消耗时间: {time.time() - start_time:.2f} 秒")
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
+            res_text = model_a.generate(
                 wavs=feat,
                 wavs_len=feat_lens,
                 prompt=prompt,
             # T2T推理逻辑
             print(f'开始t2t推理, question_txt: {question_txt}')
             if is_npu: torch_npu.npu.synchronize()
+            res_text = model_a.generate_text2text(
                 device=device,
                 text=question_txt
             )[0]
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
+            output_text, text_res, speech_res = model_a.generate_s2s_no_stream_with_repetition_penalty(
                 wavs=feat,
                 wavs_len=feat_lens,
             )
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
+            output_text, text_res, speech_res = model_a.generate_s2s_no_stream_think_with_repetition_penalty(
                 wavs=feat,
                 wavs_len=feat_lens,
             )
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
+            res_text = model_a.generate4chat(
                 wavs=feat,
                 wavs_len=feat_lens,
                 cache_implementation="static"
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
+            res_text = model_a.generate4chat_think(
                 wavs=feat,
                 wavs_len=feat_lens,
                 cache_implementation="static"
             feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
+            res_text = model_a.generate(
                 wavs=feat,
                 wavs_len=feat_lens,
                 prompt=input_prompt,
     wav_path_output = input_wav_path
     if task_choice == "TTS任务" or "empathetic_s2s_dialogue" in task_choice:
         if isinstance(output_res, list):  # TTS case
+            # cosyvoice.eval()
+            # time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+            # wav_path = f"./tmp/{time_str}.wav"
+            # wav_path_output = token_list2wav(output_res, prompt_speech_data, wav_path, cosyvoice)
             # wav_path_output = get_wav_from_token_list(output_res, prompt_speech_data)
             output_res = "生成的token: " + str(output_res)
         elif isinstance(output_res, str) and "|" in output_res:  # S2S case
             try:
                 text_res, token_list_str = output_res.split("|")
                 token_list = json.loads(token_list_str)
+                # cosyvoice.eval()
                 time_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
                 wav_path = f"./tmp/{time_str}.wav"
+                # wav_path_output = token_list2wav(token_list, prompt_speech_data, wav_path, cosyvoice)
                 # wav_path_output = get_wav_from_token_list(token_list, prompt_speech_data)
                 output_res = text_res
             except (ValueError, json.JSONDecodeError) as e: