Spaces:

ASLP-lab
/

OSUM-EChat

Running on Zero

App Files Files Community

xlgeng commited on Aug 21

Commit

aea4592

1 Parent(s): 13f013f

开始部署

Browse files

Files changed (1) hide show

app.py +36 -12

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import datetime
 import json
 import logging
 import os
 import spaces
 import gradio as gr
@@ -12,6 +14,7 @@ import time
 import traceback
 import torch
 from common_utils.utils4infer import get_feat_from_wav_path, load_model_and_tokenizer, token_list2wav
@@ -53,7 +56,7 @@ cosyvoice_model_path="./CosyVoice-300M-25Hz"
-device = torch.device("cuda")
 print("开始加载模型 A...")
 model_a, tokenizer_a = load_model_and_tokenizer(CHECKPOINT_PATH_A, CONFIG_PATH)
 model_a
@@ -131,6 +134,29 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
     if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
         print("音频信息未输入，且不是T2S或T2T任务")
         return "错误：需要音频输入"
     # 通用初始化：模型设备设置
     start_time = time.time()
@@ -142,7 +168,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
         if input_prompt.endswith("_TTS"):
             text_for_tts = input_prompt.replace("_TTS", "")
             # T2S推理逻辑
-            res_tensor = model_a.generate_tts(device=device, text=text_for_tts)[0]
             res_token_list = res_tensor.tolist()
             res_text = res_token_list[:-1]
             print(f"T2S 推理消耗时间: {time.time() - start_time:.2f} 秒")
@@ -151,16 +177,14 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
         elif input_prompt.endswith("_self_prompt"):
             prompt = input_prompt.replace("_self_prompt", "")
             # S2T推理逻辑
-            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
-            print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
-            if is_npu: torch_npu.npu.synchronize()
             res_text = model_a.generate(
                 wavs=feat,
                 wavs_len=feat_lens,
                 prompt=prompt,
                 cache_implementation="static"
             )[0]
-            if is_npu: torch_npu.npu.synchronize()
             print(f"S2T 推理消耗时间: {time.time() - start_time:.2f} 秒")
         # 3. 处理T2T任务
@@ -170,7 +194,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
             print(f'开始t2t推理, question_txt: {question_txt}')
             if is_npu: torch_npu.npu.synchronize()
             res_text = model_a.generate_text2text(
-                device=device,
                 text=question_txt
             )[0]
             if is_npu: torch_npu.npu.synchronize()
@@ -181,7 +205,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
                               "请推断对这段语音回答时的情感，标注情感类型，撰写流畅自然的聊天回复，并生成情感语音token。",
                               "s2s_no_think"]:
             # S2S推理逻辑
-            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             output_text, text_res, speech_res = model_a.generate_s2s_no_stream_with_repetition_penalty(
@@ -195,7 +219,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
         # 5. 处理S2S有思考任务
         elif input_prompt == "THINK":
             # S2S带思考推理逻辑
-            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             output_text, text_res, speech_res = model_a.generate_s2s_no_stream_think_with_repetition_penalty(
@@ -209,7 +233,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
         # 6. 处理S2T4Chat无思考任务
         elif input_prompt == "s2t_no_think":
             # S2T4Chat推理逻辑
-            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             res_text = model_a.generate4chat(
@@ -223,7 +247,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
         # 7. 处理S2T4Chat有思考任务
         elif input_prompt == "s2t_think":
             # S2T4Chat带思考推理逻辑
-            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             res_text = model_a.generate4chat_think(
@@ -237,7 +261,7 @@ def true_decode_fuc(model, tokenizer, input_wav_path, input_prompt, task_choice,
         # 8. 处理默认S2T任务
         else:
             # 默认S2T推理逻辑
-            feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             res_text = model_a.generate(

 import json
 import logging
 import os
+import librosa
 import spaces
 import gradio as gr
 import traceback
 import torch
+import torchaudio
 from common_utils.utils4infer import get_feat_from_wav_path, load_model_and_tokenizer, token_list2wav
 print("开始加载模型 A...")
 model_a, tokenizer_a = load_model_and_tokenizer(CHECKPOINT_PATH_A, CONFIG_PATH)
 model_a
     if input_wav_path is None and not input_prompt.endswith(("_TTS", "_T2T")):
         print("音频信息未输入，且不是T2S或T2T任务")
         return "错误：需要音频输入"
+    if input_wav_path is not None:
+        waveform, sample_rate = torchaudio.load(input_wav_path)
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+        waveform = resampler(waveform)
+        waveform = waveform.squeeze(0)
+        window = torch.hann_window(400)
+        stft = torch.stft(waveform, 400, 160, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+        filters = torch.from_numpy(librosa.filters.mel(sr=sample_rate, n_fft=400, n_mels=80))
+        mel_spec = filters @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        feat = log_spec.transpose(0, 1)
+        feat_lens = torch.tensor([feat.shape[0]], dtype=torch.int64).cuda()
+        feat = feat.unsqueeze(0).cuda()
+        feat = feat.to(torch.bfloat16)
+        print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
+    else:
+        feat = None
+        feat_lens = None
     # 通用初始化：模型设备设置
     start_time = time.time()
         if input_prompt.endswith("_TTS"):
             text_for_tts = input_prompt.replace("_TTS", "")
             # T2S推理逻辑
+            res_tensor = model_a.generate_tts(device=torch.device("cuda"), text=text_for_tts)[0]
             res_token_list = res_tensor.tolist()
             res_text = res_token_list[:-1]
             print(f"T2S 推理消耗时间: {time.time() - start_time:.2f} 秒")
         elif input_prompt.endswith("_self_prompt"):
             prompt = input_prompt.replace("_self_prompt", "")
             # S2T推理逻辑
+            # feat, feat_lens = get_feat_from_wav_path(input_wav_path)
+            # waveform, sample_rate = do_resample(input_wav_path)
             res_text = model_a.generate(
                 wavs=feat,
                 wavs_len=feat_lens,
                 prompt=prompt,
                 cache_implementation="static"
             )[0]
             print(f"S2T 推理消耗时间: {time.time() - start_time:.2f} 秒")
         # 3. 处理T2T任务
             print(f'开始t2t推理, question_txt: {question_txt}')
             if is_npu: torch_npu.npu.synchronize()
             res_text = model_a.generate_text2text(
+                device=torch.device("cuda"),
                 text=question_txt
             )[0]
             if is_npu: torch_npu.npu.synchronize()
                               "请推断对这段语音回答时的情感，标注情感类型，撰写流畅自然的聊天回复，并生成情感语音token。",
                               "s2s_no_think"]:
             # S2S推理逻辑
+            # feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             output_text, text_res, speech_res = model_a.generate_s2s_no_stream_with_repetition_penalty(
         # 5. 处理S2S有思考任务
         elif input_prompt == "THINK":
             # S2S带思考推理逻辑
+            # feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             output_text, text_res, speech_res = model_a.generate_s2s_no_stream_think_with_repetition_penalty(
         # 6. 处理S2T4Chat无思考任务
         elif input_prompt == "s2t_no_think":
             # S2T4Chat推理逻辑
+            # feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             res_text = model_a.generate4chat(
         # 7. 处理S2T4Chat有思考任务
         elif input_prompt == "s2t_think":
             # S2T4Chat带思考推理逻辑
+            # feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             res_text = model_a.generate4chat_think(
         # 8. 处理默认S2T任务
         else:
             # 默认S2T推理逻辑
+            # feat, feat_lens = get_feat_from_wav_path(input_wav_path)
             print(f'feat shape: {feat.shape}, feat_lens: {feat_lens}')
             if is_npu: torch_npu.npu.synchronize()
             res_text = model_a.generate(