Spaces:
Running
Running
| from typing import Union | |
| from fastapi import HTTPException | |
| from pydantic import BaseModel | |
| from modules.api import utils as api_utils | |
| from modules.api.Api import APIManager | |
| from modules.api.impl.handler.SSMLHandler import SSMLHandler | |
| from modules.api.impl.handler.TTSHandler import TTSHandler | |
| from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat | |
| from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig | |
| from modules.api.impl.model.enhancer_model import EnhancerConfig | |
| from modules.speaker import Speaker, speaker_mgr | |
| class SynthesisInput(BaseModel): | |
| text: Union[str, None] = None | |
| ssml: Union[str, None] = None | |
| class VoiceSelectionParams(BaseModel): | |
| languageCode: str = "ZH-CN" | |
| name: str = "female2" | |
| style: str = "" | |
| temperature: float = 0.3 | |
| topP: float = 0.7 | |
| topK: int = 20 | |
| seed: int = 42 | |
| # end_of_sentence | |
| eos: str = "[uv_break]" | |
| class AudioConfig(BaseModel): | |
| audioEncoding: AudioFormat = AudioFormat.mp3 | |
| speakingRate: float = 1 | |
| pitch: float = 0 | |
| volumeGainDb: float = 0 | |
| sampleRateHertz: int = 24000 | |
| batchSize: int = 4 | |
| spliterThreshold: int = 100 | |
| class GoogleTextSynthesizeRequest(BaseModel): | |
| input: SynthesisInput | |
| voice: VoiceSelectionParams | |
| audioConfig: AudioConfig | |
| enhancerConfig: EnhancerConfig = None | |
| class GoogleTextSynthesizeResponse(BaseModel): | |
| audioContent: str | |
| async def google_text_synthesize(request: GoogleTextSynthesizeRequest): | |
| input = request.input | |
| voice = request.voice | |
| audioConfig = request.audioConfig | |
| enhancerConfig = request.enhancerConfig | |
| # 提取参数 | |
| # TODO 这个也许应该传给 normalizer | |
| language_code = voice.languageCode | |
| voice_name = voice.name | |
| infer_seed = voice.seed or 42 | |
| eos = voice.eos or "[uv_break]" | |
| audio_format = audioConfig.audioEncoding | |
| if not isinstance(audio_format, AudioFormat) and isinstance(audio_format, str): | |
| audio_format = AudioFormat(audio_format) | |
| speaking_rate = audioConfig.speakingRate or 1 | |
| pitch = audioConfig.pitch or 0 | |
| volume_gain_db = audioConfig.volumeGainDb or 0 | |
| batch_size = audioConfig.batchSize or 1 | |
| spliter_threshold = audioConfig.spliterThreshold or 100 | |
| # TODO | |
| sample_rate = audioConfig.sampleRateHertz or 24000 | |
| params = api_utils.calc_spk_style(spk=voice.name, style=voice.style) | |
| # 虽然 calc_spk_style 可以解析 seed 形式,但是这个接口只准备支持 speakers list 中存在的 speaker | |
| if speaker_mgr.get_speaker(voice_name) is None: | |
| raise HTTPException( | |
| status_code=422, detail="The specified voice name is not supported." | |
| ) | |
| if not isinstance(params.get("spk"), Speaker): | |
| raise HTTPException( | |
| status_code=422, detail="The specified voice name is not supported." | |
| ) | |
| speaker = params.get("spk") | |
| tts_config = ChatTTSConfig( | |
| style=params.get("style", ""), | |
| temperature=voice.temperature, | |
| top_k=voice.topK, | |
| top_p=voice.topP, | |
| ) | |
| infer_config = InferConfig( | |
| batch_size=batch_size, | |
| spliter_threshold=spliter_threshold, | |
| eos=eos, | |
| seed=infer_seed, | |
| ) | |
| adjust_config = AdjustConfig( | |
| speaking_rate=speaking_rate, | |
| pitch=pitch, | |
| volume_gain_db=volume_gain_db, | |
| ) | |
| enhancer_config = enhancerConfig | |
| mime_type = f"audio/{audio_format.value}" | |
| if audio_format == AudioFormat.mp3: | |
| mime_type = "audio/mpeg" | |
| try: | |
| if input.text: | |
| text_content = input.text | |
| handler = TTSHandler( | |
| text_content=text_content, | |
| spk=speaker, | |
| tts_config=tts_config, | |
| infer_config=infer_config, | |
| adjust_config=adjust_config, | |
| enhancer_config=enhancer_config, | |
| ) | |
| base64_string = handler.enqueue_to_base64(format=audio_format) | |
| return {"audioContent": f"data:{mime_type};base64,{base64_string}"} | |
| elif input.ssml: | |
| ssml_content = input.ssml | |
| handler = SSMLHandler( | |
| ssml_content=ssml_content, | |
| infer_config=infer_config, | |
| adjust_config=adjust_config, | |
| enhancer_config=enhancer_config, | |
| ) | |
| base64_string = handler.enqueue_to_base64(format=audio_format) | |
| return {"audioContent": f"data:{mime_type};base64,{base64_string}"} | |
| else: | |
| raise HTTPException( | |
| status_code=422, detail="Invalid input text or ssml specified." | |
| ) | |
| except Exception as e: | |
| import logging | |
| logging.exception(e) | |
| if isinstance(e, HTTPException): | |
| raise e | |
| else: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| def setup(app: APIManager): | |
| app.post( | |
| "/v1/text:synthesize", | |
| response_model=GoogleTextSynthesizeResponse, | |
| description=""" | |
| google api document: <br/> | |
| [https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize](https://cloud.google.com/text-to-speech/docs/reference/rest/v1/text/synthesize) | |
| - 多个属性在本系统中无用仅仅是为了兼容google api | |
| - voice 中的 topP, topK, temperature 为本系统中的参数 | |
| - voice.name 即 speaker name (或者speaker seed) | |
| - voice.seed 为 infer seed (可在webui中测试具体作用) | |
| - 编码格式影响的是 audioContent 的二进制格式,所以所有format都是返回带有base64数据的json | |
| """, | |
| )(google_text_synthesize) | |