Spaces:
Running
Running
| import numpy as np | |
| from fastapi import HTTPException | |
| from modules.api.impl.handler.AudioHandler import AudioHandler | |
| from modules.api.impl.model.audio_model import AdjustConfig | |
| from modules.api.impl.model.chattts_model import InferConfig | |
| from modules.api.impl.model.enhancer_model import EnhancerConfig | |
| from modules.Enhancer.ResembleEnhance import apply_audio_enhance_full | |
| from modules.normalization import text_normalize | |
| from modules.ssml_parser.SSMLParser import create_ssml_parser | |
| from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments | |
| from modules.utils import audio | |
| class SSMLHandler(AudioHandler): | |
| def __init__( | |
| self, | |
| ssml_content: str, | |
| infer_config: InferConfig, | |
| adjust_config: AdjustConfig, | |
| enhancer_config: EnhancerConfig, | |
| ) -> None: | |
| assert isinstance(ssml_content, str), "ssml_content must be a string." | |
| assert isinstance( | |
| infer_config, InferConfig | |
| ), "infer_config must be an InferConfig object." | |
| assert isinstance( | |
| adjust_config, AdjustConfig | |
| ), "adjest_config should be AdjustConfig" | |
| assert isinstance( | |
| enhancer_config, EnhancerConfig | |
| ), "enhancer_config must be an EnhancerConfig object." | |
| self.ssml_content = ssml_content | |
| self.infer_config = infer_config | |
| self.adjest_config = adjust_config | |
| self.enhancer_config = enhancer_config | |
| self.validate() | |
| def validate(self): | |
| # TODO params checker | |
| pass | |
| def enqueue(self) -> tuple[np.ndarray, int]: | |
| ssml_content = self.ssml_content | |
| infer_config = self.infer_config | |
| adjust_config = self.adjest_config | |
| enhancer_config = self.enhancer_config | |
| parser = create_ssml_parser() | |
| segments = parser.parse(ssml_content) | |
| for seg in segments: | |
| seg["text"] = text_normalize(seg["text"], is_end=True) | |
| if len(segments) == 0: | |
| raise HTTPException( | |
| status_code=422, detail="The SSML text is empty or parsing failed." | |
| ) | |
| synthesize = SynthesizeSegments( | |
| batch_size=infer_config.batch_size, | |
| eos=infer_config.eos, | |
| spliter_thr=infer_config.spliter_threshold, | |
| ) | |
| audio_segments = synthesize.synthesize_segments(segments) | |
| combined_audio = combine_audio_segments(audio_segments) | |
| sample_rate, audio_data = audio.pydub_to_np(combined_audio) | |
| if enhancer_config.enabled: | |
| nfe = enhancer_config.nfe | |
| solver = enhancer_config.solver | |
| lambd = enhancer_config.lambd | |
| tau = enhancer_config.tau | |
| audio_data, sample_rate = apply_audio_enhance_full( | |
| audio_data=audio_data, | |
| sr=sample_rate, | |
| nfe=nfe, | |
| solver=solver, | |
| lambd=lambd, | |
| tau=tau, | |
| ) | |
| audio_data = audio.apply_prosody_to_audio_data( | |
| audio_data=audio_data, | |
| rate=adjust_config.speed_rate, | |
| pitch=adjust_config.pitch, | |
| volume=adjust_config.volume_gain_db, | |
| sr=sample_rate, | |
| ) | |
| if adjust_config.normalize: | |
| sample_rate, audio_data = audio.apply_normalize( | |
| audio_data=audio_data, headroom=adjust_config.headroom, sr=sample_rate | |
| ) | |
| return audio_data, sample_rate | |