Spaces:
Sleeping
Sleeping
| import numpy as np | |
| def speak(text: str = Query(..., description="Text to convert to speech")): | |
| # Prepare input | |
| inputs = processor(text=text, return_tensors="pt") | |
| # Generate speech | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
| audio = speech.numpy().astype(np.float32) | |
| # --- Normalize --- | |
| peak = np.max(np.abs(audio)) | |
| if peak > 0: | |
| audio = (audio / peak) * 0.1 # match your NORMALIZATION_LEVEL | |
| # --- Smooth (moving average) --- | |
| window_size = 3 # like SMOOTHING_WINDOW | |
| if window_size > 1: | |
| cumsum = np.cumsum(np.insert(audio, 0, 0)) | |
| audio = (cumsum[window_size:] - cumsum[:-window_size]) / window_size | |
| # pad to original length | |
| audio = np.pad(audio, (window_size//2, window_size-1-window_size//2), mode='edge') | |
| # Write WAV as 32-bit float | |
| buf = io.BytesIO() | |
| sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT") | |
| buf.seek(0) | |
| return StreamingResponse(buf, media_type="audio/wav") | |