text2speech / app.py
anuj-exe's picture
Update app.py
55ee0c6 verified
raw
history blame
1.06 kB
import numpy as np
@app.get("/speak")
def speak(text: str = Query(..., description="Text to convert to speech")):
# Prepare input
inputs = processor(text=text, return_tensors="pt")
# Generate speech
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
audio = speech.numpy().astype(np.float32)
# --- Normalize ---
peak = np.max(np.abs(audio))
if peak > 0:
audio = (audio / peak) * 0.1 # match your NORMALIZATION_LEVEL
# --- Smooth (moving average) ---
window_size = 3 # like SMOOTHING_WINDOW
if window_size > 1:
cumsum = np.cumsum(np.insert(audio, 0, 0))
audio = (cumsum[window_size:] - cumsum[:-window_size]) / window_size
# pad to original length
audio = np.pad(audio, (window_size//2, window_size-1-window_size//2), mode='edge')
# Write WAV as 32-bit float
buf = io.BytesIO()
sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT")
buf.seek(0)
return StreamingResponse(buf, media_type="audio/wav")