text2speech / app.py
anuj-exe's picture
Update app.py
6114331 verified
from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import io
import soundfile as sf
import requests
import numpy as np
app = FastAPI(title="SpeechT5 TTS API")
# Load models once at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Function to load a speaker embedding from a URL
def load_speaker_embedding(url: str) -> torch.Tensor:
response = requests.get(url)
response.raise_for_status()
# Load the .bin file as a float32 tensor
embedding = torch.frombuffer(response.content, dtype=torch.float32)
return embedding.unsqueeze(0) # Add batch dimension
# Example: load US female 1
speaker_embeddings = load_speaker_embedding(
"https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
)
def smooth_audio(audio: np.ndarray, window_size: int = 3) -> np.ndarray:
"""
Simple moving average smoothing.
"""
if window_size < 2:
return audio
cumsum = np.cumsum(np.insert(audio, 0, 0))
smoothed = (cumsum[window_size:] - cumsum[:-window_size]) / window_size
# pad to original length
pad_left = window_size // 2
pad_right = window_size - 1 - pad_left
smoothed = np.pad(smoothed, (pad_left, pad_right), mode='edge')
return smoothed
@app.get("/speak")
def speak(text: str = Query(..., description="Text to convert to speech")):
"""
Convert text to speech using SpeechT5 + HiFi-GAN.
Returns a WAV audio stream.
"""
# Prepare input
inputs = processor(text=text, return_tensors="pt")
# Generate speech
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
audio = speech.numpy().astype(np.float32)
# --- Normalize ---
peak = np.max(np.abs(audio))
if peak > 0:
audio = (audio / peak) * 0.1 # Adjustable normalization level
# --- Smooth ---
audio = smooth_audio(audio, window_size=3)
# --- Write WAV as 32-bit float ---
buf = io.BytesIO()
sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT")
buf.seek(0)
return StreamingResponse(buf, media_type="audio/wav")