text2speech / otherApp.py
anuj-exe's picture
Create otherApp.py
3075963 verified
from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import io
import soundfile as sf
import requests
app = FastAPI(title="SpeechT5 TTS API")
# Load models once at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Function to load a speaker embedding from a URL
def load_speaker_embedding(url: str) -> torch.Tensor:
response = requests.get(url)
response.raise_for_status()
# Load the .bin file as a float32 tensor
embedding = torch.frombuffer(response.content, dtype=torch.float32)
return embedding.unsqueeze(0) # Add batch dimension
# Example: load US female 1
speaker_embeddings = load_speaker_embedding(
"https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
)
@app.get("/speak")
def speak(text: str = Query(..., description="Text to convert to speech")):
"""
Convert text to speech using SpeechT5 + HiFi-GAN.
Returns a WAV audio stream.
"""
# Prepare input
inputs = processor(text=text, return_tensors="pt")
# Generate speech
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
# Convert to bytes buffer
buf = io.BytesIO()
sf.write(buf, speech.numpy(), samplerate=16000, format="WAV")
buf.seek(0)
return StreamingResponse(buf, media_type="audio/wav")