text2speech / app.py
anuj-exe's picture
Create app.py
4f0a2be verified
raw
history blame
887 Bytes
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
import torch
import soundfile as sf
import io
app = FastAPI()
# Load processor & model
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
# Dummy speaker embedding (flat voice). You can later replace with real embeddings.
speaker_embeddings = torch.zeros((1, 512))
@app.get("/speak")
def speak(text: str):
inputs = processor(text=text, return_tensors="pt")
# Generate speech
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings)
# Save into memory buffer
buf = io.BytesIO()
sf.write(buf, speech.numpy(), 16000, format="WAV")
buf.seek(0)
return StreamingResponse(buf, media_type="audio/wav")