File size: 1,654 Bytes
3075963
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import io
import soundfile as sf
import requests

app = FastAPI(title="SpeechT5 TTS API")

# Load models once at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


# Function to load a speaker embedding from a URL
def load_speaker_embedding(url: str) -> torch.Tensor:
    response = requests.get(url)
    response.raise_for_status()
    # Load the .bin file as a float32 tensor
    embedding = torch.frombuffer(response.content, dtype=torch.float32)
    return embedding.unsqueeze(0)  # Add batch dimension


# Example: load US female 1
speaker_embeddings = load_speaker_embedding(
    "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
)


@app.get("/speak")
def speak(text: str = Query(..., description="Text to convert to speech")):
    """
    Convert text to speech using SpeechT5 + HiFi-GAN.
    Returns a WAV audio stream.
    """
    # Prepare input
    inputs = processor(text=text, return_tensors="pt")

    # Generate speech
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    # Convert to bytes buffer
    buf = io.BytesIO()
    sf.write(buf, speech.numpy(), samplerate=16000, format="WAV")
    buf.seek(0)

    return StreamingResponse(buf, media_type="audio/wav")