File size: 2,440 Bytes
6114331
 
 
 
 
 
 
55ee0c6
aead49d
6114331
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c567be8
55d983c
6114331
 
 
 
55d983c
 
245735d
55d983c
 
55ee0c6
 
 
 
 
6114331
55ee0c6
6114331
 
55ee0c6
6114331
e37d2df
55ee0c6
e37d2df
c567be8
e37d2df
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import io
import soundfile as sf
import requests
import numpy as np

app = FastAPI(title="SpeechT5 TTS API")

# Load models once at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


# Function to load a speaker embedding from a URL
def load_speaker_embedding(url: str) -> torch.Tensor:
    response = requests.get(url)
    response.raise_for_status()
    # Load the .bin file as a float32 tensor
    embedding = torch.frombuffer(response.content, dtype=torch.float32)
    return embedding.unsqueeze(0)  # Add batch dimension


# Example: load US female 1
speaker_embeddings = load_speaker_embedding(
    "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
)


def smooth_audio(audio: np.ndarray, window_size: int = 3) -> np.ndarray:
    """
    Simple moving average smoothing.
    """
    if window_size < 2:
        return audio
    cumsum = np.cumsum(np.insert(audio, 0, 0))
    smoothed = (cumsum[window_size:] - cumsum[:-window_size]) / window_size
    # pad to original length
    pad_left = window_size // 2
    pad_right = window_size - 1 - pad_left
    smoothed = np.pad(smoothed, (pad_left, pad_right), mode='edge')
    return smoothed


@app.get("/speak")
def speak(text: str = Query(..., description="Text to convert to speech")):
    """
    Convert text to speech using SpeechT5 + HiFi-GAN.
    Returns a WAV audio stream.
    """
    # Prepare input
    inputs = processor(text=text, return_tensors="pt")

    # Generate speech
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    audio = speech.numpy().astype(np.float32)

    # --- Normalize ---
    peak = np.max(np.abs(audio))
    if peak > 0:
        audio = (audio / peak) * 0.1  # Adjustable normalization level

    # --- Smooth ---
    audio = smooth_audio(audio, window_size=3)

    # --- Write WAV as 32-bit float ---
    buf = io.BytesIO()
    sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT")
    buf.seek(0)

    return StreamingResponse(buf, media_type="audio/wav")