File size: 3,842 Bytes
91a984b
 
 
 
 
 
 
 
 
 
 
b1888cb
4a26f56
aabd3f5
91a984b
 
b1888cb
91a984b
 
 
 
b1888cb
 
9f2ddc0
 
 
 
 
 
 
 
b1888cb
 
91a984b
 
 
 
 
 
 
 
9f2ddc0
900a05f
 
 
 
 
 
91a984b
 
 
b059b6d
 
 
 
 
91a984b
b1888cb
 
 
 
 
 
 
9f2ddc0
 
 
91a984b
b059b6d
91a984b
 
 
 
b059b6d
9f2ddc0
b059b6d
9f2ddc0
 
b059b6d
900a05f
 
91a984b
 
 
900a05f
91a984b
900a05f
91a984b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import io
import soundfile as sf
import requests
import numpy as np

app = FastAPI(title="SpeechT5 TTS API")

# Adjustable parameters
NORMALIZATION_LEVEL = 0.1
SMOOTHING_WINDOW = 7
BIT_DEPTH = "32f"

# Load models once at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Map integer to speaker embedding URL
SPEAKER_EMBEDDINGS = {
    0: "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin",  # Normal
    1: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin",  # US female 1
    2: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_clb_arctic-wav-arctic_a0001.bin",  # US female 2
    3: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_bdl_arctic-wav-arctic_a0003.bin",  # US male 1
    4: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_rms_arctic-wav-arctic_a0003.bin",  # US male 2
    5: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_jmk_arctic-wav-arctic_a0002.bin",  # Canadian male
    6: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_b0002.bin",  # Scottish male
    7: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_ksp_arctic-wav-arctic_a0007.bin",  # Indian male
}


def load_speaker_embedding(url: str) -> torch.Tensor:
    response = requests.get(url)
    response.raise_for_status()
    embedding = torch.frombuffer(response.content, dtype=torch.float32)
    return embedding.unsqueeze(0)


def smooth_audio_nodejs(audio: np.ndarray, window_size: int) -> np.ndarray:
    smoothed = np.copy(audio)
    half = window_size // 2
    for i in range(len(audio)):
        start = max(0, i - half)
        end = min(len(audio), i + half + 1)
        smoothed[i] = np.mean(audio[start:end])
    return smoothed


def apply_fade_out(audio: np.ndarray, fade_samples: int = 256) -> np.ndarray:
    fade = np.linspace(1, 0, fade_samples)
    audio[-fade_samples:] *= fade
    return audio

@app.get("/speak")
def speak(
    text: str = Query(..., description="Text to convert to speech"),
    speaker: int = Query(1, ge=0, le=7, description="Speaker ID (0-7)")
):
    embedding_url = SPEAKER_EMBEDDINGS[speaker]
    speaker_embedding = load_speaker_embedding(embedding_url)

    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
    audio = speech.numpy().astype(np.float32)

    # --- Normalize ---
    peak = np.max(np.abs(audio))
    if peak > 0:
        audio = (audio / peak) * NORMALIZATION_LEVEL

    # --- Smooth audio ---
    audio = smooth_audio_nodejs(audio, SMOOTHING_WINDOW * 2)  # Slightly larger window

    # --- Fade out to remove clicks at the end ---
    fade_samples = min(512, len(audio)//10)  # ~30ms at 16kHz
    audio = apply_fade_out(audio, fade_samples)

    # --- Bit depth ---
    if BIT_DEPTH == "16":
        pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16)
    else:
        pcm = audio

    # --- Write WAV ---
    buf = io.BytesIO()
    subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT"
    sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype)
    buf.seek(0)

    return StreamingResponse(buf, media_type="audio/wav")