from fastapi import FastAPI, Query from fastapi.responses import StreamingResponse from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import torch import io import soundfile as sf import requests import numpy as np app = FastAPI(title="SpeechT5 TTS API") # Adjustable parameters NORMALIZATION_LEVEL = 0.1 SMOOTHING_WINDOW = 7 BIT_DEPTH = "32f" # Load models once at startup processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Map integer to speaker embedding URL SPEAKER_EMBEDDINGS = { 0: "https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin", # Normal 1: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin", # US female 1 2: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_clb_arctic-wav-arctic_a0001.bin", # US female 2 3: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_bdl_arctic-wav-arctic_a0003.bin", # US male 1 4: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_rms_arctic-wav-arctic_a0003.bin", # US male 2 5: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_jmk_arctic-wav-arctic_a0002.bin", # Canadian male 6: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_awb_arctic-wav-arctic_b0002.bin", # Scottish male 7: "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_ksp_arctic-wav-arctic_a0007.bin", # Indian male } def load_speaker_embedding(url: str) -> torch.Tensor: response = requests.get(url) response.raise_for_status() embedding = torch.frombuffer(response.content, dtype=torch.float32) return embedding.unsqueeze(0) def smooth_audio_nodejs(audio: np.ndarray, window_size: int) -> np.ndarray: smoothed = np.copy(audio) half = window_size // 2 for i in range(len(audio)): start = max(0, i - half) end = min(len(audio), i + half + 1) smoothed[i] = np.mean(audio[start:end]) return smoothed def apply_fade_out(audio: np.ndarray, fade_samples: int = 256) -> np.ndarray: fade = np.linspace(1, 0, fade_samples) audio[-fade_samples:] *= fade return audio @app.get("/speak") def speak( text: str = Query(..., description="Text to convert to speech"), speaker: int = Query(1, ge=0, le=7, description="Speaker ID (0-7)") ): embedding_url = SPEAKER_EMBEDDINGS[speaker] speaker_embedding = load_speaker_embedding(embedding_url) inputs = processor(text=text, return_tensors="pt") speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder) audio = speech.numpy().astype(np.float32) # --- Normalize --- peak = np.max(np.abs(audio)) if peak > 0: audio = (audio / peak) * NORMALIZATION_LEVEL # --- Smooth audio --- audio = smooth_audio_nodejs(audio, SMOOTHING_WINDOW * 2) # Slightly larger window # --- Fade out to remove clicks at the end --- fade_samples = min(512, len(audio)//10) # ~30ms at 16kHz audio = apply_fade_out(audio, fade_samples) # --- Bit depth --- if BIT_DEPTH == "16": pcm = np.clip(np.round(audio * 32767), -32768, 32767).astype(np.int16) else: pcm = audio # --- Write WAV --- buf = io.BytesIO() subtype = "PCM_16" if BIT_DEPTH == "16" else "FLOAT" sf.write(buf, pcm, samplerate=16000, format="WAV", subtype=subtype) buf.seek(0) return StreamingResponse(buf, media_type="audio/wav")