Spaces:
Sleeping
Sleeping
File size: 2,440 Bytes
6114331 55ee0c6 aead49d 6114331 c567be8 55d983c 6114331 55d983c 245735d 55d983c 55ee0c6 6114331 55ee0c6 6114331 55ee0c6 6114331 e37d2df 55ee0c6 e37d2df c567be8 e37d2df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import io
import soundfile as sf
import requests
import numpy as np
app = FastAPI(title="SpeechT5 TTS API")
# Load models once at startup
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Function to load a speaker embedding from a URL
def load_speaker_embedding(url: str) -> torch.Tensor:
response = requests.get(url)
response.raise_for_status()
# Load the .bin file as a float32 tensor
embedding = torch.frombuffer(response.content, dtype=torch.float32)
return embedding.unsqueeze(0) # Add batch dimension
# Example: load US female 1
speaker_embeddings = load_speaker_embedding(
"https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin"
)
def smooth_audio(audio: np.ndarray, window_size: int = 3) -> np.ndarray:
"""
Simple moving average smoothing.
"""
if window_size < 2:
return audio
cumsum = np.cumsum(np.insert(audio, 0, 0))
smoothed = (cumsum[window_size:] - cumsum[:-window_size]) / window_size
# pad to original length
pad_left = window_size // 2
pad_right = window_size - 1 - pad_left
smoothed = np.pad(smoothed, (pad_left, pad_right), mode='edge')
return smoothed
@app.get("/speak")
def speak(text: str = Query(..., description="Text to convert to speech")):
"""
Convert text to speech using SpeechT5 + HiFi-GAN.
Returns a WAV audio stream.
"""
# Prepare input
inputs = processor(text=text, return_tensors="pt")
# Generate speech
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
audio = speech.numpy().astype(np.float32)
# --- Normalize ---
peak = np.max(np.abs(audio))
if peak > 0:
audio = (audio / peak) * 0.1 # Adjustable normalization level
# --- Smooth ---
audio = smooth_audio(audio, window_size=3)
# --- Write WAV as 32-bit float ---
buf = io.BytesIO()
sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT")
buf.seek(0)
return StreamingResponse(buf, media_type="audio/wav")
|