Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, Query | |
| from fastapi.responses import StreamingResponse | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| import torch | |
| import io | |
| import soundfile as sf | |
| import requests | |
| import numpy as np | |
| app = FastAPI(title="SpeechT5 TTS API") | |
| # Load models once at startup | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # Function to load a speaker embedding from a URL | |
| def load_speaker_embedding(url: str) -> torch.Tensor: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| # Load the .bin file as a float32 tensor | |
| embedding = torch.frombuffer(response.content, dtype=torch.float32) | |
| return embedding.unsqueeze(0) # Add batch dimension | |
| # Example: load US female 1 | |
| speaker_embeddings = load_speaker_embedding( | |
| "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin" | |
| ) | |
| def smooth_audio(audio: np.ndarray, window_size: int = 3) -> np.ndarray: | |
| """ | |
| Simple moving average smoothing. | |
| """ | |
| if window_size < 2: | |
| return audio | |
| cumsum = np.cumsum(np.insert(audio, 0, 0)) | |
| smoothed = (cumsum[window_size:] - cumsum[:-window_size]) / window_size | |
| # pad to original length | |
| pad_left = window_size // 2 | |
| pad_right = window_size - 1 - pad_left | |
| smoothed = np.pad(smoothed, (pad_left, pad_right), mode='edge') | |
| return smoothed | |
| def speak(text: str = Query(..., description="Text to convert to speech")): | |
| """ | |
| Convert text to speech using SpeechT5 + HiFi-GAN. | |
| Returns a WAV audio stream. | |
| """ | |
| # Prepare input | |
| inputs = processor(text=text, return_tensors="pt") | |
| # Generate speech | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
| audio = speech.numpy().astype(np.float32) | |
| # --- Normalize --- | |
| peak = np.max(np.abs(audio)) | |
| if peak > 0: | |
| audio = (audio / peak) * 0.1 # Adjustable normalization level | |
| # --- Smooth --- | |
| audio = smooth_audio(audio, window_size=3) | |
| # --- Write WAV as 32-bit float --- | |
| buf = io.BytesIO() | |
| sf.write(buf, audio, samplerate=16000, format="WAV", subtype="FLOAT") | |
| buf.seek(0) | |
| return StreamingResponse(buf, media_type="audio/wav") | |