Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, Query | |
| from fastapi.responses import StreamingResponse | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| import torch | |
| import io | |
| import soundfile as sf | |
| import requests | |
| app = FastAPI(title="SpeechT5 TTS API") | |
| # Load models once at startup | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # Function to load a speaker embedding from a URL | |
| def load_speaker_embedding(url: str) -> torch.Tensor: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| # Load the .bin file as a float32 tensor | |
| embedding = torch.frombuffer(response.content, dtype=torch.float32) | |
| return embedding.unsqueeze(0) # Add batch dimension | |
| # Example: load US female 1 | |
| speaker_embeddings = load_speaker_embedding( | |
| "https://huggingface.co/datasets/Xenova/cmu-arctic-xvectors-extracted/resolve/main/cmu_us_slt_arctic-wav-arctic_a0001.bin" | |
| ) | |
| def speak(text: str = Query(..., description="Text to convert to speech")): | |
| """ | |
| Convert text to speech using SpeechT5 + HiFi-GAN. | |
| Returns a WAV audio stream. | |
| """ | |
| # Prepare input | |
| inputs = processor(text=text, return_tensors="pt") | |
| # Generate speech | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
| # Convert to bytes buffer | |
| buf = io.BytesIO() | |
| sf.write(buf, speech.numpy(), samplerate=16000, format="WAV") | |
| buf.seek(0) | |
| return StreamingResponse(buf, media_type="audio/wav") | |