vibe-shopping / mcp_host /tts /hf_zero_gpu_tts.py
sitatech's picture
Add notes
98a160d
from typing import Generator, Iterator
import numpy as np
import torch
import spaces
from kokoro import KPipeline, KModel
from stream2sentence import generate_sentences
from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES
__all__ = ["stream_text_to_speech"]
device = 0 if torch.cuda.is_available() else "cpu"
model = KModel().to(device).eval()
# Create a pipeline for each language. Kokoro language codes:
# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
# 🇪🇸 'e' => Spanish es
# 🇫🇷 'f' => French fr-fr
# 🇮🇳 'h' => Hindi hi
# 🇮🇹 'i' => Italian it
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
# 🇧🇷 'p' => Brazilian Portuguese pt-br
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
pipes = {
lang_code: KPipeline(lang_code=lang_code, model=model, device=device)
for lang_code in "abzefhip"
# for lang_code in "abjzefhip"
}
# Preload voices into pipelines
for voice_code in VOICES.values():
# First letter of the voice code is the language code (kokoro format)
lang_code = voice_code[0]
if lang_code in pipes:
pipes[lang_code].load_voice(voice_code)
def stream_text_to_speech(
text_stream: Iterator[str], voice: str | None = None
) -> Generator[tuple[int, np.ndarray], None, None]:
"""
Convert text to speech using the specified voice.
Args:
text (str): The text to convert to speech.
voice (str): The voice to use for the conversion. Default to af_heart
Returns:
np.ndarray: The audio as a NumPy array.
"""
voice = voice or "af_heart"
if voice not in VOICES.values():
raise ValueError(f"Voice '{voice}' is not available.")
kokoro_lang = voice[0]
standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
for text in generate_sentences(
text_stream, language=standard_lang_code, full_sentence_delimiters=".?!:\n…。"
):
text = text.strip()
print(f"Streaming audio for text: {text}")
for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice):
yield 24000, audio
@spaces.GPU(duration=10)
def text_to_speech(
text: str,
pipe_key: str,
voice: str | None = None,
):
for _, __, audio in pipes[pipe_key](text, voice=voice):
yield audio.numpy()