from typing import Generator, Iterator import numpy as np import torch import spaces from kokoro import KPipeline, KModel from stream2sentence import generate_sentences from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES __all__ = ["stream_text_to_speech"] device = 0 if torch.cuda.is_available() else "cpu" model = KModel().to(device).eval() # Create a pipeline for each language. Kokoro language codes: # šŸ‡ŗšŸ‡ø 'a' => American English, šŸ‡¬šŸ‡§ 'b' => British English # šŸ‡ŖšŸ‡ø 'e' => Spanish es # šŸ‡«šŸ‡· 'f' => French fr-fr # šŸ‡®šŸ‡³ 'h' => Hindi hi # šŸ‡®šŸ‡¹ 'i' => Italian it # šŸ‡ÆšŸ‡µ 'j' => Japanese: pip install misaki[ja] # šŸ‡§šŸ‡· 'p' => Brazilian Portuguese pt-br # šŸ‡ØšŸ‡³ 'z' => Mandarin Chinese: pip install misaki[zh] pipes = { lang_code: KPipeline(lang_code=lang_code, model=model, device=device) for lang_code in "abzefhip" # for lang_code in "abjzefhip" } # Preload voices into pipelines for voice_code in VOICES.values(): # First letter of the voice code is the language code (kokoro format) lang_code = voice_code[0] if lang_code in pipes: pipes[lang_code].load_voice(voice_code) def stream_text_to_speech( text_stream: Iterator[str], voice: str | None = None ) -> Generator[tuple[int, np.ndarray], None, None]: """ Convert text to speech using the specified voice. Args: text (str): The text to convert to speech. voice (str): The voice to use for the conversion. Default to af_heart Returns: np.ndarray: The audio as a NumPy array. """ voice = voice or "af_heart" if voice not in VOICES.values(): raise ValueError(f"Voice '{voice}' is not available.") kokoro_lang = voice[0] standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang] for text in generate_sentences( text_stream, language=standard_lang_code, full_sentence_delimiters=".?!:\n…。" ): text = text.strip() print(f"Streaming audio for text: {text}") for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice): yield 24000, audio @spaces.GPU(duration=10) def text_to_speech( text: str, pipe_key: str, voice: str | None = None, ): for _, __, audio in pipes[pipe_key](text, voice=voice): yield audio.numpy()