import os from typing import Generator, Iterator import numpy as np from fastrtc import KokoroTTSOptions, get_tts_model from stream2sentence import generate_sentences from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES __all__ = ["stream_text_to_speech"] if not os.getenv("LOCALE_RUN"): model = get_tts_model(model="kokoro") def stream_text_to_speech( text_stream: Iterator[str], voice: str | None = None ) -> Generator[tuple[int, np.ndarray], None, None]: """ Convert text to speech using the specified voice. Args: text_stream (Iterator[str]): An iterator that yields text strings to convert to speech. voice (str | None): The voice to use for the conversion. Default to af_heart. Yields: np.ndarray: The audio as a NumPy array. """ voice = voice or "af_heart" if voice not in VOICES.values(): raise ValueError(f"Voice '{voice}' is not available.") kokoro_lang = voice[0] standard_lang_code = KOKORO_TO_STD_LANG.get(kokoro_lang, "en") options = KokoroTTSOptions(voice=voice, lang=standard_lang_code) for text in generate_sentences(text_stream, language=standard_lang_code): for audio in model.stream_tts_sync(text, options): yield audio