Spaces:
Running
Running
File size: 2,307 Bytes
bc2a0f7 28eb77c 90c1abd 28eb77c 080eb0f 28eb77c 080eb0f 28eb77c 90c1abd 28eb77c 08aa67f 28eb77c 90c1abd bc2a0f7 90c1abd bc2a0f7 28eb77c 90c1abd 28eb77c 90c1abd 28eb77c 90c1abd 938a6cd 28eb77c 98a160d 90c1abd 1e3e7b5 28eb77c 98a160d 225cd3e 646ceb9 07f1694 f67668a 6a828eb f67668a 9129121 f67668a 07f1694 98a160d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from typing import Generator, Iterator
import numpy as np
import torch
import spaces
from kokoro import KPipeline, KModel
from stream2sentence import generate_sentences
from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES
__all__ = ["stream_text_to_speech"]
device = 0 if torch.cuda.is_available() else "cpu"
model = KModel().to(device).eval()
# Create a pipeline for each language. Kokoro language codes:
# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
# 🇪🇸 'e' => Spanish es
# 🇫🇷 'f' => French fr-fr
# 🇮🇳 'h' => Hindi hi
# 🇮🇹 'i' => Italian it
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
# 🇧🇷 'p' => Brazilian Portuguese pt-br
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
pipes = {
lang_code: KPipeline(lang_code=lang_code, model=model, device=device)
for lang_code in "abzefhip"
# for lang_code in "abjzefhip"
}
# Preload voices into pipelines
for voice_code in VOICES.values():
# First letter of the voice code is the language code (kokoro format)
lang_code = voice_code[0]
if lang_code in pipes:
pipes[lang_code].load_voice(voice_code)
def stream_text_to_speech(
text_stream: Iterator[str], voice: str | None = None
) -> Generator[tuple[int, np.ndarray], None, None]:
"""
Convert text to speech using the specified voice.
Args:
text (str): The text to convert to speech.
voice (str): The voice to use for the conversion. Default to af_heart
Returns:
np.ndarray: The audio as a NumPy array.
"""
voice = voice or "af_heart"
if voice not in VOICES.values():
raise ValueError(f"Voice '{voice}' is not available.")
kokoro_lang = voice[0]
standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]
for text in generate_sentences(
text_stream, language=standard_lang_code, full_sentence_delimiters=".?!:\n…。"
):
text = text.strip()
print(f"Streaming audio for text: {text}")
for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice):
yield 24000, audio
@spaces.GPU(duration=10)
def text_to_speech(
text: str,
pipe_key: str,
voice: str | None = None,
):
for _, __, audio in pipes[pipe_key](text, voice=voice):
yield audio.numpy()
|