File size: 2,307 Bytes
bc2a0f7
28eb77c
 
 
 
 
90c1abd
28eb77c
080eb0f
28eb77c
080eb0f
28eb77c
90c1abd
28eb77c
 
 
 
 
 
 
 
 
 
 
 
 
 
08aa67f
 
28eb77c
 
 
 
 
 
 
 
 
90c1abd
bc2a0f7
90c1abd
bc2a0f7
28eb77c
 
90c1abd
28eb77c
 
 
90c1abd
28eb77c
 
 
90c1abd
938a6cd
28eb77c
98a160d
90c1abd
1e3e7b5
28eb77c
98a160d
 
 
225cd3e
646ceb9
07f1694
 
f67668a
 
6a828eb
f67668a
 
9129121
f67668a
 
07f1694
98a160d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from typing import Generator, Iterator

import numpy as np
import torch
import spaces
from kokoro import KPipeline, KModel
from stream2sentence import generate_sentences

from mcp_host.tts.utils import KOKORO_TO_STD_LANG, VOICES

__all__ = ["stream_text_to_speech"]


device = 0 if torch.cuda.is_available() else "cpu"
model = KModel().to(device).eval()

# Create a pipeline for each language. Kokoro language codes:
# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
# 🇪🇸 'e' => Spanish es
# 🇫🇷 'f' => French fr-fr
# 🇮🇳 'h' => Hindi hi
# 🇮🇹 'i' => Italian it
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
# 🇧🇷 'p' => Brazilian Portuguese pt-br
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
pipes = {
    lang_code: KPipeline(lang_code=lang_code, model=model, device=device)
    for lang_code in "abzefhip"
    # for lang_code in "abjzefhip"
}

# Preload voices into pipelines
for voice_code in VOICES.values():
    # First letter of the voice code is the language code (kokoro format)
    lang_code = voice_code[0]
    if lang_code in pipes:
        pipes[lang_code].load_voice(voice_code)


def stream_text_to_speech(
    text_stream: Iterator[str], voice: str | None = None
) -> Generator[tuple[int, np.ndarray], None, None]:
    """
    Convert text to speech using the specified voice.

    Args:
        text (str): The text to convert to speech.
        voice (str): The voice to use for the conversion. Default to af_heart

    Returns:
        np.ndarray: The audio as a NumPy array.
    """
    voice = voice or "af_heart"
    if voice not in VOICES.values():
        raise ValueError(f"Voice '{voice}' is not available.")

    kokoro_lang = voice[0]
    standard_lang_code = KOKORO_TO_STD_LANG[kokoro_lang]

    for text in generate_sentences(
        text_stream, language=standard_lang_code, full_sentence_delimiters=".?!:\n…。"
    ):
        text = text.strip()
        print(f"Streaming audio for text: {text}")
        for audio in text_to_speech(text, pipe_key=kokoro_lang, voice=voice):
            yield 24000, audio


@spaces.GPU(duration=10)
def text_to_speech(
    text: str,
    pipe_key: str,
    voice: str | None = None,
):
    for _, __, audio in pipes[pipe_key](text, voice=voice):
        yield audio.numpy()