Spaces:

voicing-ai
/

ParlerVoice

Running on Zero

File size: 8,573 Bytes

import argparse
import os
import glob
from typing import Tuple
import gradio as gr
import torch
import spaces

from parlervoice_infer.engine import ParlerVoiceInference
from parlervoice_infer.config import GenerationConfig
from parlervoice_infer.presets import PRESETS
from parlervoice_infer.constants import (
    GENDER_MAP,
    PITCH_BINS as pitch_mean_bins,
    RATE_BINS as speaker_rate_bins,
    MONOTONY_BINS as speech_monotony_bins,
    NOISE_BINS as noise_bins,
    REVERB_BINS as reverberation_bins,
)
from parlervoice_infer.description import build_advanced_description


# --- Global inference engine ---
_INFER: ParlerVoiceInference = None
CHECKPOINT = "voicing-ai/ParlerVoice"
BASE_MODEL = "parler-tts/parler-tts-mini-v1.1"


# --- Load model (singleton) ---
def _ensure_infer(checkpoint: str, base_model: str) -> ParlerVoiceInference:
    global _INFER
    if _INFER is None:
        print("[INFO] Loading model...")
        _INFER = ParlerVoiceInference(checkpoint_path=checkpoint, base_model_path=base_model)
    return _INFER


# --- Cleanup old outputs ---
def cleanup_outputs(max_files=20):
    """Keep only the latest `max_files` WAVs in outputs/ directory."""
    os.makedirs("outputs", exist_ok=True)
    files = sorted(glob.glob("outputs/*.wav"), key=os.path.getmtime)
    if len(files) > max_files:
        old_files = files[:len(files) - max_files]
        for f in old_files:
            try:
                os.remove(f)
            except Exception:
                pass


# --- Audio generation ---
@spaces.GPU(duration=120)
def generate_audio(
    prompt: str,
    speaker: str,
    tone: str,
    emotion: str,
    pitch: str,
    pace: str,
    monotony: str,
    noise: str,
    reverberation: str,
) -> Tuple[str, str]:
    try:
        infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
        description = build_advanced_description(
            speaker=speaker,
            pace=pace,
            noise=noise,
            reverberation=reverberation,
            monotony=monotony,
            pitch=pitch,
            emotion=emotion,
            tone=tone,
            add_context=True,
        )
        cfg = GenerationConfig()

        os.makedirs("outputs", exist_ok=True)
        out_path = os.path.join("outputs", f"parler_out_{os.getpid()}.wav")

        cleanup_outputs(max_files=20)

        print(f"[INFO] Generating audio to {out_path} ...")
        audio_array, saved = infer.generate_audio(
            prompt=prompt,
            description=description,
            config=cfg,
            output_path=out_path,
        )

        if not saved or not os.path.isfile(saved):
            import soundfile as sf
            if audio_array is None or len(audio_array) == 0:
                raise ValueError("generate_audio() did not return valid audio data.")
            sf.write(out_path, audio_array, getattr(infer, "sampling_rate", 22050))
            saved = out_path

        return saved, "Success"

    except Exception as e:
        import traceback
        print(traceback.format_exc())
        return "", f"Error: {e}"


# --- Gradio demo ---
def build_demo() -> gr.Blocks:
    SPEAKER_NAMES = sorted(GENDER_MAP.keys())
    preset_names = ["Custom"] + list(PRESETS.keys())

    with gr.Blocks() as demo:
        gr.Markdown("# ParlerVoice")

        prompt_input = gr.Textbox(label="Enter Text", placeholder="Type what the speaker says...")
        speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=SPEAKER_NAMES, value=SPEAKER_NAMES[0])

        preset_dropdown = gr.Dropdown(
            label="Voice Preset",
            choices=preset_names,
            value="Custom",
            interactive=True,
        )

        with gr.Group():
            tone = gr.Dropdown(
                label="Tone",
                choices=[
                    "serious",
                    "dramatic",
                    "casual",
                    "professional",
                    "storytelling",
                    "narrative",
                    "emotional",
                    "energetic",
                    "loving"
                ],
                value="serious",
            )
            
            emotion = gr.Dropdown(
                label="Emotion",
                choices=[
                    "neutral",
                    "sad",
                    "happy",
                    "angry",
                    "excited",
                    "confused",
                    "loving",
                    "casual"
                ],
                value="neutral",
            )

            pitch = gr.Dropdown(label="Pitch", choices=pitch_mean_bins, value="moderate pitch")
            pace = gr.Dropdown(label="Pace", choices=speaker_rate_bins, value="moderate speed")
            monotony = gr.Dropdown(label="Speech Style", choices=speech_monotony_bins, value="expressive and animated")
            noise = gr.Dropdown(label="Noise", choices=noise_bins, value="very clear")
            reverberation = gr.Dropdown(label="Reverberation", choices=reverberation_bins, value="very close-sounding")

        gr.Markdown(
            """
**Sample Descriptions:**  
- Connor delivers a serious and professional message with a calm, even pace and a moderate pitch.  
- Madison delivers a sad and disappointed speech. Her voice is slightly high-pitched and sounds emotional.  
- Jackson delivers a narrative with a slightly dramatic tone and clean recording.
"""
        )

        def apply_preset(preset_name: str):
            if preset_name == "Custom" or preset_name not in PRESETS:
                return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
            preset = PRESETS[preset_name]
            return (
                gr.update(value=preset.get("tone")),
                gr.update(value=preset.get("emotion")),
                gr.update(value=preset.get("pitch")),
                gr.update(value=preset.get("pace")),
                gr.update(value=preset.get("monotony")),
            )

        preset_dropdown.change(
            fn=apply_preset,
            inputs=preset_dropdown,
            outputs=[tone, emotion, pitch, pace, monotony],
        )

        generate_btn = gr.Button("Generate Audio")
        audio_output = gr.Audio(type="filepath", label="Generated Audio")
        status_output = gr.Textbox(label="Status", interactive=False)

        generate_btn.click(
            fn=generate_audio,
            inputs=[
                prompt_input,
                speaker_dropdown,
                tone,
                emotion,
                pitch,
                pace,
                monotony,
                noise,
                reverberation,
            ],
            outputs=[audio_output, status_output],
        )

    return demo


# --- Warmup logic ---
@spaces.GPU(duration=180)
def warmup_model():
    """Run a few dummy sentences to preload model & CUDA."""
    infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
    cfg = GenerationConfig(max_length=256)
    warmup_sentences = [
        "Hello there, this is a warmup test.",
        "The model is preparing to generate speech.",
        "Please wait a moment while we load everything.",
        "This is sentence number four for warmup.",
        "Warmup complete, ready to synthesize voice!",
    ]
    speaker = list(GENDER_MAP.keys())[0]
    for text in warmup_sentences:
        try:
            desc = build_advanced_description(
                speaker=speaker,
                pace="moderate speed",
                noise="very clear",
                reverberation="very close-sounding",
                monotony="expressive and animated",
                pitch="moderate pitch",
                emotion="neutral",
                tone="serious",
                add_context=False,
            )
            infer.generate_audio(text, desc, cfg)
        except Exception as e:
            print(f"[WARN] Warmup failed for '{text}': {e}")
    print("[INFO] Warmup completed ✅")


def _parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="ParlerVoice Gradio App")
    p.add_argument("--server-name", default="0.0.0.0")
    p.add_argument("--server-port", type=int, default=8000)
    p.add_argument("--share", action="store_true")
    return p.parse_args()


def main() -> int:
    warmup_model()
    args = _parse_args()
    demo = build_demo()
    # demo.launch(server_name=args.server_name, server_port=args.server_port, share=args.share)
    demo.launch()
    return 0


if __name__ == "__main__":
    raise SystemExit(main())