import argparse import os import glob from typing import Tuple import gradio as gr import torch import spaces from parlervoice_infer.engine import ParlerVoiceInference from parlervoice_infer.config import GenerationConfig from parlervoice_infer.presets import PRESETS from parlervoice_infer.constants import ( GENDER_MAP, PITCH_BINS as pitch_mean_bins, RATE_BINS as speaker_rate_bins, MONOTONY_BINS as speech_monotony_bins, NOISE_BINS as noise_bins, REVERB_BINS as reverberation_bins, ) from parlervoice_infer.description import build_advanced_description # --- Global inference engine --- _INFER: ParlerVoiceInference = None CHECKPOINT = "voicing-ai/ParlerVoice" BASE_MODEL = "parler-tts/parler-tts-mini-v1.1" # --- Load model (singleton) --- def _ensure_infer(checkpoint: str, base_model: str) -> ParlerVoiceInference: global _INFER if _INFER is None: print("[INFO] Loading model...") _INFER = ParlerVoiceInference(checkpoint_path=checkpoint, base_model_path=base_model) return _INFER # --- Cleanup old outputs --- def cleanup_outputs(max_files=20): """Keep only the latest `max_files` WAVs in outputs/ directory.""" os.makedirs("outputs", exist_ok=True) files = sorted(glob.glob("outputs/*.wav"), key=os.path.getmtime) if len(files) > max_files: old_files = files[:len(files) - max_files] for f in old_files: try: os.remove(f) except Exception: pass # --- Audio generation --- @spaces.GPU(duration=120) def generate_audio( prompt: str, speaker: str, tone: str, emotion: str, pitch: str, pace: str, monotony: str, noise: str, reverberation: str, ) -> Tuple[str, str]: try: infer = _ensure_infer(CHECKPOINT, BASE_MODEL) description = build_advanced_description( speaker=speaker, pace=pace, noise=noise, reverberation=reverberation, monotony=monotony, pitch=pitch, emotion=emotion, tone=tone, add_context=True, ) cfg = GenerationConfig() os.makedirs("outputs", exist_ok=True) out_path = os.path.join("outputs", f"parler_out_{os.getpid()}.wav") cleanup_outputs(max_files=20) print(f"[INFO] Generating audio to {out_path} ...") audio_array, saved = infer.generate_audio( prompt=prompt, description=description, config=cfg, output_path=out_path, ) if not saved or not os.path.isfile(saved): import soundfile as sf if audio_array is None or len(audio_array) == 0: raise ValueError("generate_audio() did not return valid audio data.") sf.write(out_path, audio_array, getattr(infer, "sampling_rate", 22050)) saved = out_path return saved, "Success" except Exception as e: import traceback print(traceback.format_exc()) return "", f"Error: {e}" # --- Gradio demo --- def build_demo() -> gr.Blocks: SPEAKER_NAMES = sorted(GENDER_MAP.keys()) preset_names = ["Custom"] + list(PRESETS.keys()) with gr.Blocks() as demo: gr.Markdown("# ParlerVoice") prompt_input = gr.Textbox(label="Enter Text", placeholder="Type what the speaker says...") speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=SPEAKER_NAMES, value=SPEAKER_NAMES[0]) preset_dropdown = gr.Dropdown( label="Voice Preset", choices=preset_names, value="Custom", interactive=True, ) with gr.Group(): tone = gr.Dropdown( label="Tone", choices=[ "serious", "dramatic", "casual", "professional", "storytelling", "narrative", "emotional", "energetic", "loving" ], value="serious", ) emotion = gr.Dropdown( label="Emotion", choices=[ "neutral", "sad", "happy", "angry", "excited", "confused", "loving", "casual" ], value="neutral", ) pitch = gr.Dropdown(label="Pitch", choices=pitch_mean_bins, value="moderate pitch") pace = gr.Dropdown(label="Pace", choices=speaker_rate_bins, value="moderate speed") monotony = gr.Dropdown(label="Speech Style", choices=speech_monotony_bins, value="expressive and animated") noise = gr.Dropdown(label="Noise", choices=noise_bins, value="very clear") reverberation = gr.Dropdown(label="Reverberation", choices=reverberation_bins, value="very close-sounding") gr.Markdown( """ **Sample Descriptions:** - Connor delivers a serious and professional message with a calm, even pace and a moderate pitch. - Madison delivers a sad and disappointed speech. Her voice is slightly high-pitched and sounds emotional. - Jackson delivers a narrative with a slightly dramatic tone and clean recording. """ ) def apply_preset(preset_name: str): if preset_name == "Custom" or preset_name not in PRESETS: return gr.update(), gr.update(), gr.update(), gr.update(), gr.update() preset = PRESETS[preset_name] return ( gr.update(value=preset.get("tone")), gr.update(value=preset.get("emotion")), gr.update(value=preset.get("pitch")), gr.update(value=preset.get("pace")), gr.update(value=preset.get("monotony")), ) preset_dropdown.change( fn=apply_preset, inputs=preset_dropdown, outputs=[tone, emotion, pitch, pace, monotony], ) generate_btn = gr.Button("Generate Audio") audio_output = gr.Audio(type="filepath", label="Generated Audio") status_output = gr.Textbox(label="Status", interactive=False) generate_btn.click( fn=generate_audio, inputs=[ prompt_input, speaker_dropdown, tone, emotion, pitch, pace, monotony, noise, reverberation, ], outputs=[audio_output, status_output], ) return demo # --- Warmup logic --- @spaces.GPU(duration=180) def warmup_model(): """Run a few dummy sentences to preload model & CUDA.""" infer = _ensure_infer(CHECKPOINT, BASE_MODEL) cfg = GenerationConfig(max_length=256) warmup_sentences = [ "Hello there, this is a warmup test.", "The model is preparing to generate speech.", "Please wait a moment while we load everything.", "This is sentence number four for warmup.", "Warmup complete, ready to synthesize voice!", ] speaker = list(GENDER_MAP.keys())[0] for text in warmup_sentences: try: desc = build_advanced_description( speaker=speaker, pace="moderate speed", noise="very clear", reverberation="very close-sounding", monotony="expressive and animated", pitch="moderate pitch", emotion="neutral", tone="serious", add_context=False, ) infer.generate_audio(text, desc, cfg) except Exception as e: print(f"[WARN] Warmup failed for '{text}': {e}") print("[INFO] Warmup completed ✅") def _parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description="ParlerVoice Gradio App") p.add_argument("--server-name", default="0.0.0.0") p.add_argument("--server-port", type=int, default=8000) p.add_argument("--share", action="store_true") return p.parse_args() def main() -> int: warmup_model() args = _parse_args() demo = build_demo() # demo.launch(server_name=args.server_name, server_port=args.server_port, share=args.share) demo.launch() return 0 if __name__ == "__main__": raise SystemExit(main())