ParlerVoice / app.py
Mohammed Zeeshan Parvez
feat(app): increased max_length
0c510a3
import argparse
import os
import glob
from typing import Tuple
import gradio as gr
import torch
import spaces
from parlervoice_infer.engine import ParlerVoiceInference
from parlervoice_infer.config import GenerationConfig
from parlervoice_infer.presets import PRESETS
from parlervoice_infer.constants import (
GENDER_MAP,
PITCH_BINS as pitch_mean_bins,
RATE_BINS as speaker_rate_bins,
MONOTONY_BINS as speech_monotony_bins,
NOISE_BINS as noise_bins,
REVERB_BINS as reverberation_bins,
)
from parlervoice_infer.description import build_advanced_description
# --- Global inference engine ---
_INFER: ParlerVoiceInference = None
CHECKPOINT = "voicing-ai/ParlerVoice"
BASE_MODEL = "parler-tts/parler-tts-mini-v1.1"
# --- Load model (singleton) ---
def _ensure_infer(checkpoint: str, base_model: str) -> ParlerVoiceInference:
global _INFER
if _INFER is None:
print("[INFO] Loading model...")
_INFER = ParlerVoiceInference(checkpoint_path=checkpoint, base_model_path=base_model)
return _INFER
# --- Cleanup old outputs ---
def cleanup_outputs(max_files=20):
"""Keep only the latest `max_files` WAVs in outputs/ directory."""
os.makedirs("outputs", exist_ok=True)
files = sorted(glob.glob("outputs/*.wav"), key=os.path.getmtime)
if len(files) > max_files:
old_files = files[:len(files) - max_files]
for f in old_files:
try:
os.remove(f)
except Exception:
pass
# --- Audio generation ---
@spaces.GPU(duration=120)
def generate_audio(
prompt: str,
speaker: str,
tone: str,
emotion: str,
pitch: str,
pace: str,
monotony: str,
noise: str,
reverberation: str,
) -> Tuple[str, str]:
try:
infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
description = build_advanced_description(
speaker=speaker,
pace=pace,
noise=noise,
reverberation=reverberation,
monotony=monotony,
pitch=pitch,
emotion=emotion,
tone=tone,
add_context=True,
)
cfg = GenerationConfig()
os.makedirs("outputs", exist_ok=True)
out_path = os.path.join("outputs", f"parler_out_{os.getpid()}.wav")
cleanup_outputs(max_files=20)
print(f"[INFO] Generating audio to {out_path} ...")
audio_array, saved = infer.generate_audio(
prompt=prompt,
description=description,
config=cfg,
output_path=out_path,
)
if not saved or not os.path.isfile(saved):
import soundfile as sf
if audio_array is None or len(audio_array) == 0:
raise ValueError("generate_audio() did not return valid audio data.")
sf.write(out_path, audio_array, getattr(infer, "sampling_rate", 22050))
saved = out_path
return saved, "Success"
except Exception as e:
import traceback
print(traceback.format_exc())
return "", f"Error: {e}"
# --- Gradio demo ---
def build_demo() -> gr.Blocks:
SPEAKER_NAMES = sorted(GENDER_MAP.keys())
preset_names = ["Custom"] + list(PRESETS.keys())
with gr.Blocks() as demo:
gr.Markdown("# ParlerVoice")
prompt_input = gr.Textbox(label="Enter Text", placeholder="Type what the speaker says...")
speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=SPEAKER_NAMES, value=SPEAKER_NAMES[0])
preset_dropdown = gr.Dropdown(
label="Voice Preset",
choices=preset_names,
value="Custom",
interactive=True,
)
with gr.Group():
tone = gr.Dropdown(
label="Tone",
choices=[
"serious",
"dramatic",
"casual",
"professional",
"storytelling",
"narrative",
"emotional",
"energetic",
"loving"
],
value="serious",
)
emotion = gr.Dropdown(
label="Emotion",
choices=[
"neutral",
"sad",
"happy",
"angry",
"excited",
"confused",
"loving",
"casual"
],
value="neutral",
)
pitch = gr.Dropdown(label="Pitch", choices=pitch_mean_bins, value="moderate pitch")
pace = gr.Dropdown(label="Pace", choices=speaker_rate_bins, value="moderate speed")
monotony = gr.Dropdown(label="Speech Style", choices=speech_monotony_bins, value="expressive and animated")
noise = gr.Dropdown(label="Noise", choices=noise_bins, value="very clear")
reverberation = gr.Dropdown(label="Reverberation", choices=reverberation_bins, value="very close-sounding")
gr.Markdown(
"""
**Sample Descriptions:**
- Connor delivers a serious and professional message with a calm, even pace and a moderate pitch.
- Madison delivers a sad and disappointed speech. Her voice is slightly high-pitched and sounds emotional.
- Jackson delivers a narrative with a slightly dramatic tone and clean recording.
"""
)
def apply_preset(preset_name: str):
if preset_name == "Custom" or preset_name not in PRESETS:
return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
preset = PRESETS[preset_name]
return (
gr.update(value=preset.get("tone")),
gr.update(value=preset.get("emotion")),
gr.update(value=preset.get("pitch")),
gr.update(value=preset.get("pace")),
gr.update(value=preset.get("monotony")),
)
preset_dropdown.change(
fn=apply_preset,
inputs=preset_dropdown,
outputs=[tone, emotion, pitch, pace, monotony],
)
generate_btn = gr.Button("Generate Audio")
audio_output = gr.Audio(type="filepath", label="Generated Audio")
status_output = gr.Textbox(label="Status", interactive=False)
generate_btn.click(
fn=generate_audio,
inputs=[
prompt_input,
speaker_dropdown,
tone,
emotion,
pitch,
pace,
monotony,
noise,
reverberation,
],
outputs=[audio_output, status_output],
)
return demo
# --- Warmup logic ---
@spaces.GPU(duration=180)
def warmup_model():
"""Run a few dummy sentences to preload model & CUDA."""
infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
cfg = GenerationConfig(max_length=256)
warmup_sentences = [
"Hello there, this is a warmup test.",
"The model is preparing to generate speech.",
"Please wait a moment while we load everything.",
"This is sentence number four for warmup.",
"Warmup complete, ready to synthesize voice!",
]
speaker = list(GENDER_MAP.keys())[0]
for text in warmup_sentences:
try:
desc = build_advanced_description(
speaker=speaker,
pace="moderate speed",
noise="very clear",
reverberation="very close-sounding",
monotony="expressive and animated",
pitch="moderate pitch",
emotion="neutral",
tone="serious",
add_context=False,
)
infer.generate_audio(text, desc, cfg)
except Exception as e:
print(f"[WARN] Warmup failed for '{text}': {e}")
print("[INFO] Warmup completed ✅")
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="ParlerVoice Gradio App")
p.add_argument("--server-name", default="0.0.0.0")
p.add_argument("--server-port", type=int, default=8000)
p.add_argument("--share", action="store_true")
return p.parse_args()
def main() -> int:
warmup_model()
args = _parse_args()
demo = build_demo()
# demo.launch(server_name=args.server_name, server_port=args.server_port, share=args.share)
demo.launch()
return 0
if __name__ == "__main__":
raise SystemExit(main())