Spaces:

voicing-ai
/

ParlerVoice

Running on Zero

ParlerVoice / app.py

Mohammed Zeeshan Parvez

feat(app): increased max_length

0c510a3 29 days ago

8.57 kB

	import argparse
	import os
	import glob
	from typing import Tuple
	import gradio as gr
	import torch
	import spaces

	from parlervoice_infer.engine import ParlerVoiceInference
	from parlervoice_infer.config import GenerationConfig
	from parlervoice_infer.presets import PRESETS
	from parlervoice_infer.constants import (
	GENDER_MAP,
	PITCH_BINS as pitch_mean_bins,
	RATE_BINS as speaker_rate_bins,
	MONOTONY_BINS as speech_monotony_bins,
	NOISE_BINS as noise_bins,
	REVERB_BINS as reverberation_bins,
	)
	from parlervoice_infer.description import build_advanced_description


	# --- Global inference engine ---
	_INFER: ParlerVoiceInference = None
	CHECKPOINT = "voicing-ai/ParlerVoice"
	BASE_MODEL = "parler-tts/parler-tts-mini-v1.1"


	# --- Load model (singleton) ---
	def _ensure_infer(checkpoint: str, base_model: str) -> ParlerVoiceInference:
	global _INFER
	if _INFER is None:
	print("[INFO] Loading model...")
	_INFER = ParlerVoiceInference(checkpoint_path=checkpoint, base_model_path=base_model)
	return _INFER


	# --- Cleanup old outputs ---
	def cleanup_outputs(max_files=20):
	"""Keep only the latest `max_files` WAVs in outputs/ directory."""
	os.makedirs("outputs", exist_ok=True)
	files = sorted(glob.glob("outputs/*.wav"), key=os.path.getmtime)
	if len(files) > max_files:
	old_files = files[:len(files) - max_files]
	for f in old_files:
	try:
	os.remove(f)
	except Exception:
	pass


	# --- Audio generation ---
	@spaces.GPU(duration=120)
	def generate_audio(
	prompt: str,
	speaker: str,
	tone: str,
	emotion: str,
	pitch: str,
	pace: str,
	monotony: str,
	noise: str,
	reverberation: str,
	) -> Tuple[str, str]:
	try:
	infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
	description = build_advanced_description(
	speaker=speaker,
	pace=pace,
	noise=noise,
	reverberation=reverberation,
	monotony=monotony,
	pitch=pitch,
	emotion=emotion,
	tone=tone,
	add_context=True,
	)
	cfg = GenerationConfig()

	os.makedirs("outputs", exist_ok=True)
	out_path = os.path.join("outputs", f"parler_out_{os.getpid()}.wav")

	cleanup_outputs(max_files=20)

	print(f"[INFO] Generating audio to {out_path} ...")
	audio_array, saved = infer.generate_audio(
	prompt=prompt,
	description=description,
	config=cfg,
	output_path=out_path,
	)

	if not saved or not os.path.isfile(saved):
	import soundfile as sf
	if audio_array is None or len(audio_array) == 0:
	raise ValueError("generate_audio() did not return valid audio data.")
	sf.write(out_path, audio_array, getattr(infer, "sampling_rate", 22050))
	saved = out_path

	return saved, "Success"

	except Exception as e:
	import traceback
	print(traceback.format_exc())
	return "", f"Error: {e}"


	# --- Gradio demo ---
	def build_demo() -> gr.Blocks:
	SPEAKER_NAMES = sorted(GENDER_MAP.keys())
	preset_names = ["Custom"] + list(PRESETS.keys())

	with gr.Blocks() as demo:
	gr.Markdown("# ParlerVoice")

	prompt_input = gr.Textbox(label="Enter Text", placeholder="Type what the speaker says...")
	speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=SPEAKER_NAMES, value=SPEAKER_NAMES[0])

	preset_dropdown = gr.Dropdown(
	label="Voice Preset",
	choices=preset_names,
	value="Custom",
	interactive=True,
	)

	with gr.Group():
	tone = gr.Dropdown(
	label="Tone",
	choices=[
	"serious",
	"dramatic",
	"casual",
	"professional",
	"storytelling",
	"narrative",
	"emotional",
	"energetic",
	"loving"
	],
	value="serious",
	)

	emotion = gr.Dropdown(
	label="Emotion",
	choices=[
	"neutral",
	"sad",
	"happy",
	"angry",
	"excited",
	"confused",
	"loving",
	"casual"
	],
	value="neutral",
	)

	pitch = gr.Dropdown(label="Pitch", choices=pitch_mean_bins, value="moderate pitch")
	pace = gr.Dropdown(label="Pace", choices=speaker_rate_bins, value="moderate speed")
	monotony = gr.Dropdown(label="Speech Style", choices=speech_monotony_bins, value="expressive and animated")
	noise = gr.Dropdown(label="Noise", choices=noise_bins, value="very clear")
	reverberation = gr.Dropdown(label="Reverberation", choices=reverberation_bins, value="very close-sounding")

	gr.Markdown(
	"""
	Sample Descriptions:
	- Connor delivers a serious and professional message with a calm, even pace and a moderate pitch.
	- Madison delivers a sad and disappointed speech. Her voice is slightly high-pitched and sounds emotional.
	- Jackson delivers a narrative with a slightly dramatic tone and clean recording.
	"""
	)

	def apply_preset(preset_name: str):
	if preset_name == "Custom" or preset_name not in PRESETS:
	return gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
	preset = PRESETS[preset_name]
	return (
	gr.update(value=preset.get("tone")),
	gr.update(value=preset.get("emotion")),
	gr.update(value=preset.get("pitch")),
	gr.update(value=preset.get("pace")),
	gr.update(value=preset.get("monotony")),
	)

	preset_dropdown.change(
	fn=apply_preset,
	inputs=preset_dropdown,
	outputs=[tone, emotion, pitch, pace, monotony],
	)

	generate_btn = gr.Button("Generate Audio")
	audio_output = gr.Audio(type="filepath", label="Generated Audio")
	status_output = gr.Textbox(label="Status", interactive=False)

	generate_btn.click(
	fn=generate_audio,
	inputs=[
	prompt_input,
	speaker_dropdown,
	tone,
	emotion,
	pitch,
	pace,
	monotony,
	noise,
	reverberation,
	],
	outputs=[audio_output, status_output],
	)

	return demo


	# --- Warmup logic ---
	@spaces.GPU(duration=180)
	def warmup_model():
	"""Run a few dummy sentences to preload model & CUDA."""
	infer = _ensure_infer(CHECKPOINT, BASE_MODEL)
	cfg = GenerationConfig(max_length=256)
	warmup_sentences = [
	"Hello there, this is a warmup test.",
	"The model is preparing to generate speech.",
	"Please wait a moment while we load everything.",
	"This is sentence number four for warmup.",
	"Warmup complete, ready to synthesize voice!",
	]
	speaker = list(GENDER_MAP.keys())[0]
	for text in warmup_sentences:
	try:
	desc = build_advanced_description(
	speaker=speaker,
	pace="moderate speed",
	noise="very clear",
	reverberation="very close-sounding",
	monotony="expressive and animated",
	pitch="moderate pitch",
	emotion="neutral",
	tone="serious",
	add_context=False,
	)
	infer.generate_audio(text, desc, cfg)
	except Exception as e:
	print(f"[WARN] Warmup failed for '{text}': {e}")
	print("[INFO] Warmup completed ✅")


	def _parse_args() -> argparse.Namespace:
	p = argparse.ArgumentParser(description="ParlerVoice Gradio App")
	p.add_argument("--server-name", default="0.0.0.0")
	p.add_argument("--server-port", type=int, default=8000)
	p.add_argument("--share", action="store_true")
	return p.parse_args()


	def main() -> int:
	warmup_model()
	args = _parse_args()
	demo = build_demo()
	# demo.launch(server_name=args.server_name, server_port=args.server_port, share=args.share)
	demo.launch()
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())