Spaces:
Running
on
Zero
Running
on
Zero
| import argparse | |
| import os | |
| import glob | |
| from typing import Tuple | |
| import gradio as gr | |
| import torch | |
| import spaces | |
| from parlervoice_infer.engine import ParlerVoiceInference | |
| from parlervoice_infer.config import GenerationConfig | |
| from parlervoice_infer.presets import PRESETS | |
| from parlervoice_infer.constants import ( | |
| GENDER_MAP, | |
| PITCH_BINS as pitch_mean_bins, | |
| RATE_BINS as speaker_rate_bins, | |
| MONOTONY_BINS as speech_monotony_bins, | |
| NOISE_BINS as noise_bins, | |
| REVERB_BINS as reverberation_bins, | |
| ) | |
| from parlervoice_infer.description import build_advanced_description | |
| # --- Global inference engine --- | |
| _INFER: ParlerVoiceInference = None | |
| CHECKPOINT = "voicing-ai/ParlerVoice" | |
| BASE_MODEL = "parler-tts/parler-tts-mini-v1.1" | |
| # --- Load model (singleton) --- | |
| def _ensure_infer(checkpoint: str, base_model: str) -> ParlerVoiceInference: | |
| global _INFER | |
| if _INFER is None: | |
| print("[INFO] Loading model...") | |
| _INFER = ParlerVoiceInference(checkpoint_path=checkpoint, base_model_path=base_model) | |
| return _INFER | |
| # --- Cleanup old outputs --- | |
| def cleanup_outputs(max_files=20): | |
| """Keep only the latest `max_files` WAVs in outputs/ directory.""" | |
| os.makedirs("outputs", exist_ok=True) | |
| files = sorted(glob.glob("outputs/*.wav"), key=os.path.getmtime) | |
| if len(files) > max_files: | |
| old_files = files[:len(files) - max_files] | |
| for f in old_files: | |
| try: | |
| os.remove(f) | |
| except Exception: | |
| pass | |
| # --- Audio generation --- | |
| def generate_audio( | |
| prompt: str, | |
| speaker: str, | |
| tone: str, | |
| emotion: str, | |
| pitch: str, | |
| pace: str, | |
| monotony: str, | |
| noise: str, | |
| reverberation: str, | |
| ) -> Tuple[str, str]: | |
| try: | |
| infer = _ensure_infer(CHECKPOINT, BASE_MODEL) | |
| description = build_advanced_description( | |
| speaker=speaker, | |
| pace=pace, | |
| noise=noise, | |
| reverberation=reverberation, | |
| monotony=monotony, | |
| pitch=pitch, | |
| emotion=emotion, | |
| tone=tone, | |
| add_context=True, | |
| ) | |
| cfg = GenerationConfig() | |
| os.makedirs("outputs", exist_ok=True) | |
| out_path = os.path.join("outputs", f"parler_out_{os.getpid()}.wav") | |
| cleanup_outputs(max_files=20) | |
| print(f"[INFO] Generating audio to {out_path} ...") | |
| audio_array, saved = infer.generate_audio( | |
| prompt=prompt, | |
| description=description, | |
| config=cfg, | |
| output_path=out_path, | |
| ) | |
| if not saved or not os.path.isfile(saved): | |
| import soundfile as sf | |
| if audio_array is None or len(audio_array) == 0: | |
| raise ValueError("generate_audio() did not return valid audio data.") | |
| sf.write(out_path, audio_array, getattr(infer, "sampling_rate", 22050)) | |
| saved = out_path | |
| return saved, "Success" | |
| except Exception as e: | |
| import traceback | |
| print(traceback.format_exc()) | |
| return "", f"Error: {e}" | |
| # --- Gradio demo --- | |
| def build_demo() -> gr.Blocks: | |
| SPEAKER_NAMES = sorted(GENDER_MAP.keys()) | |
| preset_names = ["Custom"] + list(PRESETS.keys()) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# ParlerVoice") | |
| prompt_input = gr.Textbox(label="Enter Text", placeholder="Type what the speaker says...") | |
| speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=SPEAKER_NAMES, value=SPEAKER_NAMES[0]) | |
| preset_dropdown = gr.Dropdown( | |
| label="Voice Preset", | |
| choices=preset_names, | |
| value="Custom", | |
| interactive=True, | |
| ) | |
| with gr.Group(): | |
| tone = gr.Dropdown( | |
| label="Tone", | |
| choices=[ | |
| "serious", | |
| "dramatic", | |
| "casual", | |
| "professional", | |
| "storytelling", | |
| "narrative", | |
| "emotional", | |
| "energetic", | |
| "loving" | |
| ], | |
| value="serious", | |
| ) | |
| emotion = gr.Dropdown( | |
| label="Emotion", | |
| choices=[ | |
| "neutral", | |
| "sad", | |
| "happy", | |
| "angry", | |
| "excited", | |
| "confused", | |
| "loving", | |
| "casual" | |
| ], | |
| value="neutral", | |
| ) | |
| pitch = gr.Dropdown(label="Pitch", choices=pitch_mean_bins, value="moderate pitch") | |
| pace = gr.Dropdown(label="Pace", choices=speaker_rate_bins, value="moderate speed") | |
| monotony = gr.Dropdown(label="Speech Style", choices=speech_monotony_bins, value="expressive and animated") | |
| noise = gr.Dropdown(label="Noise", choices=noise_bins, value="very clear") | |
| reverberation = gr.Dropdown(label="Reverberation", choices=reverberation_bins, value="very close-sounding") | |
| gr.Markdown( | |
| """ | |
| **Sample Descriptions:** | |
| - Connor delivers a serious and professional message with a calm, even pace and a moderate pitch. | |
| - Madison delivers a sad and disappointed speech. Her voice is slightly high-pitched and sounds emotional. | |
| - Jackson delivers a narrative with a slightly dramatic tone and clean recording. | |
| """ | |
| ) | |
| def apply_preset(preset_name: str): | |
| if preset_name == "Custom" or preset_name not in PRESETS: | |
| return gr.update(), gr.update(), gr.update(), gr.update(), gr.update() | |
| preset = PRESETS[preset_name] | |
| return ( | |
| gr.update(value=preset.get("tone")), | |
| gr.update(value=preset.get("emotion")), | |
| gr.update(value=preset.get("pitch")), | |
| gr.update(value=preset.get("pace")), | |
| gr.update(value=preset.get("monotony")), | |
| ) | |
| preset_dropdown.change( | |
| fn=apply_preset, | |
| inputs=preset_dropdown, | |
| outputs=[tone, emotion, pitch, pace, monotony], | |
| ) | |
| generate_btn = gr.Button("Generate Audio") | |
| audio_output = gr.Audio(type="filepath", label="Generated Audio") | |
| status_output = gr.Textbox(label="Status", interactive=False) | |
| generate_btn.click( | |
| fn=generate_audio, | |
| inputs=[ | |
| prompt_input, | |
| speaker_dropdown, | |
| tone, | |
| emotion, | |
| pitch, | |
| pace, | |
| monotony, | |
| noise, | |
| reverberation, | |
| ], | |
| outputs=[audio_output, status_output], | |
| ) | |
| return demo | |
| # --- Warmup logic --- | |
| def warmup_model(): | |
| """Run a few dummy sentences to preload model & CUDA.""" | |
| infer = _ensure_infer(CHECKPOINT, BASE_MODEL) | |
| cfg = GenerationConfig(max_length=256) | |
| warmup_sentences = [ | |
| "Hello there, this is a warmup test.", | |
| "The model is preparing to generate speech.", | |
| "Please wait a moment while we load everything.", | |
| "This is sentence number four for warmup.", | |
| "Warmup complete, ready to synthesize voice!", | |
| ] | |
| speaker = list(GENDER_MAP.keys())[0] | |
| for text in warmup_sentences: | |
| try: | |
| desc = build_advanced_description( | |
| speaker=speaker, | |
| pace="moderate speed", | |
| noise="very clear", | |
| reverberation="very close-sounding", | |
| monotony="expressive and animated", | |
| pitch="moderate pitch", | |
| emotion="neutral", | |
| tone="serious", | |
| add_context=False, | |
| ) | |
| infer.generate_audio(text, desc, cfg) | |
| except Exception as e: | |
| print(f"[WARN] Warmup failed for '{text}': {e}") | |
| print("[INFO] Warmup completed ✅") | |
| def _parse_args() -> argparse.Namespace: | |
| p = argparse.ArgumentParser(description="ParlerVoice Gradio App") | |
| p.add_argument("--server-name", default="0.0.0.0") | |
| p.add_argument("--server-port", type=int, default=8000) | |
| p.add_argument("--share", action="store_true") | |
| return p.parse_args() | |
| def main() -> int: | |
| warmup_model() | |
| args = _parse_args() | |
| demo = build_demo() | |
| # demo.launch(server_name=args.server_name, server_port=args.server_port, share=args.share) | |
| demo.launch() | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |