Spaces:

rizwanali007
/

zip_voice

Running on Zero

File size: 7,360 Bytes

"""
Hugging Face Space (Gradio) app for running ZipVoice inference by cloning
https://github.com/k2-fsa/ZipVoice and calling the inference script.

Files: this single file (app.py). Save alongside a requirements.txt in the Space
that contains at least: gradio

Notes:
- This script attempts to `git clone` the ZipVoice repo on first run and install
  it (pip). On Spaces this can take time and may require a GPU-enabled runner
  (recommended). If your Space already has the ZipVoice code and deps installed,
  the startup will be faster.
- The app calls the repo's inference CLI: `python -m zipvoice.bin.infer_zipvoice`.
- You can change `REPO_URL` or the default model names if you have your own HF
  checkpoint or local weights.
- This is a practical example; depending on available hardware (CPU-only), you
  may need to use the `zipvoice_distill` or quantized models for speed.

Usage in Space:
- Create a new Space (Gradio, Python).
- Add this file as `app.py`.
- Add a small `requirements.txt` with: gradio
- Optionally add a `start.sh` or enable internet to let the app clone and
  install the ZipVoice repo on startup.

"""

import os
import subprocess
import shlex
import tempfile
import time
from pathlib import Path
import spaces
import gradio as gr

@spaces.GPU
def gpuCheck():
    return "GPU OK"

# CONFIG - change if needed
REPO_URL = "https://github.com/k2-fsa/ZipVoice.git"
REPO_DIR = Path("/tmp/ZipVoice")
PYTHON_CMD = "python3"
DEFAULT_MODEL = "zipvoice"
DEFAULT_DISTILL_MODEL = "zipvoice_distill"


def run_cmd(cmd, cwd=None, env=None, timeout=1800):
    """Run shell command, return (returncode, stdout, stderr)."""
    try:
        proc = subprocess.run(
            shlex.split(cmd),
            cwd=cwd,
            env=env,
            capture_output=True,
            text=True,
            timeout=timeout,
        )
        return proc.returncode, proc.stdout, proc.stderr
    except subprocess.TimeoutExpired as e:
        return -1, "", f"Timeout: {e}"


def ensure_zipvoice_installed():
    """Clone repo and install if not present. Returns a tuple (ok, logs)."""
    logs = []
    if REPO_DIR.exists():
        logs.append(f"Found existing repo at {REPO_DIR}")
        return True, "\n".join(logs)

    logs.append(f"Cloning {REPO_URL} into {REPO_DIR} ...")
    code, out, err = run_cmd(f"git clone {REPO_URL} {REPO_DIR}")
    logs.append(out)
    logs.append(err)
    if code != 0:
        logs.append("Failed to clone repository.")
        return False, "\n".join(logs)

    # Try to pip install the package (editable) and requirements if present
    # This may be heavy (torch etc.). If it fails, user can preinstall deps.
    req_txt = REPO_DIR / "requirements.txt"
    if req_txt.exists():
        logs.append("Installing requirements.txt (this may take several minutes)...")
        code, out, err = run_cmd(f"{PYTHON_CMD} -m pip install -r {req_txt}")
        logs.append(out)
        logs.append(err)
        if code != 0:
            logs.append("requirements install returned non-zero exit code, trying package install...")

    # Try to install the package (setup.py or pyproject)
    logs.append("Attempting to install ZipVoice package (pip install -e .)")
    code, out, err = run_cmd(f"{PYTHON_CMD} -m pip install -e {REPO_DIR}")
    logs.append(out)
    logs.append(err)
    if code != 0:
        logs.append("Editable install failed; try installing dependencies manually in the Space.")
        # Even if install failed, the CLI may still run if requirements are available.
        # Return success=False so the UI can warn user.
        return False, "\n".join(logs)

    logs.append("ZipVoice installed successfully.")
    return True, "\n".join(logs)


def infer_zipvoice(prompt_wav_path: str, prompt_text: str, text: str, model_name: str, num_steps: int = 6):
    """Run the ZipVoice inference CLI and return path to generated wav and logs."""
    out_dir = Path("/tmp/zipvoice_out")
    out_dir.mkdir(parents=True, exist_ok=True)
    res_wav = out_dir / f"result_{int(time.time())}.wav"

    # Build command
    cmd = (
        f"{PYTHON_CMD} -m zipvoice.bin.infer_zipvoice"
        f" --model-name {shlex.quote(model_name)}"
        f" --prompt-wav {shlex.quote(prompt_wav_path)}"
        f" --prompt-text {shlex.quote(prompt_text)}"
        f" --text {shlex.quote(text)}"
        f" --res-wav-path {shlex.quote(str(res_wav))}"
        f" --num-steps {int(num_steps)}"
    )

    rc, out, err = run_cmd(cmd, cwd=str(REPO_DIR), timeout=900)
    logs = f"RETURN_CODE={rc}\nSTDOUT:\n{out}\nSTDERR:\n{err}"

    if rc == 0 and res_wav.exists():
        return str(res_wav), logs
    else:
        return None, logs


# Build Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# ZipVoice Hugging Face Space - Quick Runner")

    status_box = gr.Textbox(label="Setup / Status logs", lines=8, interactive=False)

    with gr.Row():
        with gr.Column(scale=1):
            prompt_audio = gr.Audio(label="Prompt audio (wav) - short, clean, single speaker", type="filepath")
            prompt_text = gr.Textbox(label="Transcription of prompt audio", value="", placeholder="Type the transcription of the prompt wav here")
            text_to_speak = gr.Textbox(label="Text to synthesize", value="Hello, this is a test from ZipVoice.")
            model_choice = gr.Radio(choices=[DEFAULT_MODEL, DEFAULT_DISTILL_MODEL], value=DEFAULT_MODEL, label="Model")
            num_steps = gr.Slider(label="Number of sampling steps (lower = faster)", minimum=1, maximum=16, value=6, step=1)
            generate_btn = gr.Button("Generate")
        with gr.Column(scale=1):
            output_audio = gr.Audio(label="Generated audio (result.wav)")
            logs_out = gr.Textbox(label="Inference logs", lines=12, interactive=False)

    def startup_check():
        ok, logs = ensure_zipvoice_installed()
        if not ok:
            msg = (
                "Warning: automatic install failed.\n"
                "Please preinstall model dependencies (torch, soundfile, etc.) or enable internet for this Space.\n"
                "Install logs:\n"
            )
            return msg + logs
        return "Setup complete. You can upload prompt audio and run inference.\n" + logs

    def on_generate(prompt_wav, p_text, text, model_name, n_steps):
        if not prompt_wav:
            return None, "", "Please upload a prompt WAV file."
        # ensure installed
        ok, logs = ensure_zipvoice_installed()
        if not ok:
            return None, "", "ZipVoice is not installed correctly. See logs:\n" + logs

        # Copy prompt wav into tmp file (sometimes gradio provides a temporary path already)
        tmp_prompt = Path(prompt_wav)
        if not tmp_prompt.exists():
            return None, "", f"Prompt file not found: {prompt_wav}"

        res_path, infer_logs = infer_zipvoice(str(tmp_prompt), p_text or "", text or "", model_name, int(n_steps))
        if res_path:
            return res_path, infer_logs, "Generation successful"
        else:
            return None, infer_logs, "Generation failed. See logs."

    # Wire events
    demo.load(startup_check, outputs=[status_box])
    generate_btn.click(on_generate, inputs=[prompt_audio, prompt_text, text_to_speak, model_choice, num_steps], outputs=[output_audio, logs_out, status_box])


if __name__ == "__main__":
    demo.launch()