File size: 7,360 Bytes
90cb481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df1800e
90cb481
 
df1800e
0c44d05
70d93e2
df1800e
90cb481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
"""
Hugging Face Space (Gradio) app for running ZipVoice inference by cloning
https://github.com/k2-fsa/ZipVoice and calling the inference script.

Files: this single file (app.py). Save alongside a requirements.txt in the Space
that contains at least: gradio

Notes:
- This script attempts to `git clone` the ZipVoice repo on first run and install
  it (pip). On Spaces this can take time and may require a GPU-enabled runner
  (recommended). If your Space already has the ZipVoice code and deps installed,
  the startup will be faster.
- The app calls the repo's inference CLI: `python -m zipvoice.bin.infer_zipvoice`.
- You can change `REPO_URL` or the default model names if you have your own HF
  checkpoint or local weights.
- This is a practical example; depending on available hardware (CPU-only), you
  may need to use the `zipvoice_distill` or quantized models for speed.

Usage in Space:
- Create a new Space (Gradio, Python).
- Add this file as `app.py`.
- Add a small `requirements.txt` with: gradio
- Optionally add a `start.sh` or enable internet to let the app clone and
  install the ZipVoice repo on startup.

"""

import os
import subprocess
import shlex
import tempfile
import time
from pathlib import Path
import spaces
import gradio as gr

@spaces.GPU
def gpuCheck():
    return "GPU OK"

# CONFIG - change if needed
REPO_URL = "https://github.com/k2-fsa/ZipVoice.git"
REPO_DIR = Path("/tmp/ZipVoice")
PYTHON_CMD = "python3"
DEFAULT_MODEL = "zipvoice"
DEFAULT_DISTILL_MODEL = "zipvoice_distill"


def run_cmd(cmd, cwd=None, env=None, timeout=1800):
    """Run shell command, return (returncode, stdout, stderr)."""
    try:
        proc = subprocess.run(
            shlex.split(cmd),
            cwd=cwd,
            env=env,
            capture_output=True,
            text=True,
            timeout=timeout,
        )
        return proc.returncode, proc.stdout, proc.stderr
    except subprocess.TimeoutExpired as e:
        return -1, "", f"Timeout: {e}"


def ensure_zipvoice_installed():
    """Clone repo and install if not present. Returns a tuple (ok, logs)."""
    logs = []
    if REPO_DIR.exists():
        logs.append(f"Found existing repo at {REPO_DIR}")
        return True, "\n".join(logs)

    logs.append(f"Cloning {REPO_URL} into {REPO_DIR} ...")
    code, out, err = run_cmd(f"git clone {REPO_URL} {REPO_DIR}")
    logs.append(out)
    logs.append(err)
    if code != 0:
        logs.append("Failed to clone repository.")
        return False, "\n".join(logs)

    # Try to pip install the package (editable) and requirements if present
    # This may be heavy (torch etc.). If it fails, user can preinstall deps.
    req_txt = REPO_DIR / "requirements.txt"
    if req_txt.exists():
        logs.append("Installing requirements.txt (this may take several minutes)...")
        code, out, err = run_cmd(f"{PYTHON_CMD} -m pip install -r {req_txt}")
        logs.append(out)
        logs.append(err)
        if code != 0:
            logs.append("requirements install returned non-zero exit code, trying package install...")

    # Try to install the package (setup.py or pyproject)
    logs.append("Attempting to install ZipVoice package (pip install -e .)")
    code, out, err = run_cmd(f"{PYTHON_CMD} -m pip install -e {REPO_DIR}")
    logs.append(out)
    logs.append(err)
    if code != 0:
        logs.append("Editable install failed; try installing dependencies manually in the Space.")
        # Even if install failed, the CLI may still run if requirements are available.
        # Return success=False so the UI can warn user.
        return False, "\n".join(logs)

    logs.append("ZipVoice installed successfully.")
    return True, "\n".join(logs)


def infer_zipvoice(prompt_wav_path: str, prompt_text: str, text: str, model_name: str, num_steps: int = 6):
    """Run the ZipVoice inference CLI and return path to generated wav and logs."""
    out_dir = Path("/tmp/zipvoice_out")
    out_dir.mkdir(parents=True, exist_ok=True)
    res_wav = out_dir / f"result_{int(time.time())}.wav"

    # Build command
    cmd = (
        f"{PYTHON_CMD} -m zipvoice.bin.infer_zipvoice"
        f" --model-name {shlex.quote(model_name)}"
        f" --prompt-wav {shlex.quote(prompt_wav_path)}"
        f" --prompt-text {shlex.quote(prompt_text)}"
        f" --text {shlex.quote(text)}"
        f" --res-wav-path {shlex.quote(str(res_wav))}"
        f" --num-steps {int(num_steps)}"
    )

    rc, out, err = run_cmd(cmd, cwd=str(REPO_DIR), timeout=900)
    logs = f"RETURN_CODE={rc}\nSTDOUT:\n{out}\nSTDERR:\n{err}"

    if rc == 0 and res_wav.exists():
        return str(res_wav), logs
    else:
        return None, logs


# Build Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# ZipVoice Hugging Face Space - Quick Runner")

    status_box = gr.Textbox(label="Setup / Status logs", lines=8, interactive=False)

    with gr.Row():
        with gr.Column(scale=1):
            prompt_audio = gr.Audio(label="Prompt audio (wav) - short, clean, single speaker", type="filepath")
            prompt_text = gr.Textbox(label="Transcription of prompt audio", value="", placeholder="Type the transcription of the prompt wav here")
            text_to_speak = gr.Textbox(label="Text to synthesize", value="Hello, this is a test from ZipVoice.")
            model_choice = gr.Radio(choices=[DEFAULT_MODEL, DEFAULT_DISTILL_MODEL], value=DEFAULT_MODEL, label="Model")
            num_steps = gr.Slider(label="Number of sampling steps (lower = faster)", minimum=1, maximum=16, value=6, step=1)
            generate_btn = gr.Button("Generate")
        with gr.Column(scale=1):
            output_audio = gr.Audio(label="Generated audio (result.wav)")
            logs_out = gr.Textbox(label="Inference logs", lines=12, interactive=False)

    def startup_check():
        ok, logs = ensure_zipvoice_installed()
        if not ok:
            msg = (
                "Warning: automatic install failed.\n"
                "Please preinstall model dependencies (torch, soundfile, etc.) or enable internet for this Space.\n"
                "Install logs:\n"
            )
            return msg + logs
        return "Setup complete. You can upload prompt audio and run inference.\n" + logs

    def on_generate(prompt_wav, p_text, text, model_name, n_steps):
        if not prompt_wav:
            return None, "", "Please upload a prompt WAV file."
        # ensure installed
        ok, logs = ensure_zipvoice_installed()
        if not ok:
            return None, "", "ZipVoice is not installed correctly. See logs:\n" + logs

        # Copy prompt wav into tmp file (sometimes gradio provides a temporary path already)
        tmp_prompt = Path(prompt_wav)
        if not tmp_prompt.exists():
            return None, "", f"Prompt file not found: {prompt_wav}"

        res_path, infer_logs = infer_zipvoice(str(tmp_prompt), p_text or "", text or "", model_name, int(n_steps))
        if res_path:
            return res_path, infer_logs, "Generation successful"
        else:
            return None, infer_logs, "Generation failed. See logs."

    # Wire events
    demo.load(startup_check, outputs=[status_box])
    generate_btn.click(on_generate, inputs=[prompt_audio, prompt_text, text_to_speak, model_choice, num_steps], outputs=[output_audio, logs_out, status_box])


if __name__ == "__main__":
    demo.launch()