Spaces:

rizwanali007
/

zip_voice

Running on Zero

App Files Files Community

zip_voice / app.py

rizwanali007

Update app.py

70d93e2 verified 17 days ago

raw

history blame contribute delete

7.36 kB

	"""
	Hugging Face Space (Gradio) app for running ZipVoice inference by cloning
	https://github.com/k2-fsa/ZipVoice and calling the inference script.

	Files: this single file (app.py). Save alongside a requirements.txt in the Space
	that contains at least: gradio

	Notes:
	- This script attempts to `git clone` the ZipVoice repo on first run and install
	it (pip). On Spaces this can take time and may require a GPU-enabled runner
	(recommended). If your Space already has the ZipVoice code and deps installed,
	the startup will be faster.
	- The app calls the repo's inference CLI: `python -m zipvoice.bin.infer_zipvoice`.
	- You can change `REPO_URL` or the default model names if you have your own HF
	checkpoint or local weights.
	- This is a practical example; depending on available hardware (CPU-only), you
	may need to use the `zipvoice_distill` or quantized models for speed.

	Usage in Space:
	- Create a new Space (Gradio, Python).
	- Add this file as `app.py`.
	- Add a small `requirements.txt` with: gradio
	- Optionally add a `start.sh` or enable internet to let the app clone and
	install the ZipVoice repo on startup.

	"""

	import os
	import subprocess
	import shlex
	import tempfile
	import time
	from pathlib import Path
	import spaces
	import gradio as gr

	@spaces.GPU
	def gpuCheck():
	return "GPU OK"

	# CONFIG - change if needed
	REPO_URL = "https://github.com/k2-fsa/ZipVoice.git"
	REPO_DIR = Path("/tmp/ZipVoice")
	PYTHON_CMD = "python3"
	DEFAULT_MODEL = "zipvoice"
	DEFAULT_DISTILL_MODEL = "zipvoice_distill"


	def run_cmd(cmd, cwd=None, env=None, timeout=1800):
	"""Run shell command, return (returncode, stdout, stderr)."""
	try:
	proc = subprocess.run(
	shlex.split(cmd),
	cwd=cwd,
	env=env,
	capture_output=True,
	text=True,
	timeout=timeout,
	)
	return proc.returncode, proc.stdout, proc.stderr
	except subprocess.TimeoutExpired as e:
	return -1, "", f"Timeout: {e}"


	def ensure_zipvoice_installed():
	"""Clone repo and install if not present. Returns a tuple (ok, logs)."""
	logs = []
	if REPO_DIR.exists():
	logs.append(f"Found existing repo at {REPO_DIR}")
	return True, "\n".join(logs)

	logs.append(f"Cloning {REPO_URL} into {REPO_DIR} ...")
	code, out, err = run_cmd(f"git clone {REPO_URL} {REPO_DIR}")
	logs.append(out)
	logs.append(err)
	if code != 0:
	logs.append("Failed to clone repository.")
	return False, "\n".join(logs)

	# Try to pip install the package (editable) and requirements if present
	# This may be heavy (torch etc.). If it fails, user can preinstall deps.
	req_txt = REPO_DIR / "requirements.txt"
	if req_txt.exists():
	logs.append("Installing requirements.txt (this may take several minutes)...")
	code, out, err = run_cmd(f"{PYTHON_CMD} -m pip install -r {req_txt}")
	logs.append(out)
	logs.append(err)
	if code != 0:
	logs.append("requirements install returned non-zero exit code, trying package install...")

	# Try to install the package (setup.py or pyproject)
	logs.append("Attempting to install ZipVoice package (pip install -e .)")
	code, out, err = run_cmd(f"{PYTHON_CMD} -m pip install -e {REPO_DIR}")
	logs.append(out)
	logs.append(err)
	if code != 0:
	logs.append("Editable install failed; try installing dependencies manually in the Space.")
	# Even if install failed, the CLI may still run if requirements are available.
	# Return success=False so the UI can warn user.
	return False, "\n".join(logs)

	logs.append("ZipVoice installed successfully.")
	return True, "\n".join(logs)


	def infer_zipvoice(prompt_wav_path: str, prompt_text: str, text: str, model_name: str, num_steps: int = 6):
	"""Run the ZipVoice inference CLI and return path to generated wav and logs."""
	out_dir = Path("/tmp/zipvoice_out")
	out_dir.mkdir(parents=True, exist_ok=True)
	res_wav = out_dir / f"result_{int(time.time())}.wav"

	# Build command
	cmd = (
	f"{PYTHON_CMD} -m zipvoice.bin.infer_zipvoice"
	f" --model-name {shlex.quote(model_name)}"
	f" --prompt-wav {shlex.quote(prompt_wav_path)}"
	f" --prompt-text {shlex.quote(prompt_text)}"
	f" --text {shlex.quote(text)}"
	f" --res-wav-path {shlex.quote(str(res_wav))}"
	f" --num-steps {int(num_steps)}"
	)

	rc, out, err = run_cmd(cmd, cwd=str(REPO_DIR), timeout=900)
	logs = f"RETURN_CODE={rc}\nSTDOUT:\n{out}\nSTDERR:\n{err}"

	if rc == 0 and res_wav.exists():
	return str(res_wav), logs
	else:
	return None, logs


	# Build Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# ZipVoice Hugging Face Space - Quick Runner")

	status_box = gr.Textbox(label="Setup / Status logs", lines=8, interactive=False)

	with gr.Row():
	with gr.Column(scale=1):
	prompt_audio = gr.Audio(label="Prompt audio (wav) - short, clean, single speaker", type="filepath")
	prompt_text = gr.Textbox(label="Transcription of prompt audio", value="", placeholder="Type the transcription of the prompt wav here")
	text_to_speak = gr.Textbox(label="Text to synthesize", value="Hello, this is a test from ZipVoice.")
	model_choice = gr.Radio(choices=[DEFAULT_MODEL, DEFAULT_DISTILL_MODEL], value=DEFAULT_MODEL, label="Model")
	num_steps = gr.Slider(label="Number of sampling steps (lower = faster)", minimum=1, maximum=16, value=6, step=1)
	generate_btn = gr.Button("Generate")
	with gr.Column(scale=1):
	output_audio = gr.Audio(label="Generated audio (result.wav)")
	logs_out = gr.Textbox(label="Inference logs", lines=12, interactive=False)

	def startup_check():
	ok, logs = ensure_zipvoice_installed()
	if not ok:
	msg = (
	"Warning: automatic install failed.\n"
	"Please preinstall model dependencies (torch, soundfile, etc.) or enable internet for this Space.\n"
	"Install logs:\n"
	)
	return msg + logs
	return "Setup complete. You can upload prompt audio and run inference.\n" + logs

	def on_generate(prompt_wav, p_text, text, model_name, n_steps):
	if not prompt_wav:
	return None, "", "Please upload a prompt WAV file."
	# ensure installed
	ok, logs = ensure_zipvoice_installed()
	if not ok:
	return None, "", "ZipVoice is not installed correctly. See logs:\n" + logs

	# Copy prompt wav into tmp file (sometimes gradio provides a temporary path already)
	tmp_prompt = Path(prompt_wav)
	if not tmp_prompt.exists():
	return None, "", f"Prompt file not found: {prompt_wav}"

	res_path, infer_logs = infer_zipvoice(str(tmp_prompt), p_text or "", text or "", model_name, int(n_steps))
	if res_path:
	return res_path, infer_logs, "Generation successful"
	else:
	return None, infer_logs, "Generation failed. See logs."

	# Wire events
	demo.load(startup_check, outputs=[status_box])
	generate_btn.click(on_generate, inputs=[prompt_audio, prompt_text, text_to_speak, model_choice, num_steps], outputs=[output_audio, logs_out, status_box])


	if __name__ == "__main__":
	demo.launch()