Spaces:

USF00
/

Text_to_Video_Demo

Configuration error

App Files Files Community

Text_to_Video_Demo / preprocessing /extract_vocals.py

USF00

Initial commit

2b67076 30 days ago

raw

history blame contribute delete

2.28 kB

	from pathlib import Path
	import os, tempfile
	import numpy as np
	import soundfile as sf
	import librosa
	import torch
	import gc

	from audio_separator.separator import Separator

	def get_vocals(src_path: str, dst_path: str, min_seconds: float = 8) -> str:
	"""
	If the source audio is shorter than `min_seconds`, pad with trailing silence
	in a temporary file, then run separation and save only the vocals to dst_path.
	Returns the full path to the vocals file.
	"""

	default_device = torch.get_default_device()
	torch.set_default_device('cpu')

	dst = Path(dst_path)
	dst.parent.mkdir(parents=True, exist_ok=True)

	# Quick duration check
	duration = librosa.get_duration(path=src_path)

	use_path = src_path
	temp_path = None
	try:
	if duration < min_seconds:
	# Load (resample) and pad in memory
	y, sr = librosa.load(src_path, sr=None, mono=False)
	if y.ndim == 1: # ensure shape (channels, samples)
	y = y[np.newaxis, :]
	target_len = int(min_seconds * sr)
	pad = max(0, target_len - y.shape[1])
	if pad:
	y = np.pad(y, ((0, 0), (0, pad)), mode="constant")

	# Write a temp WAV for the separator
	fd, temp_path = tempfile.mkstemp(suffix=".wav")
	os.close(fd)
	sf.write(temp_path, y.T, sr) # soundfile expects (frames, channels)
	use_path = temp_path

	# Run separation: emit only the vocals, with your exact filename
	sep = Separator(
	output_dir=str(dst.parent),
	output_format=(dst.suffix.lstrip(".") or "wav"),
	output_single_stem="Vocals",
	model_file_dir="ckpts/roformer/" #model_bs_roformer_ep_317_sdr_12.9755.ckpt"
	)
	sep.load_model()
	out_files = sep.separate(use_path, {"Vocals": dst.stem})

	out = Path(out_files[0])
	return str(out if out.is_absolute() else (dst.parent / out))
	finally:
	if temp_path and os.path.exists(temp_path):
	os.remove(temp_path)

	torch.cuda.empty_cache()
	gc.collect()
	torch.set_default_device(default_device)

	# Example:
	# final = extract_vocals("in/clip.mp3", "out/vocals.wav")
	# print(final)