Spaces:
Running
Running
Update src/vc_infer_pipeline.py
Browse files- src/vc_infer_pipeline.py +32 -39
src/vc_infer_pipeline.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
|
|
| 1 |
import numpy as np, parselmouth, torch, pdb, sys, os
|
| 2 |
from time import time as ttime
|
| 3 |
import torch.nn.functional as F
|
| 4 |
import torchcrepe
|
| 5 |
-
from torch import Tensor
|
| 6 |
-
import scipy.signal as signal
|
| 7 |
-
import pyworld, os, traceback, faiss, librosa, torchcrepe
|
| 8 |
from scipy import signal
|
| 9 |
-
from
|
| 10 |
-
import
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 13 |
now_dir = os.path.join(BASE_DIR, 'src')
|
|
@@ -36,19 +37,20 @@ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
|
|
| 36 |
|
| 37 |
|
| 38 |
def change_rms(data1, sr1, data2, sr2, rate):
|
| 39 |
-
rms1 = librosa.feature.rms(
|
| 40 |
-
y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
|
| 41 |
-
)
|
| 42 |
rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
|
|
|
|
| 43 |
rms1 = torch.from_numpy(rms1)
|
| 44 |
rms1 = F.interpolate(
|
| 45 |
rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
|
| 46 |
).squeeze()
|
|
|
|
| 47 |
rms2 = torch.from_numpy(rms2)
|
| 48 |
rms2 = F.interpolate(
|
| 49 |
rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
|
| 50 |
).squeeze()
|
| 51 |
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
|
|
|
|
| 52 |
data2 *= (
|
| 53 |
torch.pow(rms1, torch.tensor(1 - rate))
|
| 54 |
* torch.pow(rms2, torch.tensor(rate - 1))
|
|
@@ -78,9 +80,7 @@ class VC(object):
|
|
| 78 |
|
| 79 |
def get_optimal_torch_device(self, index: int = 0) -> torch.device:
|
| 80 |
if torch.cuda.is_available():
|
| 81 |
-
return torch.device(
|
| 82 |
-
f"cuda:{index % torch.cuda.device_count()}"
|
| 83 |
-
)
|
| 84 |
elif torch.backends.mps.is_available():
|
| 85 |
return torch.device("mps")
|
| 86 |
return torch.device("cpu")
|
|
@@ -94,9 +94,7 @@ class VC(object):
|
|
| 94 |
hop_length=160,
|
| 95 |
model="full",
|
| 96 |
):
|
| 97 |
-
x = x.astype(
|
| 98 |
-
np.float32
|
| 99 |
-
)
|
| 100 |
x /= np.quantile(np.abs(x), 0.999)
|
| 101 |
torch_device = self.get_optimal_torch_device()
|
| 102 |
audio = torch.from_numpy(x).to(torch_device, copy=True)
|
|
@@ -152,12 +150,6 @@ class VC(object):
|
|
| 152 |
f0 = f0[0].cpu().numpy()
|
| 153 |
return f0
|
| 154 |
|
| 155 |
-
def get_f0_pyin_computation(self, x, f0_min, f0_max):
|
| 156 |
-
y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
|
| 157 |
-
f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
|
| 158 |
-
f0 = f0[1:]
|
| 159 |
-
return f0
|
| 160 |
-
|
| 161 |
def get_f0_hybrid_computation(
|
| 162 |
self,
|
| 163 |
methods_str,
|
|
@@ -180,8 +172,9 @@ class VC(object):
|
|
| 180 |
for method in methods:
|
| 181 |
f0 = None
|
| 182 |
if method == "crepe":
|
| 183 |
-
f0 = self.
|
| 184 |
-
|
|
|
|
| 185 |
elif method == "mangio-crepe":
|
| 186 |
f0 = self.get_f0_crepe_computation(
|
| 187 |
x, f0_min, f0_max, p_len, crepe_hop_length
|
|
@@ -228,11 +221,13 @@ class VC(object):
|
|
| 228 |
filter_radius,
|
| 229 |
crepe_hop_length,
|
| 230 |
inp_f0=None,
|
|
|
|
|
|
|
| 231 |
):
|
| 232 |
global input_audio_path2wav
|
| 233 |
time_step = self.window / self.sr * 1000
|
| 234 |
-
f0_min = 50
|
| 235 |
-
f0_max = 1100
|
| 236 |
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
| 237 |
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
| 238 |
if f0_method == "pm":
|
|
@@ -248,9 +243,7 @@ class VC(object):
|
|
| 248 |
)
|
| 249 |
pad_size = (p_len - len(f0) + 1) // 2
|
| 250 |
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
| 251 |
-
f0 = np.pad(
|
| 252 |
-
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
|
| 253 |
-
)
|
| 254 |
|
| 255 |
elif f0_method == "harvest":
|
| 256 |
input_audio_path2wav[input_audio_path] = x.astype(np.double)
|
|
@@ -268,10 +261,10 @@ class VC(object):
|
|
| 268 |
)
|
| 269 |
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
|
| 270 |
f0 = signal.medfilt(f0, 3)
|
| 271 |
-
|
| 272 |
-
elif f0_method == "crepe":
|
| 273 |
-
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
|
| 274 |
|
|
|
|
|
|
|
|
|
|
| 275 |
elif f0_method == "mangio-crepe":
|
| 276 |
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
|
| 277 |
|
|
@@ -476,17 +469,15 @@ class VC(object):
|
|
| 476 |
protect,
|
| 477 |
crepe_hop_length,
|
| 478 |
f0_file=None,
|
|
|
|
|
|
|
| 479 |
):
|
| 480 |
-
if (
|
| 481 |
-
file_index != ""
|
| 482 |
-
and os.path.exists(file_index) == True
|
| 483 |
-
and index_rate != 0
|
| 484 |
-
):
|
| 485 |
try:
|
| 486 |
index = faiss.read_index(file_index)
|
| 487 |
big_npy = index.reconstruct_n(0, index.ntotal)
|
| 488 |
-
except:
|
| 489 |
-
|
| 490 |
index = big_npy = None
|
| 491 |
else:
|
| 492 |
index = big_npy = None
|
|
@@ -521,8 +512,8 @@ class VC(object):
|
|
| 521 |
for line in lines:
|
| 522 |
inp_f0.append([float(i) for i in line.split(",")])
|
| 523 |
inp_f0 = np.array(inp_f0, dtype="float32")
|
| 524 |
-
except:
|
| 525 |
-
|
| 526 |
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
| 527 |
pitch, pitchf = None, None
|
| 528 |
if if_f0 == 1:
|
|
@@ -535,6 +526,8 @@ class VC(object):
|
|
| 535 |
filter_radius,
|
| 536 |
crepe_hop_length,
|
| 537 |
inp_f0,
|
|
|
|
|
|
|
| 538 |
)
|
| 539 |
pitch = pitch[:p_len]
|
| 540 |
pitchf = pitchf[:p_len]
|
|
|
|
| 1 |
+
from functools import lru_cache
|
| 2 |
import numpy as np, parselmouth, torch, pdb, sys, os
|
| 3 |
from time import time as ttime
|
| 4 |
import torch.nn.functional as F
|
| 5 |
import torchcrepe
|
|
|
|
|
|
|
|
|
|
| 6 |
from scipy import signal
|
| 7 |
+
from torch import Tensor
|
| 8 |
+
import pyworld, os, faiss, librosa, torchcrepe
|
| 9 |
+
import random
|
| 10 |
+
import gc
|
| 11 |
+
import re
|
| 12 |
|
| 13 |
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 14 |
now_dir = os.path.join(BASE_DIR, 'src')
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def change_rms(data1, sr1, data2, sr2, rate):
|
| 40 |
+
rms1 = librosa.feature.rms(y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2)
|
|
|
|
|
|
|
| 41 |
rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
|
| 42 |
+
|
| 43 |
rms1 = torch.from_numpy(rms1)
|
| 44 |
rms1 = F.interpolate(
|
| 45 |
rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
|
| 46 |
).squeeze()
|
| 47 |
+
|
| 48 |
rms2 = torch.from_numpy(rms2)
|
| 49 |
rms2 = F.interpolate(
|
| 50 |
rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
|
| 51 |
).squeeze()
|
| 52 |
rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
|
| 53 |
+
|
| 54 |
data2 *= (
|
| 55 |
torch.pow(rms1, torch.tensor(1 - rate))
|
| 56 |
* torch.pow(rms2, torch.tensor(rate - 1))
|
|
|
|
| 80 |
|
| 81 |
def get_optimal_torch_device(self, index: int = 0) -> torch.device:
|
| 82 |
if torch.cuda.is_available():
|
| 83 |
+
return torch.device(f"cuda:{index % torch.cuda.device_count()}")
|
|
|
|
|
|
|
| 84 |
elif torch.backends.mps.is_available():
|
| 85 |
return torch.device("mps")
|
| 86 |
return torch.device("cpu")
|
|
|
|
| 94 |
hop_length=160,
|
| 95 |
model="full",
|
| 96 |
):
|
| 97 |
+
x = x.astype(np.float32)
|
|
|
|
|
|
|
| 98 |
x /= np.quantile(np.abs(x), 0.999)
|
| 99 |
torch_device = self.get_optimal_torch_device()
|
| 100 |
audio = torch.from_numpy(x).to(torch_device, copy=True)
|
|
|
|
| 150 |
f0 = f0[0].cpu().numpy()
|
| 151 |
return f0
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
def get_f0_hybrid_computation(
|
| 154 |
self,
|
| 155 |
methods_str,
|
|
|
|
| 172 |
for method in methods:
|
| 173 |
f0 = None
|
| 174 |
if method == "crepe":
|
| 175 |
+
f0 = self.get_f0_crepe_computation(
|
| 176 |
+
x, f0_min, f0_max, p_len
|
| 177 |
+
)
|
| 178 |
elif method == "mangio-crepe":
|
| 179 |
f0 = self.get_f0_crepe_computation(
|
| 180 |
x, f0_min, f0_max, p_len, crepe_hop_length
|
|
|
|
| 221 |
filter_radius,
|
| 222 |
crepe_hop_length,
|
| 223 |
inp_f0=None,
|
| 224 |
+
f0_min=50,
|
| 225 |
+
f0_max=1100,
|
| 226 |
):
|
| 227 |
global input_audio_path2wav
|
| 228 |
time_step = self.window / self.sr * 1000
|
| 229 |
+
#f0_min = 50
|
| 230 |
+
#f0_max = 1100
|
| 231 |
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
| 232 |
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
| 233 |
if f0_method == "pm":
|
|
|
|
| 243 |
)
|
| 244 |
pad_size = (p_len - len(f0) + 1) // 2
|
| 245 |
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
| 246 |
+
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
|
|
|
|
|
|
| 247 |
|
| 248 |
elif f0_method == "harvest":
|
| 249 |
input_audio_path2wav[input_audio_path] = x.astype(np.double)
|
|
|
|
| 261 |
)
|
| 262 |
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
|
| 263 |
f0 = signal.medfilt(f0, 3)
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
+
elif f0_method == "crepe":
|
| 266 |
+
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len)
|
| 267 |
+
|
| 268 |
elif f0_method == "mangio-crepe":
|
| 269 |
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
|
| 270 |
|
|
|
|
| 469 |
protect,
|
| 470 |
crepe_hop_length,
|
| 471 |
f0_file=None,
|
| 472 |
+
f0_min=50,
|
| 473 |
+
f0_max=1100,
|
| 474 |
):
|
| 475 |
+
if file_index != "" and os.path.exists(file_index) == True and index_rate != 0:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
try:
|
| 477 |
index = faiss.read_index(file_index)
|
| 478 |
big_npy = index.reconstruct_n(0, index.ntotal)
|
| 479 |
+
except Exception as error:
|
| 480 |
+
print(error)
|
| 481 |
index = big_npy = None
|
| 482 |
else:
|
| 483 |
index = big_npy = None
|
|
|
|
| 512 |
for line in lines:
|
| 513 |
inp_f0.append([float(i) for i in line.split(",")])
|
| 514 |
inp_f0 = np.array(inp_f0, dtype="float32")
|
| 515 |
+
except Exception as error:
|
| 516 |
+
print(error)
|
| 517 |
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
| 518 |
pitch, pitchf = None, None
|
| 519 |
if if_f0 == 1:
|
|
|
|
| 526 |
filter_radius,
|
| 527 |
crepe_hop_length,
|
| 528 |
inp_f0,
|
| 529 |
+
f0_min,
|
| 530 |
+
f0_max,
|
| 531 |
)
|
| 532 |
pitch = pitch[:p_len]
|
| 533 |
pitchf = pitchf[:p_len]
|