Spaces:
Sleeping
Sleeping
File size: 3,071 Bytes
3aa4060 b2b82a4 bd0143d b2b82a4 3aa4060 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import sys,os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import librosa
import argparse
import numpy as np
import parselmouth
from pitch.rmvpe import RMVPE
import torch
import numpy as np
def compute_f0_rmvpe(path, model_path='rmvpe.pt'):
hf_hub_download(repo_id="lj1995/VoiceConversion"+"Web"+"UI", filename="rmvpe.pt")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = RMVPE(model_path, is_half=False, device=device)
x, sr = librosa.load(path, sr=16000)
assert sr == 16000
f0 = model.infer_from_audio(x)
# Adjust to 10 ms resolution by repeating (assuming 20 ms output)
f0 = np.repeat(f0, 2)
# Pad f0 to match other functions
lpad = 6
rpad = 6
f0 = np.pad(f0, [lpad, rpad], mode='constant')
return f0
def compute_f0_mouth(path):
x, sr = librosa.load(path, sr=16000)
assert sr == 16000
lpad = 1024 // 160
rpad = lpad
f0 = parselmouth.Sound(x, sr).to_pitch_ac(
time_step=160 / sr,
voicing_threshold=0.5,
pitch_floor=30,
pitch_ceiling=1000).selected_array['frequency']
f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
return f0
def compute_f0_crepe(filename):
import torch
import torchcrepe
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
audio, sr = librosa.load(filename, sr=16000)
assert sr == 16000
audio = torch.tensor(np.copy(audio))[None]
audio = audio + torch.randn_like(audio) * 0.001
# Here we'll use a 20 millisecond hop length
hop_length = 320
fmin = 50
fmax = 1000
model = "full"
batch_size = 512
pitch = torchcrepe.predict(
audio,
sr,
hop_length,
fmin,
fmax,
model,
batch_size=batch_size,
device=device,
return_periodicity=False,
)
pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2
pitch = torchcrepe.filter.mean(pitch, 5)
pitch = pitch.squeeze(0)
return pitch
def save_csv_pitch(pitch, path):
with open(path, "w", encoding='utf-8') as pitch_file:
for i in range(len(pitch)):
t = i * 10
minute = t // 60000
seconds = (t - minute * 60000) // 1000
millisecond = t % 1000
print(
f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file)
def load_csv_pitch(path):
pitch = []
with open(path, "r", encoding='utf-8') as pitch_file:
for line in pitch_file.readlines():
pit = line.strip().split(",")[-1]
pitch.append(int(pit))
return pitch
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-w", "--wav", help="wav", dest="wav")
parser.add_argument("-p", "--pit", help="pit", dest="pit") # csv for excel
args = parser.parse_args()
print(args.wav)
print(args.pit)
pitch = compute_f0_mouth(args.wav)
save_csv_pitch(pitch, args.pit)
#tmp = load_csv_pitch(args.pit)
#save_csv_pitch(tmp, "tmp.csv")
|