File size: 3,071 Bytes
3aa4060
 
 
 
 
 
b2b82a4
 
 
 
 
bd0143d
b2b82a4
 
 
 
 
 
 
 
 
 
 
 
 
3aa4060
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import sys,os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import librosa
import argparse
import numpy as np
import parselmouth
from pitch.rmvpe import RMVPE
import torch
import numpy as np

def compute_f0_rmvpe(path, model_path='rmvpe.pt'):
    hf_hub_download(repo_id="lj1995/VoiceConversion"+"Web"+"UI", filename="rmvpe.pt")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = RMVPE(model_path, is_half=False, device=device)
    x, sr = librosa.load(path, sr=16000)
    assert sr == 16000
    f0 = model.infer_from_audio(x)
    # Adjust to 10 ms resolution by repeating (assuming 20 ms output)
    f0 = np.repeat(f0, 2)
    # Pad f0 to match other functions
    lpad = 6
    rpad = 6
    f0 = np.pad(f0, [lpad, rpad], mode='constant')
    return f0
    

def compute_f0_mouth(path):
    x, sr = librosa.load(path, sr=16000)
    assert sr == 16000
    lpad = 1024 // 160
    rpad = lpad
    f0 = parselmouth.Sound(x, sr).to_pitch_ac(
        time_step=160 / sr,
        voicing_threshold=0.5,
        pitch_floor=30,
        pitch_ceiling=1000).selected_array['frequency']
    f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
    return f0


def compute_f0_crepe(filename):
    import torch
    import torchcrepe
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    audio, sr = librosa.load(filename, sr=16000)
    assert sr == 16000
    audio = torch.tensor(np.copy(audio))[None]
    audio = audio + torch.randn_like(audio) * 0.001
    # Here we'll use a 20 millisecond hop length
    hop_length = 320
    fmin = 50
    fmax = 1000
    model = "full"
    batch_size = 512
    pitch = torchcrepe.predict(
        audio,
        sr,
        hop_length,
        fmin,
        fmax,
        model,
        batch_size=batch_size,
        device=device,
        return_periodicity=False,
    )
    pitch = np.repeat(pitch, 2, -1)  # 320 -> 160 * 2
    pitch = torchcrepe.filter.mean(pitch, 5)
    pitch = pitch.squeeze(0)
    return pitch


def save_csv_pitch(pitch, path):
    with open(path, "w", encoding='utf-8') as pitch_file:
        for i in range(len(pitch)):
            t = i * 10
            minute = t // 60000
            seconds = (t - minute * 60000) // 1000
            millisecond = t % 1000
            print(
                f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file)


def load_csv_pitch(path):
    pitch = []
    with open(path, "r", encoding='utf-8') as pitch_file:
        for line in pitch_file.readlines():
            pit = line.strip().split(",")[-1]
            pitch.append(int(pit))
    return pitch


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-w", "--wav", help="wav", dest="wav")
    parser.add_argument("-p", "--pit", help="pit", dest="pit")  # csv for excel
    args = parser.parse_args()
    print(args.wav)
    print(args.pit)

    pitch = compute_f0_mouth(args.wav)
    save_csv_pitch(pitch, args.pit)
    #tmp = load_csv_pitch(args.pit)
    #save_csv_pitch(tmp, "tmp.csv")