|
|
|
|
|
import numpy as np |
|
|
import librosa |
|
|
|
|
|
def calculate_msd(pred_audio, target_audio, sr=22050): |
|
|
""" |
|
|
Mel Spectral Distance (MSD) between predicted and target audio. |
|
|
""" |
|
|
|
|
|
pred_mel = librosa.feature.melspectrogram(y=pred_audio, sr=sr) |
|
|
target_mel = librosa.feature.melspectrogram(y=target_audio, sr=sr) |
|
|
|
|
|
|
|
|
pred_db = librosa.power_to_db(pred_mel, ref=np.max) |
|
|
target_db = librosa.power_to_db(target_mel, ref=np.max) |
|
|
|
|
|
|
|
|
return np.mean((pred_db - target_db) ** 2) |
|
|
|
|
|
|
|
|
def calculate_f0_correlation(pred_audio, target_audio, sr=22050): |
|
|
""" |
|
|
Pitch correlation (F0 correlation) between predicted and target. |
|
|
""" |
|
|
f0_pred, _, _ = librosa.pyin(pred_audio, fmin=50, fmax=500, sr=sr) |
|
|
f0_target, _, _ = librosa.pyin(target_audio, fmin=50, fmax=500, sr=sr) |
|
|
|
|
|
|
|
|
mask = ~np.isnan(f0_pred) & ~np.isnan(f0_target) |
|
|
if np.sum(mask) == 0: |
|
|
return 0.0 |
|
|
return np.corrcoef(f0_pred[mask], f0_target[mask])[0, 1] |
|
|
|
|
|
|
|
|
def calculate_phoneme_accuracy(pred_phonemes, target_phonemes): |
|
|
""" |
|
|
Simple phoneme accuracy metric. |
|
|
(Here, pred_phonemes and target_phonemes are lists of symbols) |
|
|
""" |
|
|
if len(target_phonemes) == 0: |
|
|
return 0.0 |
|
|
correct = sum(p == t for p, t in zip(pred_phonemes, target_phonemes)) |
|
|
return correct / len(target_phonemes) |
|
|
|
|
|
|
|
|
def calculate_spectral_convergence(pred_audio, target_audio, sr=22050): |
|
|
""" |
|
|
Spectral convergence: how close the predicted spectrum is to the target. |
|
|
""" |
|
|
pred_spec = np.abs(librosa.stft(pred_audio)) |
|
|
target_spec = np.abs(librosa.stft(target_audio)) |
|
|
|
|
|
numerator = np.linalg.norm(target_spec - pred_spec, 'fro') |
|
|
denominator = np.linalg.norm(target_spec, 'fro') |
|
|
|
|
|
return numerator / (denominator + 1e-8) |
|
|
|
|
|
|