# utils/metrics.py import numpy as np import librosa def calculate_msd(pred_audio, target_audio, sr=22050): """ Mel Spectral Distance (MSD) between predicted and target audio. """ # Convert to mel-spectrogram pred_mel = librosa.feature.melspectrogram(y=pred_audio, sr=sr) target_mel = librosa.feature.melspectrogram(y=target_audio, sr=sr) # Convert to dB pred_db = librosa.power_to_db(pred_mel, ref=np.max) target_db = librosa.power_to_db(target_mel, ref=np.max) # Mean squared difference return np.mean((pred_db - target_db) ** 2) def calculate_f0_correlation(pred_audio, target_audio, sr=22050): """ Pitch correlation (F0 correlation) between predicted and target. """ f0_pred, _, _ = librosa.pyin(pred_audio, fmin=50, fmax=500, sr=sr) f0_target, _, _ = librosa.pyin(target_audio, fmin=50, fmax=500, sr=sr) # Remove NaNs mask = ~np.isnan(f0_pred) & ~np.isnan(f0_target) if np.sum(mask) == 0: return 0.0 return np.corrcoef(f0_pred[mask], f0_target[mask])[0, 1] def calculate_phoneme_accuracy(pred_phonemes, target_phonemes): """ Simple phoneme accuracy metric. (Here, pred_phonemes and target_phonemes are lists of symbols) """ if len(target_phonemes) == 0: return 0.0 correct = sum(p == t for p, t in zip(pred_phonemes, target_phonemes)) return correct / len(target_phonemes) def calculate_spectral_convergence(pred_audio, target_audio, sr=22050): """ Spectral convergence: how close the predicted spectrum is to the target. """ pred_spec = np.abs(librosa.stft(pred_audio)) target_spec = np.abs(librosa.stft(target_audio)) numerator = np.linalg.norm(target_spec - pred_spec, 'fro') denominator = np.linalg.norm(target_spec, 'fro') return numerator / (denominator + 1e-8)