Spaces:
Running
Running
| import warnings | |
| import librosa | |
| import numpy as np | |
| import resampy | |
| import torch | |
| import crepe | |
| ############################################################################### | |
| # Constants | |
| ############################################################################### | |
| # Minimum decibel level | |
| MIN_DB = -100. | |
| # Reference decibel level | |
| REF_DB = 20. | |
| ############################################################################### | |
| # A-weighted loudness | |
| ############################################################################### | |
| def a_weighted(audio, sample_rate, hop_length=None, pad=True): | |
| """Retrieve the per-frame loudness""" | |
| # Save device | |
| device = audio.device | |
| # Default hop length of 10 ms | |
| hop_length = sample_rate // 100 if hop_length is None else hop_length | |
| # Convert to numpy | |
| audio = audio.detach().cpu().numpy().squeeze(0) | |
| # Resample | |
| if sample_rate != crepe.SAMPLE_RATE: | |
| audio = resampy.resample(audio, sample_rate, crepe.SAMPLE_RATE) | |
| hop_length = int(hop_length * crepe.SAMPLE_RATE / sample_rate) | |
| # Cache weights | |
| if not hasattr(a_weighted, 'weights'): | |
| a_weighted.weights = perceptual_weights() | |
| # Take stft | |
| stft = librosa.stft(audio, | |
| n_fft=crepe.WINDOW_SIZE, | |
| hop_length=hop_length, | |
| win_length=crepe.WINDOW_SIZE, | |
| center=pad, | |
| pad_mode='constant') | |
| # Compute magnitude on db scale | |
| db = librosa.amplitude_to_db(np.abs(stft)) | |
| # Apply A-weighting | |
| weighted = db + a_weighted.weights | |
| # Threshold | |
| weighted[weighted < MIN_DB] = MIN_DB | |
| # Average over weighted frequencies | |
| return torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None] | |
| def perceptual_weights(): | |
| """A-weighted frequency-dependent perceptual loudness weights""" | |
| frequencies = librosa.fft_frequencies(sr=crepe.SAMPLE_RATE, | |
| n_fft=crepe.WINDOW_SIZE) | |
| # A warning is raised for nearly inaudible frequencies, but it ends up | |
| # defaulting to -100 db. That default is fine for our purposes. | |
| with warnings.catch_warnings(): | |
| warnings.simplefilter('ignore', RuntimeWarning) | |
| return librosa.A_weighting(frequencies)[:, None] - REF_DB | |