Spaces:
Running
Running
| import warnings | |
| import numpy as np | |
| import resampy | |
| import torch | |
| import tqdm | |
| import crepe | |
| __all__ = ['CENTS_PER_BIN', | |
| 'MAX_FMAX', | |
| 'PITCH_BINS', | |
| 'SAMPLE_RATE', | |
| 'WINDOW_SIZE', | |
| 'UNVOICED', | |
| 'embed', | |
| 'embed_from_file', | |
| 'embed_from_file_to_file', | |
| 'embed_from_files_to_files', | |
| 'infer', | |
| 'predict', | |
| 'predict_from_file', | |
| 'predict_from_file_to_file', | |
| 'predict_from_files_to_files', | |
| 'preprocess', | |
| 'postprocess', | |
| 'resample'] | |
| ############################################################################### | |
| # Constants | |
| ############################################################################### | |
| CENTS_PER_BIN = 20 # cents | |
| MAX_FMAX = 2006. # hz | |
| PITCH_BINS = 360 | |
| SAMPLE_RATE = 16000 # hz | |
| WINDOW_SIZE = 1024 # samples | |
| UNVOICED = np.nan | |
| ############################################################################### | |
| # Crepe pitch prediction | |
| ############################################################################### | |
| def predict(audio, | |
| sample_rate, | |
| hop_length=None, | |
| fmin=50., | |
| fmax=MAX_FMAX, | |
| model='full', | |
| decoder=crepe.decode.viterbi, | |
| return_harmonicity=False, | |
| return_periodicity=False, | |
| batch_size=None, | |
| device='cpu', | |
| pad=True): | |
| """Performs pitch estimation | |
| Arguments | |
| audio (torch.tensor [shape=(1, time)]) | |
| The audio signal | |
| sample_rate (int) | |
| The sampling rate in Hz | |
| hop_length (int) | |
| The hop_length in samples | |
| fmin (float) | |
| The minimum allowable frequency in Hz | |
| fmax (float) | |
| The maximum allowable frequency in Hz | |
| model (string) | |
| The model capacity. One of 'full' or 'tiny'. | |
| decoder (function) | |
| The decoder to use. See decode.py for decoders. | |
| return_harmonicity (bool) [DEPRECATED] | |
| Whether to also return the network confidence | |
| return_periodicity (bool) | |
| Whether to also return the network confidence | |
| batch_size (int) | |
| The number of frames per batch | |
| device (string) | |
| The device used to run inference | |
| pad (bool) | |
| Whether to zero-pad the audio | |
| Returns | |
| pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))]) | |
| (Optional) periodicity (torch.tensor | |
| [shape=(1, 1 + int(time // hop_length))]) | |
| """ | |
| # Deprecate return_harmonicity | |
| if return_harmonicity: | |
| message = ( | |
| 'The crepe return_harmonicity argument is deprecated and ' | |
| 'will be removed in a future release. Please use ' | |
| 'return_periodicity. Rationale: if network confidence measured ' | |
| 'harmonics, the value would be low for non-harmonic, periodic ' | |
| 'sounds (e.g., sine waves). But this is not observed.') | |
| warnings.warn(message, DeprecationWarning) | |
| return_periodicity = return_harmonicity | |
| results = [] | |
| # Postprocessing breaks gradients, so just don't compute them | |
| with torch.no_grad(): | |
| # Preprocess audio | |
| generator = preprocess(audio, | |
| sample_rate, | |
| hop_length, | |
| batch_size, | |
| device, | |
| pad) | |
| for frames in generator: | |
| # Infer independent probabilities for each pitch bin | |
| probabilities = infer(frames, model) | |
| # shape=(batch, 360, time / hop_length) | |
| probabilities = probabilities.reshape( | |
| audio.size(0), -1, PITCH_BINS).transpose(1, 2) | |
| # Convert probabilities to F0 and periodicity | |
| result = postprocess(probabilities, | |
| fmin, | |
| fmax, | |
| decoder, | |
| return_harmonicity, | |
| return_periodicity) | |
| # Place on same device as audio to allow very long inputs | |
| if isinstance(result, tuple): | |
| result = (result[0].to(audio.device), | |
| result[1].to(audio.device)) | |
| else: | |
| result = result.to(audio.device) | |
| results.append(result) | |
| # Split pitch and periodicity | |
| if return_periodicity: | |
| pitch, periodicity = zip(*results) | |
| return torch.cat(pitch, 1), torch.cat(periodicity, 1) | |
| # Concatenate | |
| return torch.cat(results, 1) | |
| def predict_from_file(audio_file, | |
| hop_length=None, | |
| fmin=50., | |
| fmax=MAX_FMAX, | |
| model='full', | |
| decoder=crepe.decode.viterbi, | |
| return_harmonicity=False, | |
| return_periodicity=False, | |
| batch_size=None, | |
| device='cpu', | |
| pad=True): | |
| """Performs pitch estimation from file on disk | |
| Arguments | |
| audio_file (string) | |
| The file to perform pitch tracking on | |
| hop_length (int) | |
| The hop_length in samples | |
| fmin (float) | |
| The minimum allowable frequency in Hz | |
| fmax (float) | |
| The maximum allowable frequency in Hz | |
| model (string) | |
| The model capacity. One of 'full' or 'tiny'. | |
| decoder (function) | |
| The decoder to use. See decode.py for decoders. | |
| return_harmonicity (bool) [DEPRECATED] | |
| Whether to also return the network confidence | |
| return_periodicity (bool) | |
| Whether to also return the network confidence | |
| batch_size (int) | |
| The number of frames per batch | |
| device (string) | |
| The device used to run inference | |
| pad (bool) | |
| Whether to zero-pad the audio | |
| Returns | |
| pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))]) | |
| (Optional) periodicity (torch.tensor | |
| [shape=(1, 1 + int(time // hop_length))]) | |
| """ | |
| # Load audio | |
| audio, sample_rate = crepe.load.audio(audio_file) | |
| # Predict | |
| return predict(audio, | |
| sample_rate, | |
| hop_length, | |
| fmin, | |
| fmax, | |
| model, | |
| decoder, | |
| return_harmonicity, | |
| return_periodicity, | |
| batch_size, | |
| device, | |
| pad) | |
| def predict_from_file_to_file(audio_file, | |
| output_pitch_file, | |
| output_harmonicity_file=None, | |
| output_periodicity_file=None, | |
| hop_length=None, | |
| fmin=50., | |
| fmax=MAX_FMAX, | |
| model='full', | |
| decoder=crepe.decode.viterbi, | |
| batch_size=None, | |
| device='cpu', | |
| pad=True): | |
| """Performs pitch estimation from file on disk | |
| Arguments | |
| audio_file (string) | |
| The file to perform pitch tracking on | |
| output_pitch_file (string) | |
| The file to save predicted pitch | |
| output_harmonicity_file (string or None) [DEPRECATED] | |
| The file to save predicted harmonicity | |
| output_periodicity_file (string or None) | |
| The file to save predicted periodicity | |
| hop_length (int) | |
| The hop_length in samples | |
| fmin (float) | |
| The minimum allowable frequency in Hz | |
| fmax (float) | |
| The maximum allowable frequency in Hz | |
| model (string) | |
| The model capacity. One of 'full' or 'tiny'. | |
| decoder (function) | |
| The decoder to use. See decode.py for decoders. | |
| batch_size (int) | |
| The number of frames per batch | |
| device (string) | |
| The device used to run inference | |
| pad (bool) | |
| Whether to zero-pad the audio | |
| """ | |
| # Deprecate output_harmonicity_file | |
| if output_harmonicity_file is not None: | |
| message = ( | |
| 'The crepe output_harmonicity_file argument is deprecated and ' | |
| 'will be removed in a future release. Please use ' | |
| 'output_periodicity_file. Rationale: if network confidence measured ' | |
| 'harmonic content, the value would be low for non-harmonic, periodic ' | |
| 'sounds (e.g., sine waves). But this is not observed.') | |
| warnings.warn(message, DeprecationWarning) | |
| output_periodicity_file = output_harmonicity_file | |
| # Predict from file | |
| prediction = predict_from_file(audio_file, | |
| hop_length, | |
| fmin, | |
| fmax, | |
| model, | |
| decoder, | |
| False, | |
| output_periodicity_file is not None, | |
| batch_size, | |
| device, | |
| pad) | |
| # Save to disk | |
| if output_periodicity_file is not None: | |
| torch.save(prediction[0].detach(), output_pitch_file) | |
| torch.save(prediction[1].detach(), output_periodicity_file) | |
| else: | |
| torch.save(prediction.detach(), output_pitch_file) | |
| def predict_from_files_to_files(audio_files, | |
| output_pitch_files, | |
| output_harmonicity_files=None, | |
| output_periodicity_files=None, | |
| hop_length=None, | |
| fmin=50., | |
| fmax=MAX_FMAX, | |
| model='full', | |
| decoder=crepe.decode.viterbi, | |
| batch_size=None, | |
| device='cpu', | |
| pad=True): | |
| """Performs pitch estimation from files on disk without reloading model | |
| Arguments | |
| audio_files (list[string]) | |
| The files to perform pitch tracking on | |
| output_pitch_files (list[string]) | |
| The files to save predicted pitch | |
| output_harmonicity_files (list[string] or None) [DEPRECATED] | |
| The files to save predicted harmonicity | |
| output_periodicity_files (list[string] or None) | |
| The files to save predicted periodicity | |
| hop_length (int) | |
| The hop_length in samples | |
| fmin (float) | |
| The minimum allowable frequency in Hz | |
| fmax (float) | |
| The maximum allowable frequency in Hz | |
| model (string) | |
| The model capacity. One of 'full' or 'tiny'. | |
| decoder (function) | |
| The decoder to use. See decode.py for decoders. | |
| batch_size (int) | |
| The number of frames per batch | |
| device (string) | |
| The device used to run inference | |
| pad (bool) | |
| Whether to zero-pad the audio | |
| """ | |
| # Deprecate output_harmonicity_files | |
| if output_harmonicity_files is not None: | |
| message = ( | |
| 'The crepe output_harmonicity_files argument is deprecated and ' | |
| 'will be removed in a future release. Please use ' | |
| 'output_periodicity_files. Rationale: if network confidence measured ' | |
| 'harmonic content, the value would be low for non-harmonic, periodic ' | |
| 'sounds (e.g., sine waves). But this is not observed.') | |
| warnings.warn(message, DeprecationWarning) | |
| output_periodicity_files = output_harmonicity_files | |
| if output_periodicity_files is None: | |
| output_periodicity_files = len(audio_files) * [None] | |
| # Setup iterator | |
| iterator = zip(audio_files, output_pitch_files, output_periodicity_files) | |
| iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True) | |
| for audio_file, output_pitch_file, output_periodicity_file in iterator: | |
| # Predict a file | |
| predict_from_file_to_file(audio_file, | |
| output_pitch_file, | |
| None, | |
| output_periodicity_file, | |
| hop_length, | |
| fmin, | |
| fmax, | |
| model, | |
| decoder, | |
| batch_size, | |
| device, | |
| pad) | |
| ############################################################################### | |
| # Crepe pitch embedding | |
| ############################################################################### | |
| def embed(audio, | |
| sample_rate, | |
| hop_length=None, | |
| model='full', | |
| batch_size=None, | |
| device='cpu', | |
| pad=True): | |
| """Embeds audio to the output of CREPE's fifth maxpool layer | |
| Arguments | |
| audio (torch.tensor [shape=(1, time)]) | |
| The audio signals | |
| sample_rate (int) | |
| The sampling rate in Hz | |
| hop_length (int) | |
| The hop_length in samples | |
| model (string) | |
| The model capacity. One of 'full' or 'tiny'. | |
| batch_size (int) | |
| The number of frames per batch | |
| device (string) | |
| The device to run inference on | |
| pad (bool) | |
| Whether to zero-pad the audio | |
| Returns | |
| embedding (torch.tensor [shape=(1, | |
| 1 + int(time // hop_length), 32, -1)]) | |
| """ | |
| results = [] | |
| # Preprocess audio | |
| generator = preprocess(audio, | |
| sample_rate, | |
| hop_length, | |
| batch_size, | |
| device, | |
| pad) | |
| for frames in generator: | |
| # Infer pitch embeddings | |
| embedding = infer(frames, model, embed=True) | |
| # shape=(batch, time / hop_length, 32, embedding_size) | |
| result = embedding.reshape(audio.size(0), frames.size(0), 32, -1) | |
| # Place on same device as audio. This allows for large inputs. | |
| results.append(result.to(audio.device)) | |
| # Concatenate | |
| return torch.cat(results, 1) | |
| def embed_from_file(audio_file, | |
| hop_length=None, | |
| model='full', | |
| batch_size=None, | |
| device='cpu', | |
| pad=True): | |
| """Embeds audio from disk to the output of CREPE's fifth maxpool layer | |
| Arguments | |
| audio_file (string) | |
| The wav file containing the audio to embed | |
| hop_length (int) | |
| The hop_length in samples | |
| model (string) | |
| The model capacity. One of 'full' or 'tiny'. | |
| batch_size (int) | |
| The number of frames per batch | |
| device (string) | |
| The device to run inference on | |
| pad (bool) | |
| Whether to zero-pad the audio | |
| Returns | |
| embedding (torch.tensor [shape=(1, | |
| 1 + int(time // hop_length), 32, -1)]) | |
| """ | |
| # Load audio | |
| audio, sample_rate = crepe.load.audio(audio_file) | |
| # Embed | |
| return embed(audio, | |
| sample_rate, | |
| hop_length, | |
| model, | |
| batch_size, | |
| device, | |
| pad) | |
| def embed_from_file_to_file(audio_file, | |
| output_file, | |
| hop_length=None, | |
| model='full', | |
| batch_size=None, | |
| device='cpu', | |
| pad=True): | |
| """Embeds audio from disk and saves to disk | |
| Arguments | |
| audio_file (string) | |
| The wav file containing the audio to embed | |
| hop_length (int) | |
| The hop_length in samples | |
| output_file (string) | |
| The file to save the embedding | |
| model (string) | |
| The model capacity. One of 'full' or 'tiny'. | |
| batch_size (int) | |
| The number of frames per batch | |
| device (string) | |
| The device to run inference on | |
| pad (bool) | |
| Whether to zero-pad the audio | |
| """ | |
| # No use computing gradients if we're just saving to file | |
| with torch.no_grad(): | |
| # Embed | |
| embedding = embed_from_file(audio_file, | |
| hop_length, | |
| model, | |
| batch_size, | |
| device, | |
| pad) | |
| # Save to disk | |
| torch.save(embedding.detach(), output_file) | |
| def embed_from_files_to_files(audio_files, | |
| output_files, | |
| hop_length=None, | |
| model='full', | |
| batch_size=None, | |
| device='cpu', | |
| pad=True): | |
| """Embeds audio from disk and saves to disk without reloading model | |
| Arguments | |
| audio_files (list[string]) | |
| The wav files containing the audio to embed | |
| output_files (list[string]) | |
| The files to save the embeddings | |
| hop_length (int) | |
| The hop_length in samples | |
| model (string) | |
| The model capacity. One of 'full' or 'tiny'. | |
| batch_size (int) | |
| The number of frames per batch | |
| device (string) | |
| The device to run inference on | |
| pad (bool) | |
| Whether to zero-pad the audio | |
| """ | |
| # Setup iterator | |
| iterator = zip(audio_files, output_files) | |
| iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True) | |
| for audio_file, output_file in iterator: | |
| # Embed a file | |
| embed_from_file_to_file(audio_file, | |
| output_file, | |
| hop_length, | |
| model, | |
| batch_size, | |
| device, | |
| pad) | |
| ############################################################################### | |
| # Components for step-by-step prediction | |
| ############################################################################### | |
| def infer(frames, model='full', embed=False): | |
| """Forward pass through the model | |
| Arguments | |
| frames (torch.tensor [shape=(time / hop_length, 1024)]) | |
| The network input | |
| model (string) | |
| The model capacity. One of 'full' or 'tiny'. | |
| embed (bool) | |
| Whether to stop inference at the intermediate embedding layer | |
| Returns | |
| logits (torch.tensor [shape=(1 + int(time // hop_length), 360)]) OR | |
| embedding (torch.tensor [shape=(1 + int(time // hop_length), | |
| embedding_size)]) | |
| """ | |
| # Load the model if necessary | |
| if not hasattr(infer, 'model') or not hasattr(infer, 'capacity') or \ | |
| (hasattr(infer, 'capacity') and infer.capacity != model): | |
| crepe.load.model(frames.device, model) | |
| # Move model to correct device (no-op if devices are the same) | |
| infer.model = infer.model.to(frames.device) | |
| # Apply model | |
| return infer.model(frames, embed=embed) | |
| def postprocess(probabilities, | |
| fmin=0., | |
| fmax=MAX_FMAX, | |
| decoder=crepe.decode.viterbi, | |
| return_harmonicity=False, | |
| return_periodicity=False): | |
| """Convert model output to F0 and periodicity | |
| Arguments | |
| probabilities (torch.tensor [shape=(1, 360, time / hop_length)]) | |
| The probabilities for each pitch bin inferred by the network | |
| fmin (float) | |
| The minimum allowable frequency in Hz | |
| fmax (float) | |
| The maximum allowable frequency in Hz | |
| viterbi (bool) | |
| Whether to use viterbi decoding | |
| return_harmonicity (bool) [DEPRECATED] | |
| Whether to also return the network confidence | |
| return_periodicity (bool) | |
| Whether to also return the network confidence | |
| Returns | |
| pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))]) | |
| periodicity (torch.tensor [shape=(1, 1 + int(time // hop_length))]) | |
| """ | |
| # Sampling is non-differentiable, so remove from graph | |
| probabilities = probabilities.detach() | |
| # Convert frequency range to pitch bin range | |
| minidx = crepe.convert.frequency_to_bins(torch.tensor(fmin)) | |
| maxidx = crepe.convert.frequency_to_bins(torch.tensor(fmax), | |
| torch.ceil) | |
| # Remove frequencies outside of allowable range | |
| probabilities[:, :minidx] = -float('inf') | |
| probabilities[:, maxidx:] = -float('inf') | |
| # Perform argmax or viterbi sampling | |
| bins, pitch = decoder(probabilities) | |
| # Deprecate return_harmonicity | |
| if return_harmonicity: | |
| message = ( | |
| 'The crepe return_harmonicity argument is deprecated and ' | |
| 'will be removed in a future release. Please use ' | |
| 'return_periodicity. Rationale: if network confidence measured ' | |
| 'harmonics, the value would be low for non-harmonic, periodic ' | |
| 'sounds (e.g., sine waves). But this is not observed.') | |
| warnings.warn(message, DeprecationWarning) | |
| return_periodicity = return_harmonicity | |
| if not return_periodicity: | |
| return pitch | |
| # Compute periodicity from probabilities and decoded pitch bins | |
| return pitch, periodicity(probabilities, bins) | |
| def preprocess(audio, | |
| sample_rate, | |
| hop_length=None, | |
| batch_size=None, | |
| device='cpu', | |
| pad=True): | |
| """Convert audio to model input | |
| Arguments | |
| audio (torch.tensor [shape=(1, time)]) | |
| The audio signals | |
| sample_rate (int) | |
| The sampling rate in Hz | |
| hop_length (int) | |
| The hop_length in samples | |
| batch_size (int) | |
| The number of frames per batch | |
| device (string) | |
| The device to run inference on | |
| pad (bool) | |
| Whether to zero-pad the audio | |
| Returns | |
| frames (torch.tensor [shape=(1 + int(time // hop_length), 1024)]) | |
| """ | |
| # Default hop length of 10 ms | |
| hop_length = sample_rate // 100 if hop_length is None else hop_length | |
| # Resample | |
| if sample_rate != SAMPLE_RATE: | |
| audio = resample(audio, sample_rate) | |
| hop_length = int(hop_length * SAMPLE_RATE / sample_rate) | |
| # Get total number of frames | |
| # Maybe pad | |
| if pad: | |
| total_frames = 1 + int(audio.size(1) // hop_length) | |
| audio = torch.nn.functional.pad( | |
| audio, | |
| (WINDOW_SIZE // 2, WINDOW_SIZE // 2)) | |
| else: | |
| total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length) | |
| # Default to running all frames in a single batch | |
| batch_size = total_frames if batch_size is None else batch_size | |
| # Generate batches | |
| for i in range(0, total_frames, batch_size): | |
| # Batch indices | |
| start = max(0, i * hop_length) | |
| end = min(audio.size(1), | |
| (i + batch_size - 1) * hop_length + WINDOW_SIZE) | |
| # Chunk | |
| frames = torch.nn.functional.unfold( | |
| audio[:, None, None, start:end], | |
| kernel_size=(1, WINDOW_SIZE), | |
| stride=(1, hop_length)) | |
| # shape=(1 + int(time / hop_length, 1024) | |
| frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE) | |
| # Place on device | |
| frames = frames.to(device) | |
| # Mean-center | |
| frames -= frames.mean(dim=1, keepdim=True) | |
| # Scale | |
| # Note: during silent frames, this produces very large values. But | |
| # this seems to be what the network expects. | |
| frames /= torch.max(torch.tensor(1e-10, device=frames.device), | |
| frames.std(dim=1, keepdim=True)) | |
| yield frames | |
| ############################################################################### | |
| # Utilities | |
| ############################################################################### | |
| def periodicity(probabilities, bins): | |
| """Computes the periodicity from the network output and pitch bins""" | |
| # shape=(batch * time / hop_length, 360) | |
| probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS) | |
| # shape=(batch * time / hop_length, 1) | |
| bins_stacked = bins.reshape(-1, 1).to(torch.int64) | |
| # Use maximum logit over pitch bins as periodicity | |
| periodicity = probs_stacked.gather(1, bins_stacked) | |
| # shape=(batch, time / hop_length) | |
| return periodicity.reshape(probabilities.size(0), probabilities.size(2)) | |
| def resample(audio, sample_rate): | |
| """Resample audio""" | |
| # Store device for later placement | |
| device = audio.device | |
| # Convert to numpy | |
| audio = audio.detach().cpu().numpy().squeeze(0) | |
| # Resample | |
| # We have to use resampy if we want numbers to match Crepe | |
| audio = resampy.resample(audio, sample_rate, SAMPLE_RATE) | |
| # Convert to pytorch | |
| return torch.tensor(audio, device=device).unsqueeze(0) | |