Spaces:
Runtime error
Runtime error
| from huggingface_hub import from_pretrained_keras | |
| import numpy as np | |
| import tensorflow as tf | |
| from tensorflow.keras import layers | |
| import tensorflow_io as tfio | |
| import gradio as gr | |
| import librosa | |
| import librosa.display | |
| import matplotlib.pyplot as plt | |
| class MelSpec(layers.Layer): | |
| def __init__( | |
| self, | |
| frame_length=1024, | |
| frame_step=256, | |
| fft_length=None, | |
| sampling_rate=22050, | |
| num_mel_channels=80, | |
| freq_min=125, | |
| freq_max=7600, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.frame_length = frame_length | |
| self.frame_step = frame_step | |
| self.fft_length = fft_length | |
| self.sampling_rate = sampling_rate | |
| self.num_mel_channels = num_mel_channels | |
| self.freq_min = freq_min | |
| self.freq_max = freq_max | |
| self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix( | |
| num_mel_bins=self.num_mel_channels, | |
| num_spectrogram_bins=self.frame_length // 2 + 1, | |
| sample_rate=self.sampling_rate, | |
| lower_edge_hertz=self.freq_min, | |
| upper_edge_hertz=self.freq_max, | |
| ) | |
| def call(self, audio): | |
| stft = tf.signal.stft( | |
| tf.squeeze(audio, -1), | |
| self.frame_length, | |
| self.frame_step, | |
| self.fft_length, | |
| pad_end=True, | |
| ) | |
| # Taking the magnitude of the STFT output | |
| magnitude = tf.abs(stft) | |
| # Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale | |
| mel = tf.matmul(tf.square(magnitude), self.mel_filterbank) | |
| log_mel_spec = tfio.audio.dbscale(mel, top_db=80) | |
| return log_mel_spec | |
| def get_config(self): | |
| config = super(MelSpec, self).get_config() | |
| config.update( | |
| { | |
| "frame_length": self.frame_length, | |
| "frame_step": self.frame_step, | |
| "fft_length": self.fft_length, | |
| "sampling_rate": self.sampling_rate, | |
| "num_mel_channels": self.num_mel_channels, | |
| "freq_min": self.freq_min, | |
| "freq_max": self.freq_max, | |
| } | |
| ) | |
| return config | |
| model = from_pretrained_keras("keras-io/MelGAN-spectrogram-inversion") | |
| def inference(audio, model): | |
| input, sr = librosa.load(audio) | |
| # input, sr = audio | |
| x = tf.expand_dims(input, axis=-1) | |
| mel = MelSpec()(x) | |
| audio_sample = tf.expand_dims(mel, axis=0) | |
| pred = model.predict(audio_sample, batch_size=1, verbose=0) | |
| return input, pred.squeeze(), sr | |
| def predict(audio): | |
| x, x_pred, sr = inference(audio, model) | |
| fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10, 8), dpi=120) | |
| D = librosa.amplitude_to_db(np.abs(librosa.stft(x)), ref=np.max) | |
| img = librosa.display.specshow(D, y_axis='linear', x_axis='time', | |
| sr=sr, ax=ax[0]) | |
| ax[0].set(title='Spectrogram of Original sample audio') | |
| ax[0].label_outer() | |
| D = librosa.amplitude_to_db(np.abs(librosa.stft(x_pred)), ref=np.max) | |
| img = librosa.display.specshow(D, y_axis='linear', x_axis='time', | |
| sr=sr, ax=ax[1]) | |
| ax[1].set(title='Spectrogram of synthesis sample audio ') | |
| ax[1].label_outer() | |
| return plt.gcf() | |
| inputs = [ | |
| gr.Audio(source = "upload", label='Upload audio file', type="filepath"), | |
| ] | |
| examples = ["sample_1.wav", "sample_2.wav"] | |
| gr.Interface( | |
| fn=predict, | |
| title="MelGAN-based spectrogram inversion", | |
| description = "Inversion of audio from mel-spectrograms using the MelGAN architecture and feature matching", | |
| inputs=inputs, | |
| examples=examples, | |
| outputs=gr.Plot(), | |
| cache_examples=False, | |
| article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the keras example from <a href=\"https://keras.io/examples/audio/melgan_spectrogram_inversion/\">Darshan Deshpande</a>", | |
| ).launch(debug=False, enable_queue=True) |