Spaces:
Sleeping
Sleeping
| # utils.py | |
| import numpy as np | |
| import librosa | |
| from matplotlib import pyplot as plt | |
| SR = 22050 | |
| HOP_LENGTH = 256 | |
| def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60): | |
| # mel_db: (n_mels, T) in dB (like saved from preprocess) | |
| S = librosa.db_to_power(mel_db) | |
| # invert mel to linear spectrogram | |
| mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0]) | |
| # Approximate inverse using pseudo inverse | |
| inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S)) | |
| # Griffin-Lim | |
| audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length) | |
| return audio | |
| import nltk | |
| nltk.download('averaged_perceptron_tagger_eng') | |
| from g2p_en import G2p | |
| g2p = G2p() | |
| def text_to_phonemes(text): | |
| ph = g2p(text) | |
| # Remove spaces/punct tokens produced by g2p_en | |
| ph = [p for p in ph if p.isalpha()] | |
| return " ".join(ph) | |
| import librosa | |
| import numpy as np | |
| import os | |
| def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80): | |
| # Load audio | |
| y, _ = librosa.load(audio_path, sr=sr) | |
| # Compute STFT magnitude | |
| S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)) | |
| # Convert to mel spectrogram | |
| mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels) | |
| mel = np.dot(mel_basis, S) | |
| # Convert to dB | |
| mel_db = librosa.power_to_db(mel) | |
| # Make sure save directory exists | |
| os.makedirs(save_dir, exist_ok=True) | |
| # Save mel as .npy file | |
| base_name = os.path.splitext(os.path.basename(audio_path))[0] | |
| mel_path = os.path.join(save_dir, base_name + "_mel.npy") | |
| np.save(mel_path, mel_db) | |
| return mel_path | |
| def ctc_post_process(phonemes): | |
| """ | |
| Collapse repeats + remove blanks ('-') in CTC output. | |
| phonemes: list of predicted phoneme tokens | |
| """ | |
| new_seq = [] | |
| prev = None | |
| for p in phonemes: | |
| if p == "-" or p == prev: | |
| continue | |
| new_seq.append(p) | |
| prev = p | |
| return new_seq | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import librosa.display | |
| def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True): | |
| # Load mel spectrogram from .npy | |
| mel_db = np.load(mel_path) | |
| # Create figure | |
| plt.figure(figsize=(14, 6)) | |
| # Plot mel spectrogram | |
| librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma') | |
| plt.title("Mel Spectrogram (dB)") | |
| plt.colorbar(format="%+2.0f dB") | |
| save_path = mel_path.replace('.npy', '_mel.png') | |
| plt.savefig(save_path) | |
| print(f"Saved mel spectrogram image at: {save_path}") | |
| """plt.show()""" | |
| return save_path | |
| # load reverse lexicon: phoneme_seq -> [words] | |
| import nltk | |
| from collections import defaultdict | |
| nltk.download('cmudict') | |
| arpabet = nltk.corpus.cmudict.dict() | |
| # Build reverse lexicon | |
| reverse_lex = defaultdict(list) | |
| for word, pron_list in arpabet.items(): | |
| for pron in pron_list: | |
| reverse_lex[tuple(pron)].append(word) | |
| def split_on_boundaries(phoneme_stream, boundary_token="<w>"): | |
| """Split on a special token representing word boundaries.""" | |
| words = [] | |
| current = [] | |
| for phon in phoneme_stream: | |
| if phon == boundary_token: | |
| if current: | |
| words.append(current) | |
| current = [] | |
| else: | |
| current.append(phon) | |
| if current: | |
| words.append(current) | |
| return words | |
| def p2g_fallback(phoneme_word): | |
| # Placeholder for fallback pronunciation-to-spelling | |
| return "".join(phoneme_word).lower() | |
| def phonemes_to_text(phoneme_stream): | |
| words = [] | |
| for phoneme_word in split_on_boundaries(phoneme_stream): | |
| candidates = reverse_lex.get(tuple(phoneme_word), []) | |
| if candidates: | |
| words.append(candidates[0]) | |
| else: | |
| words.append(p2g_fallback(phoneme_word)) | |
| return " ".join(words) | |