# utils.py import numpy as np import librosa from matplotlib import pyplot as plt SR = 22050 HOP_LENGTH = 256 def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60): # mel_db: (n_mels, T) in dB (like saved from preprocess) S = librosa.db_to_power(mel_db) # invert mel to linear spectrogram mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0]) # Approximate inverse using pseudo inverse inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S)) # Griffin-Lim audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length) return audio import nltk nltk.download('averaged_perceptron_tagger_eng') from g2p_en import G2p g2p = G2p() def text_to_phonemes(text): ph = g2p(text) # Remove spaces/punct tokens produced by g2p_en ph = [p for p in ph if p.isalpha()] return " ".join(ph) import librosa import numpy as np import os def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80): # Load audio y, _ = librosa.load(audio_path, sr=sr) # Compute STFT magnitude S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)) # Convert to mel spectrogram mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels) mel = np.dot(mel_basis, S) # Convert to dB mel_db = librosa.power_to_db(mel) # Make sure save directory exists os.makedirs(save_dir, exist_ok=True) # Save mel as .npy file base_name = os.path.splitext(os.path.basename(audio_path))[0] mel_path = os.path.join(save_dir, base_name + "_mel.npy") np.save(mel_path, mel_db) return mel_path def ctc_post_process(phonemes): """ Collapse repeats + remove blanks ('-') in CTC output. phonemes: list of predicted phoneme tokens """ new_seq = [] prev = None for p in phonemes: if p == "-" or p == prev: continue new_seq.append(p) prev = p return new_seq import numpy as np import matplotlib.pyplot as plt import librosa.display def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True): # Load mel spectrogram from .npy mel_db = np.load(mel_path) # Create figure plt.figure(figsize=(14, 6)) # Plot mel spectrogram librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma') plt.title("Mel Spectrogram (dB)") plt.colorbar(format="%+2.0f dB") save_path = mel_path.replace('.npy', '_mel.png') plt.savefig(save_path) print(f"Saved mel spectrogram image at: {save_path}") """plt.show()""" return save_path # load reverse lexicon: phoneme_seq -> [words] import nltk from collections import defaultdict nltk.download('cmudict') arpabet = nltk.corpus.cmudict.dict() # Build reverse lexicon reverse_lex = defaultdict(list) for word, pron_list in arpabet.items(): for pron in pron_list: reverse_lex[tuple(pron)].append(word) def split_on_boundaries(phoneme_stream, boundary_token=""): """Split on a special token representing word boundaries.""" words = [] current = [] for phon in phoneme_stream: if phon == boundary_token: if current: words.append(current) current = [] else: current.append(phon) if current: words.append(current) return words def p2g_fallback(phoneme_word): # Placeholder for fallback pronunciation-to-spelling return "".join(phoneme_word).lower() def phonemes_to_text(phoneme_stream): words = [] for phoneme_word in split_on_boundaries(phoneme_stream): candidates = reverse_lex.get(tuple(phoneme_word), []) if candidates: words.append(candidates[0]) else: words.append(p2g_fallback(phoneme_word)) return " ".join(words)