audio_to_phonome / utils.py
hash-map's picture
Update utils.py
a32630e verified
# utils.py
import numpy as np
import librosa
from matplotlib import pyplot as plt
SR = 22050
HOP_LENGTH = 256
def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60):
# mel_db: (n_mels, T) in dB (like saved from preprocess)
S = librosa.db_to_power(mel_db)
# invert mel to linear spectrogram
mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0])
# Approximate inverse using pseudo inverse
inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S))
# Griffin-Lim
audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
return audio
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from g2p_en import G2p
g2p = G2p()
def text_to_phonemes(text):
ph = g2p(text)
# Remove spaces/punct tokens produced by g2p_en
ph = [p for p in ph if p.isalpha()]
return " ".join(ph)
import librosa
import numpy as np
import os
def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80):
# Load audio
y, _ = librosa.load(audio_path, sr=sr)
# Compute STFT magnitude
S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
# Convert to mel spectrogram
mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
mel = np.dot(mel_basis, S)
# Convert to dB
mel_db = librosa.power_to_db(mel)
# Make sure save directory exists
os.makedirs(save_dir, exist_ok=True)
# Save mel as .npy file
base_name = os.path.splitext(os.path.basename(audio_path))[0]
mel_path = os.path.join(save_dir, base_name + "_mel.npy")
np.save(mel_path, mel_db)
return mel_path
def ctc_post_process(phonemes):
"""
Collapse repeats + remove blanks ('-') in CTC output.
phonemes: list of predicted phoneme tokens
"""
new_seq = []
prev = None
for p in phonemes:
if p == "-" or p == prev:
continue
new_seq.append(p)
prev = p
return new_seq
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True):
# Load mel spectrogram from .npy
mel_db = np.load(mel_path)
# Create figure
plt.figure(figsize=(14, 6))
# Plot mel spectrogram
librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma')
plt.title("Mel Spectrogram (dB)")
plt.colorbar(format="%+2.0f dB")
save_path = mel_path.replace('.npy', '_mel.png')
plt.savefig(save_path)
print(f"Saved mel spectrogram image at: {save_path}")
"""plt.show()"""
return save_path
# load reverse lexicon: phoneme_seq -> [words]
import nltk
from collections import defaultdict
nltk.download('cmudict')
arpabet = nltk.corpus.cmudict.dict()
# Build reverse lexicon
reverse_lex = defaultdict(list)
for word, pron_list in arpabet.items():
for pron in pron_list:
reverse_lex[tuple(pron)].append(word)
def split_on_boundaries(phoneme_stream, boundary_token="<w>"):
"""Split on a special token representing word boundaries."""
words = []
current = []
for phon in phoneme_stream:
if phon == boundary_token:
if current:
words.append(current)
current = []
else:
current.append(phon)
if current:
words.append(current)
return words
def p2g_fallback(phoneme_word):
# Placeholder for fallback pronunciation-to-spelling
return "".join(phoneme_word).lower()
def phonemes_to_text(phoneme_stream):
words = []
for phoneme_word in split_on_boundaries(phoneme_stream):
candidates = reverse_lex.get(tuple(phoneme_word), [])
if candidates:
words.append(candidates[0])
else:
words.append(p2g_fallback(phoneme_word))
return " ".join(words)