Spaces:
Sleeping
Sleeping
File size: 3,993 Bytes
a32630e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# utils.py
import numpy as np
import librosa
from matplotlib import pyplot as plt
SR = 22050
HOP_LENGTH = 256
def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60):
# mel_db: (n_mels, T) in dB (like saved from preprocess)
S = librosa.db_to_power(mel_db)
# invert mel to linear spectrogram
mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0])
# Approximate inverse using pseudo inverse
inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S))
# Griffin-Lim
audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
return audio
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from g2p_en import G2p
g2p = G2p()
def text_to_phonemes(text):
ph = g2p(text)
# Remove spaces/punct tokens produced by g2p_en
ph = [p for p in ph if p.isalpha()]
return " ".join(ph)
import librosa
import numpy as np
import os
def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80):
# Load audio
y, _ = librosa.load(audio_path, sr=sr)
# Compute STFT magnitude
S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
# Convert to mel spectrogram
mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
mel = np.dot(mel_basis, S)
# Convert to dB
mel_db = librosa.power_to_db(mel)
# Make sure save directory exists
os.makedirs(save_dir, exist_ok=True)
# Save mel as .npy file
base_name = os.path.splitext(os.path.basename(audio_path))[0]
mel_path = os.path.join(save_dir, base_name + "_mel.npy")
np.save(mel_path, mel_db)
return mel_path
def ctc_post_process(phonemes):
"""
Collapse repeats + remove blanks ('-') in CTC output.
phonemes: list of predicted phoneme tokens
"""
new_seq = []
prev = None
for p in phonemes:
if p == "-" or p == prev:
continue
new_seq.append(p)
prev = p
return new_seq
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True):
# Load mel spectrogram from .npy
mel_db = np.load(mel_path)
# Create figure
plt.figure(figsize=(14, 6))
# Plot mel spectrogram
librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma')
plt.title("Mel Spectrogram (dB)")
plt.colorbar(format="%+2.0f dB")
save_path = mel_path.replace('.npy', '_mel.png')
plt.savefig(save_path)
print(f"Saved mel spectrogram image at: {save_path}")
"""plt.show()"""
return save_path
# load reverse lexicon: phoneme_seq -> [words]
import nltk
from collections import defaultdict
nltk.download('cmudict')
arpabet = nltk.corpus.cmudict.dict()
# Build reverse lexicon
reverse_lex = defaultdict(list)
for word, pron_list in arpabet.items():
for pron in pron_list:
reverse_lex[tuple(pron)].append(word)
def split_on_boundaries(phoneme_stream, boundary_token="<w>"):
"""Split on a special token representing word boundaries."""
words = []
current = []
for phon in phoneme_stream:
if phon == boundary_token:
if current:
words.append(current)
current = []
else:
current.append(phon)
if current:
words.append(current)
return words
def p2g_fallback(phoneme_word):
# Placeholder for fallback pronunciation-to-spelling
return "".join(phoneme_word).lower()
def phonemes_to_text(phoneme_stream):
words = []
for phoneme_word in split_on_boundaries(phoneme_stream):
candidates = reverse_lex.get(tuple(phoneme_word), [])
if candidates:
words.append(candidates[0])
else:
words.append(p2g_fallback(phoneme_word))
return " ".join(words)
|