Spaces:

hash-map
/

audio_to_phonome

Sleeping

App Files Files Community

audio_to_phonome / utils.py

hash-map

Update utils.py

a32630e verified 2 months ago

raw

history blame contribute delete

3.99 kB

	# utils.py
	import numpy as np
	import librosa
	from matplotlib import pyplot as plt
	SR = 22050
	HOP_LENGTH = 256
	def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60):
	# mel_db: (n_mels, T) in dB (like saved from preprocess)
	S = librosa.db_to_power(mel_db)
	# invert mel to linear spectrogram
	mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0])
	# Approximate inverse using pseudo inverse
	inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S))
	# Griffin-Lim
	audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
	return audio
	import nltk
	nltk.download('averaged_perceptron_tagger_eng')
	from g2p_en import G2p
	g2p = G2p()

	def text_to_phonemes(text):
	ph = g2p(text)
	# Remove spaces/punct tokens produced by g2p_en
	ph = [p for p in ph if p.isalpha()]
	return " ".join(ph)

	import librosa
	import numpy as np
	import os

	def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80):
	# Load audio
	y, _ = librosa.load(audio_path, sr=sr)

	# Compute STFT magnitude
	S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))

	# Convert to mel spectrogram
	mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
	mel = np.dot(mel_basis, S)

	# Convert to dB
	mel_db = librosa.power_to_db(mel)

	# Make sure save directory exists
	os.makedirs(save_dir, exist_ok=True)

	# Save mel as .npy file
	base_name = os.path.splitext(os.path.basename(audio_path))[0]
	mel_path = os.path.join(save_dir, base_name + "_mel.npy")
	np.save(mel_path, mel_db)

	return mel_path


	def ctc_post_process(phonemes):
	"""
	Collapse repeats + remove blanks ('-') in CTC output.
	phonemes: list of predicted phoneme tokens
	"""
	new_seq = []
	prev = None
	for p in phonemes:
	if p == "-" or p == prev:
	continue
	new_seq.append(p)
	prev = p
	return new_seq


	import numpy as np
	import matplotlib.pyplot as plt
	import librosa.display

	def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True):
	# Load mel spectrogram from .npy
	mel_db = np.load(mel_path)

	# Create figure
	plt.figure(figsize=(14, 6))

	# Plot mel spectrogram
	librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma')
	plt.title("Mel Spectrogram (dB)")
	plt.colorbar(format="%+2.0f dB")

	save_path = mel_path.replace('.npy', '_mel.png')
	plt.savefig(save_path)
	print(f"Saved mel spectrogram image at: {save_path}")

	"""plt.show()"""
	return save_path
	# load reverse lexicon: phoneme_seq -> [words]
	import nltk
	from collections import defaultdict

	nltk.download('cmudict')
	arpabet = nltk.corpus.cmudict.dict()

	# Build reverse lexicon
	reverse_lex = defaultdict(list)
	for word, pron_list in arpabet.items():
	for pron in pron_list:
	reverse_lex[tuple(pron)].append(word)

	def split_on_boundaries(phoneme_stream, boundary_token="<w>"):
	"""Split on a special token representing word boundaries."""
	words = []
	current = []
	for phon in phoneme_stream:
	if phon == boundary_token:
	if current:
	words.append(current)
	current = []
	else:
	current.append(phon)
	if current:
	words.append(current)
	return words

	def p2g_fallback(phoneme_word):
	# Placeholder for fallback pronunciation-to-spelling
	return "".join(phoneme_word).lower()

	def phonemes_to_text(phoneme_stream):
	words = []
	for phoneme_word in split_on_boundaries(phoneme_stream):
	candidates = reverse_lex.get(tuple(phoneme_word), [])
	if candidates:
	words.append(candidates[0])
	else:
	words.append(p2g_fallback(phoneme_word))
	return " ".join(words)