Spaces:

hash-map
/

audio_to_phonome

Sleeping

File size: 3,993 Bytes

a32630e

# utils.py
import numpy as np
import librosa
from matplotlib import pyplot as plt
SR = 22050
HOP_LENGTH = 256
def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60):
    # mel_db: (n_mels, T) in dB (like saved from preprocess)
    S = librosa.db_to_power(mel_db)
    # invert mel to linear spectrogram
    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0])
    # Approximate inverse using pseudo inverse
    inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S))
    # Griffin-Lim
    audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
    return audio
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from g2p_en import G2p
g2p = G2p()

def text_to_phonemes(text):
    ph = g2p(text)
    # Remove spaces/punct tokens produced by g2p_en
    ph = [p for p in ph if p.isalpha()]
    return " ".join(ph)

import librosa
import numpy as np
import os

def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80):
    # Load audio
    y, _ = librosa.load(audio_path, sr=sr)
    
    # Compute STFT magnitude
    S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
    
    # Convert to mel spectrogram
    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
    mel = np.dot(mel_basis, S)
    
    # Convert to dB
    mel_db = librosa.power_to_db(mel)
    
    # Make sure save directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    # Save mel as .npy file
    base_name = os.path.splitext(os.path.basename(audio_path))[0]
    mel_path = os.path.join(save_dir, base_name + "_mel.npy")
    np.save(mel_path, mel_db)
    
    return mel_path


def ctc_post_process(phonemes):
    """
    Collapse repeats + remove blanks ('-') in CTC output.
    phonemes: list of predicted phoneme tokens
    """
    new_seq = []
    prev = None
    for p in phonemes:
        if p == "-" or p == prev:
            continue
        new_seq.append(p)
        prev = p
    return new_seq


import numpy as np
import matplotlib.pyplot as plt
import librosa.display

def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True):
    # Load mel spectrogram from .npy
    mel_db = np.load(mel_path)
    
    # Create figure
    plt.figure(figsize=(14, 6))
    
    # Plot mel spectrogram
    librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma')
    plt.title("Mel Spectrogram (dB)")
    plt.colorbar(format="%+2.0f dB")
    
    save_path = mel_path.replace('.npy', '_mel.png')
    plt.savefig(save_path)
    print(f"Saved mel spectrogram image at: {save_path}")

    """plt.show()"""
    return save_path 
# load reverse lexicon: phoneme_seq -> [words]
import nltk
from collections import defaultdict

nltk.download('cmudict')
arpabet = nltk.corpus.cmudict.dict()

# Build reverse lexicon
reverse_lex = defaultdict(list)
for word, pron_list in arpabet.items():
    for pron in pron_list:
        reverse_lex[tuple(pron)].append(word)

def split_on_boundaries(phoneme_stream, boundary_token="<w>"):
    """Split on a special token representing word boundaries."""
    words = []
    current = []
    for phon in phoneme_stream:
        if phon == boundary_token:
            if current:
                words.append(current)
                current = []
        else:
            current.append(phon)
    if current:
        words.append(current)
    return words

def p2g_fallback(phoneme_word):
    # Placeholder for fallback pronunciation-to-spelling
    return "".join(phoneme_word).lower()

def phonemes_to_text(phoneme_stream):
    words = []
    for phoneme_word in split_on_boundaries(phoneme_stream):
        candidates = reverse_lex.get(tuple(phoneme_word), [])
        if candidates:
            words.append(candidates[0])
        else:
            words.append(p2g_fallback(phoneme_word))
    return " ".join(words)