Spaces:

hash-map
/

audio_to_phonome

Sleeping

App Files Files Community

hash-map commited on Sep 7

Commit

4ffa9fc

verified ·

1 Parent(s): 429e785

Upload 6 files

Browse files

Files changed (6) hide show

infer.py +153 -0
last_checkpoint.pt +3 -0
model.py +83 -0
phoneme_to_id.json +79 -0
requirements.txt +11 -0
utils.py +133 -0

infer.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import gradio as gr
+import torch
+import json
+import numpy as np
+import os
+from datetime import datetime
+from model import Image2Phoneme
+from utils import ctc_post_process, audio_to_mel, mel_to_image, text_to_phonemes
+import soundfile as sf
+import shutil
+import pronouncing
+import time
+# Configuration
+DEVICE = torch.device("cpu")
+PHMAP = "phoneme_to_id.json"
+AUDIO_DIR = "audio_inputs"
+# Ensure audio directory exists
+os.makedirs(AUDIO_DIR, exist_ok=True)
+# Load phoneme vocabulary
+try:
+    vocab = json.load(open(PHMAP, "r"))
+    id_to_ph = {v: k for k, v in vocab.items()}
+except FileNotFoundError:
+    raise FileNotFoundError(f"Phoneme mapping file not found at {PHMAP}")
+# Build model
+vocab_size = max(vocab.values()) + 1
+model = Image2Phoneme(vocab_size=vocab_size).to(DEVICE)
+try:
+    ckpt = torch.load("last_checkpoint.pt", map_location=DEVICE, weights_only=True)
+    model.load_state_dict(ckpt["model_state_dict"])
+    model.eval()
+except FileNotFoundError:
+    raise FileNotFoundError(f"Checkpoint file not found at last_checkpoint.pt")
+def process_audio(audio_input):
+    """Process audio to predict phonemes and display mel spectrogram."""
+    try:
+        print(f"Received audio_input before processing: {audio_input}")
+        # Generate unique filename based on timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        audio_path = os.path.join(AUDIO_DIR, f"input_{timestamp}.wav")
+        # Handle audio input
+        if audio_input is None:
+            print("Audio input is None after stopping recording")
+            return {"error": "No audio input provided"}, None, None, None
+        if isinstance(audio_input, str):
+            # File upload: Copy the uploaded file to audio_inputs/
+            print(f"Processing uploaded file: {audio_input}")
+            if not os.path.exists(audio_input):
+                return {"error": f"Uploaded file not found: {audio_input}"}, None, None, None
+            if audio_input.endswith(".mp3"):
+                print("Converting .mp3 to .wav")
+                from pydub import AudioSegment
+                audio = AudioSegment.from_mp3(audio_input)
+                audio_path = audio_path.replace(".wav", "_converted.wav")
+                audio.export(audio_path, format="wav")
+                print(f"Converted file saved to: {audio_path}")
+            else:
+                shutil.copy(audio_input, audio_path)
+                print(f"Copied file to: {audio_path}")
+        else:
+            # Microphone input: (sample_rate, audio_data)
+            print("Processing microphone input")
+            sample_rate, audio_data = audio_input
+            print(f"Sample rate: {sample_rate}, Audio data shape: {audio_data.shape if hasattr(audio_data, 'shape') else 'None'}")
+            if audio_data is None or len(audio_data) == 0:
+                print("Microphone audio data is empty or invalid")
+                return {"error": "Microphone input data is empty or invalid"}, None, None, None
+            # Add a small delay to ensure audio data is fully captured
+            time.sleep(1)
+            sf.write(audio_path, audio_data, sample_rate)
+            print(f"Saved microphone audio to: {audio_path}")
+            # Verify the file exists
+            if not os.path.exists(audio_path):
+                print(f"Failed to save audio file at: {audio_path}")
+                return {"error": "Failed to save recorded audio file"}, None, None, None
+        # Process audio to mel spectrogram
+        mel_path = audio_to_mel(audio_path)
+        print(f"Generated mel spectrogram: {mel_path}")
+        if not os.path.exists(mel_path):
+            return {"error": f"Mel spectrogram file not found: {mel_path}"}, None, None, None
+        mel_image_path = mel_to_image(mel_path)
+        print(f"Generated mel spectrogram image: {mel_image_path}")
+        if not os.path.exists(mel_image_path):
+            return {"error": f"Mel spectrogram image not found: {mel_image_path}"}, None, None, None
+        # Load mel spectrogram
+        mel = np.load(mel_path)  # shape (n_mels, T)
+        print(f"Loaded mel spectrogram shape: {mel.shape}")
+        mel_tensor = torch.tensor(mel).unsqueeze(0).to(DEVICE)  # add batch dim
+        mel_lens = torch.tensor([mel.shape[1]]).to(DEVICE)
+        # Predict phonemes
+        with torch.no_grad():
+            ph_pred = model(mel_tensor)  # shape (B, seq_len, vocab_size)
+            ph_ids = ph_pred.argmax(-1)[0].cpu().numpy()  # pick first batch
+            print(f"Predicted phoneme IDs: {ph_ids}")
+        # Convert IDs to phonemes
+        ph_seq = [id_to_ph[i] for i in ph_ids if i > 0]
+        print(f"Raw phonemes: {ph_seq}")
+        # Post-process phonemes
+        post_processed = ctc_post_process(ph_seq)
+        print(f"Post-processed phonemes: {post_processed}")
+        # Return results
+        return {
+            "audio_path": audio_path,
+            "phonemes": " ".join(ph_seq),
+            "post_processed_phonemes": " ".join(post_processed)
+        }, mel_image_path, " ".join(ph_seq), " ".join(post_processed)
+    except Exception as e:
+        print(f"Error in process_audio: {str(e)}")
+        return {"error": f"Processing failed: {str(e)}"}, None, None, None
+# Gradio interface
+with gr.Blocks() as iface:
+    gr.Markdown("# Speech to Phonemes Converter")
+    gr.Markdown("Record or upload audio to predict phonemes and display mel spectrogram. Paste input text if available.")
+    audio_input = gr.Audio(sources=[ "upload"], type="filepath", label="Upload Audio (.wav or .mp3)", interactive=True)
+    text_input = gr.Textbox(label="Enter Text", placeholder="Type a sentence to convert to phonemes")
+    process_button = gr.Button("Process")
+    audio_output = gr.JSON(label="Audio Processing Results (Audio Path, Phonemes, Post-Processed Phonemes)")
+    mel_image = gr.Image(label="Mel Spectrogram", type="filepath")
+    raw_phonemes = gr.Textbox(label="Raw Phonemes")
+    post_processed_phonemes = gr.Textbox(label="Post-Processed Phonemes")
+    text_output = gr.JSON(label="Text-to-Phoneme Results")
+    def process(audio_input, text_input):
+        print(f"Processing inputs - Audio: {audio_input}, Text: {text_input}")
+        audio_result, mel_image_path, raw_ph, post_ph = process_audio(audio_input) if audio_input else ({}, None, None, None)
+        text_result = text_to_phonemes(text_input) if text_input else {}
+        return audio_result, mel_image_path, raw_ph, post_ph, text_result
+    process_button.click(
+        fn=process,
+        inputs=[audio_input, text_input],
+        outputs=[audio_output, mel_image, raw_phonemes, post_processed_phonemes, text_output]
+    )
+if __name__ == "__main__":
+    iface.launch(debug=True)

last_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94001abf3c674de3828f3aaf00ffea7964c8a85ee1942988463d1244cc33e978
+size 13410944

model.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# model_img2ph.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CNNEncoder(nn.Module):
+    def __init__(self, in_channels=1, hidden_dim=256, dropout=0.2):
+        super().__init__()
+        # Convolutions mostly reduce frequency dimension, not time
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, 64, kernel_size=3, stride=(2,1), padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Conv2d(64, 128, kernel_size=3, stride=(2,1), padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Conv2d(128, 256, kernel_size=3, stride=(2,1), padding=1),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Conv2d(256, hidden_dim, kernel_size=3, stride=(2,1), padding=1),
+            nn.BatchNorm2d(hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        # x: (B, n_mels, T)
+        x = x.unsqueeze(1)  # (B,1,n_mels,T)
+        feat = self.conv(x)  # (B,C,H’,T)
+        B, C, H, T = feat.size()
+        # collapse frequency into features, keep time intact
+        feat = feat.permute(0, 3, 1, 2).contiguous()  # (B,T,C,H)
+        feat = feat.view(B, T, C*H)  # (B,T,features)
+        return feat
+class PhonemeDecoder(nn.Module):
+    def __init__(self, vocab_size, enc_dim=128*5, rnn_hidden=128, num_layers=2, dropout=0.3):
+        super().__init__()
+        self.rnn = nn.GRU(
+            enc_dim, rnn_hidden,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout,
+            bidirectional=False  # Changed to unidirectional
+        )
+        self.proj = nn.Linear(rnn_hidden, 256)  # Single projection layer
+        self.norm = nn.LayerNorm(256)  # Added LayerNorm
+        self.fc_out = nn.Linear(256, vocab_size)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, enc_out):
+        rnn_out, _ = self.rnn(enc_out)  # (B,T,rnn_hidden)
+        dense_out = self.proj(rnn_out)  # (B,T,256)
+        dense_out = self.norm(dense_out)  # Normalize
+        dense_out = F.relu(dense_out)  # Activation
+        dense_out = self.dropout(dense_out)  # Dropout after activation
+        logits = self.fc_out(dense_out)  # (B,T,vocab_size)
+        return logits
+class Image2Phoneme(nn.Module):
+    def __init__(self, vocab_size, in_channels=1, enc_hidden=128, rnn_hidden=128):
+        super().__init__()
+        self.encoder = CNNEncoder(in_channels=in_channels, hidden_dim=enc_hidden)
+        # enc_dim = enc_hidden * H’, after convs H’≈5 (if input mel=80, stride=(2,1) 4 times → 80/16=5)
+        enc_dim = enc_hidden * 5
+        self.decoder = PhonemeDecoder(vocab_size, enc_dim=enc_dim, rnn_hidden=rnn_hidden)
+    def forward(self, mels):
+        # mels: (B,n_mels,T)
+        enc_out = self.encoder(mels)     # (B,T,enc_dim)
+        logits = self.decoder(enc_out)   # (B,T,vocab_size)
+        return logits

phoneme_to_id.json ADDED Viewed

	@@ -0,0 +1,79 @@

+{
+  "!": 1,
+  "'": 2,
+  ",": 3,
+  "-": 4,
+  ".": 5,
+  "..": 6,
+  "?": 7,
+  "AA0": 8,
+  "AA1": 9,
+  "AA2": 10,
+  "AE0": 11,
+  "AE1": 12,
+  "AE2": 13,
+  "AH0": 14,
+  "AH1": 15,
+  "AH2": 16,
+  "AO0": 17,
+  "AO1": 18,
+  "AO2": 19,
+  "AW0": 20,
+  "AW1": 21,
+  "AW2": 22,
+  "AY0": 23,
+  "AY1": 24,
+  "AY2": 25,
+  "B": 26,
+  "CH": 27,
+  "D": 28,
+  "DH": 29,
+  "EH0": 30,
+  "EH1": 31,
+  "EH2": 32,
+  "ER0": 33,
+  "ER1": 34,
+  "ER2": 35,
+  "EY0": 36,
+  "EY1": 37,
+  "EY2": 38,
+  "F": 39,
+  "G": 40,
+  "HH": 41,
+  "IH0": 42,
+  "IH1": 43,
+  "IH2": 44,
+  "IY0": 45,
+  "IY1": 46,
+  "IY2": 47,
+  "JH": 48,
+  "K": 49,
+  "L": 50,
+  "M": 51,
+  "N": 52,
+  "NG": 53,
+  "OW0": 54,
+  "OW1": 55,
+  "OW2": 56,
+  "OY0": 57,
+  "OY1": 58,
+  "OY2": 59,
+  "P": 60,
+  "R": 61,
+  "S": 62,
+  "SH": 63,
+  "T": 64,
+  "TH": 65,
+  "UH0": 66,
+  "UH1": 67,
+  "UH2": 68,
+  "UW0": 69,
+  "UW1": 70,
+  "UW2": 71,
+  "V": 72,
+  "W": 73,
+  "Y": 74,
+  "Z": 75,
+  "ZH": 76,
+  "<PAD>": 0
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch>=2.0
+torchaudio
+librosa
+numpy
+pandas
+g2p-en
+soundfile
+tqdm
+nltk
+pronouncing
+gradio

utils.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# utils.py
+import numpy as np
+import librosa
+from matplotlib import pyplot as plt
+SR = 22050
+HOP_LENGTH = 256
+def mel_to_audio(mel_db, sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_iter=60):
+    # mel_db: (n_mels, T) in dB (like saved from preprocess)
+    S = librosa.db_to_power(mel_db)
+    # invert mel to linear spectrogram
+    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=S.shape[0])
+    # Approximate inverse using pseudo inverse
+    inv_mel = np.maximum(1e-10, np.linalg.pinv(mel_basis).dot(S))
+    # Griffin-Lim
+    audio = librosa.griffinlim(inv_mel, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
+    return audio
+from g2p_en import G2p
+g2p = G2p()
+def text_to_phonemes(text):
+    ph = g2p(text)
+    # Remove spaces/punct tokens produced by g2p_en
+    ph = [p for p in ph if p.isalpha()]
+    return " ".join(ph)
+import librosa
+import numpy as np
+import os
+def audio_to_mel(audio_path, save_dir="mels", sr=22050, n_fft=1024, hop_length=256, win_length=1024, n_mels=80):
+    # Load audio
+    y, _ = librosa.load(audio_path, sr=sr)
+    # Compute STFT magnitude
+    S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
+    # Convert to mel spectrogram
+    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
+    mel = np.dot(mel_basis, S)
+    # Convert to dB
+    mel_db = librosa.power_to_db(mel)
+    # Make sure save directory exists
+    os.makedirs(save_dir, exist_ok=True)
+    # Save mel as .npy file
+    base_name = os.path.splitext(os.path.basename(audio_path))[0]
+    mel_path = os.path.join(save_dir, base_name + "_mel.npy")
+    np.save(mel_path, mel_db)
+    return mel_path
+def ctc_post_process(phonemes):
+    """
+    Collapse repeats + remove blanks ('-') in CTC output.
+    phonemes: list of predicted phoneme tokens
+    """
+    new_seq = []
+    prev = None
+    for p in phonemes:
+        if p == "-" or p == prev:
+            continue
+        new_seq.append(p)
+        prev = p
+    return new_seq
+import numpy as np
+import matplotlib.pyplot as plt
+import librosa.display
+def mel_to_image(mel_path, sr=22050, hop_length=256, save_fig=True):
+    # Load mel spectrogram from .npy
+    mel_db = np.load(mel_path)
+    # Create figure
+    plt.figure(figsize=(14, 6))
+    # Plot mel spectrogram
+    librosa.display.specshow(mel_db, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel', cmap='magma')
+    plt.title("Mel Spectrogram (dB)")
+    plt.colorbar(format="%+2.0f dB")
+    save_path = mel_path.replace('.npy', '_mel.png')
+    plt.savefig(save_path)
+    print(f"Saved mel spectrogram image at: {save_path}")
+    """plt.show()"""
+    return save_path
+# load reverse lexicon: phoneme_seq -> [words]
+import nltk
+from collections import defaultdict
+nltk.download('cmudict')
+arpabet = nltk.corpus.cmudict.dict()
+# Build reverse lexicon
+reverse_lex = defaultdict(list)
+for word, pron_list in arpabet.items():
+    for pron in pron_list:
+        reverse_lex[tuple(pron)].append(word)
+def split_on_boundaries(phoneme_stream, boundary_token="<w>"):
+    """Split on a special token representing word boundaries."""
+    words = []
+    current = []
+    for phon in phoneme_stream:
+        if phon == boundary_token:
+            if current:
+                words.append(current)
+                current = []
+        else:
+            current.append(phon)
+    if current:
+        words.append(current)
+    return words
+def p2g_fallback(phoneme_word):
+    # Placeholder for fallback pronunciation-to-spelling
+    return "".join(phoneme_word).lower()
+def phonemes_to_text(phoneme_stream):
+    words = []
+    for phoneme_word in split_on_boundaries(phoneme_stream):
+        candidates = reverse_lex.get(tuple(phoneme_word), [])
+        if candidates:
+            words.append(candidates[0])
+        else:
+            words.append(p2g_fallback(phoneme_word))
+    return " ".join(words)