Spaces:

WJ88
/

Parakeet-TDT-0.6b-V3_-_multilingual_but_performance_issues_accumulating_over_time

Paused

App Files Files Community

WJ88 commited on 28 days ago

Commit

5ab166b

verified ·

1 Parent(s): 4318ae4

pure ai refactoring

Browse files

Files changed (1) hide show

app.py +217 -101

app.py CHANGED Viewed

@@ -1,118 +1,234 @@
-import gradio as gr, numpy as np, torch, torchaudio, copy
 import nemo.collections.asr as nemo_asr
 from omegaconf import OmegaConf
-from dataclasses import dataclass
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
-from nemo.collections.asr.parts.utils.rnnt_utils import BatchedHyps, batched_hyps_to_hypotheses
 from nemo.collections.asr.parts.utils.streaming_utils import ContextSize, StreamingBatchedAudioBuffer
-def _div(a, b): return (a // b) * b
 @dataclass
-class Cfg:
-    name: str = "nvidia/parakeet-tdt-0.6b-v3"
     left_s: float = 10.0
     chunk_s: float = 2.0
     right_s: float = 2.0
-    max_s: float = 40.0
-    batch: int = 1
-    device: str = "cpu"
-cfg = Cfg()
-m = nemo_asr.models.EncDecRNNTModel.from_pretrained(cfg.name).to(cfg.device).eval()
-for p in m.parameters(): p.requires_grad_(False)
-dec = RNNTDecodingConfig(strategy="greedy_batch", fused_batch_size=-1, compute_timestamps=False)
-dec.greedy.loop_labels = True
-m.change_decoding_strategy(dec)
-dc = m.decoding.decoding.decoding_computer
-mc = copy.deepcopy(m.cfg)
-OmegaConf.set_struct(mc.preprocessor, False)
-mc.preprocessor.dither = 0.0
-mc.preprocessor.pad_to = 0
-OmegaConf.set_struct(mc.preprocessor, True)
-sr = mc.preprocessor.sample_rate
-ws = mc.preprocessor.window_stride
-fps = 1.0 / ws
-sub = m.encoder.subsampling_factor
-feat_f2a = _div(int(sr * ws), sub)
-enc_f2a = feat_f2a * sub
-ctx_enc = ContextSize(
-    left=int(cfg.left_s * fps / sub),
-    chunk=int(cfg.chunk_s * fps / sub),
-    right=int(cfg.right_s * fps / sub),
-)
-ctx_samp = ContextSize(
-    left=ctx_enc.left * sub * feat_f2a,
-    chunk=ctx_enc.chunk * sub * feat_f2a,
-    right=ctx_enc.right * sub * feat_f2a,
-)
-max_samples = int(cfg.max_s * sr)
-def _mono(x):
-    x = np.asarray(x)
-    if x.ndim == 2:
-        if x.shape[1] == 2: x = x.mean(axis=1)
-        else: x = x.mean(axis=-1)
-    return x.astype(np.float32)
-def _resample(x, in_sr):
-    if in_sr == sr: return x
-    return torchaudio.functional.resample(torch.from_numpy(x), in_sr, sr).numpy().astype(np.float32)
-def _decode(a_np):
-    with torch.inference_mode():
-        a = torch.from_numpy(a_np).unsqueeze(0).to(torch.float32).to(cfg.device)
-        L = torch.tensor([a.shape[1]], dtype=torch.long, device=cfg.device)
-        cur = None
-        st = None
         l = 0
-        r = min(ctx_samp.chunk + ctx_samp.right, a.shape[1])
-        buf = StreamingBatchedAudioBuffer(batch_size=cfg.batch, context_samples=ctx_samp, dtype=a.dtype, device=cfg.device)
-        rest = L.clone()
         while l < a.shape[1]:
-            clen = min(r, a.shape[1]) - l
-            last = r >= a.shape[1]
-            last_b = torch.tensor([clen >= rest[0]], dtype=torch.bool, device=cfg.device)
-            clen_b = torch.where(last_b, rest, torch.full_like(rest, fill_value=clen))
-            buf.add_audio_batch_(a[:, l:r], audio_lengths=clen_b, is_last_chunk=last, is_last_chunk_batch=last_b)
-            enc, _ = m(input_signal=buf.samples, input_signal_length=buf.context_size_batch.total())
-            enc = enc.transpose(1, 2)
-            enc_ctx = buf.context_size.subsample(factor=enc_f2a)
-            enc_ctx_b = buf.context_size_batch.subsample(factor=enc_f2a)
-            enc = enc[:, enc_ctx.left:]
-            hyps, _, st = dc(x=enc, out_len=enc_ctx_b.chunk, prev_batched_state=st)
-            if cur is None: cur = hyps
-            else: cur.merge_(hyps)
-            rest -= clen_b
             l = r
-            r = min(r + ctx_samp.chunk, a.shape[1])
-        outs = batched_hyps_to_hypotheses(cur, None, batch_size=cfg.batch) if cur is not None else []
-        for h in outs: h.text = m.tokenizer.ids_to_text(h.y_sequence.tolist())
         return outs[0].text if outs else ""
-def transcribe(stream, new_chunk):
-    if new_chunk is None: return stream, ""
-    in_sr, data = new_chunk
-    y = _mono(data)
-    y = _resample(y, int(in_sr))
-    a = y if stream is None or len(stream) == 0 else np.concatenate([stream, y])
-    if len(a) > max_samples: a = a[-max_samples:]
-    text = _decode(a) if a.size else ""
-    return a, text
-demo = gr.Interface(
-    fn=transcribe,
-    inputs=[gr.State(), gr.Audio(sources=["microphone"], type="numpy", streaming=True, label="Mic")],
-    outputs=[gr.State(), gr.Textbox(label="Transcript", lines=3)],
-    title="Parakeet-TDT-0.6B-v3 — CPU streaming",
-    description="Multilingual buffered streaming (10-2-2) in memory",
-    live=True,
-)
 if __name__ == "__main__":
-    demo.launch()

+"""Refactored Gradio app for streaming ASR with NVIDIA NeMo Parakeet-TDT-0.6B-v3.
+Functionality preserved. Structure simplified and documented.
+- Buffered streaming on CPU by default (configurable device).
+- Monophonic conversion and resampling to model sample rate.
+- Greedy batched RNNT decoding with label-looping.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import copy
+import numpy as np
+import torch
+import torchaudio
+import gradio as gr
 import nemo.collections.asr as nemo_asr
 from omegaconf import OmegaConf
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig
+from nemo.collections.asr.parts.utils.rnnt_utils import batched_hyps_to_hypotheses
 from nemo.collections.asr.parts.utils.streaming_utils import ContextSize, StreamingBatchedAudioBuffer
+# ----------------------------
+# Config
+# ----------------------------
 @dataclass
+class AppConfig:
+    model_name: str = "nvidia/parakeet-tdt-0.6b-v3"
     left_s: float = 10.0
     chunk_s: float = 2.0
     right_s: float = 2.0
+    max_buffer_s: float = 40.0
+    batch_size: int = 1
+    device: str = "cpu"  # "cuda" to force GPU if available
+# ----------------------------
+# Utility
+# ----------------------------
+def _floor_multiple(a: int, b: int) -> int:
+    """Largest multiple of b not exceeding a."""
+    return (a // b) * b
+# ----------------------------
+# ASR Engine
+# ----------------------------
+class ParakeetStreamer:
+    """Encapsulates model, preprocessor settings, and decoding."""
+    def __init__(self, cfg: AppConfig) -> None:
+        self.cfg = cfg
+        # Load model
+        self.model = (
+            nemo_asr.models.EncDecRNNTModel.from_pretrained(cfg.model_name)
+            .to(cfg.device)
+            .eval()
+        )
+        for p in self.model.parameters():
+            p.requires_grad_(False)
+        # Decoding strategy: greedy-batch with label-looping for batched efficiency
+        dec_cfg = RNNTDecodingConfig(
+            strategy="greedy_batch", fused_batch_size=-1, compute_timestamps=False
+        )
+        dec_cfg.greedy.loop_labels = True
+        self.model.change_decoding_strategy(dec_cfg)
+        self._decoding_computer = self.model.decoding.decoding.decoding_computer
+        # Clone and tweak preprocessor to avoid dither and padding during inference
+        mcfg = copy.deepcopy(self.model.cfg)
+        OmegaConf.set_struct(mcfg.preprocessor, False)
+        mcfg.preprocessor.dither = 0.0
+        mcfg.preprocessor.pad_to = 0
+        OmegaConf.set_struct(mcfg.preprocessor, True)
+        # Derived constants
+        self.sample_rate: int = int(mcfg.preprocessor.sample_rate)
+        window_stride: float = float(mcfg.preprocessor.window_stride)
+        self.frames_per_second: float = 1.0 / window_stride
+        self.subsampling: int = int(self.model.encoder.subsampling_factor)
+        # Feature->audio and encoder->audio subsampling alignment
+        feat_f2a = _floor_multiple(int(self.sample_rate * window_stride), self.subsampling)
+        self.enc_f2a = feat_f2a * self.subsampling
+        # Context sizes
+        self.ctx_enc = ContextSize(
+            left=int(cfg.left_s * self.frames_per_second / self.subsampling),
+            chunk=int(cfg.chunk_s * self.frames_per_second / self.subsampling),
+            right=int(cfg.right_s * self.frames_per_second / self.subsampling),
+        )
+        self.ctx_samp = ContextSize(
+            left=self.ctx_enc.left * self.subsampling * feat_f2a,
+            chunk=self.ctx_enc.chunk * self.subsampling * feat_f2a,
+            right=self.ctx_enc.right * self.subsampling * feat_f2a,
+        )
+        self.max_samples = int(cfg.max_buffer_s * self.sample_rate)
+    # -------- audio helpers --------
+    @staticmethod
+    def _to_mono(x: np.ndarray) -> np.ndarray:
+        """Ensure mono float32 array."""
+        x = np.asarray(x)
+        if x.ndim == 2:
+            # Handle shape (samples, channels) or (channels, samples)
+            x = x.mean(axis=1) if x.shape[1] == 2 else x.mean(axis=-1)
+        return x.astype(np.float32, copy=False)
+    def _resample_if_needed(self, x: np.ndarray, in_sr: int) -> np.ndarray:
+        """Resample to model sample rate if required."""
+        if int(in_sr) == self.sample_rate:
+            return x
+        y = torchaudio.functional.resample(
+            torch.from_numpy(x), in_sr, self.sample_rate
+        )
+        return y.numpy().astype(np.float32, copy=False)
+    # -------- core decoding --------
+    @torch.inference_mode()
+    def _decode_buffer(self, audio_np: np.ndarray) -> str:
+        """Run buffered streaming decoding over the entire audio buffer."""
+        if audio_np.size == 0:
+            return ""
+        a = torch.from_numpy(audio_np).unsqueeze(0).to(torch.float32).to(self.cfg.device)
+        total_len = torch.tensor([a.shape[1]], dtype=torch.long, device=self.cfg.device)
+        cur_hyps = None
+        prev_state = None
         l = 0
+        r = min(self.ctx_samp.chunk + self.ctx_samp.right, a.shape[1])
+        buf = StreamingBatchedAudioBuffer(
+            batch_size=self.cfg.batch_size,
+            context_samples=self.ctx_samp,
+            dtype=a.dtype,
+            device=self.cfg.device,
+        )
+        remaining = total_len.clone()
         while l < a.shape[1]:
+            clen = int(min(r, a.shape[1]) - l)
+            is_last = r >= a.shape[1]
+            is_last_b = torch.tensor([clen >= remaining[0]], dtype=torch.bool, device=self.cfg.device)
+            clen_b = torch.where(is_last_b, remaining, torch.full_like(remaining, fill_value=clen))
+            buf.add_audio_batch_(
+                a[:, l:r], audio_lengths=clen_b, is_last_chunk=is_last, is_last_chunk_batch=is_last_b
+            )
+            enc, _ = self.model(input_signal=buf.samples, input_signal_length=buf.context_size_batch.total())
+            enc = enc.transpose(1, 2)  # [B, T, C]
+            enc_ctx = buf.context_size.subsample(factor=self.enc_f2a)
+            enc_ctx_b = buf.context_size_batch.subsample(factor=self.enc_f2a)
+            enc = enc[:, enc_ctx.left:]  # drop left context before decoding
+            hyps, _, prev_state = self._decoding_computer(
+                x=enc, out_len=enc_ctx_b.chunk, prev_batched_state=prev_state
+            )
+            if cur_hyps is None:
+                cur_hyps = hyps
+            else:
+                cur_hyps.merge_(hyps)
+            remaining -= clen_b
             l = r
+            r = min(r + self.ctx_samp.chunk, a.shape[1])
+        outs = batched_hyps_to_hypotheses(cur_hyps, None, batch_size=self.cfg.batch_size) if cur_hyps is not None else []
+        for h in outs:
+            h.text = self.model.tokenizer.ids_to_text(h.y_sequence.tolist())
         return outs[0].text if outs else ""
+    # -------- public API for Gradio --------
+    def transcribe(self, stream: Optional[np.ndarray], new_chunk: Optional[Tuple[int, np.ndarray]]):
+        """Gradio callback. Maintains rolling buffer in `stream` and returns transcript.
+        Args:
+            stream: rolling buffer carried in gr.State()
+            new_chunk: tuple (sample_rate, np.ndarray) provided by gr.Audio with type='numpy'
+        """
+        if new_chunk is None:
+            return stream, ""
+        in_sr, data = new_chunk
+        y = self._to_mono(data)
+        y = self._resample_if_needed(y, int(in_sr))
+        if stream is None or len(stream) == 0:
+            a = y
+        else:
+            a = np.concatenate([stream, y])
+        if a.size > self.max_samples:
+            a = a[-self.max_samples:]
+        text = self._decode_buffer(a) if a.size else ""
+        return a, text
+# ----------------------------
+# UI
+# ----------------------------
+def build_demo(cfg: Optional[AppConfig] = None) -> gr.Interface:
+    cfg = cfg or AppConfig()
+    engine = ParakeetStreamer(cfg)
+    return gr.Interface(
+        fn=engine.transcribe,
+        inputs=[gr.State(), gr.Audio(sources=["microphone"], type="numpy", streaming=True, label="Mic")],
+        outputs=[gr.State(), gr.Textbox(label="Transcript", lines=3)],
+        title="Parakeet-TDT-0.6B-v3 — CPU streaming",
+        description="Multilingual buffered streaming (10-2-2) in memory",
+        live=True,
+    )
 if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch()