Spaces:

intelli-zen
/

voice_activity_detection

Runtime error

App Files Files Community

qgyd2021 commited on Jan 29, 2024

Commit

fdbda89

1 Parent(s): acb6654

update

Browse files

Files changed (12) hide show

.gitattributes +3 -0
.gitignore +15 -0
Dockerfile +32 -0
README.md +4 -7
examples/webrtcvad/vad.py +173 -0
main.py +135 -0
project_settings.py +16 -0
requirements.txt +6 -0
toolbox/__init__.py +6 -0
toolbox/webrtcvad/__init__.py +6 -0
toolbox/webrtcvad/vad.py +233 -0
webrtcvad_examples.json +8 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+.git/
+.idea/
+data/
+pretrained_models/
+temp/
+**/cache/
+**/__pycache__/
+**/*.env
+**/*.mp3
+**/*.png
+**/*.xlsx

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.8
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+RUN apt-get install -y git
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python", "main.py"]

README.md CHANGED Viewed

@@ -1,13 +1,10 @@
 ---
 title: Voice Activity Detection
-emoji: 🐢
-colorFrom: yellow
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.16.0
-app_file: app.py
 pinned: false
-license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Voice Activity Detection
+emoji: 🌍
+colorFrom: purple
+colorTo: gray
+sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

examples/webrtcvad/vad.py ADDED Viewed

	@@ -0,0 +1,173 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import collections
+import contextlib
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.io import wavfile
+import wave
+import webrtcvad
+from project_settings import project_path
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wav_file",
+        default=(project_path / "data/3300999628164249998.wav").as_posix(),
+        type=str,
+    )
+    parser.add_argument(
+        "--agg",
+        default=3,
+        type=int,
+        help="The level of aggressiveness of the VAD: [0-3]'"
+    )
+    parser.add_argument(
+        "--frame_duration_ms",
+        default=30,
+        type=int,
+    )
+    parser.add_argument(
+        "--silence_duration_threshold",
+        default=0.3,
+        type=float,
+        help="minimum silence duration, in seconds."
+    )
+    args = parser.parse_args()
+    return args
+def read_wave(path):
+    with contextlib.closing(wave.open(path, 'rb')) as wf:
+        num_channels = wf.getnchannels()
+        assert num_channels == 1
+        sample_width = wf.getsampwidth()
+        assert sample_width == 2
+        sample_rate = wf.getframerate()
+        assert sample_rate in (8000, 16000, 32000, 48000)
+        pcm_data = wf.readframes(wf.getnframes())
+        return pcm_data, sample_rate
+class Frame(object):
+    def __init__(self, audio_bytes, timestamp, duration):
+        self.audio_bytes = audio_bytes
+        self.timestamp = timestamp
+        self.duration = duration
+def frame_generator(frame_duration_ms, audio, sample_rate):
+    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
+    offset = 0
+    timestamp = 0.0
+    duration = (float(n) / sample_rate) / 2.0
+    while offset + n < len(audio):
+        yield Frame(audio[offset:offset + n], timestamp, duration)
+        timestamp += duration
+        offset += n
+def vad_collector(sample_rate, frame_duration_ms,
+                  padding_duration_ms, vad, frames):
+    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
+    ring_buffer = collections.deque(maxlen=num_padding_frames)
+    triggered = False
+    voiced_frames = []
+    for frame in frames:
+        is_speech = vad.is_speech(frame.audio_bytes, sample_rate)
+        if not triggered:
+            ring_buffer.append((frame, is_speech))
+            num_voiced = len([f for f, speech in ring_buffer if speech])
+            if num_voiced > 0.9 * ring_buffer.maxlen:
+                triggered = True
+                for f, _ in ring_buffer:
+                    voiced_frames.append(f)
+                ring_buffer.clear()
+        else:
+            voiced_frames.append(frame)
+            ring_buffer.append((frame, is_speech))
+            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
+            if num_unvoiced > 0.9 * ring_buffer.maxlen:
+                triggered = False
+                yield [b''.join([f.audio_bytes for f in voiced_frames]),
+                       voiced_frames[0].timestamp, voiced_frames[-1].timestamp]
+                ring_buffer.clear()
+                voiced_frames = []
+    if voiced_frames:
+        yield [b''.join([f.audio_bytes for f in voiced_frames]),
+               voiced_frames[0].timestamp, voiced_frames[-1].timestamp]
+def main():
+    args = get_args()
+    vad = webrtcvad.Vad(mode=args.agg)
+    audio_pcm_data, sample_rate = read_wave(args.wav_file)
+    _, audio_data = wavfile.read(args.wav_file)
+    # audio_data_ = bytes(audio_data)
+    frames = frame_generator(
+        frame_duration_ms=args.frame_duration_ms,
+        audio=audio_pcm_data, sample_rate=sample_rate
+    )
+    frames = list(frames)
+    segments = vad_collector(sample_rate, args.frame_duration_ms, 300, vad, frames)
+    segments = list(segments)
+    vad_segments = list()
+    timestamp_start = 0.0
+    timestamp_end = 0.0
+    last_i = len(segments) - 1
+    for i, segment in enumerate(segments):
+        start = round(segment[1], 4)
+        end = round(segment[2], 4)
+        flag_first = i == 0
+        flag_last = i == last_i
+        if flag_first:
+            timestamp_start = start
+            timestamp_end = end
+            continue
+        if timestamp_start:
+            sil_duration = start - timestamp_end
+            if sil_duration > args.silence_duration_threshold:
+                vad_segments.append([timestamp_start, timestamp_end])
+                timestamp_start = start
+                timestamp_end = end
+                if flag_last:
+                    vad_segments.append([timestamp_start, timestamp_end])
+            else:
+                timestamp_end = end
+    print(vad_segments)
+    time = np.arange(0, len(audio_data)) / sample_rate
+    plt.figure(figsize=(12, 5))
+    plt.plot(time, audio_data / 32768, color='b')
+    for start, end in vad_segments:
+        plt.axvline(x=start, ymin=0.25, ymax=0.75, color='g', linestyle='--', label='开始端点')  # 标记开始端点
+        plt.axvline(x=end, ymin=0.25, ymax=0.75, color='r', linestyle='--', label='结束端点')  # 标记结束端点
+    plt.show()
+    return
+if __name__ == '__main__':
+    main()

main.py ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import json
+import platform
+from typing import Tuple
+import gradio as gr
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from project_settings import project_path, temp_directory
+from toolbox.webrtcvad.vad import WebRTCVad
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--webrtcvad_examples_file",
+        default=(project_path / "webrtcvad_examples.json").as_posix(),
+        type=str
+    )
+    args = parser.parse_args()
+    return args
+webrtcvad: WebRTCVad = None
+def click_webrtcvad_button(audio: Tuple[int, np.ndarray],
+                           agg: int = 3,
+                           frame_duration_ms: int = 30,
+                           padding_duration_ms: int = 300,
+                           silence_duration_threshold: float = 0.3,
+                           ):
+    global webrtcvad
+    sample_rate, signal = audio
+    webrtcvad = WebRTCVad(agg=int(agg),
+                          frame_duration_ms=frame_duration_ms,
+                          padding_duration_ms=padding_duration_ms,
+                          silence_duration_threshold=silence_duration_threshold,
+                          sample_rate=sample_rate,
+                          )
+    vad_segments = list()
+    segments = webrtcvad.vad(signal)
+    vad_segments += segments
+    segments = webrtcvad.last_vad_segments()
+    vad_segments += segments
+    time = np.arange(0, len(signal)) / sample_rate
+    plt.figure(figsize=(12, 5))
+    plt.plot(time, signal / 32768, color='b')
+    for start, end in vad_segments:
+        plt.axvline(x=start, ymin=0.25, ymax=0.75, color='g', linestyle='--', label='开始端点')  # 标记开始端点
+        plt.axvline(x=end, ymin=0.25, ymax=0.75, color='r', linestyle='--', label='结束端点')  # 标记结束端点
+    temp_image_file = temp_directory / "temp.jpg"
+    plt.savefig(temp_image_file)
+    image = Image.open(open(temp_image_file, "rb"))
+    return image, vad_segments
+def main():
+    args = get_args()
+    brief_description = """
+    ## Voice Activity Detection
+    """
+    # examples
+    with open(args.webrtcvad_examples_file, "r", encoding="utf-8") as f:
+        webrtcvad_examples = json.load(f)
+    # ui
+    with gr.Blocks() as blocks:
+        gr.Markdown(value=brief_description)
+        with gr.Row():
+            with gr.Column(scale=5):
+                with gr.Tabs():
+                    with gr.TabItem("webrtcvad"):
+                        gr.Markdown(value="")
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                webrtcvad_wav = gr.Audio(label="wav")
+                                with gr.Row():
+                                    webrtcvad_agg = gr.Dropdown(choices=[1, 2, 3], value=3, label="agg")
+                                    webrtcvad_frame_duration_ms = gr.Slider(minimum=0, maximum=100, value=30, label="frame_duration_ms")
+                                with gr.Row():
+                                    webrtcvad_padding_duration_ms = gr.Slider(minimum=0, maximum=1000, value=300, label="padding_duration_ms")
+                                    webrtcvad_silence_duration_threshold = gr.Slider(minimum=0, maximum=1.0, value=0.3, step=0.1, label="silence_duration_threshold")
+                                webrtcvad_button = gr.Button("retrieval", variant="primary")
+                            with gr.Column(scale=1):
+                                webrtcvad_image = gr.Image(label="image", height=300, width=720, show_label=False)
+                                webrtcvad_end_points = gr.TextArea(label="end_points", max_lines=35)
+                        gr.Examples(
+                            examples=webrtcvad_examples,
+                            inputs=[
+                                webrtcvad_wav, webrtcvad_agg, webrtcvad_frame_duration_ms,
+                                webrtcvad_padding_duration_ms, webrtcvad_silence_duration_threshold
+                            ],
+                            outputs=[webrtcvad_image, webrtcvad_end_points],
+                            fn=click_webrtcvad_button
+                        )
+                        # click event
+                        webrtcvad_button.click(
+                            click_webrtcvad_button,
+                            inputs=[
+                                webrtcvad_wav, webrtcvad_agg, webrtcvad_frame_duration_ms,
+                                webrtcvad_padding_duration_ms, webrtcvad_silence_duration_threshold
+                            ],
+                            outputs=[webrtcvad_image, webrtcvad_end_points],
+                        )
+    blocks.queue().launch(
+        share=False if platform.system() == "Windows" else False
+    )
+    return
+if __name__ == "__main__":
+    main()

project_settings.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os
+from pathlib import Path
+project_path = os.path.abspath(os.path.dirname(__file__))
+project_path = Path(project_path)
+temp_directory = project_path / "temp"
+temp_directory.mkdir(exist_ok=True)
+if __name__ == '__main__':
+    pass

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==4.1.2
+webrtcvad==2.0.10
+wave==0.0.2
+matplotlib==3.7.4
+scipy==1.10.1
+pillow==10.2.0

toolbox/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/webrtcvad/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+if __name__ == '__main__':
+    pass

toolbox/webrtcvad/vad.py ADDED Viewed

	@@ -0,0 +1,233 @@

+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import argparse
+import collections
+from typing import List
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.io import wavfile
+import webrtcvad
+from project_settings import project_path
+class Frame(object):
+    def __init__(self, signal: np.ndarray, timestamp, duration):
+        self.signal = signal
+        self.timestamp = timestamp
+        self.duration = duration
+class WebRTCVad(object):
+    def __init__(self,
+                 agg: int = 3,
+                 frame_duration_ms: int = 30,
+                 padding_duration_ms: int = 300,
+                 silence_duration_threshold: float = 0.3,
+                 sample_rate: int = 8000
+                 ):
+        self.agg = agg
+        self.frame_duration_ms = frame_duration_ms
+        self.padding_duration_ms = padding_duration_ms
+        self.silence_duration_threshold = silence_duration_threshold
+        self.sample_rate = sample_rate
+        self._vad = webrtcvad.Vad(mode=agg)
+        # frames
+        self.frame_length = int(sample_rate * (frame_duration_ms / 1000.0))
+        self.frame_timestamp = 0.0
+        self.signal_cache = None
+        # segments
+        self.num_padding_frames = int(padding_duration_ms / frame_duration_ms)
+        self.ring_buffer = collections.deque(maxlen=self.num_padding_frames)
+        self.triggered = False
+        self.voiced_frames: List[Frame] = list()
+        self.segments = list()
+        # vad segments
+        self.is_first_segment = True
+        self.timestamp_start = 0.0
+        self.timestamp_end = 0.0
+    def signal_to_frames(self, signal: np.ndarray):
+        frames = list()
+        l = len(signal)
+        duration = (float(self.frame_length) / self.sample_rate)
+        for offset in range(0, l, self.frame_length):
+            sub_signal = signal[offset:offset+self.frame_length]
+            frame = Frame(sub_signal, self.frame_timestamp, duration)
+            self.frame_timestamp += duration
+            frames.append(frame)
+        return frames
+    def segments_generator(self, signal: np.ndarray):
+        # signal rounding
+        if self.signal_cache is not None:
+            signal = np.concatenate([self.signal_cache, signal])
+        rest = len(signal) % self.frame_length
+        if rest == 0:
+            self.signal_cache = None
+            signal_ = signal
+        else:
+            self.signal_cache = signal[-rest:]
+            signal_ = signal[:-rest]
+        # frames
+        frames = self.signal_to_frames(signal_)
+        for frame in frames:
+            audio_bytes = bytes(frame.signal)
+            is_speech = self._vad.is_speech(audio_bytes, self.sample_rate)
+            if not self.triggered:
+                self.ring_buffer.append((frame, is_speech))
+                num_voiced = len([f for f, speech in self.ring_buffer if speech])
+                if num_voiced > 0.9 * self.ring_buffer.maxlen:
+                    self.triggered = True
+                    for f, _ in self.ring_buffer:
+                        self.voiced_frames.append(f)
+                    self.ring_buffer.clear()
+            else:
+                self.voiced_frames.append(frame)
+                self.ring_buffer.append((frame, is_speech))
+                num_unvoiced = len([f for f, speech in self.ring_buffer if not speech])
+                if num_unvoiced > 0.9 * self.ring_buffer.maxlen:
+                    self.triggered = False
+                    segment = [
+                        np.concatenate([f.signal for f in self.voiced_frames]),
+                        self.voiced_frames[0].timestamp,
+                        self.voiced_frames[-1].timestamp
+                    ]
+                    yield segment
+                    self.ring_buffer.clear()
+                    self.voiced_frames = []
+    def vad_segments_generator(self, segments_generator):
+        segments = list(segments_generator)
+        for i, segment in enumerate(segments):
+            start = round(segment[1], 4)
+            end = round(segment[2], 4)
+            if self.is_first_segment:
+                self.timestamp_start = start
+                self.timestamp_end = end
+                self.is_first_segment = False
+                continue
+            if self.timestamp_start:
+                sil_duration = start - self.timestamp_end
+                if sil_duration > self.silence_duration_threshold:
+                    vad_segment = [self.timestamp_start, self.timestamp_end]
+                    yield vad_segment
+                    self.timestamp_start = start
+                    self.timestamp_end = end
+                else:
+                    self.timestamp_end = end
+    def vad(self, signal: np.ndarray) -> List[list]:
+        segments = self.segments_generator(signal)
+        vad_segments = self.vad_segments_generator(segments)
+        vad_segments = list(vad_segments)
+        return vad_segments
+    def last_vad_segments(self) -> List[list]:
+        # last segments
+        if len(self.voiced_frames) == 0:
+            segments = []
+        else:
+            segment = [
+                np.concatenate([f.signal for f in self.voiced_frames]),
+                self.voiced_frames[0].timestamp,
+                self.voiced_frames[-1].timestamp
+            ]
+            segments = [segment]
+        # last vad segments
+        vad_segments = self.vad_segments_generator(segments)
+        vad_segments = list(vad_segments)
+        vad_segments = vad_segments + [[self.timestamp_start, self.timestamp_end]]
+        return vad_segments
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wav_file",
+        default=(project_path / "data/3300999628164249998.wav").as_posix(),
+        type=str,
+    )
+    parser.add_argument(
+        "--agg",
+        default=3,
+        type=int,
+        help="The level of aggressiveness of the VAD: [0-3]'"
+    )
+    parser.add_argument(
+        "--frame_duration_ms",
+        default=30,
+        type=int,
+    )
+    parser.add_argument(
+        "--silence_duration_threshold",
+        default=0.3,
+        type=float,
+        help="minimum silence duration, in seconds."
+    )
+    args = parser.parse_args()
+    return args
+SAMPLE_RATE = 8000
+def main():
+    args = get_args()
+    w_vad = WebRTCVad(sample_rate=SAMPLE_RATE)
+    sample_rate, signal = wavfile.read(args.wav_file)
+    if SAMPLE_RATE != sample_rate:
+        raise AssertionError
+    vad_segments = list()
+    segments = w_vad.vad(signal)
+    vad_segments += segments
+    for segment in segments:
+        print(segment)
+    # last vad segment
+    segments = w_vad.last_vad_segments()
+    vad_segments += segments
+    for segment in segments:
+        print(segment)
+    # plot
+    time = np.arange(0, len(signal)) / sample_rate
+    plt.figure(figsize=(12, 5))
+    plt.plot(time, signal / 32768, color='b')
+    for start, end in vad_segments:
+        plt.axvline(x=start, ymin=0.25, ymax=0.75, color='g', linestyle='--', label='开始端点')  # 标记开始端点
+        plt.axvline(x=end, ymin=0.25, ymax=0.75, color='r', linestyle='--', label='结束端点')  # 标记结束端点
+    plt.show()
+    return
+if __name__ == '__main__':
+    main()

webrtcvad_examples.json ADDED Viewed

	@@ -0,0 +1,8 @@

+[
+    [
+        "data/early_media/3300999628164249998.wav"
+    ],
+    [
+        "data/early_media/3300999628164852605.wav"
+    ]
+]