Spaces:

WillHeld
/

diva-audio-chat

Paused

App Files Files Community

Helw150 commited on Oct 11, 2024

Commit

94540c3

1 Parent(s): ec083e1

Restructure

Browse files

Files changed (1) hide show

app.py +18 -18

app.py CHANGED Viewed

@@ -17,11 +17,12 @@ import tempfile
 from utils.vad import VadOptions, collect_chunks, get_speech_timestamps
-diva_model = AutoModel.from_pretrained(
-    "WillHeld/DiVA-llama-3-v0-8b", trust_remote_code=True
-)
-resampler = Audio(sampling_rate=16_000)
 @spaces.GPU(duration=20)
@@ -44,10 +45,12 @@ def diva_audio(audio_input, do_sample=False, temperature=0.001, prev_outs=None):
     )
-def run_vad(ori_audio, sr):
     _st = time.time()
     try:
         audio = ori_audio
         audio = audio.astype(np.float32) / 32768.0
         sampling_rate = 16000
         if sr != sampling_rate:
@@ -76,7 +79,7 @@ def run_vad(ori_audio, sr):
 def warm_up():
     frames = np.ones(2048)  # 1024 frames of 2 bytes each
-    dur, frames, tcost = run_vad(frames, 16000)
     print(f"warm up done, time_cost: {tcost:.3f} s")
@@ -97,19 +100,19 @@ class AppState:
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
     """Take in the stream, determine if a pause happened"""
-    temp_audio = audio
-    dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
     duration = len(audio) / sampling_rate
-    if dur_vad > 0.5 and not state.started_talking:
         print("started talking")
         state.started_talking = True
         return False
     print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
-    return (duration - dur_vad) > 1
 def process_audio(audio: tuple, state: AppState):
@@ -180,12 +183,9 @@ theme = gr.themes.Soft(
 with gr.Blocks(theme=theme) as demo:
     with gr.Row():
-        with gr.Column():
-            input_audio = gr.Audio(
-                label="Input Audio", sources="microphone", type="numpy"
-            )
-        with gr.Column():
-            chatbot = gr.Chatbot(label="Conversation", type="messages")
     state = gr.State(value=AppState())
     stream = input_audio.stream(
@@ -206,5 +206,5 @@ with gr.Blocks(theme=theme) as demo:
         cancels=[respond, stream],
     )
-demo.launch()

 from utils.vad import VadOptions, collect_chunks, get_speech_timestamps
+if gr.NO_RELOAD:
+    diva_model = AutoModel.from_pretrained(
+        "WillHeld/DiVA-llama-3-v0-8b", trust_remote_code=True
+    )
+    resampler = Audio(sampling_rate=16_000)
 @spaces.GPU(duration=20)
     )
+def run_vad(ori_audio, sr, duration):
     _st = time.time()
     try:
         audio = ori_audio
+        if duration < 1:
+            return -1, ori_audio, round(time.time() - _st, 4)
         audio = audio.astype(np.float32) / 32768.0
         sampling_rate = 16000
         if sr != sampling_rate:
 def warm_up():
     frames = np.ones(2048)  # 1024 frames of 2 bytes each
+    dur, frames, tcost = run_vad(frames, 16000, 10)
     print(f"warm up done, time_cost: {tcost:.3f} s")
 def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
     """Take in the stream, determine if a pause happened"""
+    temp_audio = audio[-2 * sampling_rate :]
     duration = len(audio) / sampling_rate
+    dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate, duration)
+    if dur_vad > 0.25 and not state.started_talking:
         print("started talking")
         state.started_talking = True
         return False
     print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
+    return dur_vad < 0.5
 def process_audio(audio: tuple, state: AppState):
 with gr.Blocks(theme=theme) as demo:
     with gr.Row():
+        input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
+    with gr.Row():
+        chatbot = gr.Chatbot(label="Conversation", type="messages")
     state = gr.State(value=AppState())
     stream = input_audio.stream(
         cancels=[respond, stream],
     )
+if __name__ == "__main__":
+    demo.launch()