File size: 2,590 Bytes
693498e
3b6a091
 
 
 
d7a74e6
693498e
3b6a091
 
693498e
d7a74e6
3b6a091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7ad1a2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gradio as gr
import torch
from api import FlexSED
import tempfile
import os
import spaces

# Load model once on startup
flexsed = FlexSED(device="cuda" if torch.cuda.is_available() else "cpu")

@spaces.GPU  # πŸš€ ensures this function runs on the GPU in Hugging Face Spaces
def run_flexsed(audio_file, event_list):
    """
    Run inference using FlexSED and return prediction plot.
    """
    if not audio_file:
        return None

    # Split events by semicolon or comma
    events = [e.strip() for e in event_list.split(";") if e.strip()]
    if not events:
        return None

    # Run inference
    preds = flexsed.run_inference(audio_file, events)

    # Generate visualization
    output_fname = os.path.join(tempfile.gettempdir(), "flexsed_output")
    flexsed.to_multi_plot(preds, events, fname=output_fname)
    plot_path = f"{output_fname}.png"

    return plot_path


# App layout
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as app:
    # Header
    gr.Markdown("""
    ## 🎧 FlexSED: A Flexible Open-Vocabulary Sound Event Detection System

    πŸ‘‹ Welcome to the **FlexSED live demo** β€” explore **prompt-guided sound event detection** in real audio clips.

    πŸ”— Learn more on the [FlexSED GitHub Repository](https://github.com/JHU-LCAP/FlexSED)
    """)

    gr.Markdown("### πŸ” Upload or choose an example below to detect sound events:")

    with gr.Row():
        # Left column: Inputs
        with gr.Column(scale=1):
            audio_input = gr.Audio(type="filepath", label="🎡 Upload Audio (.wav)")
            text_input = gr.Textbox(label="Event list (semicolon-separated)", value="Male speech; Door; Dog; Laughter")

            with gr.Row():
                detect_btn = gr.Button("🎯 Detect", variant="primary")
                clear_btn = gr.Button("🧹 Clear")

        # Right column: Output
        with gr.Column(scale=1):
            image_output = gr.Image(label="Prediction Plot", show_label=True, elem_id="output-image")
            gr.Examples(
                examples=[
                    ["example.wav", "Male speech; Door; Dog; Laughter"],
                    ["example2.wav", "Male speech; Bee; Gunshot, gunfire"],
                ],
                inputs=[audio_input, text_input],
                label="Example Audios"
            )

    # Function bindings
    detect_btn.click(run_flexsed, inputs=[audio_input, text_input], outputs=image_output)
    clear_btn.click(lambda: (None, "Male speech; Door; Dog; Laughter"), outputs=[audio_input, text_input])


if __name__ == "__main__":
    app.launch()