|
|
import gradio as gr |
|
|
import torch |
|
|
from api import FlexSED |
|
|
import tempfile |
|
|
import os |
|
|
import spaces |
|
|
|
|
|
|
|
|
flexsed = FlexSED(device="cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
@spaces.GPU |
|
|
def run_flexsed(audio_file, event_list): |
|
|
""" |
|
|
Run inference using FlexSED and return prediction plot. |
|
|
""" |
|
|
if not audio_file: |
|
|
return None |
|
|
|
|
|
|
|
|
events = [e.strip() for e in event_list.split(";") if e.strip()] |
|
|
if not events: |
|
|
return None |
|
|
|
|
|
|
|
|
preds = flexsed.run_inference(audio_file, events) |
|
|
|
|
|
|
|
|
output_fname = os.path.join(tempfile.gettempdir(), "flexsed_output") |
|
|
flexsed.to_multi_plot(preds, events, fname=output_fname) |
|
|
plot_path = f"{output_fname}.png" |
|
|
|
|
|
return plot_path |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as app: |
|
|
|
|
|
gr.Markdown(""" |
|
|
## π§ FlexSED: A Flexible Open-Vocabulary Sound Event Detection System |
|
|
|
|
|
π Welcome to the **FlexSED live demo** β explore **prompt-guided sound event detection** in real audio clips. |
|
|
|
|
|
π Learn more on the [FlexSED GitHub Repository](https://github.com/JHU-LCAP/FlexSED) |
|
|
""") |
|
|
|
|
|
gr.Markdown("### π Upload or choose an example below to detect sound events:") |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
audio_input = gr.Audio(type="filepath", label="π΅ Upload Audio (.wav)") |
|
|
text_input = gr.Textbox(label="Event list (semicolon-separated)", value="Male speech; Door; Dog; Laughter") |
|
|
|
|
|
with gr.Row(): |
|
|
detect_btn = gr.Button("π― Detect", variant="primary") |
|
|
clear_btn = gr.Button("π§Ή Clear") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
image_output = gr.Image(label="Prediction Plot", show_label=True, elem_id="output-image") |
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["example.wav", "Male speech; Door; Dog; Laughter"], |
|
|
["example2.wav", "Male speech; Bee; Gunshot, gunfire"], |
|
|
], |
|
|
inputs=[audio_input, text_input], |
|
|
label="Example Audios" |
|
|
) |
|
|
|
|
|
|
|
|
detect_btn.click(run_flexsed, inputs=[audio_input, text_input], outputs=image_output) |
|
|
clear_btn.click(lambda: (None, "Male speech; Door; Dog; Laughter"), outputs=[audio_input, text_input]) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch() |
|
|
|