File size: 2,590 Bytes
693498e 3b6a091 d7a74e6 693498e 3b6a091 693498e d7a74e6 3b6a091 f7ad1a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
import torch
from api import FlexSED
import tempfile
import os
import spaces
# Load model once on startup
flexsed = FlexSED(device="cuda" if torch.cuda.is_available() else "cpu")
@spaces.GPU # π ensures this function runs on the GPU in Hugging Face Spaces
def run_flexsed(audio_file, event_list):
"""
Run inference using FlexSED and return prediction plot.
"""
if not audio_file:
return None
# Split events by semicolon or comma
events = [e.strip() for e in event_list.split(";") if e.strip()]
if not events:
return None
# Run inference
preds = flexsed.run_inference(audio_file, events)
# Generate visualization
output_fname = os.path.join(tempfile.gettempdir(), "flexsed_output")
flexsed.to_multi_plot(preds, events, fname=output_fname)
plot_path = f"{output_fname}.png"
return plot_path
# App layout
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as app:
# Header
gr.Markdown("""
## π§ FlexSED: A Flexible Open-Vocabulary Sound Event Detection System
π Welcome to the **FlexSED live demo** β explore **prompt-guided sound event detection** in real audio clips.
π Learn more on the [FlexSED GitHub Repository](https://github.com/JHU-LCAP/FlexSED)
""")
gr.Markdown("### π Upload or choose an example below to detect sound events:")
with gr.Row():
# Left column: Inputs
with gr.Column(scale=1):
audio_input = gr.Audio(type="filepath", label="π΅ Upload Audio (.wav)")
text_input = gr.Textbox(label="Event list (semicolon-separated)", value="Male speech; Door; Dog; Laughter")
with gr.Row():
detect_btn = gr.Button("π― Detect", variant="primary")
clear_btn = gr.Button("π§Ή Clear")
# Right column: Output
with gr.Column(scale=1):
image_output = gr.Image(label="Prediction Plot", show_label=True, elem_id="output-image")
gr.Examples(
examples=[
["example.wav", "Male speech; Door; Dog; Laughter"],
["example2.wav", "Male speech; Bee; Gunshot, gunfire"],
],
inputs=[audio_input, text_input],
label="Example Audios"
)
# Function bindings
detect_btn.click(run_flexsed, inputs=[audio_input, text_input], outputs=image_output)
clear_btn.click(lambda: (None, "Male speech; Door; Dog; Laughter"), outputs=[audio_input, text_input])
if __name__ == "__main__":
app.launch()
|