import gradio as gr import torch from api import FlexSED import tempfile import os import spaces # Load model once on startup flexsed = FlexSED(device="cuda" if torch.cuda.is_available() else "cpu") @spaces.GPU # ๐Ÿš€ ensures this function runs on the GPU in Hugging Face Spaces def run_flexsed(audio_file, event_list): """ Run inference using FlexSED and return prediction plot. """ if not audio_file: return None # Split events by semicolon or comma events = [e.strip() for e in event_list.split(";") if e.strip()] if not events: return None # Run inference preds = flexsed.run_inference(audio_file, events) # Generate visualization output_fname = os.path.join(tempfile.gettempdir(), "flexsed_output") flexsed.to_multi_plot(preds, events, fname=output_fname) plot_path = f"{output_fname}.png" return plot_path # App layout with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as app: # Header gr.Markdown(""" ## ๐ŸŽง FlexSED: A Flexible Open-Vocabulary Sound Event Detection System ๐Ÿ‘‹ Welcome to the **FlexSED live demo** โ€” explore **prompt-guided sound event detection** in real audio clips. ๐Ÿ”— Learn more on the [FlexSED GitHub Repository](https://github.com/JHU-LCAP/FlexSED) """) gr.Markdown("### ๐Ÿ” Upload or choose an example below to detect sound events:") with gr.Row(): # Left column: Inputs with gr.Column(scale=1): audio_input = gr.Audio(type="filepath", label="๐ŸŽต Upload Audio (.wav)") text_input = gr.Textbox(label="Event list (semicolon-separated)", value="Male speech; Door; Dog; Laughter") with gr.Row(): detect_btn = gr.Button("๐ŸŽฏ Detect", variant="primary") clear_btn = gr.Button("๐Ÿงน Clear") # Right column: Output with gr.Column(scale=1): image_output = gr.Image(label="Prediction Plot", show_label=True, elem_id="output-image") gr.Examples( examples=[ ["example.wav", "Male speech; Door; Dog; Laughter"], ["example2.wav", "Male speech; Bee; Gunshot, gunfire"], ], inputs=[audio_input, text_input], label="Example Audios" ) # Function bindings detect_btn.click(run_flexsed, inputs=[audio_input, text_input], outputs=image_output) clear_btn.click(lambda: (None, "Male speech; Door; Dog; Laughter"), outputs=[audio_input, text_input]) if __name__ == "__main__": app.launch()