audio-flamingo-3

Sleeping

File size: 4,806 Bytes

import gradio as gr
import torch
import llava
import os
import spaces
from huggingface_hub import snapshot_download
import copy 

# ---------------------------------
# SINGLE-TURN MODEL SETUP
# ---------------------------------

MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')

model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
model_single = model_single.to("cuda")

generation_config_single = model_single.default_generation_config

# ---------------------------------
# SINGLE-TURN INFERENCE FUNCTION
# ---------------------------------
@spaces.GPU
def single_turn_infer(audio_file, prompt_text):
    try:
        sound = llava.Sound(audio_file)
        full_prompt = f"<sound>\n{prompt_text}"
        response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single)
        return response
    except Exception as e:
        return f"❌ Error: {str(e)}"

# ---------------------------------
# INTERFACE
# ---------------------------------
with gr.Blocks(css="""
.gradio-container { 
    max-width: 100% !important; 
    width: 100% !important;
    margin: 0 !important; 
    padding: 0 !important;
}
#component-0, .gr-block.gr-box { 
    width: 100% !important; 
}
.gr-block.gr-box, .gr-column, .gr-row {
    padding: 0 !important;
    margin: 0 !important;
}
""") as demo:

    with gr.Column():
        gr.HTML("""
<div align="center">
  <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="110" style="margin-bottom: 8px;">
  <h2><strong>Audio Understanding Demo (WeaveMuse Edition)</strong></h2>
  <p><em>Powered by NVIDIA’s open Audio Flamingo 3 model</em></p>
</div>

<div align="center" style="margin-top: 10px;">
  <a href="https://huggingface.co/nvidia/audio-flamingo-3">
    <img src="https://img.shields.io/badge/🤗-Model:_audio--flamingo--3-ED5A22.svg">
  </a>
  <a href="https://github.com/NVIDIA/audio-flamingo">
    <img src="https://img.shields.io/badge/GitHub-NVIDIA%2Faudio--flamingo-9C276A.svg">
  </a>
</div>
<div align="center" style="margin-top: 8px;">
  <p><small>This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the <strong>WeaveMuse</strong> framework.</small></p>
</div>
""")

    with gr.Tabs():
        # ---------------- SINGLE-TURN ----------------
        with gr.Tab("🎧 Audio Inference"):
            with gr.Row():
                with gr.Column():
                    audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip")
                    prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6)
                    btn_single = gr.Button("Generate Response")

                    gr.Examples(
                        examples=[
                            ["static/emergent/audio1.wav", "What is happening in this audio?"],
                            ["static/audio/audio2.wav", "Describe the sounds you hear."],
                            ["static/speech/audio3.wav", "Transcribe the spoken words."],
                        ],
                        inputs=[audio_input_single, prompt_input_single],
                        label="🧪 Example Prompts"
                    )

                with gr.Column():
                    output_single = gr.Textbox(label="Model Response", lines=15)

            btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)        

        with gr.Tab("ℹ️ About"):
            gr.Markdown("""
### 🎶 Overview

This demo showcases a **simple single-turn audio–language interface** built for integration with the **WeaveMuse** framework.  
It uses the open-source **NVIDIA Audio Flamingo 3** model for audio understanding, transcription, and sound reasoning.

You can upload an audio file and ask natural-language questions such as:
- “What kind of sound is this?”
- “Describe the scene.”
- “Transcribe any speech.”

**Acknowledgment:**  
Model and research credit to **NVIDIA**, for the development of the open **Audio Flamingo 3** model and datasets used for training.  
This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse.

**Tech stack:** Gradio + PyTorch + llava + WeaveMuse Integration
""")

    gr.Markdown("© 2025 WeaveMuse | Model © NVIDIA — Audio Flamingo 3 | Built with ❤️ using Gradio & PyTorch")

# -----------------------
# Launch App
# -----------------------
if __name__ == "__main__":
    demo.launch(share=True)