File size: 4,806 Bytes
004a685
 
 
 
075bd82
004a685
b79121f
877491b
004a685
 
 
65d1596
0b7e831
004a685
 
52af07f
 
 
004a685
 
 
 
 
075bd82
004a685
 
 
 
eb7d238
004a685
 
 
 
 
 
 
82e286f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ce672
 
51dcabc
 
877491b
 
 
51dcabc
 
 
 
877491b
51dcabc
877491b
 
51dcabc
504600b
877491b
 
51dcabc
529d522
004a685
 
 
877491b
004a685
 
877491b
 
 
004a685
 
 
877491b
 
 
004a685
 
877491b
004a685
 
 
31f5b5d
004a685
16e9299
004a685
877491b
004a685
877491b
004a685
877491b
 
004a685
877491b
 
 
 
004a685
877491b
 
 
004a685
877491b
004a685
 
877491b
004a685
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import torch
import llava
import os
import spaces
from huggingface_hub import snapshot_download
import copy 

# ---------------------------------
# SINGLE-TURN MODEL SETUP
# ---------------------------------

MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')

model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
model_single = model_single.to("cuda")

generation_config_single = model_single.default_generation_config

# ---------------------------------
# SINGLE-TURN INFERENCE FUNCTION
# ---------------------------------
@spaces.GPU
def single_turn_infer(audio_file, prompt_text):
    try:
        sound = llava.Sound(audio_file)
        full_prompt = f"<sound>\n{prompt_text}"
        response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single)
        return response
    except Exception as e:
        return f"❌ Error: {str(e)}"

# ---------------------------------
# INTERFACE
# ---------------------------------
with gr.Blocks(css="""
.gradio-container { 
    max-width: 100% !important; 
    width: 100% !important;
    margin: 0 !important; 
    padding: 0 !important;
}
#component-0, .gr-block.gr-box { 
    width: 100% !important; 
}
.gr-block.gr-box, .gr-column, .gr-row {
    padding: 0 !important;
    margin: 0 !important;
}
""") as demo:

    with gr.Column():
        gr.HTML("""
<div align="center">
  <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="110" style="margin-bottom: 8px;">
  <h2><strong>Audio Understanding Demo (WeaveMuse Edition)</strong></h2>
  <p><em>Powered by NVIDIA’s open Audio Flamingo 3 model</em></p>
</div>

<div align="center" style="margin-top: 10px;">
  <a href="https://huggingface.co/nvidia/audio-flamingo-3">
    <img src="https://img.shields.io/badge/🤗-Model:_audio--flamingo--3-ED5A22.svg">
  </a>
  <a href="https://github.com/NVIDIA/audio-flamingo">
    <img src="https://img.shields.io/badge/GitHub-NVIDIA%2Faudio--flamingo-9C276A.svg">
  </a>
</div>
<div align="center" style="margin-top: 8px;">
  <p><small>This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the <strong>WeaveMuse</strong> framework.</small></p>
</div>
""")

    with gr.Tabs():
        # ---------------- SINGLE-TURN ----------------
        with gr.Tab("🎧 Audio Inference"):
            with gr.Row():
                with gr.Column():
                    audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip")
                    prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6)
                    btn_single = gr.Button("Generate Response")

                    gr.Examples(
                        examples=[
                            ["static/emergent/audio1.wav", "What is happening in this audio?"],
                            ["static/audio/audio2.wav", "Describe the sounds you hear."],
                            ["static/speech/audio3.wav", "Transcribe the spoken words."],
                        ],
                        inputs=[audio_input_single, prompt_input_single],
                        label="🧪 Example Prompts"
                    )

                with gr.Column():
                    output_single = gr.Textbox(label="Model Response", lines=15)

            btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)        

        with gr.Tab("ℹ️ About"):
            gr.Markdown("""
### 🎶 Overview

This demo showcases a **simple single-turn audio–language interface** built for integration with the **WeaveMuse** framework.  
It uses the open-source **NVIDIA Audio Flamingo 3** model for audio understanding, transcription, and sound reasoning.

You can upload an audio file and ask natural-language questions such as:
- “What kind of sound is this?”
- “Describe the scene.”
- “Transcribe any speech.”

**Acknowledgment:**  
Model and research credit to **NVIDIA**, for the development of the open **Audio Flamingo 3** model and datasets used for training.  
This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse.

**Tech stack:** Gradio + PyTorch + llava + WeaveMuse Integration
""")

    gr.Markdown("© 2025 WeaveMuse | Model © NVIDIA — Audio Flamingo 3 | Built with ❤️ using Gradio & PyTorch")

# -----------------------
# Launch App
# -----------------------
if __name__ == "__main__":
    demo.launch(share=True)