Spaces:
Sleeping
Sleeping
File size: 4,806 Bytes
004a685 075bd82 004a685 b79121f 877491b 004a685 65d1596 0b7e831 004a685 52af07f 004a685 075bd82 004a685 eb7d238 004a685 82e286f 24ce672 51dcabc 877491b 51dcabc 877491b 51dcabc 877491b 51dcabc 504600b 877491b 51dcabc 529d522 004a685 877491b 004a685 877491b 004a685 877491b 004a685 877491b 004a685 31f5b5d 004a685 16e9299 004a685 877491b 004a685 877491b 004a685 877491b 004a685 877491b 004a685 877491b 004a685 877491b 004a685 877491b 004a685 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import torch
import llava
import os
import spaces
from huggingface_hub import snapshot_download
import copy
# ---------------------------------
# SINGLE-TURN MODEL SETUP
# ---------------------------------
MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
model_single = model_single.to("cuda")
generation_config_single = model_single.default_generation_config
# ---------------------------------
# SINGLE-TURN INFERENCE FUNCTION
# ---------------------------------
@spaces.GPU
def single_turn_infer(audio_file, prompt_text):
try:
sound = llava.Sound(audio_file)
full_prompt = f"<sound>\n{prompt_text}"
response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single)
return response
except Exception as e:
return f"❌ Error: {str(e)}"
# ---------------------------------
# INTERFACE
# ---------------------------------
with gr.Blocks(css="""
.gradio-container {
max-width: 100% !important;
width: 100% !important;
margin: 0 !important;
padding: 0 !important;
}
#component-0, .gr-block.gr-box {
width: 100% !important;
}
.gr-block.gr-box, .gr-column, .gr-row {
padding: 0 !important;
margin: 0 !important;
}
""") as demo:
with gr.Column():
gr.HTML("""
<div align="center">
<img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="110" style="margin-bottom: 8px;">
<h2><strong>Audio Understanding Demo (WeaveMuse Edition)</strong></h2>
<p><em>Powered by NVIDIA’s open Audio Flamingo 3 model</em></p>
</div>
<div align="center" style="margin-top: 10px;">
<a href="https://huggingface.co/nvidia/audio-flamingo-3">
<img src="https://img.shields.io/badge/🤗-Model:_audio--flamingo--3-ED5A22.svg">
</a>
<a href="https://github.com/NVIDIA/audio-flamingo">
<img src="https://img.shields.io/badge/GitHub-NVIDIA%2Faudio--flamingo-9C276A.svg">
</a>
</div>
<div align="center" style="margin-top: 8px;">
<p><small>This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the <strong>WeaveMuse</strong> framework.</small></p>
</div>
""")
with gr.Tabs():
# ---------------- SINGLE-TURN ----------------
with gr.Tab("🎧 Audio Inference"):
with gr.Row():
with gr.Column():
audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip")
prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6)
btn_single = gr.Button("Generate Response")
gr.Examples(
examples=[
["static/emergent/audio1.wav", "What is happening in this audio?"],
["static/audio/audio2.wav", "Describe the sounds you hear."],
["static/speech/audio3.wav", "Transcribe the spoken words."],
],
inputs=[audio_input_single, prompt_input_single],
label="🧪 Example Prompts"
)
with gr.Column():
output_single = gr.Textbox(label="Model Response", lines=15)
btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)
with gr.Tab("ℹ️ About"):
gr.Markdown("""
### 🎶 Overview
This demo showcases a **simple single-turn audio–language interface** built for integration with the **WeaveMuse** framework.
It uses the open-source **NVIDIA Audio Flamingo 3** model for audio understanding, transcription, and sound reasoning.
You can upload an audio file and ask natural-language questions such as:
- “What kind of sound is this?”
- “Describe the scene.”
- “Transcribe any speech.”
**Acknowledgment:**
Model and research credit to **NVIDIA**, for the development of the open **Audio Flamingo 3** model and datasets used for training.
This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse.
**Tech stack:** Gradio + PyTorch + llava + WeaveMuse Integration
""")
gr.Markdown("© 2025 WeaveMuse | Model © NVIDIA — Audio Flamingo 3 | Built with ❤️ using Gradio & PyTorch")
# -----------------------
# Launch App
# -----------------------
if __name__ == "__main__":
demo.launch(share=True)
|