manoskary's picture
Update app.py
877491b verified
import gradio as gr
import torch
import llava
import os
import spaces
from huggingface_hub import snapshot_download
import copy
# ---------------------------------
# SINGLE-TURN MODEL SETUP
# ---------------------------------
MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
model_single = model_single.to("cuda")
generation_config_single = model_single.default_generation_config
# ---------------------------------
# SINGLE-TURN INFERENCE FUNCTION
# ---------------------------------
@spaces.GPU
def single_turn_infer(audio_file, prompt_text):
try:
sound = llava.Sound(audio_file)
full_prompt = f"<sound>\n{prompt_text}"
response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single)
return response
except Exception as e:
return f"❌ Error: {str(e)}"
# ---------------------------------
# INTERFACE
# ---------------------------------
with gr.Blocks(css="""
.gradio-container {
max-width: 100% !important;
width: 100% !important;
margin: 0 !important;
padding: 0 !important;
}
#component-0, .gr-block.gr-box {
width: 100% !important;
}
.gr-block.gr-box, .gr-column, .gr-row {
padding: 0 !important;
margin: 0 !important;
}
""") as demo:
with gr.Column():
gr.HTML("""
<div align="center">
<img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="110" style="margin-bottom: 8px;">
<h2><strong>Audio Understanding Demo (WeaveMuse Edition)</strong></h2>
<p><em>Powered by NVIDIA’s open Audio Flamingo 3 model</em></p>
</div>
<div align="center" style="margin-top: 10px;">
<a href="https://huggingface.co/nvidia/audio-flamingo-3">
<img src="https://img.shields.io/badge/🤗-Model:_audio--flamingo--3-ED5A22.svg">
</a>
<a href="https://github.com/NVIDIA/audio-flamingo">
<img src="https://img.shields.io/badge/GitHub-NVIDIA%2Faudio--flamingo-9C276A.svg">
</a>
</div>
<div align="center" style="margin-top: 8px;">
<p><small>This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the <strong>WeaveMuse</strong> framework.</small></p>
</div>
""")
with gr.Tabs():
# ---------------- SINGLE-TURN ----------------
with gr.Tab("🎧 Audio Inference"):
with gr.Row():
with gr.Column():
audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip")
prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6)
btn_single = gr.Button("Generate Response")
gr.Examples(
examples=[
["static/emergent/audio1.wav", "What is happening in this audio?"],
["static/audio/audio2.wav", "Describe the sounds you hear."],
["static/speech/audio3.wav", "Transcribe the spoken words."],
],
inputs=[audio_input_single, prompt_input_single],
label="🧪 Example Prompts"
)
with gr.Column():
output_single = gr.Textbox(label="Model Response", lines=15)
btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)
with gr.Tab("ℹ️ About"):
gr.Markdown("""
### 🎶 Overview
This demo showcases a **simple single-turn audio–language interface** built for integration with the **WeaveMuse** framework.
It uses the open-source **NVIDIA Audio Flamingo 3** model for audio understanding, transcription, and sound reasoning.
You can upload an audio file and ask natural-language questions such as:
- “What kind of sound is this?”
- “Describe the scene.”
- “Transcribe any speech.”
**Acknowledgment:**
Model and research credit to **NVIDIA**, for the development of the open **Audio Flamingo 3** model and datasets used for training.
This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse.
**Tech stack:** Gradio + PyTorch + llava + WeaveMuse Integration
""")
gr.Markdown("© 2025 WeaveMuse | Model © NVIDIA — Audio Flamingo 3 | Built with ❤️ using Gradio & PyTorch")
# -----------------------
# Launch App
# -----------------------
if __name__ == "__main__":
demo.launch(share=True)