audio-flamingo-3

Running on Zero

App Files Files Community

audio-flamingo-3 / app.py

manoskary

Update app.py

877491b verified about 1 month ago

raw

history blame contribute delete

4.81 kB

	import gradio as gr
	import torch
	import llava
	import os
	import spaces
	from huggingface_hub import snapshot_download
	import copy

	# ---------------------------------
	# SINGLE-TURN MODEL SETUP
	# ---------------------------------

	MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
	MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')

	model_single = llava.load(MODEL_BASE_SINGLE, model_base=None)
	model_single = model_single.to("cuda")

	generation_config_single = model_single.default_generation_config

	# ---------------------------------
	# SINGLE-TURN INFERENCE FUNCTION
	# ---------------------------------
	@spaces.GPU
	def single_turn_infer(audio_file, prompt_text):
	try:
	sound = llava.Sound(audio_file)
	full_prompt = f"<sound>\n{prompt_text}"
	response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single)
	return response
	except Exception as e:
	return f"❌ Error: {str(e)}"

	# ---------------------------------
	# INTERFACE
	# ---------------------------------
	with gr.Blocks(css="""
	.gradio-container {
	max-width: 100% !important;
	width: 100% !important;
	margin: 0 !important;
	padding: 0 !important;
	}
	#component-0, .gr-block.gr-box {
	width: 100% !important;
	}
	.gr-block.gr-box, .gr-column, .gr-row {
	padding: 0 !important;
	margin: 0 !important;
	}
	""") as demo:

	with gr.Column():
	gr.HTML("""
	<div align="center">
	<img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="110" style="margin-bottom: 8px;">
	<h2><strong>Audio Understanding Demo (WeaveMuse Edition)</strong></h2>
	<p><em>Powered by NVIDIA’s open Audio Flamingo 3 model</em></p>
	</div>

	<div align="center" style="margin-top: 10px;">
	<a href="https://huggingface.co/nvidia/audio-flamingo-3">
	<img src="https://img.shields.io/badge/🤗-Model:_audio--flamingo--3-ED5A22.svg">
	</a>
	<a href="https://github.com/NVIDIA/audio-flamingo">
	<img src="https://img.shields.io/badge/GitHub-NVIDIA%2Faudio--flamingo-9C276A.svg">
	</a>
	</div>
	<div align="center" style="margin-top: 8px;">
	<p><small>This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the <strong>WeaveMuse</strong> framework.</small></p>
	</div>
	""")

	with gr.Tabs():
	# ---------------- SINGLE-TURN ----------------
	with gr.Tab("🎧 Audio Inference"):
	with gr.Row():
	with gr.Column():
	audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip")
	prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6)
	btn_single = gr.Button("Generate Response")

	gr.Examples(
	examples=[
	["static/emergent/audio1.wav", "What is happening in this audio?"],
	["static/audio/audio2.wav", "Describe the sounds you hear."],
	["static/speech/audio3.wav", "Transcribe the spoken words."],
	],
	inputs=[audio_input_single, prompt_input_single],
	label="🧪 Example Prompts"
	)

	with gr.Column():
	output_single = gr.Textbox(label="Model Response", lines=15)

	btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single)

	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	### 🎶 Overview

	This demo showcases a simple single-turn audio–language interface built for integration with the WeaveMuse framework.
	It uses the open-source NVIDIA Audio Flamingo 3 model for audio understanding, transcription, and sound reasoning.

	You can upload an audio file and ask natural-language questions such as:
	- “What kind of sound is this?”
	- “Describe the scene.”
	- “Transcribe any speech.”

	Acknowledgment:
	Model and research credit to NVIDIA, for the development of the open Audio Flamingo 3 model and datasets used for training.
	This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse.

	Tech stack: Gradio + PyTorch + llava + WeaveMuse Integration
	""")

	gr.Markdown("© 2025 WeaveMuse \| Model © NVIDIA — Audio Flamingo 3 \| Built with ❤️ using Gradio & PyTorch")

	# -----------------------
	# Launch App
	# -----------------------
	if __name__ == "__main__":
	demo.launch(share=True)