Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| import llava | |
| import os | |
| import spaces | |
| from huggingface_hub import snapshot_download | |
| import copy | |
| # --------------------------------- | |
| # SINGLE-TURN MODEL SETUP | |
| # --------------------------------- | |
| MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3") | |
| MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35') | |
| model_single = llava.load(MODEL_BASE_SINGLE, model_base=None) | |
| model_single = model_single.to("cuda") | |
| generation_config_single = model_single.default_generation_config | |
| # --------------------------------- | |
| # SINGLE-TURN INFERENCE FUNCTION | |
| # --------------------------------- | |
| def single_turn_infer(audio_file, prompt_text): | |
| try: | |
| sound = llava.Sound(audio_file) | |
| full_prompt = f"<sound>\n{prompt_text}" | |
| response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single) | |
| return response | |
| except Exception as e: | |
| return f"❌ Error: {str(e)}" | |
| # --------------------------------- | |
| # INTERFACE | |
| # --------------------------------- | |
| with gr.Blocks(css=""" | |
| .gradio-container { | |
| max-width: 100% !important; | |
| width: 100% !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| } | |
| #component-0, .gr-block.gr-box { | |
| width: 100% !important; | |
| } | |
| .gr-block.gr-box, .gr-column, .gr-row { | |
| padding: 0 !important; | |
| margin: 0 !important; | |
| } | |
| """) as demo: | |
| with gr.Column(): | |
| gr.HTML(""" | |
| <div align="center"> | |
| <img src="https://raw.githubusercontent.com/NVIDIA/audio-flamingo/audio_flamingo_3/static/logo-no-bg.png" alt="Audio Flamingo 3 Logo" width="110" style="margin-bottom: 8px;"> | |
| <h2><strong>Audio Understanding Demo (WeaveMuse Edition)</strong></h2> | |
| <p><em>Powered by NVIDIA’s open Audio Flamingo 3 model</em></p> | |
| </div> | |
| <div align="center" style="margin-top: 10px;"> | |
| <a href="https://huggingface.co/nvidia/audio-flamingo-3"> | |
| <img src="https://img.shields.io/badge/🤗-Model:_audio--flamingo--3-ED5A22.svg"> | |
| </a> | |
| <a href="https://github.com/NVIDIA/audio-flamingo"> | |
| <img src="https://img.shields.io/badge/GitHub-NVIDIA%2Faudio--flamingo-9C276A.svg"> | |
| </a> | |
| </div> | |
| <div align="center" style="margin-top: 8px;"> | |
| <p><small>This simple interface demonstrates audio–language reasoning and transcription using open models from NVIDIA. Integrated for use in the <strong>WeaveMuse</strong> framework.</small></p> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # ---------------- SINGLE-TURN ---------------- | |
| with gr.Tab("🎧 Audio Inference"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input_single = gr.Audio(type="filepath", label="Upload Audio Clip") | |
| prompt_input_single = gr.Textbox(label="Prompt", placeholder="Ask about the audio (e.g. 'Describe the sound' or 'Transcribe any speech')", lines=6) | |
| btn_single = gr.Button("Generate Response") | |
| gr.Examples( | |
| examples=[ | |
| ["static/emergent/audio1.wav", "What is happening in this audio?"], | |
| ["static/audio/audio2.wav", "Describe the sounds you hear."], | |
| ["static/speech/audio3.wav", "Transcribe the spoken words."], | |
| ], | |
| inputs=[audio_input_single, prompt_input_single], | |
| label="🧪 Example Prompts" | |
| ) | |
| with gr.Column(): | |
| output_single = gr.Textbox(label="Model Response", lines=15) | |
| btn_single.click(fn=single_turn_infer, inputs=[audio_input_single, prompt_input_single], outputs=output_single) | |
| with gr.Tab("ℹ️ About"): | |
| gr.Markdown(""" | |
| ### 🎶 Overview | |
| This demo showcases a **simple single-turn audio–language interface** built for integration with the **WeaveMuse** framework. | |
| It uses the open-source **NVIDIA Audio Flamingo 3** model for audio understanding, transcription, and sound reasoning. | |
| You can upload an audio file and ask natural-language questions such as: | |
| - “What kind of sound is this?” | |
| - “Describe the scene.” | |
| - “Transcribe any speech.” | |
| **Acknowledgment:** | |
| Model and research credit to **NVIDIA**, for the development of the open **Audio Flamingo 3** model and datasets used for training. | |
| This interface is a simplified demonstration of how such models can be integrated into broader creative AI systems like WeaveMuse. | |
| **Tech stack:** Gradio + PyTorch + llava + WeaveMuse Integration | |
| """) | |
| gr.Markdown("© 2025 WeaveMuse | Model © NVIDIA — Audio Flamingo 3 | Built with ❤️ using Gradio & PyTorch") | |
| # ----------------------- | |
| # Launch App | |
| # ----------------------- | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |