audio-flamingo-3

Running on Zero

App Files Files Community

SreyanG-NVIDIA commited on Jul 14

Commit

2da3b49

verified ·

1 Parent(s): c8e5a16

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -68

app.py CHANGED Viewed

@@ -22,12 +22,12 @@ model_think = PeftModel.from_pretrained(
             torch_dtype=torch.float16,
         )
-# ---------------------------------
-# MULTI-TURN MODEL SETUP
-# ---------------------------------
-MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
-model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
-generation_config_multi = model_multi.default_generation_config
 # ---------------------------------
@@ -42,14 +42,14 @@ def single_turn_infer(audio_file, prompt_text):
     except Exception as e:
         return f"❌ Error: {str(e)}"
-def speech_prompt_infer(audio_prompt_file):
-    try:
-        sound = llava.Sound(audio_prompt_file)
-        full_prompt = "<sound>"
-        response = model_multi.generate_content([sound, full_prompt], generation_config=generation_config_single)
-        return response
-    except Exception as e:
-        return f"❌ Error: {str(e)}"
 def think_infer(audio_file, prompt_text):
     try:
@@ -63,24 +63,24 @@ def think_infer(audio_file, prompt_text):
 # ---------------------------------
 # MULTI-TURN INFERENCE FUNCTION
 # ---------------------------------
-def multi_turn_chat(user_input, audio_file, history, current_audio):
-    try:
-        if audio_file is not None:
-            current_audio = audio_file  # Update state if a new file is uploaded
-        if current_audio is None:
-            return history + [("System", "❌ Please upload an audio file before chatting.")], history, current_audio
-        sound = llava.Sound(current_audio)
-        prompt = f"<sound>\n{user_input}"
-        response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
-        history.append((user_input, response))
-        return history, history, current_audio
-    except Exception as e:
-        history.append((user_input, f"❌ Error: {str(e)}"))
-        return history, history, current_audio
 # ---------------------------------
 # INTERFACE
 # ---------------------------------
@@ -193,48 +193,51 @@ with gr.Blocks(css="""
             btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
         # ---------------- MULTI-TURN CHAT ----------------
         with gr.Tab("💬 Multi-Turn Chat"):
-            chatbot = gr.Chatbot(label="Audio Chatbot")
-            audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
-            user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
-            btn_multi = gr.Button("Send")
-            history_state = gr.State([])           # Chat history
-            current_audio_state = gr.State(None)   # Most recent audio file path
-            btn_multi.click(
-                fn=multi_turn_chat,
-                inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
-                outputs=[chatbot, history_state, current_audio_state]
-            )
-            gr.Examples(
-                examples=[
-                    ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
-                    ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
-                ],
-                inputs=[audio_input_multi, user_input_multi],
-                label="🧪 Try Examples"
-            )
         with gr.Tab("🗣️ Speech Prompt"):
-            gr.Markdown("Use your **voice** to talk to the model.")
-            with gr.Row():
-                with gr.Column():
-                    speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
-                    btn_speech = gr.Button("Submit")
-                gr.Examples(
-                        examples=[
-                            ["static/voice/voice_0.mp3"],
-                            ["static/voice/voice_1.mp3"],
-                            ["static/voice/voice_2.mp3"],
-                        ],
-                        inputs=speech_input,
-                        label="🧪 Try Examples"
-                    )
-                with gr.Column():
-                    response_box = gr.Textbox(label="Model Response", lines=15)
-            btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
         # ---------------- ABOUT ----------------
         with gr.Tab("📄 About"):

             torch_dtype=torch.float16,
         )
+# # ---------------------------------
+# # MULTI-TURN MODEL SETUP
+# # ---------------------------------
+# MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
+# model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
+# generation_config_multi = model_multi.default_generation_config
 # ---------------------------------
     except Exception as e:
         return f"❌ Error: {str(e)}"
+# def speech_prompt_infer(audio_prompt_file):
+#     try:
+#         sound = llava.Sound(audio_prompt_file)
+#         full_prompt = "<sound>"
+#         response = model_multi.generate_content([sound, full_prompt], generation_config=generation_config_single)
+#         return response
+#     except Exception as e:
+#         return f"❌ Error: {str(e)}"
 def think_infer(audio_file, prompt_text):
     try:
 # ---------------------------------
 # MULTI-TURN INFERENCE FUNCTION
 # ---------------------------------
+# def multi_turn_chat(user_input, audio_file, history, current_audio):
+#     try:
+#         if audio_file is not None:
+#             current_audio = audio_file  # Update state if a new file is uploaded
+#         if current_audio is None:
+#             return history + [("System", "❌ Please upload an audio file before chatting.")], history, current_audio
+#         sound = llava.Sound(current_audio)
+#         prompt = f"<sound>\n{user_input}"
+#         response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
+#         history.append((user_input, response))
+#         return history, history, current_audio
+#     except Exception as e:
+#         history.append((user_input, f"❌ Error: {str(e)}"))
+#         return history, history, current_audio
 # ---------------------------------
 # INTERFACE
 # ---------------------------------
             btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
         # ---------------- MULTI-TURN CHAT ----------------
         with gr.Tab("💬 Multi-Turn Chat"):
+            # chatbot = gr.Chatbot(label="Audio Chatbot")
+            # audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
+            # user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
+            # btn_multi = gr.Button("Send")
+            # history_state = gr.State([])           # Chat history
+            # current_audio_state = gr.State(None)   # Most recent audio file path
+            # btn_multi.click(
+            #     fn=multi_turn_chat,
+            #     inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
+            #     outputs=[chatbot, history_state, current_audio_state]
+            # )
+            # gr.Examples(
+            #     examples=[
+            #         ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
+            #         ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
+            #     ],
+            #     inputs=[audio_input_multi, user_input_multi],
+            #     label="🧪 Try Examples"
+            # )
+            # Add the link to another Gradio demo here
+            gr.Markdown("🔗 [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         with gr.Tab("🗣️ Speech Prompt"):
+            # gr.Markdown("Use your **voice** to talk to the model.")
+            # with gr.Row():
+            #     with gr.Column():
+            #         speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
+            #         btn_speech = gr.Button("Submit")
+            #     gr.Examples(
+            #             examples=[
+            #                 ["static/voice/voice_0.mp3"],
+            #                 ["static/voice/voice_1.mp3"],
+            #                 ["static/voice/voice_2.mp3"],
+            #             ],
+            #             inputs=speech_input,
+            #             label="🧪 Try Examples"
+            #         )
+            #     with gr.Column():
+            #         response_box = gr.Textbox(label="Model Response", lines=15)
+            # btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
+            # Add the link to another Gradio demo here
+            gr.Markdown("🔗 [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         # ---------------- ABOUT ----------------
         with gr.Tab("📄 About"):