Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,12 +22,12 @@ model_think = PeftModel.from_pretrained(
|
|
| 22 |
torch_dtype=torch.float16,
|
| 23 |
)
|
| 24 |
|
| 25 |
-
# ---------------------------------
|
| 26 |
-
# MULTI-TURN MODEL SETUP
|
| 27 |
-
# ---------------------------------
|
| 28 |
-
MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
|
| 29 |
-
model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
|
| 30 |
-
generation_config_multi = model_multi.default_generation_config
|
| 31 |
|
| 32 |
|
| 33 |
# ---------------------------------
|
|
@@ -42,14 +42,14 @@ def single_turn_infer(audio_file, prompt_text):
|
|
| 42 |
except Exception as e:
|
| 43 |
return f"β Error: {str(e)}"
|
| 44 |
|
| 45 |
-
def speech_prompt_infer(audio_prompt_file):
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
|
| 54 |
def think_infer(audio_file, prompt_text):
|
| 55 |
try:
|
|
@@ -63,24 +63,24 @@ def think_infer(audio_file, prompt_text):
|
|
| 63 |
# ---------------------------------
|
| 64 |
# MULTI-TURN INFERENCE FUNCTION
|
| 65 |
# ---------------------------------
|
| 66 |
-
def multi_turn_chat(user_input, audio_file, history, current_audio):
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
|
| 77 |
-
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
# ---------------------------------
|
| 85 |
# INTERFACE
|
| 86 |
# ---------------------------------
|
|
@@ -193,48 +193,51 @@ with gr.Blocks(css="""
|
|
| 193 |
btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
|
| 194 |
# ---------------- MULTI-TURN CHAT ----------------
|
| 195 |
with gr.Tab("π¬ Multi-Turn Chat"):
|
| 196 |
-
chatbot = gr.Chatbot(label="Audio Chatbot")
|
| 197 |
-
audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
|
| 198 |
-
user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
|
| 199 |
-
btn_multi = gr.Button("Send")
|
| 200 |
-
history_state = gr.State([]) # Chat history
|
| 201 |
-
current_audio_state = gr.State(None) # Most recent audio file path
|
| 202 |
-
|
| 203 |
-
btn_multi.click(
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
)
|
| 208 |
-
gr.Examples(
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
)
|
|
|
|
|
|
|
| 216 |
|
| 217 |
with gr.Tab("π£οΈ Speech Prompt"):
|
| 218 |
-
gr.Markdown("Use your **voice** to talk to the model.")
|
| 219 |
-
|
| 220 |
-
with gr.Row():
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
|
| 237 |
-
|
|
|
|
| 238 |
|
| 239 |
# ---------------- ABOUT ----------------
|
| 240 |
with gr.Tab("π About"):
|
|
|
|
| 22 |
torch_dtype=torch.float16,
|
| 23 |
)
|
| 24 |
|
| 25 |
+
# # ---------------------------------
|
| 26 |
+
# # MULTI-TURN MODEL SETUP
|
| 27 |
+
# # ---------------------------------
|
| 28 |
+
# MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
|
| 29 |
+
# model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
|
| 30 |
+
# generation_config_multi = model_multi.default_generation_config
|
| 31 |
|
| 32 |
|
| 33 |
# ---------------------------------
|
|
|
|
| 42 |
except Exception as e:
|
| 43 |
return f"β Error: {str(e)}"
|
| 44 |
|
| 45 |
+
# def speech_prompt_infer(audio_prompt_file):
|
| 46 |
+
# try:
|
| 47 |
+
# sound = llava.Sound(audio_prompt_file)
|
| 48 |
+
# full_prompt = "<sound>"
|
| 49 |
+
# response = model_multi.generate_content([sound, full_prompt], generation_config=generation_config_single)
|
| 50 |
+
# return response
|
| 51 |
+
# except Exception as e:
|
| 52 |
+
# return f"β Error: {str(e)}"
|
| 53 |
|
| 54 |
def think_infer(audio_file, prompt_text):
|
| 55 |
try:
|
|
|
|
| 63 |
# ---------------------------------
|
| 64 |
# MULTI-TURN INFERENCE FUNCTION
|
| 65 |
# ---------------------------------
|
| 66 |
+
# def multi_turn_chat(user_input, audio_file, history, current_audio):
|
| 67 |
+
# try:
|
| 68 |
+
# if audio_file is not None:
|
| 69 |
+
# current_audio = audio_file # Update state if a new file is uploaded
|
| 70 |
|
| 71 |
+
# if current_audio is None:
|
| 72 |
+
# return history + [("System", "β Please upload an audio file before chatting.")], history, current_audio
|
| 73 |
|
| 74 |
+
# sound = llava.Sound(current_audio)
|
| 75 |
+
# prompt = f"<sound>\n{user_input}"
|
| 76 |
|
| 77 |
+
# response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
|
| 78 |
|
| 79 |
+
# history.append((user_input, response))
|
| 80 |
+
# return history, history, current_audio
|
| 81 |
+
# except Exception as e:
|
| 82 |
+
# history.append((user_input, f"β Error: {str(e)}"))
|
| 83 |
+
# return history, history, current_audio
|
| 84 |
# ---------------------------------
|
| 85 |
# INTERFACE
|
| 86 |
# ---------------------------------
|
|
|
|
| 193 |
btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
|
| 194 |
# ---------------- MULTI-TURN CHAT ----------------
|
| 195 |
with gr.Tab("π¬ Multi-Turn Chat"):
|
| 196 |
+
# chatbot = gr.Chatbot(label="Audio Chatbot")
|
| 197 |
+
# audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
|
| 198 |
+
# user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
|
| 199 |
+
# btn_multi = gr.Button("Send")
|
| 200 |
+
# history_state = gr.State([]) # Chat history
|
| 201 |
+
# current_audio_state = gr.State(None) # Most recent audio file path
|
| 202 |
+
|
| 203 |
+
# btn_multi.click(
|
| 204 |
+
# fn=multi_turn_chat,
|
| 205 |
+
# inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
|
| 206 |
+
# outputs=[chatbot, history_state, current_audio_state]
|
| 207 |
+
# )
|
| 208 |
+
# gr.Examples(
|
| 209 |
+
# examples=[
|
| 210 |
+
# ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
|
| 211 |
+
# ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
|
| 212 |
+
# ],
|
| 213 |
+
# inputs=[audio_input_multi, user_input_multi],
|
| 214 |
+
# label="π§ͺ Try Examples"
|
| 215 |
+
# )
|
| 216 |
+
# Add the link to another Gradio demo here
|
| 217 |
+
gr.Markdown("π [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
|
| 218 |
|
| 219 |
with gr.Tab("π£οΈ Speech Prompt"):
|
| 220 |
+
# gr.Markdown("Use your **voice** to talk to the model.")
|
| 221 |
+
|
| 222 |
+
# with gr.Row():
|
| 223 |
+
# with gr.Column():
|
| 224 |
+
# speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
|
| 225 |
+
# btn_speech = gr.Button("Submit")
|
| 226 |
+
# gr.Examples(
|
| 227 |
+
# examples=[
|
| 228 |
+
# ["static/voice/voice_0.mp3"],
|
| 229 |
+
# ["static/voice/voice_1.mp3"],
|
| 230 |
+
# ["static/voice/voice_2.mp3"],
|
| 231 |
+
# ],
|
| 232 |
+
# inputs=speech_input,
|
| 233 |
+
# label="π§ͺ Try Examples"
|
| 234 |
+
# )
|
| 235 |
+
# with gr.Column():
|
| 236 |
+
# response_box = gr.Textbox(label="Model Response", lines=15)
|
| 237 |
+
|
| 238 |
+
# btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
|
| 239 |
+
# Add the link to another Gradio demo here
|
| 240 |
+
gr.Markdown("π [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
|
| 241 |
|
| 242 |
# ---------------- ABOUT ----------------
|
| 243 |
with gr.Tab("π About"):
|