Spaces:

PierrunoYT
/

audio-flamingo-3

Runtime error

App Files Files Community

PierrunoYT Amp commited on Aug 10

Commit

ca44264

1 Parent(s): 0926815

Fix issues and restore functionality

Browse files

Co-authored-by: Amp <amp@ampcode.com>
Amp-Thread-ID: https://ampcode.com/threads/T-155d539e-4d6e-4832-a5db-37e81c48dc9a

Files changed (3) hide show

app.py +73 -83
requirements.txt +1 -0
windows_cuda_fix.md +61 -0

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import llava
 from peft import PeftModel
 import os
 from huggingface_hub import snapshot_download
-import copy
 # ---------------------------------
 # SINGLE-TURN MODEL SETUP
@@ -14,11 +14,10 @@ MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3")
 MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
 model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, load_8bit=True)
-model_single_copy = copy.deepcopy(model_single)
 # Move the model to GPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model_single = model_single.to_empty(device=device)
 generation_config_single = model_single.default_generation_config
@@ -30,12 +29,14 @@ model_think = PeftModel.from_pretrained(
 )
 model_think.to(device)
-# # ---------------------------------
-# # MULTI-TURN MODEL SETUP
-# # ---------------------------------
-# MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
-# model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, devices=[0])
-# generation_config_multi = model_multi.default_generation_config
 # ---------------------------------
@@ -45,7 +46,7 @@ def single_turn_infer(audio_file, prompt_text):
     try:
         sound = llava.Sound(audio_file)
         full_prompt = f"<sound>\n{prompt_text}"
-        response = model_single_copy.generate_content([sound, full_prompt], generation_config=generation_config_single)
         return response
     except Exception as e:
         return f"❌ Error: {str(e)}"
@@ -53,24 +54,24 @@ def single_turn_infer(audio_file, prompt_text):
 # ---------------------------------
 # MULTI-TURN INFERENCE FUNCTION
 # ---------------------------------
-# def multi_turn_chat(user_input, audio_file, history, current_audio):
-#     try:
-#         if audio_file is not None:
-#             current_audio = audio_file  # Update state if a new file is uploaded
-#         if current_audio is None:
-#             return history + [("System", "❌ Please upload an audio file before chatting.")], history, current_audio
-#         sound = llava.Sound(current_audio)
-#         prompt = f"<sound>\n{user_input}"
-#         response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
-#         history.append((user_input, response))
-#         return history, history, current_audio
-#     except Exception as e:
-#         history.append((user_input, f"❌ Error: {str(e)}"))
-#         return history, history, current_audio
 def think_infer(audio_file, prompt_text):
     try:
@@ -82,26 +83,17 @@ def think_infer(audio_file, prompt_text):
         return f"❌ Error: {str(e)}"
 # ---------------------------------
-# MULTI-TURN INFERENCE FUNCTION
 # ---------------------------------
-# def multi_turn_chat(user_input, audio_file, history, current_audio):
-#     try:
-#         if audio_file is not None:
-#             current_audio = audio_file  # Update state if a new file is uploaded
-#         if current_audio is None:
-#             return history + [("System", "❌ Please upload an audio file before chatting.")], history, current_audio
-#         sound = llava.Sound(current_audio)
-#         prompt = f"<sound>\n{user_input}"
-#         response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
-#         history.append((user_input, response))
-#         return history, history, current_audio
-#     except Exception as e:
-#         history.append((user_input, f"❌ Error: {str(e)}"))
-#         return history, history, current_audio
 # ---------------------------------
 # INTERFACE
 # ---------------------------------
@@ -216,50 +208,48 @@ with gr.Blocks(css="""
             btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
         # ---------------- MULTI-TURN CHAT ----------------
         with gr.Tab("💬 Multi-Turn Chat"):
-            # chatbot = gr.Chatbot(label="Audio Chatbot")
-            # audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
-            # user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
-            # btn_multi = gr.Button("Send")
-            # history_state = gr.State([])           # Chat history
-            # current_audio_state = gr.State(None)   # Most recent audio file path
-            # btn_multi.click(
-            #     fn=multi_turn_chat,
-            #     inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
-            #     outputs=[chatbot, history_state, current_audio_state]
-            # )
-            # gr.Examples(
-            #     examples=[
-            #         ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
-            #         ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
-            #     ],
-            #     inputs=[audio_input_multi, user_input_multi],
-            #     label="🧪 Try Examples"
-            # )
-            # Add the link to another Gradio demo here
             gr.Markdown("🔗 [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         with gr.Tab("🗣️ Speech Prompt"):
-            # gr.Markdown("Use your **voice** to talk to the model.")
-            # with gr.Row():
-            #     with gr.Column():
-            #         speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
-            #         btn_speech = gr.Button("Submit")
-            #     gr.Examples(
-            #             examples=[
-            #                 ["static/voice/voice_0.mp3"],
-            #                 ["static/voice/voice_1.mp3"],
-            #                 ["static/voice/voice_2.mp3"],
-            #             ],
-            #             inputs=speech_input,
-            #             label="🧪 Try Examples"
-            #         )
-            #     with gr.Column():
-            #         response_box = gr.Textbox(label="Model Response", lines=15)
-            # btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
-            # Add the link to another Gradio demo here
             gr.Markdown("🔗 [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         # ---------------- ABOUT ----------------

 from peft import PeftModel
 import os
 from huggingface_hub import snapshot_download
 # ---------------------------------
 # SINGLE-TURN MODEL SETUP
 MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
 model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, load_8bit=True)
 # Move the model to GPU
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_single = model_single.to(device)
 generation_config_single = model_single.default_generation_config
 )
 model_think.to(device)
+# ---------------------------------
+# MULTI-TURN MODEL SETUP
+# ---------------------------------
+MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
+model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, load_8bit=True)
+model_multi = model_multi.to(device)
+generation_config_multi = model_multi.default_generation_config
 # ---------------------------------
     try:
         sound = llava.Sound(audio_file)
         full_prompt = f"<sound>\n{prompt_text}"
+        response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single)
         return response
     except Exception as e:
         return f"❌ Error: {str(e)}"
 # ---------------------------------
 # MULTI-TURN INFERENCE FUNCTION
 # ---------------------------------
+def multi_turn_chat(user_input, audio_file, history, current_audio):
+    try:
+        if audio_file is not None:
+            current_audio = audio_file  # Update state if a new file is uploaded
+        if current_audio is None:
+            return history + [("System", "❌ Please upload an audio file before chatting.")], history, current_audio
+        sound = llava.Sound(current_audio)
+        prompt = f"<sound>\n{user_input}"
+        response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
+        history.append((user_input, response))
+        return history, history, current_audio
+    except Exception as e:
+        history.append((user_input, f"❌ Error: {str(e)}"))
+        return history, history, current_audio
 def think_infer(audio_file, prompt_text):
     try:
         return f"❌ Error: {str(e)}"
 # ---------------------------------
+# SPEECH PROMPT INFERENCE FUNCTION
 # ---------------------------------
+def speech_prompt_infer(speech_input):
+    try:
+        sound = llava.Sound(speech_input)
+        # For speech prompts, we use the audio itself as both the context and the prompt
+        response = model_multi.generate_content([sound, "<sound>"], generation_config=generation_config_multi)
+        return response
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
 # ---------------------------------
 # INTERFACE
 # ---------------------------------
             btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
         # ---------------- MULTI-TURN CHAT ----------------
         with gr.Tab("💬 Multi-Turn Chat"):
+            chatbot = gr.Chatbot(label="Audio Chatbot")
+            audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
+            user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
+            btn_multi = gr.Button("Send")
+            history_state = gr.State([])           # Chat history
+            current_audio_state = gr.State(None)   # Most recent audio file path
+            btn_multi.click(
+                fn=multi_turn_chat,
+                inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
+                outputs=[chatbot, history_state, current_audio_state]
+            )
+            gr.Examples(
+                examples=[
+                    ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
+                    ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
+                ],
+                inputs=[audio_input_multi, user_input_multi],
+                label="🧪 Try Examples"
+            )
             gr.Markdown("🔗 [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         with gr.Tab("🗣️ Speech Prompt"):
+            gr.Markdown("Use your **voice** to talk to the model.")
+            with gr.Row():
+                with gr.Column():
+                    speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
+                    btn_speech = gr.Button("Submit")
+                    gr.Examples(
+                        examples=[
+                            ["static/voice/voice_0.mp3"],
+                            ["static/voice/voice_1.mp3"],
+                            ["static/voice/voice_2.mp3"],
+                        ],
+                        inputs=speech_input,
+                        label="🧪 Try Examples"
+                    )
+                with gr.Column():
+                    response_box = gr.Textbox(label="Model Response", lines=15)
+            btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
             gr.Markdown("🔗 [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         # ---------------- ABOUT ----------------

requirements.txt CHANGED Viewed

@@ -4,6 +4,7 @@ hydra-core
 loguru
 Pillow
 pydub
 # Transformers and training utilities

 loguru
 Pillow
 pydub
+gradio
 # Transformers and training utilities

windows_cuda_fix.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# Windows CUDA Linking Issues - Troubleshooting Guide
+## Issues Identified
+### 1. Fixed: torch.cuda.amp Deprecation Warnings ✅
+- **Issue**: `torch.cuda.amp.custom_fwd` and `torch.cuda.amp.custom_bwd` deprecation warnings
+- **Fix**: Updated `llava/model/qlinear_te.py` to use `device_type='cuda'` parameter
+- **Lines changed**: 101 and 153
+### 2. Windows CUDA Linking Errors
+- **Error**: `LINK : fatal error LNK1181: Eingabedatei "aio.lib" kann nicht geöffnet werden.`
+- **Error**: `LINK : fatal error LNK1181: Eingabedatei "cufile.lib" kann nicht geöffnet werden.`
+## Root Causes and Solutions
+### aio.lib Error
+- **Cause**: `aio.lib` (Asynchronous I/O) is POSIX-specific and not available on Windows
+- **Solution**: This library should not be linked on Windows builds
+- **Action**: The CUDA extension build system should exclude this on Windows
+### cufile.lib Error
+- **Cause**: Missing NVIDIA GPUDirect Storage (GDS) library or incorrect linking
+- **Solutions**:
+  1. **Install NVIDIA CUDA Toolkit** with GPUDirect Storage components
+  2. **Verify CUDA_PATH** environment variable points to correct CUDA installation
+  3. **Check library paths** in `%CUDA_PATH%\lib\x64\`
+  4. **Use dynamic linking** instead of static linking for cuFile on Windows
+## Recommended Actions
+### 1. Verify CUDA Installation
+```cmd
+echo %CUDA_PATH%
+dir "%CUDA_PATH%\lib\x64\cufile*"
+nvcc --version
+```
+### 2. Check PyTorch CUDA Compatibility
+```python
+import torch
+print(f"PyTorch version: {torch.__version__}")
+print(f"CUDA version: {torch.version.cuda}")
+print(f"CUDA available: {torch.cuda.is_available()}")
+```
+### 3. Update Build Configuration
+The `setup.py` in `llava/model/coat/optimizer/kernels/` may need Windows-specific modifications:
+- Exclude `aio.lib` on Windows
+- Ensure proper cuFile library linking
+- Add Windows-specific compiler flags if needed
+### 4. PEFT Version Warning
+- **Warning**: PEFT configuration compatibility issue
+- **Solution**: Update PEFT library: `pip install -U peft`
+## Status
+- ✅ **Fixed**: torch.cuda.amp deprecation warnings
+- ⚠️ **Needs attention**: Windows CUDA library linking
+- ⚠️ **Recommended**: Update PEFT library
+The application appears to be running despite the linking warnings, suggesting the core functionality is working but with potential performance or stability impacts.