Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							Β·
						
						ca44264
	
1
								Parent(s):
							
							0926815
								
Fix issues and restore functionality
Browse filesCo-authored-by: Amp <amp@ampcode.com>
Amp-Thread-ID: https://ampcode.com/threads/T-155d539e-4d6e-4832-a5db-37e81c48dc9a
- app.py +73 -83
- requirements.txt +1 -0
- windows_cuda_fix.md +61 -0
    	
        app.py
    CHANGED
    
    | @@ -4,7 +4,7 @@ import llava | |
| 4 | 
             
            from peft import PeftModel
         | 
| 5 | 
             
            import os
         | 
| 6 | 
             
            from huggingface_hub import snapshot_download
         | 
| 7 | 
            -
             | 
| 8 |  | 
| 9 | 
             
            # ---------------------------------
         | 
| 10 | 
             
            # SINGLE-TURN MODEL SETUP
         | 
| @@ -14,11 +14,10 @@ MODEL_BASE_SINGLE = snapshot_download(repo_id="nvidia/audio-flamingo-3") | |
| 14 | 
             
            MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
         | 
| 15 |  | 
| 16 | 
             
            model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, load_8bit=True)
         | 
| 17 | 
            -
            model_single_copy = copy.deepcopy(model_single)
         | 
| 18 |  | 
| 19 | 
             
            # Move the model to GPU
         | 
| 20 | 
             
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
| 21 | 
            -
            model_single = model_single. | 
| 22 |  | 
| 23 | 
             
            generation_config_single = model_single.default_generation_config
         | 
| 24 |  | 
| @@ -30,12 +29,14 @@ model_think = PeftModel.from_pretrained( | |
| 30 | 
             
            )
         | 
| 31 | 
             
            model_think.to(device)
         | 
| 32 |  | 
| 33 | 
            -
            #  | 
| 34 | 
            -
            #  | 
| 35 | 
            -
            #  | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
|  | |
|  | |
| 39 |  | 
| 40 |  | 
| 41 | 
             
            # ---------------------------------
         | 
| @@ -45,7 +46,7 @@ def single_turn_infer(audio_file, prompt_text): | |
| 45 | 
             
                try:
         | 
| 46 | 
             
                    sound = llava.Sound(audio_file)
         | 
| 47 | 
             
                    full_prompt = f"<sound>\n{prompt_text}"
         | 
| 48 | 
            -
                    response =  | 
| 49 | 
             
                    return response
         | 
| 50 | 
             
                except Exception as e:
         | 
| 51 | 
             
                    return f"β Error: {str(e)}"
         | 
| @@ -53,24 +54,24 @@ def single_turn_infer(audio_file, prompt_text): | |
| 53 | 
             
            # ---------------------------------
         | 
| 54 | 
             
            # MULTI-TURN INFERENCE FUNCTION
         | 
| 55 | 
             
            # ---------------------------------
         | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 |  | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 |  | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 |  | 
| 67 | 
            -
             | 
| 68 |  | 
| 69 | 
            -
             | 
| 70 | 
            -
             | 
| 71 | 
            -
             | 
| 72 | 
            -
             | 
| 73 | 
            -
             | 
| 74 |  | 
| 75 | 
             
            def think_infer(audio_file, prompt_text):
         | 
| 76 | 
             
                try:
         | 
| @@ -82,26 +83,17 @@ def think_infer(audio_file, prompt_text): | |
| 82 | 
             
                    return f"β Error: {str(e)}"
         | 
| 83 |  | 
| 84 | 
             
            # ---------------------------------
         | 
| 85 | 
            -
            #  | 
| 86 | 
             
            # ---------------------------------
         | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 89 | 
            -
             | 
| 90 | 
            -
            # | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 93 | 
            -
             | 
| 94 | 
            -
             | 
| 95 | 
            -
            #         sound = llava.Sound(current_audio)
         | 
| 96 | 
            -
            #         prompt = f"<sound>\n{user_input}"
         | 
| 97 | 
            -
             | 
| 98 | 
            -
            #         response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
         | 
| 99 |  | 
| 100 | 
            -
            #         history.append((user_input, response))
         | 
| 101 | 
            -
            #         return history, history, current_audio
         | 
| 102 | 
            -
            #     except Exception as e:
         | 
| 103 | 
            -
            #         history.append((user_input, f"β Error: {str(e)}"))
         | 
| 104 | 
            -
            #         return history, history, current_audio
         | 
| 105 | 
             
            # ---------------------------------
         | 
| 106 | 
             
            # INTERFACE
         | 
| 107 | 
             
            # ---------------------------------
         | 
| @@ -216,50 +208,48 @@ with gr.Blocks(css=""" | |
| 216 | 
             
                        btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
         | 
| 217 | 
             
                    # ---------------- MULTI-TURN CHAT ----------------
         | 
| 218 | 
             
                    with gr.Tab("π¬ Multi-Turn Chat"):
         | 
| 219 | 
            -
                         | 
| 220 | 
            -
                         | 
| 221 | 
            -
                         | 
| 222 | 
            -
                         | 
| 223 | 
            -
                         | 
| 224 | 
            -
                         | 
| 225 | 
            -
             | 
| 226 | 
            -
                         | 
| 227 | 
            -
             | 
| 228 | 
            -
             | 
| 229 | 
            -
             | 
| 230 | 
            -
                         | 
| 231 | 
            -
                         | 
| 232 | 
            -
             | 
| 233 | 
            -
             | 
| 234 | 
            -
             | 
| 235 | 
            -
             | 
| 236 | 
            -
             | 
| 237 | 
            -
             | 
| 238 | 
            -
                         | 
| 239 | 
            -
                        # Add the link to another Gradio demo here
         | 
| 240 | 
             
                        gr.Markdown("π [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         | 
| 241 |  | 
| 242 | 
             
                    with gr.Tab("π£οΈ Speech Prompt"):
         | 
| 243 | 
            -
                         | 
| 244 | 
            -
             | 
| 245 | 
            -
                         | 
| 246 | 
            -
             | 
| 247 | 
            -
             | 
| 248 | 
            -
             | 
| 249 | 
            -
             | 
| 250 | 
            -
             | 
| 251 | 
            -
             | 
| 252 | 
            -
             | 
| 253 | 
            -
             | 
| 254 | 
            -
             | 
| 255 | 
            -
             | 
| 256 | 
            -
             | 
| 257 | 
            -
             | 
| 258 | 
            -
             | 
| 259 | 
            -
             | 
| 260 | 
            -
             | 
| 261 | 
            -
                         | 
| 262 | 
            -
                        # Add the link to another Gradio demo here
         | 
| 263 | 
             
                        gr.Markdown("π [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         | 
| 264 |  | 
| 265 | 
             
                    # ---------------- ABOUT ----------------
         | 
|  | |
| 4 | 
             
            from peft import PeftModel
         | 
| 5 | 
             
            import os
         | 
| 6 | 
             
            from huggingface_hub import snapshot_download
         | 
| 7 | 
            +
             | 
| 8 |  | 
| 9 | 
             
            # ---------------------------------
         | 
| 10 | 
             
            # SINGLE-TURN MODEL SETUP
         | 
|  | |
| 14 | 
             
            MODEL_BASE_THINK = os.path.join(MODEL_BASE_SINGLE, 'stage35')
         | 
| 15 |  | 
| 16 | 
             
            model_single = llava.load(MODEL_BASE_SINGLE, model_base=None, load_8bit=True)
         | 
|  | |
| 17 |  | 
| 18 | 
             
            # Move the model to GPU
         | 
| 19 | 
             
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         | 
| 20 | 
            +
            model_single = model_single.to(device)
         | 
| 21 |  | 
| 22 | 
             
            generation_config_single = model_single.default_generation_config
         | 
| 23 |  | 
|  | |
| 29 | 
             
            )
         | 
| 30 | 
             
            model_think.to(device)
         | 
| 31 |  | 
| 32 | 
            +
            # ---------------------------------
         | 
| 33 | 
            +
            # MULTI-TURN MODEL SETUP
         | 
| 34 | 
            +
            # ---------------------------------
         | 
| 35 | 
            +
            MODEL_BASE_MULTI = snapshot_download(repo_id="nvidia/audio-flamingo-3-chat")
         | 
| 36 | 
            +
            model_multi = llava.load(MODEL_BASE_MULTI, model_base=None, load_8bit=True)
         | 
| 37 | 
            +
            model_multi = model_multi.to(device)
         | 
| 38 | 
            +
            generation_config_multi = model_multi.default_generation_config
         | 
| 39 | 
            +
             | 
| 40 |  | 
| 41 |  | 
| 42 | 
             
            # ---------------------------------
         | 
|  | |
| 46 | 
             
                try:
         | 
| 47 | 
             
                    sound = llava.Sound(audio_file)
         | 
| 48 | 
             
                    full_prompt = f"<sound>\n{prompt_text}"
         | 
| 49 | 
            +
                    response = model_single.generate_content([sound, full_prompt], generation_config=generation_config_single)
         | 
| 50 | 
             
                    return response
         | 
| 51 | 
             
                except Exception as e:
         | 
| 52 | 
             
                    return f"β Error: {str(e)}"
         | 
|  | |
| 54 | 
             
            # ---------------------------------
         | 
| 55 | 
             
            # MULTI-TURN INFERENCE FUNCTION
         | 
| 56 | 
             
            # ---------------------------------
         | 
| 57 | 
            +
            def multi_turn_chat(user_input, audio_file, history, current_audio):
         | 
| 58 | 
            +
                try:
         | 
| 59 | 
            +
                    if audio_file is not None:
         | 
| 60 | 
            +
                        current_audio = audio_file  # Update state if a new file is uploaded
         | 
| 61 |  | 
| 62 | 
            +
                    if current_audio is None:
         | 
| 63 | 
            +
                        return history + [("System", "β Please upload an audio file before chatting.")], history, current_audio
         | 
| 64 |  | 
| 65 | 
            +
                    sound = llava.Sound(current_audio)
         | 
| 66 | 
            +
                    prompt = f"<sound>\n{user_input}"
         | 
| 67 |  | 
| 68 | 
            +
                    response = model_multi.generate_content([sound, prompt], generation_config=generation_config_multi)
         | 
| 69 |  | 
| 70 | 
            +
                    history.append((user_input, response))
         | 
| 71 | 
            +
                    return history, history, current_audio
         | 
| 72 | 
            +
                except Exception as e:
         | 
| 73 | 
            +
                    history.append((user_input, f"β Error: {str(e)}"))
         | 
| 74 | 
            +
                    return history, history, current_audio
         | 
| 75 |  | 
| 76 | 
             
            def think_infer(audio_file, prompt_text):
         | 
| 77 | 
             
                try:
         | 
|  | |
| 83 | 
             
                    return f"β Error: {str(e)}"
         | 
| 84 |  | 
| 85 | 
             
            # ---------------------------------
         | 
| 86 | 
            +
            # SPEECH PROMPT INFERENCE FUNCTION
         | 
| 87 | 
             
            # ---------------------------------
         | 
| 88 | 
            +
            def speech_prompt_infer(speech_input):
         | 
| 89 | 
            +
                try:
         | 
| 90 | 
            +
                    sound = llava.Sound(speech_input)
         | 
| 91 | 
            +
                    # For speech prompts, we use the audio itself as both the context and the prompt
         | 
| 92 | 
            +
                    response = model_multi.generate_content([sound, "<sound>"], generation_config=generation_config_multi)
         | 
| 93 | 
            +
                    return response
         | 
| 94 | 
            +
                except Exception as e:
         | 
| 95 | 
            +
                    return f"β Error: {str(e)}"
         | 
|  | |
|  | |
|  | |
|  | |
| 96 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 97 | 
             
            # ---------------------------------
         | 
| 98 | 
             
            # INTERFACE
         | 
| 99 | 
             
            # ---------------------------------
         | 
|  | |
| 208 | 
             
                        btn_think.click(fn=think_infer, inputs=[audio_input_think, prompt_input_think], outputs=output_think)
         | 
| 209 | 
             
                    # ---------------- MULTI-TURN CHAT ----------------
         | 
| 210 | 
             
                    with gr.Tab("π¬ Multi-Turn Chat"):
         | 
| 211 | 
            +
                        chatbot = gr.Chatbot(label="Audio Chatbot")
         | 
| 212 | 
            +
                        audio_input_multi = gr.Audio(type="filepath", label="Upload or Replace Audio Context")
         | 
| 213 | 
            +
                        user_input_multi = gr.Textbox(label="Your message", placeholder="Ask a question about the audio...", lines=8)
         | 
| 214 | 
            +
                        btn_multi = gr.Button("Send")
         | 
| 215 | 
            +
                        history_state = gr.State([])           # Chat history
         | 
| 216 | 
            +
                        current_audio_state = gr.State(None)   # Most recent audio file path
         | 
| 217 | 
            +
             | 
| 218 | 
            +
                        btn_multi.click(
         | 
| 219 | 
            +
                            fn=multi_turn_chat,
         | 
| 220 | 
            +
                            inputs=[user_input_multi, audio_input_multi, history_state, current_audio_state],
         | 
| 221 | 
            +
                            outputs=[chatbot, history_state, current_audio_state]
         | 
| 222 | 
            +
                        )
         | 
| 223 | 
            +
                        gr.Examples(
         | 
| 224 | 
            +
                            examples=[
         | 
| 225 | 
            +
                                ["static/chat/audio1.mp3", "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?"],
         | 
| 226 | 
            +
                                ["static/chat/audio2.mp3", "Switching gears, this one is super energetic and synthetic. If I wanted to remix the calming folk piece into something closer to this, what would you suggest?"],
         | 
| 227 | 
            +
                            ],
         | 
| 228 | 
            +
                            inputs=[audio_input_multi, user_input_multi],
         | 
| 229 | 
            +
                            label="π§ͺ Try Examples"
         | 
| 230 | 
            +
                        )
         | 
|  | |
| 231 | 
             
                        gr.Markdown("π [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         | 
| 232 |  | 
| 233 | 
             
                    with gr.Tab("π£οΈ Speech Prompt"):
         | 
| 234 | 
            +
                        gr.Markdown("Use your **voice** to talk to the model.")
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                        with gr.Row():
         | 
| 237 | 
            +
                            with gr.Column():
         | 
| 238 | 
            +
                                speech_input = gr.Audio(type="filepath", label="Speak or Upload Audio")
         | 
| 239 | 
            +
                                btn_speech = gr.Button("Submit")
         | 
| 240 | 
            +
                                gr.Examples(
         | 
| 241 | 
            +
                                    examples=[
         | 
| 242 | 
            +
                                        ["static/voice/voice_0.mp3"],
         | 
| 243 | 
            +
                                        ["static/voice/voice_1.mp3"],
         | 
| 244 | 
            +
                                        ["static/voice/voice_2.mp3"],
         | 
| 245 | 
            +
                                    ],
         | 
| 246 | 
            +
                                    inputs=speech_input,
         | 
| 247 | 
            +
                                    label="π§ͺ Try Examples"
         | 
| 248 | 
            +
                                )
         | 
| 249 | 
            +
                            with gr.Column():
         | 
| 250 | 
            +
                                response_box = gr.Textbox(label="Model Response", lines=15)
         | 
| 251 | 
            +
             | 
| 252 | 
            +
                        btn_speech.click(fn=speech_prompt_infer, inputs=speech_input, outputs=response_box)
         | 
|  | |
| 253 | 
             
                        gr.Markdown("π [Check out our other Gradio demo here](https://huggingface.co/spaces/nvidia/audio-flamingo-3-chat)")
         | 
| 254 |  | 
| 255 | 
             
                    # ---------------- ABOUT ----------------
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -4,6 +4,7 @@ hydra-core | |
| 4 | 
             
            loguru
         | 
| 5 | 
             
            Pillow
         | 
| 6 | 
             
            pydub
         | 
|  | |
| 7 |  | 
| 8 |  | 
| 9 | 
             
            # Transformers and training utilities
         | 
|  | |
| 4 | 
             
            loguru
         | 
| 5 | 
             
            Pillow
         | 
| 6 | 
             
            pydub
         | 
| 7 | 
            +
            gradio
         | 
| 8 |  | 
| 9 |  | 
| 10 | 
             
            # Transformers and training utilities
         | 
    	
        windows_cuda_fix.md
    ADDED
    
    | @@ -0,0 +1,61 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Windows CUDA Linking Issues - Troubleshooting Guide
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            ## Issues Identified
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ### 1. Fixed: torch.cuda.amp Deprecation Warnings β
         | 
| 6 | 
            +
            - **Issue**: `torch.cuda.amp.custom_fwd` and `torch.cuda.amp.custom_bwd` deprecation warnings
         | 
| 7 | 
            +
            - **Fix**: Updated `llava/model/qlinear_te.py` to use `device_type='cuda'` parameter
         | 
| 8 | 
            +
            - **Lines changed**: 101 and 153
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            ### 2. Windows CUDA Linking Errors
         | 
| 11 | 
            +
            - **Error**: `LINK : fatal error LNK1181: Eingabedatei "aio.lib" kann nicht geΓΆffnet werden.`
         | 
| 12 | 
            +
            - **Error**: `LINK : fatal error LNK1181: Eingabedatei "cufile.lib" kann nicht geΓΆffnet werden.`
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            ## Root Causes and Solutions
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            ### aio.lib Error
         | 
| 17 | 
            +
            - **Cause**: `aio.lib` (Asynchronous I/O) is POSIX-specific and not available on Windows
         | 
| 18 | 
            +
            - **Solution**: This library should not be linked on Windows builds
         | 
| 19 | 
            +
            - **Action**: The CUDA extension build system should exclude this on Windows
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            ### cufile.lib Error
         | 
| 22 | 
            +
            - **Cause**: Missing NVIDIA GPUDirect Storage (GDS) library or incorrect linking
         | 
| 23 | 
            +
            - **Solutions**:
         | 
| 24 | 
            +
              1. **Install NVIDIA CUDA Toolkit** with GPUDirect Storage components
         | 
| 25 | 
            +
              2. **Verify CUDA_PATH** environment variable points to correct CUDA installation
         | 
| 26 | 
            +
              3. **Check library paths** in `%CUDA_PATH%\lib\x64\`
         | 
| 27 | 
            +
              4. **Use dynamic linking** instead of static linking for cuFile on Windows
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            ## Recommended Actions
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            ### 1. Verify CUDA Installation
         | 
| 32 | 
            +
            ```cmd
         | 
| 33 | 
            +
            echo %CUDA_PATH%
         | 
| 34 | 
            +
            dir "%CUDA_PATH%\lib\x64\cufile*"
         | 
| 35 | 
            +
            nvcc --version
         | 
| 36 | 
            +
            ```
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            ### 2. Check PyTorch CUDA Compatibility
         | 
| 39 | 
            +
            ```python
         | 
| 40 | 
            +
            import torch
         | 
| 41 | 
            +
            print(f"PyTorch version: {torch.__version__}")
         | 
| 42 | 
            +
            print(f"CUDA version: {torch.version.cuda}")
         | 
| 43 | 
            +
            print(f"CUDA available: {torch.cuda.is_available()}")
         | 
| 44 | 
            +
            ```
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            ### 3. Update Build Configuration
         | 
| 47 | 
            +
            The `setup.py` in `llava/model/coat/optimizer/kernels/` may need Windows-specific modifications:
         | 
| 48 | 
            +
            - Exclude `aio.lib` on Windows
         | 
| 49 | 
            +
            - Ensure proper cuFile library linking
         | 
| 50 | 
            +
            - Add Windows-specific compiler flags if needed
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            ### 4. PEFT Version Warning
         | 
| 53 | 
            +
            - **Warning**: PEFT configuration compatibility issue
         | 
| 54 | 
            +
            - **Solution**: Update PEFT library: `pip install -U peft`
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            ## Status
         | 
| 57 | 
            +
            - β
 **Fixed**: torch.cuda.amp deprecation warnings
         | 
| 58 | 
            +
            - β οΈ **Needs attention**: Windows CUDA library linking
         | 
| 59 | 
            +
            - β οΈ **Recommended**: Update PEFT library
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            The application appears to be running despite the linking warnings, suggesting the core functionality is working but with potential performance or stability impacts.
         |