Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	
		John Ho
		
	commited on
		
		
					Commit 
							
							·
						
						4361fd1
	
1
								Parent(s):
							
							ba43302
								
updated app to load multiple models
Browse files- README.md +3 -3
- app.py +39 -13
- requirements.txt +2 -2
    	
        README.md
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title:  | 
| 3 | 
             
            emoji: 📸
         | 
| 4 | 
             
            colorFrom: blue
         | 
| 5 | 
             
            colorTo: yellow
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 5.32.0
         | 
| 8 | 
            -
            app_file: app.py | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
            -
            short_description:  | 
| 11 | 
             
            ---
         | 
| 12 |  | 
| 13 | 
             
            # The HuggingFace Space Template
         | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: Video Captioning
         | 
| 3 | 
             
            emoji: 📸
         | 
| 4 | 
             
            colorFrom: blue
         | 
| 5 | 
             
            colorTo: yellow
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
             
            sdk_version: 5.32.0
         | 
| 8 | 
            +
            app_file: app.py
         | 
| 9 | 
             
            pinned: false
         | 
| 10 | 
            +
            short_description: Using VLMs for video captioning
         | 
| 11 | 
             
            ---
         | 
| 12 |  | 
| 13 | 
             
            # The HuggingFace Space Template
         | 
    	
        app.py
    CHANGED
    
    | @@ -1,8 +1,8 @@ | |
| 1 | 
            -
            from statistics import quantiles
         | 
| 2 | 
             
            import spaces, ffmpeg, os, sys, torch
         | 
| 3 | 
             
            import gradio as gr
         | 
| 4 | 
             
            from transformers import (
         | 
| 5 | 
             
                Qwen2_5_VLForConditionalGeneration,
         | 
|  | |
| 6 | 
             
                AutoProcessor,
         | 
| 7 | 
             
                BitsAndBytesConfig,
         | 
| 8 | 
             
            )
         | 
| @@ -85,8 +85,7 @@ def load_model( | |
| 85 | 
             
                    )
         | 
| 86 | 
             
                )
         | 
| 87 | 
             
                # Set model to evaluation mode for inference (disables dropout, etc.)
         | 
| 88 | 
            -
                model.eval()
         | 
| 89 | 
            -
                return model
         | 
| 90 |  | 
| 91 |  | 
| 92 | 
             
            def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
         | 
| @@ -98,23 +97,49 @@ def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"): | |
| 98 | 
             
                )
         | 
| 99 |  | 
| 100 |  | 
| 101 | 
            -
             | 
| 102 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 103 |  | 
| 104 |  | 
| 105 | 
             
            @spaces.GPU(duration=120)
         | 
| 106 | 
             
            def inference(
         | 
| 107 | 
             
                video_path: str,
         | 
| 108 | 
             
                prompt: str = "Describe the camera motion in this video.",
         | 
| 109 | 
            -
                 | 
| 110 | 
            -
                 | 
|  | |
| 111 | 
             
            ):
         | 
| 112 | 
             
                # default processor
         | 
| 113 | 
             
                # processor, model = PROCESSOR, MODEL
         | 
| 114 | 
            -
                processor = load_processor()
         | 
| 115 | 
            -
                model = load_model(
         | 
| 116 | 
            -
             | 
| 117 | 
            -
                )
         | 
|  | |
|  | |
| 118 |  | 
| 119 | 
             
                # The model is trained on 8.0 FPS which we recommend for optimal inference
         | 
| 120 | 
             
                fps = get_fps_ffmpeg(video_path)
         | 
| @@ -173,8 +198,9 @@ demo = gr.Interface( | |
| 173 | 
             
                inputs=[
         | 
| 174 | 
             
                    gr.Video(label="Input Video"),
         | 
| 175 | 
             
                    gr.Textbox(label="Prompt", value="Describe the camera motion in this video."),
         | 
| 176 | 
            -
                    gr. | 
| 177 | 
            -
                    gr.Checkbox(label=" | 
|  | |
| 178 | 
             
                ],
         | 
| 179 | 
             
                outputs=gr.JSON(label="Output JSON"),
         | 
| 180 | 
             
                title="",
         | 
|  | |
|  | |
| 1 | 
             
            import spaces, ffmpeg, os, sys, torch
         | 
| 2 | 
             
            import gradio as gr
         | 
| 3 | 
             
            from transformers import (
         | 
| 4 | 
             
                Qwen2_5_VLForConditionalGeneration,
         | 
| 5 | 
            +
                AutoModelForImageTextToText,
         | 
| 6 | 
             
                AutoProcessor,
         | 
| 7 | 
             
                BitsAndBytesConfig,
         | 
| 8 | 
             
            )
         | 
|  | |
| 85 | 
             
                    )
         | 
| 86 | 
             
                )
         | 
| 87 | 
             
                # Set model to evaluation mode for inference (disables dropout, etc.)
         | 
| 88 | 
            +
                return model.eval()
         | 
|  | |
| 89 |  | 
| 90 |  | 
| 91 | 
             
            def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
         | 
|  | |
| 97 | 
             
                )
         | 
| 98 |  | 
| 99 |  | 
| 100 | 
            +
            logger.debug("Loading Models and Processors...")
         | 
| 101 | 
            +
            MODEL_ZOO = {
         | 
| 102 | 
            +
                "qwen2.5-vl-7b-cam-motion-preview": load_model(
         | 
| 103 | 
            +
                    model_name="chancharikm/qwen2.5-vl-7b-cam-motion-preview",
         | 
| 104 | 
            +
                    use_flash_attention=False,
         | 
| 105 | 
            +
                    apply_quantization=False,
         | 
| 106 | 
            +
                ),
         | 
| 107 | 
            +
                "qwen2.5-vl-7b-instruct": load_model(
         | 
| 108 | 
            +
                    model_name="Qwen/Qwen2.5-VL-7B-Instruct",
         | 
| 109 | 
            +
                    use_flash_attention=False,
         | 
| 110 | 
            +
                    apply_quantization=False,
         | 
| 111 | 
            +
                ),
         | 
| 112 | 
            +
                "qwen2.5-vl-3b-instruct": load_model(
         | 
| 113 | 
            +
                    model_name="Qwen/Qwen2.5-VL-3B-Instruct",
         | 
| 114 | 
            +
                    use_flash_attention=False,
         | 
| 115 | 
            +
                    apply_quantization=False,
         | 
| 116 | 
            +
                ),
         | 
| 117 | 
            +
            }
         | 
| 118 | 
            +
             | 
| 119 | 
            +
            PROCESSORS = {
         | 
| 120 | 
            +
                "qwen2.5-vl-7b-cam-motion-preview": load_processor("Qwen/Qwen2.5-VL-7B-Instruct"),
         | 
| 121 | 
            +
                "qwen2.5-vl-7b-instruct": load_processor("Qwen/Qwen2.5-VL-7B-Instruct"),
         | 
| 122 | 
            +
                "qwen2.5-vl-3b-instruct": load_processor("Qwen/Qwen2.5-VL-3B-Instruct"),
         | 
| 123 | 
            +
            }
         | 
| 124 | 
            +
            logger.debug("Models and Processors Loaded!")
         | 
| 125 |  | 
| 126 |  | 
| 127 | 
             
            @spaces.GPU(duration=120)
         | 
| 128 | 
             
            def inference(
         | 
| 129 | 
             
                video_path: str,
         | 
| 130 | 
             
                prompt: str = "Describe the camera motion in this video.",
         | 
| 131 | 
            +
                model_name: str = "qwen2.5-vl-7b-instruct",
         | 
| 132 | 
            +
                # use_flash_attention: bool = True,
         | 
| 133 | 
            +
                # apply_quantization: bool = True,
         | 
| 134 | 
             
            ):
         | 
| 135 | 
             
                # default processor
         | 
| 136 | 
             
                # processor, model = PROCESSOR, MODEL
         | 
| 137 | 
            +
                # processor = load_processor()
         | 
| 138 | 
            +
                # model = load_model(
         | 
| 139 | 
            +
                #     use_flash_attention=use_flash_attention, apply_quantization=apply_quantization
         | 
| 140 | 
            +
                # )
         | 
| 141 | 
            +
                model = MODEL_ZOO[model_name]
         | 
| 142 | 
            +
                processor = PROCESSORS[model_name]
         | 
| 143 |  | 
| 144 | 
             
                # The model is trained on 8.0 FPS which we recommend for optimal inference
         | 
| 145 | 
             
                fps = get_fps_ffmpeg(video_path)
         | 
|  | |
| 198 | 
             
                inputs=[
         | 
| 199 | 
             
                    gr.Video(label="Input Video"),
         | 
| 200 | 
             
                    gr.Textbox(label="Prompt", value="Describe the camera motion in this video."),
         | 
| 201 | 
            +
                    gr.Dropdown(label="Model", choices=list(MODEL_ZOO.keys())),
         | 
| 202 | 
            +
                    # gr.Checkbox(label="Use Flash Attention", value=False),
         | 
| 203 | 
            +
                    # gr.Checkbox(label="Apply Quantization", value=True),
         | 
| 204 | 
             
                ],
         | 
| 205 | 
             
                outputs=gr.JSON(label="Output JSON"),
         | 
| 206 | 
             
                title="",
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -1,10 +1,10 @@ | |
| 1 | 
             
            torch
         | 
| 2 | 
             
            torchvision
         | 
| 3 | 
             
            #transformers==4.51.3
         | 
| 4 | 
            -
            transformers | 
| 5 | 
             
            accelerate
         | 
| 6 | 
             
            qwen-vl-utils
         | 
| 7 | 
             
            ffmpeg-python
         | 
| 8 | 
             
            loguru
         | 
| 9 | 
             
            bitsandbytes
         | 
| 10 | 
            -
            scipy
         | 
|  | |
| 1 | 
             
            torch
         | 
| 2 | 
             
            torchvision
         | 
| 3 | 
             
            #transformers==4.51.3
         | 
| 4 | 
            +
            transformers
         | 
| 5 | 
             
            accelerate
         | 
| 6 | 
             
            qwen-vl-utils
         | 
| 7 | 
             
            ffmpeg-python
         | 
| 8 | 
             
            loguru
         | 
| 9 | 
             
            bitsandbytes
         | 
| 10 | 
            +
            scipy
         | 
