shuai bai
		
	commited on
		
		
					Update README.md
Browse files
    	
        README.md
    CHANGED
    
    | 
         @@ -147,25 +147,25 @@ from qwen_vl_utils import process_vision_info 
     | 
|
| 147 | 
         | 
| 148 | 
         
             
            # default: Load the model on the available device(s)
         
     | 
| 149 | 
         
             
            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         
     | 
| 150 | 
         
            -
                "Qwen/Qwen2.5-VL- 
     | 
| 151 | 
         
             
            )
         
     | 
| 152 | 
         | 
| 153 | 
         
             
            # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
         
     | 
| 154 | 
         
             
            # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         
     | 
| 155 | 
         
            -
            #     "Qwen/Qwen2.5-VL- 
     | 
| 156 | 
         
             
            #     torch_dtype=torch.bfloat16,
         
     | 
| 157 | 
         
             
            #     attn_implementation="flash_attention_2",
         
     | 
| 158 | 
         
             
            #     device_map="auto",
         
     | 
| 159 | 
         
             
            # )
         
     | 
| 160 | 
         | 
| 161 | 
         
             
            # default processer
         
     | 
| 162 | 
         
            -
            processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL- 
     | 
| 163 | 
         | 
| 164 | 
         
             
            # The default range for the number of visual tokens per image in the model is 4-16384.
         
     | 
| 165 | 
         
             
            # You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
         
     | 
| 166 | 
         
             
            # min_pixels = 256*28*28
         
     | 
| 167 | 
         
             
            # max_pixels = 1280*28*28
         
     | 
| 168 | 
         
            -
            # processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL- 
     | 
| 169 | 
         | 
| 170 | 
         
             
            messages = [
         
     | 
| 171 | 
         
             
                {
         
     | 
| 
         @@ -434,7 +434,7 @@ The model supports a wide range of resolution inputs. By default, it uses the na 
     | 
|
| 434 | 
         
             
            min_pixels = 256 * 28 * 28
         
     | 
| 435 | 
         
             
            max_pixels = 1280 * 28 * 28
         
     | 
| 436 | 
         
             
            processor = AutoProcessor.from_pretrained(
         
     | 
| 437 | 
         
            -
                "Qwen/Qwen2.5-VL- 
     | 
| 438 | 
         
             
            )
         
     | 
| 439 | 
         
             
            ```
         
     | 
| 440 | 
         | 
| 
         | 
|
| 147 | 
         | 
| 148 | 
         
             
            # default: Load the model on the available device(s)
         
     | 
| 149 | 
         
             
            model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         
     | 
| 150 | 
         
            +
                "Qwen/Qwen2.5-VL-72B-Instruct", torch_dtype="auto", device_map="auto"
         
     | 
| 151 | 
         
             
            )
         
     | 
| 152 | 
         | 
| 153 | 
         
             
            # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
         
     | 
| 154 | 
         
             
            # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         
     | 
| 155 | 
         
            +
            #     "Qwen/Qwen2.5-VL-72B-Instruct",
         
     | 
| 156 | 
         
             
            #     torch_dtype=torch.bfloat16,
         
     | 
| 157 | 
         
             
            #     attn_implementation="flash_attention_2",
         
     | 
| 158 | 
         
             
            #     device_map="auto",
         
     | 
| 159 | 
         
             
            # )
         
     | 
| 160 | 
         | 
| 161 | 
         
             
            # default processer
         
     | 
| 162 | 
         
            +
            processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-72B-Instruct")
         
     | 
| 163 | 
         | 
| 164 | 
         
             
            # The default range for the number of visual tokens per image in the model is 4-16384.
         
     | 
| 165 | 
         
             
            # You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
         
     | 
| 166 | 
         
             
            # min_pixels = 256*28*28
         
     | 
| 167 | 
         
             
            # max_pixels = 1280*28*28
         
     | 
| 168 | 
         
            +
            # processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-72B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
         
     | 
| 169 | 
         | 
| 170 | 
         
             
            messages = [
         
     | 
| 171 | 
         
             
                {
         
     | 
| 
         | 
|
| 434 | 
         
             
            min_pixels = 256 * 28 * 28
         
     | 
| 435 | 
         
             
            max_pixels = 1280 * 28 * 28
         
     | 
| 436 | 
         
             
            processor = AutoProcessor.from_pretrained(
         
     | 
| 437 | 
         
            +
                "Qwen/Qwen2.5-VL-72B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels
         
     | 
| 438 | 
         
             
            )
         
     | 
| 439 | 
         
             
            ```
         
     | 
| 440 | 
         |