Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	
		Merve Noyan
		
	commited on
		
		
					Commit 
							
							·
						
						e352103
	
1
								Parent(s):
							
							e716569
								
initial commit
Browse files- app.py +164 -0
- example_images/art_critic.png +0 -0
- example_images/dummy_pdf.png +0 -0
- example_images/s2w_example.png +0 -0
- example_images/travel_tips.jpg +0 -0
- requirements.txt +6 -0
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,164 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import gradio as gr
         | 
| 2 | 
            +
            from transformers import AutoProcessor, Idefics3ForConditionalGeneration
         | 
| 3 | 
            +
            import re
         | 
| 4 | 
            +
            import time
         | 
| 5 | 
            +
            from PIL import Image
         | 
| 6 | 
            +
            import torch
         | 
| 7 | 
            +
            import spaces
         | 
| 8 | 
            +
            import subprocess
         | 
| 9 | 
            +
            subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
         | 
| 10 | 
            +
             | 
| 11 | 
            +
             | 
| 12 | 
            +
            processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics3-8b-new")
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            model = Idefics3ForConditionalGeneration.from_pretrained("HuggingFaceM4/idefics3-8b-new", 
         | 
| 15 | 
            +
                    torch_dtype=torch.bfloat16,
         | 
| 16 | 
            +
                    #_attn_implementation="flash_attention_2",
         | 
| 17 | 
            +
                    trust_remote_code=True).to("cuda")
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            BAD_WORDS_IDS = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
         | 
| 20 | 
            +
            EOS_WORDS_IDS = [processor.tokenizer.eos_token_id]
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            #@spaces.GPU
         | 
| 23 | 
            +
            def model_inference(
         | 
| 24 | 
            +
                images, text, decoding_strategy, temperature, max_new_tokens,
         | 
| 25 | 
            +
                repetition_penalty, top_p
         | 
| 26 | 
            +
            ):
         | 
| 27 | 
            +
                if text == "" and not images:
         | 
| 28 | 
            +
                    gr.Error("Please input a query and optionally image(s).")
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                if text == "" and images:
         | 
| 31 | 
            +
                    gr.Error("Please input a text query along the image(s).")
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                if isinstance(images, Image.Image):
         | 
| 34 | 
            +
                    images = [images]
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                if isinstance(text, str):
         | 
| 37 | 
            +
                    text = "<image>" + text
         | 
| 38 | 
            +
                    text = [text]
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                inputs = processor(text=text, images=images, padding=True, return_tensors="pt").to("cuda")
         | 
| 41 | 
            +
                print("inputs",inputs)
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                assert decoding_strategy in [
         | 
| 44 | 
            +
                    "Greedy",
         | 
| 45 | 
            +
                    "Top P Sampling",
         | 
| 46 | 
            +
                ]
         | 
| 47 | 
            +
                if decoding_strategy == "Greedy":
         | 
| 48 | 
            +
                    do_sample = False
         | 
| 49 | 
            +
                elif decoding_strategy == "Top P Sampling":
         | 
| 50 | 
            +
                    do_sample = True
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                # Generate
         | 
| 53 | 
            +
                
         | 
| 54 | 
            +
                generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=max_new_tokens,
         | 
| 55 | 
            +
                                            temperature=temperature, do_sample=do_sample, repetition_penalty=repetition_penalty,
         | 
| 56 | 
            +
                                            top_p=top_p),
         | 
| 57 | 
            +
                generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
         | 
| 58 | 
            +
                #generated_texts = processor.batch_decode(generated_ids[:, generation_args["input_ids"].size(1):], skip_special_tokens=True)
         | 
| 59 | 
            +
                print("INPUT:", text, "|OUTPUT:", generated_texts)
         | 
| 60 | 
            +
                return generated_texts[0]
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
            with gr.Blocks(fill_height=True) as demo:
         | 
| 64 | 
            +
                gr.Markdown("## IDEFICS2Llama 🐶")
         | 
| 65 | 
            +
                gr.Markdown("Play with [IDEFICS2Llama](https://huggingface.co/HuggingFaceM4/idefics2-8b) in this demo. To get started, upload an image and text or try one of the examples.")
         | 
| 66 | 
            +
                gr.Markdown("**Important note**: This model is not made for chatting, the chatty IDEFICS2 will be released in the upcoming days. **This model is very strong on various tasks, including visual question answering, document retrieval and more, you can see it through the examples.**")
         | 
| 67 | 
            +
                gr.Markdown("Learn more about IDEFICS2 in this [blog post](https://huggingface.co/blog/idefics2).")
         | 
| 68 | 
            +
             | 
| 69 | 
            +
             | 
| 70 | 
            +
                with gr.Column():
         | 
| 71 | 
            +
                    image_input = gr.Image(label="Upload your Image", type="pil")
         | 
| 72 | 
            +
                    query_input = gr.Textbox(label="Prompt")
         | 
| 73 | 
            +
                    submit_btn = gr.Button("Submit")
         | 
| 74 | 
            +
                    output = gr.Textbox(label="Output")
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                with gr.Accordion(label="Example Inputs and Advanced Generation Parameters"):
         | 
| 77 | 
            +
                    examples=[["example_images/travel_tips.jpg", "I want to go somewhere similar to the one in the photo. Give me destinations and travel tips.", "Greedy", 0.4, 512, 1.2, 0.8],
         | 
| 78 | 
            +
                                ["example_images/dummy_pdf.png", "How much percent is the order status?", "Greedy", 0.4, 512, 1.2, 0.8],
         | 
| 79 | 
            +
                                ["example_images/art_critic.png", "As an art critic AI assistant, could you describe this painting in details and make a thorough critic?.", "Greedy", 0.4, 512, 1.2, 0.8],
         | 
| 80 | 
            +
                                ["example_images/s2w_example.png", "What is this UI about?", "Greedy", 0.4, 512, 1.2, 0.8]]
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                    # Hyper-parameters for generation
         | 
| 83 | 
            +
                    max_new_tokens = gr.Slider(
         | 
| 84 | 
            +
                          minimum=8,
         | 
| 85 | 
            +
                          maximum=1024,
         | 
| 86 | 
            +
                          value=512,
         | 
| 87 | 
            +
                          step=1,
         | 
| 88 | 
            +
                          interactive=True,
         | 
| 89 | 
            +
                          label="Maximum number of new tokens to generate",
         | 
| 90 | 
            +
                      )
         | 
| 91 | 
            +
                    repetition_penalty = gr.Slider(
         | 
| 92 | 
            +
                          minimum=0.01,
         | 
| 93 | 
            +
                          maximum=5.0,
         | 
| 94 | 
            +
                          value=1.2,
         | 
| 95 | 
            +
                          step=0.01,
         | 
| 96 | 
            +
                          interactive=True,
         | 
| 97 | 
            +
                          label="Repetition penalty",
         | 
| 98 | 
            +
                          info="1.0 is equivalent to no penalty",
         | 
| 99 | 
            +
                      )
         | 
| 100 | 
            +
                    temperature = gr.Slider(
         | 
| 101 | 
            +
                          minimum=0.0,
         | 
| 102 | 
            +
                          maximum=5.0,
         | 
| 103 | 
            +
                          value=0.4,
         | 
| 104 | 
            +
                          step=0.1,
         | 
| 105 | 
            +
                          interactive=True,
         | 
| 106 | 
            +
                          label="Sampling temperature",
         | 
| 107 | 
            +
                          info="Higher values will produce more diverse outputs.",
         | 
| 108 | 
            +
                      )
         | 
| 109 | 
            +
                    top_p = gr.Slider(
         | 
| 110 | 
            +
                          minimum=0.01,
         | 
| 111 | 
            +
                          maximum=0.99,
         | 
| 112 | 
            +
                          value=0.8,
         | 
| 113 | 
            +
                          step=0.01,
         | 
| 114 | 
            +
                          interactive=True,
         | 
| 115 | 
            +
                          label="Top P",
         | 
| 116 | 
            +
                          info="Higher values is equivalent to sampling more low-probability tokens.",
         | 
| 117 | 
            +
                      )
         | 
| 118 | 
            +
                    decoding_strategy = gr.Radio(
         | 
| 119 | 
            +
                          [
         | 
| 120 | 
            +
                              "Greedy",
         | 
| 121 | 
            +
                              "Top P Sampling",
         | 
| 122 | 
            +
                          ],
         | 
| 123 | 
            +
                          value="Greedy",
         | 
| 124 | 
            +
                          label="Decoding strategy",
         | 
| 125 | 
            +
                          interactive=True,
         | 
| 126 | 
            +
                          info="Higher values is equivalent to sampling more low-probability tokens.",
         | 
| 127 | 
            +
                      )
         | 
| 128 | 
            +
                    decoding_strategy.change(
         | 
| 129 | 
            +
                          fn=lambda selection: gr.Slider(
         | 
| 130 | 
            +
                              visible=(
         | 
| 131 | 
            +
                                  selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
         | 
| 132 | 
            +
                              )
         | 
| 133 | 
            +
                          ),
         | 
| 134 | 
            +
                          inputs=decoding_strategy,
         | 
| 135 | 
            +
                          outputs=temperature,
         | 
| 136 | 
            +
                      )
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                    decoding_strategy.change(
         | 
| 139 | 
            +
                          fn=lambda selection: gr.Slider(
         | 
| 140 | 
            +
                              visible=(
         | 
| 141 | 
            +
                                  selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
         | 
| 142 | 
            +
                              )
         | 
| 143 | 
            +
                          ),
         | 
| 144 | 
            +
                          inputs=decoding_strategy,
         | 
| 145 | 
            +
                          outputs=repetition_penalty,
         | 
| 146 | 
            +
                      )
         | 
| 147 | 
            +
                    decoding_strategy.change(
         | 
| 148 | 
            +
                          fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])),
         | 
| 149 | 
            +
                          inputs=decoding_strategy,
         | 
| 150 | 
            +
                          outputs=top_p,
         | 
| 151 | 
            +
                      )
         | 
| 152 | 
            +
                    gr.Examples(
         | 
| 153 | 
            +
                                    examples = examples,
         | 
| 154 | 
            +
                                    inputs=[image_input, query_input, decoding_strategy, temperature,
         | 
| 155 | 
            +
                                                                          max_new_tokens, repetition_penalty, top_p],
         | 
| 156 | 
            +
                                    outputs=output,
         | 
| 157 | 
            +
                                    fn=model_inference
         | 
| 158 | 
            +
                                )
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                    submit_btn.click(model_inference, inputs = [image_input, query_input, decoding_strategy, temperature,
         | 
| 161 | 
            +
                                                                  max_new_tokens, repetition_penalty, top_p], outputs=output)
         | 
| 162 | 
            +
             | 
| 163 | 
            +
             | 
| 164 | 
            +
            demo.launch(debug=True)
         | 
    	
        example_images/art_critic.png
    ADDED
    
    |   | 
    	
        example_images/dummy_pdf.png
    ADDED
    
    |   | 
    	
        example_images/s2w_example.png
    ADDED
    
    |   | 
    	
        example_images/travel_tips.jpg
    ADDED
    
    |   | 
    	
        requirements.txt
    ADDED
    
    | @@ -0,0 +1,6 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            torch
         | 
| 2 | 
            +
            accelerate
         | 
| 3 | 
            +
            huggingface_hub
         | 
| 4 | 
            +
            gradio
         | 
| 5 | 
            +
            git+https://github.com/andimarafioti/transformers.git@idefics3
         | 
| 6 | 
            +
            spaces
         | 
 
			
