Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						f6d4208
	
1
								Parent(s):
							
							8d1279d
								
add explanation
Browse files- .DS_Store +0 -0
- VAR_explained.png +0 -0
- app.py +102 -27
    	
        .DS_Store
    ADDED
    
    | Binary file (6.15 kB). View file | 
|  | 
    	
        VAR_explained.png
    ADDED
    
    |   | 
    	
        app.py
    CHANGED
    
    | @@ -35,9 +35,9 @@ class SimpleAdapter(nn.Module): | |
| 35 | 
             
                    x = self.norm2(x)
         | 
| 36 | 
             
                    return x
         | 
| 37 |  | 
| 38 | 
            -
            class  | 
| 39 | 
             
                def __init__(self, pl_checkpoint=None, start_class_id=578, hugging_face_token=None, siglip_model='google/siglip-base-patch16-224', device="cpu", MODEL_DEPTH=16):
         | 
| 40 | 
            -
                    super( | 
| 41 | 
             
                    self.device = device
         | 
| 42 | 
             
                    self.class_id = start_class_id
         | 
| 43 | 
             
                    # Define layers
         | 
| @@ -117,12 +117,10 @@ if __name__ == '__main__': | |
| 117 | 
             
                # Initialize the model
         | 
| 118 | 
             
                checkpoint = 'VARtext_v1.pth'  # Replace with your actual checkpoint path
         | 
| 119 | 
             
                device = 'cpu' if not torch.cuda.is_available() else 'cuda'
         | 
| 120 | 
            -
                 | 
| 121 | 
            -
                model = | 
| 122 | 
            -
                model.load_state_dict(state_dict)
         | 
| 123 | 
             
                model.to(device)
         | 
| 124 |  | 
| 125 | 
            -
             | 
| 126 | 
             
                def generate_image_gradio(text, beta=1.0, seed=None, more_smooth=False, top_k=0, top_p=0.9):
         | 
| 127 | 
             
                    print(f"Generating image for text: {text}\n"
         | 
| 128 | 
             
                          f"beta: {beta}\n"
         | 
| @@ -133,34 +131,111 @@ if __name__ == '__main__': | |
| 133 | 
             
                    image = model.generate_image(text, beta=beta, seed=seed, more_smooth=more_smooth, top_k=int(top_k), top_p=top_p)
         | 
| 134 | 
             
                    return image
         | 
| 135 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 136 |  | 
| 137 | 
            -
                with gr.Blocks() as demo:
         | 
| 138 | 
            -
                    gr.Markdown("# PopYou2-VAR")
         | 
| 139 | 
             
                    with gr.Tab("Generate Image"):
         | 
| 140 | 
            -
                         | 
| 141 | 
            -
             | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
| 144 | 
            -
             | 
| 145 | 
            -
             | 
| 146 | 
            -
             | 
| 147 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 148 | 
             
                        generate_button.click(
         | 
| 149 | 
             
                            generate_image_gradio,
         | 
| 150 | 
             
                            inputs=[text_input, beta_input, seed_input, more_smooth_input, top_k_input, top_p_input],
         | 
| 151 | 
             
                            outputs=image_output
         | 
| 152 | 
             
                        )
         | 
| 153 |  | 
| 154 | 
            -
                    gr.Markdown(" | 
| 155 | 
            -
                    with gr.Row():
         | 
| 156 | 
            -
                        example1_text = gr.Textbox(label="Example 1", value="a funko pop figure of a yellow robot tom cruise with headphones on a white background", interactive=False)
         | 
| 157 | 
            -
                        example1_image = gr.Image(label="Generated Image 1", value="examples/tom_cruise_robot.png")  # Replace with the actual path
         | 
| 158 | 
            -
                    with gr.Row():
         | 
| 159 | 
            -
                        example2_text = gr.Textbox(label="Example 2", value="a funko pop figure of a alien Scarlett Johansson holding a shield on a white background", interactive=False)
         | 
| 160 | 
            -
                        example2_image = gr.Image(label="Generated Image 2", value="examples/alien_Scarlett_Johansson.png")  # Replace with the actual path
         | 
| 161 | 
            -
                    with gr.Row():
         | 
| 162 | 
            -
                        example3_text = gr.Textbox(label="Example 3", value="a funko pop figure of a woman with a hat and a pink long hair and blue dress on a white background", interactive=False)
         | 
| 163 | 
            -
                        example3_image = gr.Image(label="Generated Image 3", value="examples/woman_pink.png")  # Replace with the actual path
         | 
| 164 |  | 
| 165 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 166 |  | 
|  | 
|  | |
| 35 | 
             
                    x = self.norm2(x)
         | 
| 36 | 
             
                    return x
         | 
| 37 |  | 
| 38 | 
            +
            class InferenceTextVAR(nn.Module):
         | 
| 39 | 
             
                def __init__(self, pl_checkpoint=None, start_class_id=578, hugging_face_token=None, siglip_model='google/siglip-base-patch16-224', device="cpu", MODEL_DEPTH=16):
         | 
| 40 | 
            +
                    super(InferenceTextVAR, self).__init__()
         | 
| 41 | 
             
                    self.device = device
         | 
| 42 | 
             
                    self.class_id = start_class_id
         | 
| 43 | 
             
                    # Define layers
         | 
|  | |
| 117 | 
             
                # Initialize the model
         | 
| 118 | 
             
                checkpoint = 'VARtext_v1.pth'  # Replace with your actual checkpoint path
         | 
| 119 | 
             
                device = 'cpu' if not torch.cuda.is_available() else 'cuda'
         | 
| 120 | 
            +
                model = InferenceTextVAR(device=device)
         | 
| 121 | 
            +
                model.load_state_dict(torch.load(checkpoint, map_location=device))
         | 
|  | |
| 122 | 
             
                model.to(device)
         | 
| 123 |  | 
|  | |
| 124 | 
             
                def generate_image_gradio(text, beta=1.0, seed=None, more_smooth=False, top_k=0, top_p=0.9):
         | 
| 125 | 
             
                    print(f"Generating image for text: {text}\n"
         | 
| 126 | 
             
                          f"beta: {beta}\n"
         | 
|  | |
| 131 | 
             
                    image = model.generate_image(text, beta=beta, seed=seed, more_smooth=more_smooth, top_k=int(top_k), top_p=top_p)
         | 
| 132 | 
             
                    return image
         | 
| 133 |  | 
| 134 | 
            +
                with gr.Blocks(css="""
         | 
| 135 | 
            +
                .project-item {margin-bottom: 30px;}
         | 
| 136 | 
            +
                .project-tags .tag {display: inline-block; background-color: #e0e0e0; padding: 5px 10px; margin-right: 5px; border-radius: 5px;}
         | 
| 137 | 
            +
                .project-description {margin-top: 20px;}
         | 
| 138 | 
            +
                .github-button, .huggingface-button, .wandb-button {
         | 
| 139 | 
            +
                    display: inline-block; margin-left: 10px; text-decoration: none; font-size: 14px;
         | 
| 140 | 
            +
                    padding: 5px 10px; background-color: #f0f0f0; border-radius: 5px; color: black;
         | 
| 141 | 
            +
                }
         | 
| 142 | 
            +
                .project-content {display: flex; flex-direction: row;}
         | 
| 143 | 
            +
                .project-description {flex: 2; padding-right: 20px;}
         | 
| 144 | 
            +
                .project-options-image {flex: 1;}
         | 
| 145 | 
            +
                .funko-image {width: 100%; max-width: 300px;}
         | 
| 146 | 
            +
                """) as demo:
         | 
| 147 | 
            +
                    gr.Markdown("""
         | 
| 148 | 
            +
                    # PopYou2 - VAR Text
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                    <!-- Project Links -->
         | 
| 151 | 
            +
                    [](https://github.com/amit154154/VAR_clip)
         | 
| 152 | 
            +
                    [](https://api.wandb.ai/links/amit154154/cqccmfsl)
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    **Tags:** Image Generation, GAN
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                    ## Project Explanation
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    - **Dataset Generation:** Generated a comprehensive dataset of approximately 100,000 Funko Pop! images with detailed prompts using [SDXL Turbo](https://huggingface.co/stabilityai/sdxl-turbo) for high-quality data creation.
         | 
| 159 | 
            +
                    - **Model Fine-tuning:** Fine-tuned the [Visual AutoRegressive (VAR)](https://arxiv.org/abs/2404.02905) model, pretrained on ImageNet, to adapt it for Funko Pop! generation by injecting a custom embedding representing the "doll" class.
         | 
| 160 | 
            +
                    - **Adapter Training:** Trained an adapter with the frozen [SigLIP image encoder](https://github.com/FoundationVision/VAR) and a lightweight LoRA module to map image embeddings to text representation in a large language model.
         | 
| 161 | 
            +
                    - **Text-to-Image Generation:** Enabled text-to-image generation by replacing the SigLIP image encoder with its text encoder, retaining frozen components such as the VAE and generator for efficiency and quality.
         | 
| 162 | 
            +
                    
         | 
| 163 | 
            +
                    
         | 
| 164 | 
            +
             | 
| 165 | 
            +
             | 
| 166 | 
            +
                    ## Generate Your Own Funko Pop!
         | 
| 167 | 
            +
                    """)
         | 
| 168 |  | 
|  | |
|  | |
| 169 | 
             
                    with gr.Tab("Generate Image"):
         | 
| 170 | 
            +
                        with gr.Row():
         | 
| 171 | 
            +
                            with gr.Column(scale=1):
         | 
| 172 | 
            +
                                text_input = gr.Textbox(label="Input Text", placeholder="Enter a description for your Funko Pop!")
         | 
| 173 | 
            +
                                beta_input = gr.Slider(label="Beta", minimum=0.0, maximum=2.5, step=0.05, value=1.0)
         | 
| 174 | 
            +
                                seed_input = gr.Number(label="Seed", value=None)
         | 
| 175 | 
            +
                                more_smooth_input = gr.Checkbox(label="More Smooth", value=False)
         | 
| 176 | 
            +
                                top_k_input = gr.Number(label="Top K", value=0)
         | 
| 177 | 
            +
                                top_p_input = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, step=0.01, value=0.5)
         | 
| 178 | 
            +
                                generate_button = gr.Button("Generate Image")
         | 
| 179 | 
            +
                            with gr.Column(scale=1):
         | 
| 180 | 
            +
                                image_output = gr.Image(label="Generated Image")
         | 
| 181 | 
            +
             | 
| 182 | 
             
                        generate_button.click(
         | 
| 183 | 
             
                            generate_image_gradio,
         | 
| 184 | 
             
                            inputs=[text_input, beta_input, seed_input, more_smooth_input, top_k_input, top_p_input],
         | 
| 185 | 
             
                            outputs=image_output
         | 
| 186 | 
             
                        )
         | 
| 187 |  | 
| 188 | 
            +
                    gr.Markdown("## Examples")
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 189 |  | 
| 190 | 
            +
                    with gr.Row():
         | 
| 191 | 
            +
                        with gr.Column():
         | 
| 192 | 
            +
                            gr.Markdown("### Example 1")
         | 
| 193 | 
            +
                            gr.Markdown("A Funko Pop figure of a yellow robot Tom Cruise with headphones on a white background")
         | 
| 194 | 
            +
                            example1_image = gr.Image(value="examples/tom_cruise_robot.png")  # Replace with the actual path
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                        with gr.Column():
         | 
| 197 | 
            +
                            gr.Markdown("### Example 2")
         | 
| 198 | 
            +
                            gr.Markdown("A Funko Pop figure of an alien Scarlett Johansson holding a shield on a white background")
         | 
| 199 | 
            +
                            example2_image = gr.Image(value="examples/alien_Scarlett_Johansson.png")  # Replace with the actual path
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                        with gr.Column():
         | 
| 202 | 
            +
                            gr.Markdown("### Example 3")
         | 
| 203 | 
            +
                            gr.Markdown("A Funko Pop figure of a woman with a hat and pink long hair and blue dress on a white background")
         | 
| 204 | 
            +
                            example3_image = gr.Image(value="examples/woman_pink.png")  # Replace with the actual path
         | 
| 205 | 
            +
             | 
| 206 | 
            +
                    gr.Markdown("""
         | 
| 207 | 
            +
                    ## Customize Your Funko Pop!
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                    Build your own Funko Pop! by selecting options below and clicking "Generate Custom Funko Pop!".
         | 
| 210 | 
            +
             | 
| 211 | 
            +
                    """)
         | 
| 212 | 
            +
             | 
| 213 | 
            +
                    def update_custom_image(famous_name, character, action):
         | 
| 214 | 
            +
                        # Build the prompt based on the selections
         | 
| 215 | 
            +
                        parts = []
         | 
| 216 | 
            +
                        if famous_name != "None":
         | 
| 217 | 
            +
                            parts.append(f"a Funko Pop figure of {famous_name}")
         | 
| 218 | 
            +
                        else:
         | 
| 219 | 
            +
                            parts.append("a Funko Pop figure")
         | 
| 220 | 
            +
                        if character != "None":
         | 
| 221 | 
            +
                            parts.append(f"styled as a {character}")
         | 
| 222 | 
            +
                        if action != "None":
         | 
| 223 | 
            +
                            parts.append(f"performing {action}")
         | 
| 224 | 
            +
                        parts.append("on a white background")
         | 
| 225 | 
            +
                        prompt = ", ".join(parts)
         | 
| 226 | 
            +
                        image = model.generate_image(prompt)
         | 
| 227 | 
            +
                        return image
         | 
| 228 | 
            +
             | 
| 229 | 
            +
                    famous_name_input = gr.Dropdown(choices=["None", "Donald Trump", "Johnny Depp", "Oprah Winfrey"], label="Famous Name", value="None")
         | 
| 230 | 
            +
                    character_input = gr.Dropdown(choices=["None", "Alien", "Robot"], label="Character", value="None")
         | 
| 231 | 
            +
                    action_input = gr.Dropdown(choices=["None", "Playing the Guitar", "Holding the Sword"], label="Action", value="None")
         | 
| 232 | 
            +
                    custom_generate_button = gr.Button("Generate Custom Funko Pop!")
         | 
| 233 | 
            +
                    custom_image_output = gr.Image(label="Custom Funko Pop!")
         | 
| 234 | 
            +
             | 
| 235 | 
            +
                    custom_generate_button.click(
         | 
| 236 | 
            +
                        update_custom_image,
         | 
| 237 | 
            +
                        inputs=[famous_name_input, character_input, action_input],
         | 
| 238 | 
            +
                        outputs=custom_image_output
         | 
| 239 | 
            +
                    )
         | 
| 240 |  | 
| 241 | 
            +
                demo.launch()
         | 
