| import gradio as gr | |
| import clip | |
| from model import ClipCaptionModel | |
| from transformers import GPT2Tokenizer | |
| import numpy as np | |
| import torch | |
| import PIL | |
| from predict import generate2, generate_beam | |
| from huggingface_hub import hf_hub_download | |
| D = torch.device | |
| CPU = torch.device('cpu') | |
| pretrained_model_variance = "0.015" | |
| device = "cpu" | |
| model_path = hf_hub_download('johko/capdec_015', 'model.pt') | |
| clip_model, preprocess = clip.load("RN50x4", device=device, jit=False) | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| model_0 = hf_hub_download('johko/capdec_0', 'model.pt') | |
| model_001 = hf_hub_download('johko/capdec_001', 'model.pt') | |
| model_005 = hf_hub_download('johko/capdec_005', 'model.pt') | |
| model_015 = hf_hub_download('johko/capdec_015', 'model.pt') | |
| model_025 = hf_hub_download('johko/capdec_025', 'model.pt') | |
| model_05 = hf_hub_download('johko/capdec_05', 'model.pt') | |
| def load_noise_level_model(noise_level): | |
| if noise_level == "0.0": | |
| model_path = model_0 | |
| elif noise_level == "0.001": | |
| model_path = model_001 | |
| elif noise_level == "0.005": | |
| model_path = model_005 | |
| elif noise_level == "0.015": | |
| model_path = model_015 | |
| elif noise_level == "0.025": | |
| model_path = model_025 | |
| elif noise_level == "0.05": | |
| model_path = model_05 | |
| else: | |
| raise ValueError("Unknown Noise Level") | |
| model = ClipCaptionModel() | |
| model.load_state_dict(torch.load(model_path, map_location=CPU)) | |
| model = model.eval() | |
| model = model.to(device) | |
| return model | |
| def infer(input_image: np.ndarray, noise_level: str): | |
| use_beam_search = True | |
| model = load_noise_level_model(noise_level) | |
| pil_image = PIL.Image.fromarray(input_image) | |
| image = preprocess(pil_image).unsqueeze(0).to(device) | |
| with torch.no_grad(): | |
| prefix = clip_model.encode_image(image).to(device, dtype=torch.float32) | |
| prefix_embed = model.clip_project(prefix).reshape(1, 40, -1) | |
| if use_beam_search: | |
| generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0] | |
| else: | |
| generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed) | |
| return input_image, generated_text_prefix | |
| description="""This space is a demo for the paper [*Text-Only Training for Image Captioning using Noise-Injected CLIP*](https://arxiv.org/pdf/2211.00575.pdf) | |
| by David Nukrai, Ron Mokady and Amir Globerson. | |
| The paper is about training an Image Captioning model by only using text. It leverages the usage of noise injections at different Noise Levels, | |
| with which you can experiment as well in this demo. The text caption will change depending on the Noise Level you choose.""" | |
| dropdown = gr.components.Dropdown(["0.0", "0.001", "0.005", "0.015", "0.025", "0.05"], value="0.015", label="Noise Level") | |
| input_image = gr.components.Image(label="Input Image") | |
| output_image = gr.components.Image(label="Image") | |
| output_text = gr.components.Textbox(label="Generated Caption") | |
| iface = gr.Interface( | |
| title="CapDec Image Captioning", | |
| description=description, | |
| fn=infer, | |
| inputs=[input_image, dropdown], | |
| outputs=[output_image, output_text], | |
| examples=[["examples/flickr_ex2.jpg", "0.015"], ["examples/web_ex3.jpeg", "0.015"]]) | |
| iface.launch() |