Spaces:
Running
on
Zero
Running
on
Zero
| # coding=utf-8 | |
| # Copyright 2023 HuggingFace Inc. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import gc | |
| import random | |
| import unittest | |
| import numpy as np | |
| import torch | |
| from transformers import XLMRobertaTokenizer | |
| from diffusers import ( | |
| AltDiffusionImg2ImgPipeline, | |
| AutoencoderKL, | |
| PNDMScheduler, | |
| UNet2DConditionModel, | |
| ) | |
| from diffusers.image_processor import VaeImageProcessor | |
| from diffusers.pipelines.alt_diffusion.modeling_roberta_series import ( | |
| RobertaSeriesConfig, | |
| RobertaSeriesModelWithTransformation, | |
| ) | |
| from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device | |
| from diffusers.utils.testing_utils import require_torch_gpu | |
| torch.backends.cuda.matmul.allow_tf32 = False | |
| class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase): | |
| def tearDown(self): | |
| # clean up the VRAM after each test | |
| super().tearDown() | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| def dummy_image(self): | |
| batch_size = 1 | |
| num_channels = 3 | |
| sizes = (32, 32) | |
| image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) | |
| return image | |
| def dummy_cond_unet(self): | |
| torch.manual_seed(0) | |
| model = UNet2DConditionModel( | |
| block_out_channels=(32, 64), | |
| layers_per_block=2, | |
| sample_size=32, | |
| in_channels=4, | |
| out_channels=4, | |
| down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), | |
| up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), | |
| cross_attention_dim=32, | |
| ) | |
| return model | |
| def dummy_vae(self): | |
| torch.manual_seed(0) | |
| model = AutoencoderKL( | |
| block_out_channels=[32, 64], | |
| in_channels=3, | |
| out_channels=3, | |
| down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], | |
| up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], | |
| latent_channels=4, | |
| ) | |
| return model | |
| def dummy_text_encoder(self): | |
| torch.manual_seed(0) | |
| config = RobertaSeriesConfig( | |
| hidden_size=32, | |
| project_dim=32, | |
| intermediate_size=37, | |
| layer_norm_eps=1e-05, | |
| num_attention_heads=4, | |
| num_hidden_layers=5, | |
| pad_token_id=1, | |
| vocab_size=5006, | |
| ) | |
| return RobertaSeriesModelWithTransformation(config) | |
| def dummy_extractor(self): | |
| def extract(*args, **kwargs): | |
| class Out: | |
| def __init__(self): | |
| self.pixel_values = torch.ones([0]) | |
| def to(self, device): | |
| self.pixel_values.to(device) | |
| return self | |
| return Out() | |
| return extract | |
| def test_stable_diffusion_img2img_default_case(self): | |
| device = "cpu" # ensure determinism for the device-dependent torch.Generator | |
| unet = self.dummy_cond_unet | |
| scheduler = PNDMScheduler(skip_prk_steps=True) | |
| vae = self.dummy_vae | |
| bert = self.dummy_text_encoder | |
| tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta") | |
| tokenizer.model_max_length = 77 | |
| init_image = self.dummy_image.to(device) | |
| # make sure here that pndm scheduler skips prk | |
| alt_pipe = AltDiffusionImg2ImgPipeline( | |
| unet=unet, | |
| scheduler=scheduler, | |
| vae=vae, | |
| text_encoder=bert, | |
| tokenizer=tokenizer, | |
| safety_checker=None, | |
| feature_extractor=self.dummy_extractor, | |
| ) | |
| alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False) | |
| alt_pipe = alt_pipe.to(device) | |
| alt_pipe.set_progress_bar_config(disable=None) | |
| prompt = "A painting of a squirrel eating a burger" | |
| generator = torch.Generator(device=device).manual_seed(0) | |
| output = alt_pipe( | |
| [prompt], | |
| generator=generator, | |
| guidance_scale=6.0, | |
| num_inference_steps=2, | |
| output_type="np", | |
| image=init_image, | |
| ) | |
| image = output.images | |
| generator = torch.Generator(device=device).manual_seed(0) | |
| image_from_tuple = alt_pipe( | |
| [prompt], | |
| generator=generator, | |
| guidance_scale=6.0, | |
| num_inference_steps=2, | |
| output_type="np", | |
| image=init_image, | |
| return_dict=False, | |
| )[0] | |
| image_slice = image[0, -3:, -3:, -1] | |
| image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] | |
| assert image.shape == (1, 32, 32, 3) | |
| expected_slice = np.array([0.4115, 0.3870, 0.4089, 0.4807, 0.4668, 0.4144, 0.4151, 0.4721, 0.4569]) | |
| assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 | |
| assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-3 | |
| def test_stable_diffusion_img2img_fp16(self): | |
| """Test that stable diffusion img2img works with fp16""" | |
| unet = self.dummy_cond_unet | |
| scheduler = PNDMScheduler(skip_prk_steps=True) | |
| vae = self.dummy_vae | |
| bert = self.dummy_text_encoder | |
| tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta") | |
| tokenizer.model_max_length = 77 | |
| init_image = self.dummy_image.to(torch_device) | |
| # put models in fp16 | |
| unet = unet.half() | |
| vae = vae.half() | |
| bert = bert.half() | |
| # make sure here that pndm scheduler skips prk | |
| alt_pipe = AltDiffusionImg2ImgPipeline( | |
| unet=unet, | |
| scheduler=scheduler, | |
| vae=vae, | |
| text_encoder=bert, | |
| tokenizer=tokenizer, | |
| safety_checker=None, | |
| feature_extractor=self.dummy_extractor, | |
| ) | |
| alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False) | |
| alt_pipe = alt_pipe.to(torch_device) | |
| alt_pipe.set_progress_bar_config(disable=None) | |
| prompt = "A painting of a squirrel eating a burger" | |
| generator = torch.manual_seed(0) | |
| image = alt_pipe( | |
| [prompt], | |
| generator=generator, | |
| num_inference_steps=2, | |
| output_type="np", | |
| image=init_image, | |
| ).images | |
| assert image.shape == (1, 32, 32, 3) | |
| def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): | |
| init_image = load_image( | |
| "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" | |
| "/img2img/sketch-mountains-input.jpg" | |
| ) | |
| # resize to resolution that is divisible by 8 but not 16 or 32 | |
| init_image = init_image.resize((760, 504)) | |
| model_id = "BAAI/AltDiffusion" | |
| pipe = AltDiffusionImg2ImgPipeline.from_pretrained( | |
| model_id, | |
| safety_checker=None, | |
| ) | |
| pipe.to(torch_device) | |
| pipe.set_progress_bar_config(disable=None) | |
| pipe.enable_attention_slicing() | |
| prompt = "A fantasy landscape, trending on artstation" | |
| generator = torch.manual_seed(0) | |
| output = pipe( | |
| prompt=prompt, | |
| image=init_image, | |
| strength=0.75, | |
| guidance_scale=7.5, | |
| generator=generator, | |
| output_type="np", | |
| ) | |
| image = output.images[0] | |
| image_slice = image[255:258, 383:386, -1] | |
| assert image.shape == (504, 760, 3) | |
| expected_slice = np.array([0.9358, 0.9397, 0.9599, 0.9901, 1.0000, 1.0000, 0.9882, 1.0000, 1.0000]) | |
| assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 | |
| class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase): | |
| def tearDown(self): | |
| # clean up the VRAM after each test | |
| super().tearDown() | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| def test_stable_diffusion_img2img_pipeline_default(self): | |
| init_image = load_image( | |
| "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" | |
| "/img2img/sketch-mountains-input.jpg" | |
| ) | |
| init_image = init_image.resize((768, 512)) | |
| expected_image = load_numpy( | |
| "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy" | |
| ) | |
| model_id = "BAAI/AltDiffusion" | |
| pipe = AltDiffusionImg2ImgPipeline.from_pretrained( | |
| model_id, | |
| safety_checker=None, | |
| ) | |
| pipe.to(torch_device) | |
| pipe.set_progress_bar_config(disable=None) | |
| pipe.enable_attention_slicing() | |
| prompt = "A fantasy landscape, trending on artstation" | |
| generator = torch.manual_seed(0) | |
| output = pipe( | |
| prompt=prompt, | |
| image=init_image, | |
| strength=0.75, | |
| guidance_scale=7.5, | |
| generator=generator, | |
| output_type="np", | |
| ) | |
| image = output.images[0] | |
| assert image.shape == (512, 768, 3) | |
| # img2img is flaky across GPUs even in fp32, so using MAE here | |
| assert np.abs(expected_image - image).max() < 1e-3 | |