Spaces:

buxiangzhiren
/

GeoRemover

Running on Zero

File size: 4,554 Bytes

5a0778e

import gc
import unittest

import numpy as np
import torch
from transformers import AutoTokenizer

from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, OmniGenPipeline, OmniGenTransformer2DModel
from diffusers.utils.testing_utils import (
    numpy_cosine_similarity_distance,
    require_torch_gpu,
    slow,
    torch_device,
)

from ..test_pipelines_common import PipelineTesterMixin


class OmniGenPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
    pipeline_class = OmniGenPipeline
    params = frozenset(["prompt", "guidance_scale"])
    batch_params = frozenset(["prompt"])

    test_layerwise_casting = True

    def get_dummy_components(self):
        torch.manual_seed(0)

        transformer = OmniGenTransformer2DModel(
            hidden_size=16,
            num_attention_heads=4,
            num_key_value_heads=4,
            intermediate_size=32,
            num_layers=1,
            in_channels=4,
            time_step_dim=4,
            rope_scaling={"long_factor": list(range(1, 3)), "short_factor": list(range(1, 3))},
        )

        torch.manual_seed(0)
        vae = AutoencoderKL(
            sample_size=32,
            in_channels=3,
            out_channels=3,
            block_out_channels=(4, 4, 4, 4),
            layers_per_block=1,
            latent_channels=4,
            norm_num_groups=1,
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
        )

        scheduler = FlowMatchEulerDiscreteScheduler(invert_sigmas=True, num_train_timesteps=1)
        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")

        components = {
            "transformer": transformer,
            "vae": vae,
            "scheduler": scheduler,
            "tokenizer": tokenizer,
        }
        return components

    def get_dummy_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device="cpu").manual_seed(seed)

        inputs = {
            "prompt": "A painting of a squirrel eating a burger",
            "generator": generator,
            "num_inference_steps": 1,
            "guidance_scale": 3.0,
            "output_type": "np",
            "height": 16,
            "width": 16,
        }
        return inputs

    def test_inference(self):
        pipe = self.pipeline_class(**self.get_dummy_components()).to(torch_device)

        inputs = self.get_dummy_inputs(torch_device)
        generated_image = pipe(**inputs).images[0]

        self.assertEqual(generated_image.shape, (16, 16, 3))


@slow
@require_torch_gpu
class OmniGenPipelineSlowTests(unittest.TestCase):
    pipeline_class = OmniGenPipeline
    repo_id = "shitao/OmniGen-v1-diffusers"

    def setUp(self):
        super().setUp()
        gc.collect()
        torch.cuda.empty_cache()

    def tearDown(self):
        super().tearDown()
        gc.collect()
        torch.cuda.empty_cache()

    def get_inputs(self, device, seed=0):
        if str(device).startswith("mps"):
            generator = torch.manual_seed(seed)
        else:
            generator = torch.Generator(device="cpu").manual_seed(seed)

        return {
            "prompt": "A photo of a cat",
            "num_inference_steps": 2,
            "guidance_scale": 2.5,
            "output_type": "np",
            "generator": generator,
        }

    def test_omnigen_inference(self):
        pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.bfloat16)
        pipe.enable_model_cpu_offload()

        inputs = self.get_inputs(torch_device)

        image = pipe(**inputs).images[0]
        image_slice = image[0, :10, :10]

        expected_slice = np.array(
            [
                [0.1783447, 0.16772744, 0.14339337],
                [0.17066911, 0.15521264, 0.13757327],
                [0.17072496, 0.15531206, 0.13524258],
                [0.16746324, 0.1564025, 0.13794944],
                [0.16490817, 0.15258026, 0.13697758],
                [0.16971767, 0.15826806, 0.13928896],
                [0.16782972, 0.15547255, 0.13783783],
                [0.16464645, 0.15281534, 0.13522372],
                [0.16535294, 0.15301755, 0.13526791],
                [0.16365296, 0.15092957, 0.13443318],
            ],
            dtype=np.float32,
        )

        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())

        assert max_diff < 1e-4