UDiffText

Build error

App Files Files Community

ZYMPKU commited on Dec 11, 2023

Commit

251f521

1 Parent(s): a34ca7e

v 2.0

Browse files

Files changed (23) hide show

.gitignore +1 -2
app.py +14 -26
checkpoints/{st-step=100000+la-step=100000-simp.ckpt → st-step=100000+la-step=100000-v2.ckpt} +2 -2
configs/demo.yaml +2 -3
configs/test/textdesign_sd_2.yaml +17 -23
sgm/models/diffusion.py +1 -0
sgm/modules/__init__.py +1 -1
sgm/modules/attention.py +62 -622
sgm/modules/diffusionmodules/__init__.py +1 -1
sgm/modules/diffusionmodules/guiders.py +1 -29
sgm/modules/diffusionmodules/loss.py +59 -9
sgm/modules/diffusionmodules/openaimodel.py +193 -1638
sgm/modules/diffusionmodules/sampling.py +1 -183
sgm/modules/diffusionmodules/sampling_utils.py +1 -4
sgm/modules/diffusionmodules/wrappers.py +2 -2
sgm/modules/encoders/modules.py +43 -50
temp/attn_map/attn_map_3.png +0 -0
temp/attn_map/attn_map_4.png +0 -0
temp/attn_map/attn_map_5.png +0 -0
temp/seg_map/seg_3.npy +3 -0
temp/seg_map/seg_4.npy +3 -0
temp/seg_map/seg_5.npy +3 -0
util.py +25 -84

.gitignore CHANGED Viewed

	@@ -1,2 +1 @@
1	- **/__pycache__
2	- process.ipynb


1	+ **/__pycache__

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from omegaconf import OmegaConf
 from contextlib import nullcontext
 from pytorch_lightning import seed_everything
 from os.path import join as ospj
 from util import *
@@ -18,30 +18,17 @@ def predict(cfgs, model, sampler, batch):
     with context():
-        batch, batch_uc_1, batch_uc_2 = prepare_batch(cfgs, batch)
-        if cfgs.dual_conditioner:
-            c, uc_1, uc_2 = model.conditioner.get_unconditional_conditioning(
-                batch,
-                batch_uc_1=batch_uc_1,
-                batch_uc_2=batch_uc_2,
-                force_uc_zero_embeddings=cfgs.force_uc_zero_embeddings,
-            )
-        else:
-            c, uc_1 = model.conditioner.get_unconditional_conditioning(
-                batch,
-                batch_uc=batch_uc_1,
-                force_uc_zero_embeddings=cfgs.force_uc_zero_embeddings,
-            )
-        if cfgs.dual_conditioner:
-            x = sampler.get_init_noise(cfgs, model, cond=c, batch=batch, uc_1=uc_1, uc_2=uc_2)
-            samples_z = sampler(model, x, cond=c, batch=batch, uc_1=uc_1, uc_2=uc_2, init_step=0,
-                                aae_enabled = cfgs.aae_enabled, detailed = cfgs.detailed)
-        else:
-            x = sampler.get_init_noise(cfgs, model, cond=c, batch=batch, uc=uc_1)
-            samples_z = sampler(model, x, cond=c, batch=batch, uc=uc_1, init_step=0,
-                                aae_enabled = cfgs.aae_enabled, detailed = cfgs.detailed)
         samples_x = model.decode_first_stage(samples_z)
         samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
@@ -131,6 +118,7 @@ def demo_predict(input_blk, text, num_samples, steps, scale, seed, show_detail):
 if __name__ == "__main__":
     os.makedirs("./temp/attn_map", exist_ok=True)
     os.makedirs("./temp/seg_map", exist_ok=True)
@@ -151,7 +139,7 @@ if __name__ == "__main__":
                     UDiffText: A Unified Framework for High-quality Text Synthesis in Arbitrary Images via Character-aware Diffusion Models
                 </h1>
                 <ul style="text-align: center; margin: 0.5rem;">
-                    <li style="display: inline-block; margin:auto;"><a href='https://arxiv.org/pdf/******'><img src='https://img.shields.io/badge/Arxiv-******-DF826C'></a></li>
                     <li style="display: inline-block; margin:auto;"><a href='https://github.com/ZYM-PKU/UDiffText'><img src='https://img.shields.io/badge/Code-UDiffText-D0F288'></a></li>
                     <li style="display: inline-block; margin:auto;"><a href='https://udifftext.github.io'><img src='https://img.shields.io/badge/Project-UDiffText-8ADAB2'></a></li>
                 </ul>
@@ -177,7 +165,7 @@ if __name__ == "__main__":
                     steps = gr.Slider(label="Steps", info ="denoising sampling steps", minimum=1, maximum=200, value=50, step=1)
                     scale = gr.Slider(label="Guidance Scale", info="the scale of classifier-free guidance (CFG)", minimum=0.0, maximum=10.0, value=4.0, step=0.1)
                     seed = gr.Slider(label="Seed", info="random seed for noise initialization", minimum=0, maximum=2147483647, step=1, randomize=True)
-                    show_detail = gr.Checkbox(label="Show Detail", info="show the additional visualization results", value=True)
             with gr.Column():

 from contextlib import nullcontext
 from pytorch_lightning import seed_everything
 from os.path import join as ospj
 from util import *
     with context():
+        batch, batch_uc_1 = prepare_batch(cfgs, batch)
+        c, uc_1 = model.conditioner.get_unconditional_conditioning(
+            batch,
+            batch_uc=batch_uc_1,
+            force_uc_zero_embeddings=cfgs.force_uc_zero_embeddings,
+        )
+        x = sampler.get_init_noise(cfgs, model, cond=c, batch=batch, uc=uc_1)
+        samples_z = sampler(model, x, cond=c, batch=batch, uc=uc_1, init_step=0,
+                            aae_enabled = cfgs.aae_enabled, detailed = cfgs.detailed)
         samples_x = model.decode_first_stage(samples_z)
         samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
 if __name__ == "__main__":
+    os.makedirs("./temp", exist_ok=True)
     os.makedirs("./temp/attn_map", exist_ok=True)
     os.makedirs("./temp/seg_map", exist_ok=True)
                     UDiffText: A Unified Framework for High-quality Text Synthesis in Arbitrary Images via Character-aware Diffusion Models
                 </h1>
                 <ul style="text-align: center; margin: 0.5rem;">
+                    <li style="display: inline-block; margin:auto;"><a href='https://arxiv.org/abs/2312.04884'><img src='https://img.shields.io/badge/Arxiv-2312.04884-DF826C'></a></li>
                     <li style="display: inline-block; margin:auto;"><a href='https://github.com/ZYM-PKU/UDiffText'><img src='https://img.shields.io/badge/Code-UDiffText-D0F288'></a></li>
                     <li style="display: inline-block; margin:auto;"><a href='https://udifftext.github.io'><img src='https://img.shields.io/badge/Project-UDiffText-8ADAB2'></a></li>
                 </ul>
                     steps = gr.Slider(label="Steps", info ="denoising sampling steps", minimum=1, maximum=200, value=50, step=1)
                     scale = gr.Slider(label="Guidance Scale", info="the scale of classifier-free guidance (CFG)", minimum=0.0, maximum=10.0, value=4.0, step=0.1)
                     seed = gr.Slider(label="Seed", info="random seed for noise initialization", minimum=0, maximum=2147483647, step=1, randomize=True)
+                    show_detail = gr.Checkbox(label="Show Detail", info="show the additional visualization results", value=False)
             with gr.Column():

checkpoints/{st-step=100000+la-step=100000-simp.ckpt → st-step=100000+la-step=100000-v2.ckpt} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:968397df8910f3324d94ce3df7e9d70f1bf2415a46d22edef1a510885ee0648e
-size 2558065830

 version https://git-lfs.github.com/spec/v1
+oid sha256:b87a307ed6e240208b415166e88c0f3e6467ec9330836d70c6d662f423bfbc15
+size 4173692086

configs/demo.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 type: "demo"
 # path
-load_ckpt_path: "./checkpoints/st-step=100000+la-step=100000-simp.ckpt"
 model_cfg_path: "./configs/test/textdesign_sd_2.yaml"
 # param
@@ -16,8 +16,7 @@ scale: [4.0, 0.0] # content scale, style scale
 noise_iters: 10
 force_uc_zero_embeddings: ["ref", "label"]
 aae_enabled: False
-detailed: True
-dual_conditioner: False
 # runtime
 steps: 50

 type: "demo"
 # path
+load_ckpt_path: "./checkpoints/st-step=100000+la-step=100000-v2.ckpt"
 model_cfg_path: "./configs/test/textdesign_sd_2.yaml"
 # param
 noise_iters: 10
 force_uc_zero_embeddings: ["ref", "label"]
 aae_enabled: False
+detailed: False
 # runtime
 steps: 50

configs/test/textdesign_sd_2.yaml CHANGED Viewed

@@ -1,6 +1,8 @@
 model:
   target: sgm.models.diffusion.DiffusionEngine
   params:
     input_key: image
     scale_factor: 0.18215
     disable_first_stage_autocast: True
@@ -18,54 +20,45 @@ model:
           target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
     network_config:
-      target: sgm.modules.diffusionmodules.openaimodel.UNetAddModel
       params:
-        use_checkpoint: False
         in_channels: 9
         out_channels: 4
         ctrl_channels: 0
         model_channels: 320
         attention_resolutions: [4, 2, 1]
-        attn_type: add_attn
-        attn_layers:
-          - output_blocks.6.1
         num_res_blocks: 2
         channel_mult: [1, 2, 4, 4]
         num_head_channels: 64
-        use_spatial_transformer: True
         use_linear_in_transformer: True
         transformer_depth: 1
-        context_dim: 0
-        add_context_dim: 2048
-        legacy: False
     conditioner_config:
       target: sgm.modules.GeneralConditioner
       params:
         emb_models:
-          # crossattn cond
-          # - is_trainable: False
-          #   input_key: txt
-          #   target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-          #   params:
-          #     arch: ViT-H-14
-          #     version: ./checkpoints/encoders/OpenCLIP/ViT-H-14/open_clip_pytorch_model.bin
-          #     layer: penultimate
-          # add crossattn cond
           - is_trainable: False
             input_key: label
             target: sgm.modules.encoders.modules.LabelEncoder
             params:
-              is_add_embedder: True
               max_len: 12
               emb_dim: 2048
               n_heads: 8
               n_trans_layers: 12
-              ckpt_path: ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt # ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt
           # concat cond
           - is_trainable: False
             input_key: mask
-            target: sgm.modules.encoders.modules.IdentityEncoder
           - is_trainable: False
             input_key: masked
             target: sgm.modules.encoders.modules.LatentEncoder
@@ -95,6 +88,7 @@ model:
     first_stage_config:
       target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
       params:
         embed_dim: 4
         monitor: val/rec_loss
         ddconfig:
@@ -117,9 +111,9 @@ model:
       params:
         seq_len: 12
         kernel_size: 3
-        gaussian_sigma: 0.5
         min_attn_size: 16
-        lambda_local_loss: 0.02
         lambda_ocr_loss: 0.001
         ocr_enabled: False

 model:
   target: sgm.models.diffusion.DiffusionEngine
   params:
+    opt_keys:
+      - t_attn
     input_key: image
     scale_factor: 0.18215
     disable_first_stage_autocast: True
           target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
     network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UnifiedUNetModel
       params:
         in_channels: 9
         out_channels: 4
         ctrl_channels: 0
         model_channels: 320
         attention_resolutions: [4, 2, 1]
+        save_attn_type: [t_attn]
+        save_attn_layers: [output_blocks.6.1]
         num_res_blocks: 2
         channel_mult: [1, 2, 4, 4]
         num_head_channels: 64
         use_linear_in_transformer: True
         transformer_depth: 1
+        t_context_dim: 2048
     conditioner_config:
       target: sgm.modules.GeneralConditioner
       params:
         emb_models:
+          # textual crossattn cond
           - is_trainable: False
+            emb_key: t_crossattn
+            ucg_rate: 0.1
             input_key: label
             target: sgm.modules.encoders.modules.LabelEncoder
             params:
               max_len: 12
               emb_dim: 2048
               n_heads: 8
               n_trans_layers: 12
+              ckpt_path: ./checkpoints/encoders/LabelEncoder/epoch=19-step=7820.ckpt
           # concat cond
           - is_trainable: False
             input_key: mask
+            target: sgm.modules.encoders.modules.SpatialRescaler
+            params:
+              in_channels: 1
+              multiplier: 0.125
           - is_trainable: False
             input_key: masked
             target: sgm.modules.encoders.modules.LatentEncoder
     first_stage_config:
       target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
       params:
+        ckpt_path: ./checkpoints/AEs/AE_inpainting_2.safetensors
         embed_dim: 4
         monitor: val/rec_loss
         ddconfig:
       params:
         seq_len: 12
         kernel_size: 3
+        gaussian_sigma: 1.0
         min_attn_size: 16
+        lambda_local_loss: 0.01
         lambda_ocr_loss: 0.001
         ocr_enabled: False

sgm/models/diffusion.py CHANGED Viewed

@@ -5,6 +5,7 @@ import pytorch_lightning as pl
 import torch
 from omegaconf import ListConfig, OmegaConf
 from safetensors.torch import load_file as load_safetensors
 from ..modules import UNCONDITIONAL_CONFIG
 from ..modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER

 import torch
 from omegaconf import ListConfig, OmegaConf
 from safetensors.torch import load_file as load_safetensors
+from torch.optim.lr_scheduler import LambdaLR
 from ..modules import UNCONDITIONAL_CONFIG
 from ..modules.diffusionmodules.wrappers import OPENAIUNETWRAPPER

sgm/modules/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from .encoders.modules import GeneralConditioner, DualConditioner
 UNCONDITIONAL_CONFIG = {
     "target": "sgm.modules.GeneralConditioner",

+from .encoders.modules import GeneralConditioner
 UNCONDITIONAL_CONFIG = {
     "target": "sgm.modules.GeneralConditioner",

sgm/modules/attention.py CHANGED Viewed

@@ -5,53 +5,15 @@ from typing import Any, Optional
 import torch
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from packaging import version
 from torch import nn, einsum
-if version.parse(torch.__version__) >= version.parse("2.0.0"):
-    SDP_IS_AVAILABLE = True
-    from torch.backends.cuda import SDPBackend, sdp_kernel
-    BACKEND_MAP = {
-        SDPBackend.MATH: {
-            "enable_math": True,
-            "enable_flash": False,
-            "enable_mem_efficient": False,
-        },
-        SDPBackend.FLASH_ATTENTION: {
-            "enable_math": False,
-            "enable_flash": True,
-            "enable_mem_efficient": False,
-        },
-        SDPBackend.EFFICIENT_ATTENTION: {
-            "enable_math": False,
-            "enable_flash": False,
-            "enable_mem_efficient": True,
-        },
-        None: {"enable_math": True, "enable_flash": True, "enable_mem_efficient": True},
-    }
-else:
-    from contextlib import nullcontext
-    SDP_IS_AVAILABLE = False
-    sdp_kernel = nullcontext
-    BACKEND_MAP = {}
-    print(
-        f"No SDP backend available, likely because you are running in pytorch versions < 2.0. In fact, "
-        f"you are using PyTorch {torch.__version__}. You might want to consider upgrading."
-    )
 try:
     import xformers
     import xformers.ops
     XFORMERS_IS_AVAILABLE = True
 except:
     XFORMERS_IS_AVAILABLE = False
-    print("no module 'xformers'. Processing without...")
-from .diffusionmodules.util import checkpoint
 def exists(val):
@@ -146,51 +108,6 @@ class LinearAttention(nn.Module):
         return self.to_out(out)
-class SpatialSelfAttention(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.in_channels = in_channels
-        self.norm = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.k = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.v = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.proj_out = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-    def forward(self, x):
-        h_ = x
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-        # compute attention
-        b, c, h, w = q.shape
-        q = rearrange(q, "b c h w -> b (h w) c")
-        k = rearrange(k, "b c h w -> b c (h w)")
-        w_ = torch.einsum("bij,bjk->bik", q, k)
-        w_ = w_ * (int(c) ** (-0.5))
-        w_ = torch.nn.functional.softmax(w_, dim=2)
-        # attend to values
-        v = rearrange(v, "b c h w -> b c (h w)")
-        w_ = rearrange(w_, "b i j -> b j i")
-        h_ = torch.einsum("bij,bjk->bik", v, w_)
-        h_ = rearrange(h_, "b c (h w) -> b c h w", h=h)
-        h_ = self.proj_out(h_)
-        return x + h_
 class CrossAttention(nn.Module):
     def __init__(
         self,
@@ -198,8 +115,7 @@ class CrossAttention(nn.Module):
         context_dim=None,
         heads=8,
         dim_head=64,
-        dropout=0.0,
-        backend=None,
     ):
         super().__init__()
         inner_dim = dim_head * heads
@@ -212,60 +128,38 @@ class CrossAttention(nn.Module):
         self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
         self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
-        self.to_out = zero_module(nn.Sequential(
-            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
-        ))
-        self.backend = backend
         self.attn_map_cache = None
     def forward(
         self,
         x,
-        context=None,
-        mask=None,
-        additional_tokens=None,
-        n_times_crossframe_attn_in_self=0,
     ):
         h = self.heads
-        if additional_tokens is not None:
-            # get the number of masked tokens at the beginning of the output sequence
-            n_tokens_to_mask = additional_tokens.shape[1]
-            # add additional token
-            x = torch.cat([additional_tokens, x], dim=1)
         q = self.to_q(x)
         context = default(context, x)
         k = self.to_k(context)
         v = self.to_v(context)
-        if n_times_crossframe_attn_in_self:
-            # reprogramming cross-frame attention as in https://arxiv.org/abs/2303.13439
-            assert x.shape[0] % n_times_crossframe_attn_in_self == 0
-            n_cp = x.shape[0] // n_times_crossframe_attn_in_self
-            k = repeat(
-                k[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
-            )
-            v = repeat(
-                v[::n_times_crossframe_attn_in_self], "b ... -> (b n) ...", n=n_cp
-            )
         q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
         ## old
         sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         del q, k
-        if exists(mask):
-            mask = rearrange(mask, 'b ... -> b (...)')
-            max_neg_value = -torch.finfo(sim.dtype).max
-            mask = repeat(mask, 'b j -> (b h) () j', h=h)
-            sim.masked_fill_(~mask, max_neg_value)
         # attention, what we cannot get enough of
-        sim = sim.softmax(dim=-1)
         # save attn_map
         if self.attn_map_cache is not None:
@@ -276,20 +170,7 @@ class CrossAttention(nn.Module):
         out = einsum('b i j, b j d -> b i d', sim, v)
         out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
-        ## new
-        # with sdp_kernel(**BACKEND_MAP[self.backend]):
-        #     # print("dispatching into backend", self.backend, "q/k/v shape: ", q.shape, k.shape, v.shape)
-        #     out = F.scaled_dot_product_attention(
-        #         q, k, v, attn_mask=mask
-        #     )  # scale is dim_head ** -0.5 per default
-        # del q, k, v
-        # out = rearrange(out, "b h n d -> b n (h d)", h=h)
-        if additional_tokens is not None:
-            # remove additional token
-            out = out[:, n_tokens_to_mask:]
         return self.to_out(out)
@@ -382,10 +263,6 @@ class MemoryEfficientCrossAttention(nn.Module):
 class BasicTransformerBlock(nn.Module):
-    ATTENTION_MODES = {
-        "softmax": CrossAttention,  # vanilla attention
-        "softmax-xformers": MemoryEfficientCrossAttention,  # ampere
-    }
     def __init__(
         self,
@@ -393,169 +270,78 @@ class BasicTransformerBlock(nn.Module):
         n_heads,
         d_head,
         dropout=0.0,
-        context_dim=None,
-        add_context_dim=None,
-        gated_ff=True,
-        checkpoint=True,
-        disable_self_attn=False,
-        attn_mode="softmax",
-        sdp_backend=None,
     ):
         super().__init__()
-        assert attn_mode in self.ATTENTION_MODES
-        if attn_mode != "softmax" and not XFORMERS_IS_AVAILABLE:
-            print(
-                f"Attention mode '{attn_mode}' is not available. Falling back to native attention. "
-                f"This is not a problem in Pytorch >= 2.0. FYI, you are running with PyTorch version {torch.__version__}"
-            )
-            attn_mode = "softmax"
-        elif attn_mode == "softmax" and not SDP_IS_AVAILABLE:
-            print(
-                "We do not support vanilla attention anymore, as it is too expensive. Sorry."
-            )
-            if not XFORMERS_IS_AVAILABLE:
-                assert (
-                    False
-                ), "Please install xformers via e.g. 'pip install xformers==0.0.16'"
-            else:
-                print("Falling back to xformers efficient attention.")
-                attn_mode = "softmax-xformers"
-        attn_cls = self.ATTENTION_MODES[attn_mode]
-        if version.parse(torch.__version__) >= version.parse("2.0.0"):
-            assert sdp_backend is None or isinstance(sdp_backend, SDPBackend)
-        else:
-            assert sdp_backend is None
-        self.disable_self_attn = disable_self_attn
-        self.attn1 = attn_cls(
             query_dim=dim,
             heads=n_heads,
             dim_head=d_head,
             dropout=dropout,
-            context_dim=context_dim if self.disable_self_attn else None,
-            backend=sdp_backend,
-        )  # is a self-attention if not self.disable_self_attn
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        if context_dim is not None and context_dim > 0:
-            self.attn2 = attn_cls(
                 query_dim=dim,
-                context_dim=context_dim,
                 heads=n_heads,
                 dim_head=d_head,
-                dropout=dropout,
-                backend=sdp_backend,
-            )  # is self-attn if context is none
-        if add_context_dim is not None and add_context_dim > 0:
-            self.add_attn = attn_cls(
                 query_dim=dim,
-                context_dim=add_context_dim,
                 heads=n_heads,
                 dim_head=d_head,
-                dropout=dropout,
-                backend=sdp_backend,
-            )  # is self-attn if context is none
-            self.add_norm = nn.LayerNorm(dim)
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
-        self.checkpoint = checkpoint
-    def forward(
-        self, x, context=None, add_context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
-    ):
-        kwargs = {"x": x}
-        if context is not None:
-            kwargs.update({"context": context})
-        if additional_tokens is not None:
-            kwargs.update({"additional_tokens": additional_tokens})
-        if n_times_crossframe_attn_in_self:
-            kwargs.update(
-                {"n_times_crossframe_attn_in_self": n_times_crossframe_attn_in_self}
             )
-        return checkpoint(
-            self._forward, (x, context, add_context), self.parameters(), self.checkpoint
-        )
-    def _forward(
-        self, x, context=None, add_context=None, additional_tokens=None, n_times_crossframe_attn_in_self=0
-    ):
         x = (
             self.attn1(
                 self.norm1(x),
-                context=context if self.disable_self_attn else None,
-                additional_tokens=additional_tokens,
-                n_times_crossframe_attn_in_self=n_times_crossframe_attn_in_self
-                if not self.disable_self_attn
-                else 0,
             )
             + x
         )
-        if hasattr(self, "attn2"):
             x = (
-                self.attn2(
-                    self.norm2(x), context=context, additional_tokens=additional_tokens
                 )
                 + x
             )
-        if hasattr(self, "add_attn"):
             x = (
-                self.add_attn(
-                    self.add_norm(x), context=add_context, additional_tokens=additional_tokens
                 )
                 + x
             )
-        x = self.ff(self.norm3(x)) + x
-        return x
-class BasicTransformerSingleLayerBlock(nn.Module):
-    ATTENTION_MODES = {
-        "softmax": CrossAttention,  # vanilla attention
-        "softmax-xformers": MemoryEfficientCrossAttention  # on the A100s not quite as fast as the above version
-        # (todo might depend on head_dim, check, falls back to semi-optimized kernels for dim!=[16,32,64,128])
-    }
-    def __init__(
-        self,
-        dim,
-        n_heads,
-        d_head,
-        dropout=0.0,
-        context_dim=None,
-        gated_ff=True,
-        checkpoint=True,
-        attn_mode="softmax",
-    ):
-        super().__init__()
-        assert attn_mode in self.ATTENTION_MODES
-        attn_cls = self.ATTENTION_MODES[attn_mode]
-        self.attn1 = attn_cls(
-            query_dim=dim,
-            heads=n_heads,
-            dim_head=d_head,
-            dropout=dropout,
-            context_dim=context_dim,
-        )
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.checkpoint = checkpoint
-    def forward(self, x, context=None):
-        return checkpoint(
-            self._forward, (x, context), self.parameters(), self.checkpoint
-        )
-    def _forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), context=context) + x
-        x = self.ff(self.norm2(x)) + x
         return x
-class  SpatialTransformer(nn.Module):
     """
     Transformer block for image-like data.
     First, project the input (aka embedding)
@@ -572,36 +358,12 @@ class  SpatialTransformer(nn.Module):
         d_head,
         depth=1,
         dropout=0.0,
-        context_dim=None,
-        add_context_dim=None,
-        disable_self_attn=False,
-        use_linear=False,
-        attn_type="softmax",
-        use_checkpoint=True,
-        # sdp_backend=SDPBackend.FLASH_ATTENTION
-        sdp_backend=None,
     ):
         super().__init__()
-        # print(
-        #     f"constructing {self.__class__.__name__} of depth {depth} w/ {in_channels} channels and {n_heads} heads"
-        # )
-        from omegaconf import ListConfig
-        if exists(context_dim) and not isinstance(context_dim, (list, ListConfig)):
-            context_dim = [context_dim]
-        if exists(context_dim) and isinstance(context_dim, list):
-            if depth != len(context_dim):
-                # print(
-                #     f"WARNING: {self.__class__.__name__}: Found context dims {context_dim} of depth {len(context_dim)}, "
-                #     f"which does not match the specified 'depth' of {depth}. Setting context_dim to {depth * [context_dim[0]]} now."
-                # )
-                # depth does not match context dims.
-                assert all(
-                    map(lambda x: x == context_dim[0], context_dim)
-                ), "need homogenous context_dim to match depth automatically"
-                context_dim = depth * [context_dim[0]]
-        elif context_dim is None:
-            context_dim = [None] * depth
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)
@@ -619,12 +381,8 @@ class  SpatialTransformer(nn.Module):
                     n_heads,
                     d_head,
                     dropout=dropout,
-                    context_dim=context_dim[d],
-                    add_context_dim=add_context_dim,
-                    disable_self_attn=disable_self_attn,
-                    attn_mode=attn_type,
-                    checkpoint=use_checkpoint,
-                    sdp_backend=sdp_backend,
                 )
                 for d in range(depth)
             ]
@@ -634,14 +392,11 @@ class  SpatialTransformer(nn.Module):
                 nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
             )
         else:
-            # self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
             self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
         self.use_linear = use_linear
-    def forward(self, x, context=None, add_context=None):
-        # note: if no context is given, cross-attention defaults to self-attention
-        if not isinstance(context, list):
-            context = [context]
         b, c, h, w = x.shape
         x_in = x
         x = self.norm(x)
@@ -651,326 +406,11 @@ class  SpatialTransformer(nn.Module):
         if self.use_linear:
             x = self.proj_in(x)
         for i, block in enumerate(self.transformer_blocks):
-            if i > 0 and len(context) == 1:
-                i = 0  # use same context for each block
-            x = block(x, context=context[i], add_context=add_context)
         if self.use_linear:
             x = self.proj_out(x)
         x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
         if not self.use_linear:
             x = self.proj_out(x)
-        return x + x_in
-def benchmark_attn():
-    # Lets define a helpful benchmarking function:
-    # https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    import torch.nn.functional as F
-    import torch.utils.benchmark as benchmark
-    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
-        t0 = benchmark.Timer(
-            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
-        )
-        return t0.blocked_autorange().mean * 1e6
-    # Lets define the hyper-parameters of our input
-    batch_size = 32
-    max_sequence_len = 1024
-    num_heads = 32
-    embed_dimension = 32
-    dtype = torch.float16
-    query = torch.rand(
-        batch_size,
-        num_heads,
-        max_sequence_len,
-        embed_dimension,
-        device=device,
-        dtype=dtype,
-    )
-    key = torch.rand(
-        batch_size,
-        num_heads,
-        max_sequence_len,
-        embed_dimension,
-        device=device,
-        dtype=dtype,
-    )
-    value = torch.rand(
-        batch_size,
-        num_heads,
-        max_sequence_len,
-        embed_dimension,
-        device=device,
-        dtype=dtype,
-    )
-    print(f"q/k/v shape:", query.shape, key.shape, value.shape)
-    # Lets explore the speed of each of the 3 implementations
-    from torch.backends.cuda import SDPBackend, sdp_kernel
-    # Helpful arguments mapper
-    backend_map = {
-        SDPBackend.MATH: {
-            "enable_math": True,
-            "enable_flash": False,
-            "enable_mem_efficient": False,
-        },
-        SDPBackend.FLASH_ATTENTION: {
-            "enable_math": False,
-            "enable_flash": True,
-            "enable_mem_efficient": False,
-        },
-        SDPBackend.EFFICIENT_ATTENTION: {
-            "enable_math": False,
-            "enable_flash": False,
-            "enable_mem_efficient": True,
-        },
-    }
-    from torch.profiler import ProfilerActivity, profile, record_function
-    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
-    print(
-        f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
-    )
-    with profile(
-        activities=activities, record_shapes=False, profile_memory=True
-    ) as prof:
-        with record_function("Default detailed stats"):
-            for _ in range(25):
-                o = F.scaled_dot_product_attention(query, key, value)
-    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-    print(
-        f"The math implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
-    )
-    with sdp_kernel(**backend_map[SDPBackend.MATH]):
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("Math implmentation stats"):
-                for _ in range(25):
-                    o = F.scaled_dot_product_attention(query, key, value)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-    with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
-        try:
-            print(
-                f"The flash attention implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
-            )
-        except RuntimeError:
-            print("FlashAttention is not supported. See warnings for reasons.")
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("FlashAttention stats"):
-                for _ in range(25):
-                    o = F.scaled_dot_product_attention(query, key, value)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-    with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
-        try:
-            print(
-                f"The memory efficient implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
-            )
-        except RuntimeError:
-            print("EfficientAttention is not supported. See warnings for reasons.")
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("EfficientAttention stats"):
-                for _ in range(25):
-                    o = F.scaled_dot_product_attention(query, key, value)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-def run_model(model, x, context):
-    return model(x, context)
-def benchmark_transformer_blocks():
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    import torch.utils.benchmark as benchmark
-    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
-        t0 = benchmark.Timer(
-            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
-        )
-        return t0.blocked_autorange().mean * 1e6
-    checkpoint = True
-    compile = False
-    batch_size = 32
-    h, w = 64, 64
-    context_len = 77
-    embed_dimension = 1024
-    context_dim = 1024
-    d_head = 64
-    transformer_depth = 4
-    n_heads = embed_dimension // d_head
-    dtype = torch.float16
-    model_native = SpatialTransformer(
-        embed_dimension,
-        n_heads,
-        d_head,
-        context_dim=context_dim,
-        use_linear=True,
-        use_checkpoint=checkpoint,
-        attn_type="softmax",
-        depth=transformer_depth,
-        sdp_backend=SDPBackend.FLASH_ATTENTION,
-    ).to(device)
-    model_efficient_attn = SpatialTransformer(
-        embed_dimension,
-        n_heads,
-        d_head,
-        context_dim=context_dim,
-        use_linear=True,
-        depth=transformer_depth,
-        use_checkpoint=checkpoint,
-        attn_type="softmax-xformers",
-    ).to(device)
-    if not checkpoint and compile:
-        print("compiling models")
-        model_native = torch.compile(model_native)
-        model_efficient_attn = torch.compile(model_efficient_attn)
-    x = torch.rand(batch_size, embed_dimension, h, w, device=device, dtype=dtype)
-    c = torch.rand(batch_size, context_len, context_dim, device=device, dtype=dtype)
-    from torch.profiler import ProfilerActivity, profile, record_function
-    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
-    with torch.autocast("cuda"):
-        print(
-            f"The native model runs in {benchmark_torch_function_in_microseconds(model_native.forward, x, c):.3f} microseconds"
-        )
-        print(
-            f"The efficientattn model runs in {benchmark_torch_function_in_microseconds(model_efficient_attn.forward, x, c):.3f} microseconds"
-        )
-        print(75 * "+")
-        print("NATIVE")
-        print(75 * "+")
-        torch.cuda.reset_peak_memory_stats()
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("NativeAttention stats"):
-                for _ in range(25):
-                    model_native(x, c)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by native block")
-        print(75 * "+")
-        print("Xformers")
-        print(75 * "+")
-        torch.cuda.reset_peak_memory_stats()
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("xformers stats"):
-                for _ in range(25):
-                    model_efficient_attn(x, c)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by xformers block")
-def test01():
-    # conv1x1 vs linear
-    from ..util import count_params
-    conv = nn.Conv2d(3, 32, kernel_size=1).cuda()
-    print(count_params(conv))
-    linear = torch.nn.Linear(3, 32).cuda()
-    print(count_params(linear))
-    print(conv.weight.shape)
-    # use same initialization
-    linear.weight = torch.nn.Parameter(conv.weight.squeeze(-1).squeeze(-1))
-    linear.bias = torch.nn.Parameter(conv.bias)
-    print(linear.weight.shape)
-    x = torch.randn(11, 3, 64, 64).cuda()
-    xr = rearrange(x, "b c h w -> b (h w) c").contiguous()
-    print(xr.shape)
-    out_linear = linear(xr)
-    print(out_linear.mean(), out_linear.shape)
-    out_conv = conv(x)
-    print(out_conv.mean(), out_conv.shape)
-    print("done with test01.\n")
-def test02():
-    # try cosine flash attention
-    import time
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.backends.cudnn.benchmark = True
-    print("testing cosine flash attention...")
-    DIM = 1024
-    SEQLEN = 4096
-    BS = 16
-    print(" softmax (vanilla) first...")
-    model = BasicTransformerBlock(
-        dim=DIM,
-        n_heads=16,
-        d_head=64,
-        dropout=0.0,
-        context_dim=None,
-        attn_mode="softmax",
-    ).cuda()
-    try:
-        x = torch.randn(BS, SEQLEN, DIM).cuda()
-        tic = time.time()
-        y = model(x)
-        toc = time.time()
-        print(y.shape, toc - tic)
-    except RuntimeError as e:
-        # likely oom
-        print(str(e))
-    print("\n now flash-cosine...")
-    model = BasicTransformerBlock(
-        dim=DIM,
-        n_heads=16,
-        d_head=64,
-        dropout=0.0,
-        context_dim=None,
-        attn_mode="flash-cosine",
-    ).cuda()
-    x = torch.randn(BS, SEQLEN, DIM).cuda()
-    tic = time.time()
-    y = model(x)
-    toc = time.time()
-    print(y.shape, toc - tic)
-    print("done with test02.\n")
-if __name__ == "__main__":
-    # test01()
-    # test02()
-    # test03()
-    # benchmark_attn()
-    benchmark_transformer_blocks()
-    print("done.")

 import torch
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from torch import nn, einsum
 try:
     import xformers
     import xformers.ops
     XFORMERS_IS_AVAILABLE = True
 except:
     XFORMERS_IS_AVAILABLE = False
+    print("No module 'xformers'.")
 def exists(val):
         return self.to_out(out)
 class CrossAttention(nn.Module):
     def __init__(
         self,
         context_dim=None,
         heads=8,
         dim_head=64,
+        dropout=0.0
     ):
         super().__init__()
         inner_dim = dim_head * heads
         self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
         self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = zero_module(
+            nn.Sequential(
+                nn.Linear(inner_dim, query_dim),
+                nn.Dropout(dropout)
+            )
+        )
         self.attn_map_cache = None
     def forward(
         self,
         x,
+        context=None
     ):
         h = self.heads
         q = self.to_q(x)
         context = default(context, x)
         k = self.to_k(context)
         v = self.to_v(context)
         q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v))
         ## old
         sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         del q, k
         # attention, what we cannot get enough of
+        if sim.shape[-1] > 1:
+            sim = sim.softmax(dim=-1) # softmax on token dim
+        else:
+            sim = sim.sigmoid() # sigmoid on pixel dim
         # save attn_map
         if self.attn_map_cache is not None:
         out = einsum('b i j, b j d -> b i d', sim, v)
         out = rearrange(out, "(b h) n d -> b n (h d)", h=h)
         return self.to_out(out)
 class BasicTransformerBlock(nn.Module):
     def __init__(
         self,
         n_heads,
         d_head,
         dropout=0.0,
+        t_context_dim=None,
+        v_context_dim=None,
+        gated_ff=True
     ):
         super().__init__()
+        # self-attention
+        self.attn1 = MemoryEfficientCrossAttention(
             query_dim=dim,
             heads=n_heads,
             dim_head=d_head,
             dropout=dropout,
+            context_dim=None
+        )
+        # textual cross-attention
+        if t_context_dim is not None and t_context_dim > 0:
+            self.t_attn = CrossAttention(
                 query_dim=dim,
+                context_dim=t_context_dim,
                 heads=n_heads,
                 dim_head=d_head,
+                dropout=dropout
+            )
+            self.t_norm = nn.LayerNorm(dim)
+        # visual cross-attention
+        if v_context_dim is not None and v_context_dim > 0:
+            self.v_attn = CrossAttention(
                 query_dim=dim,
+                context_dim=v_context_dim,
                 heads=n_heads,
                 dim_head=d_head,
+                dropout=dropout
             )
+            self.v_norm = nn.LayerNorm(dim)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+    def forward(self, x, t_context=None, v_context=None):
         x = (
             self.attn1(
                 self.norm1(x),
+                context=None
             )
             + x
         )
+        if hasattr(self, "t_attn"):
             x = (
+                self.t_attn(
+                    self.t_norm(x),
+                    context=t_context
                 )
                 + x
             )
+        if hasattr(self, "v_attn"):
             x = (
+                self.v_attn(
+                    self.v_norm(x),
+                    context=v_context
                 )
                 + x
             )
+        x = self.ff(self.norm3(x)) + x
         return x
+class SpatialTransformer(nn.Module):
     """
     Transformer block for image-like data.
     First, project the input (aka embedding)
         d_head,
         depth=1,
         dropout=0.0,
+        t_context_dim=None,
+        v_context_dim=None,
+        use_linear=False
     ):
         super().__init__()
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
         self.norm = Normalize(in_channels)
                     n_heads,
                     d_head,
                     dropout=dropout,
+                    t_context_dim=t_context_dim,
+                    v_context_dim=v_context_dim
                 )
                 for d in range(depth)
             ]
                 nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
             )
         else:
             self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
         self.use_linear = use_linear
+    def forward(self, x, t_context=None, v_context=None):
         b, c, h, w = x.shape
         x_in = x
         x = self.norm(x)
         if self.use_linear:
             x = self.proj_in(x)
         for i, block in enumerate(self.transformer_blocks):
+            x = block(x, t_context=t_context, v_context=v_context)
         if self.use_linear:
             x = self.proj_out(x)
         x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous()
         if not self.use_linear:
             x = self.proj_out(x)
+        return x + x_in

sgm/modules/diffusionmodules/__init__.py CHANGED Viewed

@@ -2,6 +2,6 @@ from .denoiser import Denoiser
 from .discretizer import Discretization
 from .loss import StandardDiffusionLoss
 from .model import Model, Encoder, Decoder
-from .openaimodel import UNetModel
 from .sampling import BaseDiffusionSampler
 from .wrappers import OpenAIWrapper

 from .discretizer import Discretization
 from .loss import StandardDiffusionLoss
 from .model import Model, Encoder, Decoder
+from .openaimodel import UnifiedUNetModel
 from .sampling import BaseDiffusionSampler
 from .wrappers import OpenAIWrapper

sgm/modules/diffusionmodules/guiders.py CHANGED Viewed

@@ -32,7 +32,7 @@ class VanillaCFG:
         c_out = dict()
         for k in c:
-            if k in ["vector", "crossattn", "add_crossattn", "concat"]:
                 c_out[k] = torch.cat((uc[k], c[k]), 0)
             else:
                 assert c[k] == uc[k]
@@ -40,34 +40,6 @@ class VanillaCFG:
         return torch.cat([x] * 2), torch.cat([s] * 2), c_out
-class DualCFG:
-    def __init__(self, scale):
-        self.scale = scale
-        self.dyn_thresh = instantiate_from_config(
-            {
-                "target": "sgm.modules.diffusionmodules.sampling_utils.DualThresholding"
-            },
-        )
-    def __call__(self, x, sigma):
-        x_u_1, x_u_2, x_c = x.chunk(3)
-        x_pred = self.dyn_thresh(x_u_1, x_u_2, x_c, self.scale)
-        return x_pred
-    def prepare_inputs(self, x, s, c, uc_1, uc_2):
-        c_out = dict()
-        for k in c:
-            if k in ["vector", "crossattn", "concat", "add_crossattn"]:
-                c_out[k] = torch.cat((uc_1[k], uc_2[k], c[k]), 0)
-            else:
-                assert c[k] == uc_1[k]
-                c_out[k] = c[k]
-        return torch.cat([x] * 3), torch.cat([s] * 3), c_out
 class IdentityGuider:
     def __call__(self, x, sigma):
         return x

         c_out = dict()
         for k in c:
+            if k in ["vector", "t_crossattn", "v_crossattn", "concat"]:
                 c_out[k] = torch.cat((uc[k], c[k]), 0)
             else:
                 assert c[k] == uc[k]
         return torch.cat([x] * 2), torch.cat([s] * 2), c_out
 class IdentityGuider:
     def __call__(self, x, sigma):
         return x

sgm/modules/diffusionmodules/loss.py CHANGED Viewed

@@ -4,7 +4,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from omegaconf import ListConfig
-# from taming.modules.losses.lpips import LPIPS
 from torchvision.utils import save_image
 from ...util import append_dims, instantiate_from_config
@@ -19,16 +18,13 @@ class StandardDiffusionLoss(nn.Module):
     ):
         super().__init__()
-        assert type in ["l2", "l1", "lpips"]
         self.sigma_sampler = instantiate_from_config(sigma_sampler_config)
         self.type = type
         self.offset_noise_level = offset_noise_level
-        # if type == "lpips":
-        #     self.lpips = LPIPS().eval()
         if not batch2model_keys:
             batch2model_keys = []
@@ -70,9 +66,6 @@ class StandardDiffusionLoss(nn.Module):
             return torch.mean(
                 (w * (model_output - target).abs()).reshape(target.shape[0], -1), 1
             )
-        elif self.type == "lpips":
-            loss = self.lpips(model_output, target).reshape(-1)
-            return loss
 class FullLoss(StandardDiffusionLoss):
@@ -85,7 +78,9 @@ class FullLoss(StandardDiffusionLoss):
         min_attn_size=16,
         lambda_local_loss=0.0,
         lambda_ocr_loss=0.0,
         ocr_enabled = False,
         predictor_config = None,
         *args, **kwarg
     ):
@@ -98,7 +93,9 @@ class FullLoss(StandardDiffusionLoss):
         self.min_attn_size = min_attn_size
         self.lambda_local_loss = lambda_local_loss
         self.lambda_ocr_loss = lambda_ocr_loss
         self.ocr_enabled = ocr_enabled
         if ocr_enabled:
             self.predictor = instantiate_from_config(predictor_config)
@@ -155,9 +152,15 @@ class FullLoss(StandardDiffusionLoss):
             ocr_loss = self.get_ocr_loss(model_output, batch["r_bbox"], batch["label"], first_stage_model, scaler)
             ocr_loss = ocr_loss.mean()
         loss = diff_loss + self.lambda_local_loss * local_loss
         if self.ocr_enabled:
             loss += self.lambda_ocr_loss * ocr_loss
         loss_dict = {
             "loss/diff_loss": diff_loss,
@@ -167,6 +170,8 @@ class FullLoss(StandardDiffusionLoss):
         if self.ocr_enabled:
             loss_dict["loss/ocr_loss"] = ocr_loss
         return loss, loss_dict
@@ -191,6 +196,9 @@ class FullLoss(StandardDiffusionLoss):
         for item in attn_map_cache:
             heads = item["heads"]
             size = item["size"]
             attn_map = item["attn_map"]
@@ -233,6 +241,9 @@ class FullLoss(StandardDiffusionLoss):
         for item in attn_map_cache:
             heads = item["heads"]
             size = item["size"]
             attn_map = item["attn_map"]
@@ -241,7 +252,7 @@ class FullLoss(StandardDiffusionLoss):
             seg_l = seg_mask.shape[1]
-            bh, n, l = attn_map.shape # bh: batch size * heads / n : pixel length(h*w) / l: token length
             attn_map = attn_map.reshape((-1, heads, n, l)) # b, h, n, l
             assert seg_l <= l
@@ -272,4 +283,43 @@ class FullLoss(StandardDiffusionLoss):
         loss = loss / count
         return loss

 import torch.nn as nn
 import torch.nn.functional as F
 from omegaconf import ListConfig
 from torchvision.utils import save_image
 from ...util import append_dims, instantiate_from_config
     ):
         super().__init__()
+        assert type in ["l2", "l1"]
         self.sigma_sampler = instantiate_from_config(sigma_sampler_config)
         self.type = type
         self.offset_noise_level = offset_noise_level
         if not batch2model_keys:
             batch2model_keys = []
             return torch.mean(
                 (w * (model_output - target).abs()).reshape(target.shape[0], -1), 1
             )
 class FullLoss(StandardDiffusionLoss):
         min_attn_size=16,
         lambda_local_loss=0.0,
         lambda_ocr_loss=0.0,
+        lambda_style_loss=0.0,
         ocr_enabled = False,
+        style_enabled = False,
         predictor_config = None,
         *args, **kwarg
     ):
         self.min_attn_size = min_attn_size
         self.lambda_local_loss = lambda_local_loss
         self.lambda_ocr_loss = lambda_ocr_loss
+        self.lambda_style_loss = lambda_style_loss
+        self.style_enabled = style_enabled
         self.ocr_enabled = ocr_enabled
         if ocr_enabled:
             self.predictor = instantiate_from_config(predictor_config)
             ocr_loss = self.get_ocr_loss(model_output, batch["r_bbox"], batch["label"], first_stage_model, scaler)
             ocr_loss = ocr_loss.mean()
+        if self.style_enabled:
+            style_loss = self.get_style_local_loss(network.diffusion_model.attn_map_cache, batch["mask"])
+            style_loss = style_loss.mean()
         loss = diff_loss + self.lambda_local_loss * local_loss
         if self.ocr_enabled:
             loss += self.lambda_ocr_loss * ocr_loss
+        if self.style_enabled:
+            loss += self.lambda_style_loss * style_loss
         loss_dict = {
             "loss/diff_loss": diff_loss,
         if self.ocr_enabled:
             loss_dict["loss/ocr_loss"] = ocr_loss
+        if self.style_enabled:
+            loss_dict["loss/style_loss"] = style_loss
         return loss, loss_dict
         for item in attn_map_cache:
+            name = item["name"]
+            if not name.endswith("t_attn"): continue
             heads = item["heads"]
             size = item["size"]
             attn_map = item["attn_map"]
         for item in attn_map_cache:
+            name = item["name"]
+            if not name.endswith("t_attn"): continue
             heads = item["heads"]
             size = item["size"]
             attn_map = item["attn_map"]
             seg_l = seg_mask.shape[1]
+            bh, n, l = attn_map.shape # bh: batch size * heads / n: pixel length(h*w) / l: token length
             attn_map = attn_map.reshape((-1, heads, n, l)) # b, h, n, l
             assert seg_l <= l
         loss = loss / count
+        return loss
+    def get_style_local_loss(self, attn_map_cache, mask):
+        loss = 0
+        count = 0
+        for item in attn_map_cache:
+            name = item["name"]
+            if not name.endswith("v_attn"): continue
+            heads = item["heads"]
+            size = item["size"]
+            attn_map = item["attn_map"]
+            if size < self.min_attn_size: continue
+            bh, n, l = attn_map.shape # bh: batch size * heads / n: pixel length(h*w) / l: token length
+            attn_map = attn_map.reshape((-1, heads, n, l)) # b, h, n, l
+            attn_map = attn_map.permute(0, 1, 3, 2) # b, h, l, n
+            attn_map = attn_map.mean(dim = 1) # b, l, n
+            mask_map = F.interpolate(mask, (size, size))
+            mask_map = mask_map.reshape((-1, l, n)) # b, l, n
+            n_mask_map = 1 - mask_map
+            p_loss = (mask_map * attn_map).sum(dim = -1) / (mask_map.sum(dim = -1) + 1e-5) # b, l
+            n_loss = (n_mask_map * attn_map).sum(dim = -1) / (n_mask_map.sum(dim = -1) + 1e-5) # b, l
+            p_loss = p_loss.mean(dim = -1)
+            n_loss = n_loss.mean(dim = -1)
+            f_loss = n_loss - p_loss # b,
+            loss += f_loss
+            count += 1
+        loss = loss / count
         return loss

sgm/modules/diffusionmodules/openaimodel.py CHANGED Viewed

@@ -1,6 +1,4 @@
-import math
 from abc import abstractmethod
-from functools import partial
 from typing import Iterable
 import numpy as np
@@ -12,7 +10,6 @@ from einops import rearrange
 from ...modules.attention import SpatialTransformer
 from ...modules.diffusionmodules.util import (
     avg_pool_nd,
-    checkpoint,
     conv_nd,
     linear,
     normalization,
@@ -22,47 +19,14 @@ from ...modules.diffusionmodules.util import (
 from ...util import default, exists
-# dummy replace
-def convert_module_to_f16(x):
-    pass
-def convert_module_to_f32(x):
-    pass
-## go
-class AttentionPool2d(nn.Module):
-    """
-    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
-    """
-    def __init__(
-        self,
-        spacial_dim: int,
-        embed_dim: int,
-        num_heads_channels: int,
-        output_dim: int = None,
-    ):
         super().__init__()
-        self.positional_embedding = nn.Parameter(
-            th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5
-        )
-        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
-        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
-        self.num_heads = embed_dim // num_heads_channels
-        self.attention = QKVAttention(self.num_heads)
-    def forward(self, x):
-        b, c, *_spatial = x.shape
-        x = x.reshape(b, c, -1)  # NC(HW)
-        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
-        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
-        x = self.qkv_proj(x)
-        x = self.attention(x)
-        x = self.c_proj(x)
-        return x[:, :, 0]
 class TimestepBlock(nn.Module):
     """
@@ -86,19 +50,14 @@ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
         self,
         x,
         emb,
-        context=None,
-        add_context=None,
-        skip_time_mix=False,
-        time_context=None,
-        num_video_frames=None,
-        time_context_cat=None,
-        use_crossframe_attention_in_spatial_layers=False,
     ):
         for layer in self:
             if isinstance(layer, TimestepBlock):
                 x = layer(x, emb)
             elif isinstance(layer, SpatialTransformer):
-                x = layer(x, context, add_context)
             else:
                 x = layer(x)
         return x
@@ -143,22 +102,6 @@ class Upsample(nn.Module):
         return x
-class TransposedUpsample(nn.Module):
-    "Learned 2x upsampling without padding"
-    def __init__(self, channels, out_channels=None, ks=5):
-        super().__init__()
-        self.channels = channels
-        self.out_channels = out_channels or channels
-        self.up = nn.ConvTranspose2d(
-            self.channels, self.out_channels, kernel_size=ks, stride=2
-        )
-    def forward(self, x):
-        return self.up(x)
 class Downsample(nn.Module):
     """
     A downsampling layer with an optional convolution.
@@ -206,17 +149,6 @@ class Downsample(nn.Module):
 class ResBlock(TimestepBlock):
     """
     A residual block that can optionally change the number of channels.
-    :param channels: the number of input channels.
-    :param emb_channels: the number of timestep embedding channels.
-    :param dropout: the rate of dropout.
-    :param out_channels: if specified, the number of out channels.
-    :param use_conv: if True and out_channels is specified, use a spatial
-        convolution instead of a smaller 1x1 convolution to change the
-        channels in the skip connection.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_checkpoint: if True, use gradient checkpointing on this module.
-    :param up: if True, use this block for upsampling.
-    :param down: if True, use this block for downsampling.
     """
     def __init__(
@@ -228,12 +160,11 @@ class ResBlock(TimestepBlock):
         use_conv=False,
         use_scale_shift_norm=False,
         dims=2,
-        use_checkpoint=False,
         up=False,
         down=False,
         kernel_size=3,
         exchange_temb_dims=False,
-        skip_t_emb=False,
     ):
         super().__init__()
         self.channels = channels
@@ -241,7 +172,6 @@ class ResBlock(TimestepBlock):
         self.dropout = dropout
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
-        self.use_checkpoint = use_checkpoint
         self.use_scale_shift_norm = use_scale_shift_norm
         self.exchange_temb_dims = exchange_temb_dims
@@ -310,17 +240,6 @@ class ResBlock(TimestepBlock):
             self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
     def forward(self, x, emb):
-        """
-        Apply the block to a Tensor, conditioned on a timestep embedding.
-        :param x: an [N x C x ...] Tensor of features.
-        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
-        return checkpoint(
-            self._forward, (x, emb), self.parameters(), self.use_checkpoint
-        )
-    def _forward(self, x, emb):
         if self.updown:
             in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
             h = in_rest(x)
@@ -348,233 +267,42 @@ class ResBlock(TimestepBlock):
             h = self.out_layers(h)
         return self.skip_connection(x) + h
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other.
-    Originally ported from here, but adapted to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    """
-    def __init__(
-        self,
-        channels,
-        num_heads=1,
-        num_head_channels=-1,
-        use_checkpoint=False,
-        use_new_attention_order=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.use_checkpoint = use_checkpoint
-        self.norm = normalization(channels)
-        self.qkv = conv_nd(1, channels, channels * 3, 1)
-        if use_new_attention_order:
-            # split qkv before split heads
-            self.attention = QKVAttention(self.num_heads)
-        else:
-            # split heads before split qkv
-            self.attention = QKVAttentionLegacy(self.num_heads)
-        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
-    def forward(self, x, **kwargs):
-        # TODO add crossframe attention and use mixed checkpoint
-        return checkpoint(
-            self._forward, (x,), self.parameters(), True
-        )  # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
-        # return pt_checkpoint(self._forward, x)  # pytorch
-    def _forward(self, x):
-        b, c, *spatial = x.shape
-        x = x.reshape(b, c, -1)
-        qkv = self.qkv(self.norm(x))
-        h = self.attention(qkv)
-        h = self.proj_out(h)
-        return (x + h).reshape(b, c, *spatial)
-def count_flops_attn(model, _x, y):
-    """
-    A counter for the `thop` package to count the operations in an
-    attention operation.
-    Meant to be used like:
-        macs, params = thop.profile(
-            model,
-            inputs=(inputs, timestamps),
-            custom_ops={QKVAttention: QKVAttention.count_flops},
-        )
-    """
-    b, c, *spatial = y[0].shape
-    num_spatial = int(np.prod(spatial))
-    # We perform two matmuls with the same number of ops.
-    # The first computes the weight matrix, the second computes
-    # the combination of the value vectors.
-    matmul_ops = 2 * b * (num_spatial**2) * c
-    model.total_ops += th.DoubleTensor([matmul_ops])
-class QKVAttentionLegacy(nn.Module):
-    """
-    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
-    """
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts", q * scale, k * scale
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v)
-        return a.reshape(bs, -1, length)
-    @staticmethod
-    def count_flops(model, _x, y):
-        return count_flops_attn(model, _x, y)
-class QKVAttention(nn.Module):
-    """
-    A module which performs QKV attention and splits in a different order.
-    """
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-    def forward(self, qkv):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.chunk(3, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = th.einsum(
-            "bct,bcs->bts",
-            (q * scale).view(bs * self.n_heads, ch, length),
-            (k * scale).view(bs * self.n_heads, ch, length),
-        )  # More stable with f16 than dividing afterwards
-        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
-        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
-        return a.reshape(bs, -1, length)
-    @staticmethod
-    def count_flops(model, _x, y):
-        return count_flops_attn(model, _x, y)
-class Timestep(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-    def forward(self, t):
-        return timestep_embedding(t, self.dim)
-class UNetModel(nn.Module):
-    """
-    The full UNet model with attention and timestep embedding.
-    :param in_channels: channels in the input Tensor.
-    :param model_channels: base channel count for the model.
-    :param out_channels: channels in the output Tensor.
-    :param num_res_blocks: number of residual blocks per downsample.
-    :param attention_resolutions: a collection of downsample rates at which
-        attention will take place. May be a set, list, or tuple.
-        For example, if this contains 4, then at 4x downsampling, attention
-        will be used.
-    :param dropout: the dropout probability.
-    :param channel_mult: channel multiplier for each level of the UNet.
-    :param conv_resample: if True, use learned convolutions for upsampling and
-        downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param num_classes: if specified (as an int), then this model will be
-        class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
-    :param num_heads: the number of attention heads in each attention layer.
-    :param num_heads_channels: if specified, ignore num_heads and instead use
-                               a fixed channel width per attention head.
-    :param num_heads_upsample: works with num_heads to set a different number
-                               of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
-    :param resblock_updown: use residual blocks for up/downsampling.
-    :param use_new_attention_order: use a different attention pattern for potentially
-                                    increased efficiency.
-    """
     def __init__(
         self,
         in_channels,
         model_channels,
         out_channels,
         num_res_blocks,
         attention_resolutions,
         dropout=0,
         channel_mult=(1, 2, 4, 8),
         conv_resample=True,
         dims=2,
-        num_classes=None,
-        use_checkpoint=False,
-        use_fp16=False,
         num_heads=-1,
         num_head_channels=-1,
         num_heads_upsample=-1,
         use_scale_shift_norm=False,
         resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,  # custom transformer support
-        transformer_depth=1,  # custom transformer support
-        context_dim=None,  # custom transformer support
-        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
-        legacy=True,
-        disable_self_attentions=None,
         num_attention_blocks=None,
-        disable_middle_self_attn=False,
         use_linear_in_transformer=False,
-        spatial_transformer_attn_type="softmax",
         adm_in_channels=None,
-        use_fairscale_checkpoint=False,
-        offload_to_cpu=False,
-        transformer_depth_middle=None,
     ):
         super().__init__()
-        from omegaconf.listconfig import ListConfig
-        if use_spatial_transformer:
-            assert (
-                context_dim is not None
-            ), "Fool!! You forgot to include the dimension of your cross-attention conditioning..."
-        if context_dim is not None:
-            assert (
-                use_spatial_transformer
-            ), "Fool!! You forgot to use the spatial transformer for your cross-attention conditioning..."
-            if type(context_dim) == ListConfig:
-                context_dim = list(context_dim)
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
@@ -590,106 +318,39 @@ class UNetModel(nn.Module):
             ), "Either num_heads or num_head_channels has to be set"
         self.in_channels = in_channels
         self.model_channels = model_channels
         self.out_channels = out_channels
-        if isinstance(transformer_depth, int):
-            transformer_depth = len(channel_mult) * [transformer_depth]
-        elif isinstance(transformer_depth, ListConfig):
-            transformer_depth = list(transformer_depth)
-        transformer_depth_middle = default(
-            transformer_depth_middle, transformer_depth[-1]
-        )
-        if isinstance(num_res_blocks, int):
-            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
-        else:
-            if len(num_res_blocks) != len(channel_mult):
-                raise ValueError(
-                    "provide num_res_blocks either as an int (globally constant) or "
-                    "as a list/tuple (per-level) with the same length as channel_mult"
-                )
-            self.num_res_blocks = num_res_blocks
-        # self.num_res_blocks = num_res_blocks
-        if disable_self_attentions is not None:
-            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
-            assert len(disable_self_attentions) == len(channel_mult)
-        if num_attention_blocks is not None:
-            assert len(num_attention_blocks) == len(self.num_res_blocks)
-            assert all(
-                map(
-                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
-                    range(len(num_attention_blocks)),
-                )
-            )
-            print(
-                f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
-                f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
-                f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
-                f"attention will still not be set."
-            )  # todo: convert to warning
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
         self.conv_resample = conv_resample
-        self.num_classes = num_classes
-        self.use_checkpoint = use_checkpoint
-        if use_fp16:
-            print("WARNING: use_fp16 was dropped and has no effect anymore.")
-        # self.dtype = th.float16 if use_fp16 else th.float32
         self.num_heads = num_heads
         self.num_head_channels = num_head_channels
         self.num_heads_upsample = num_heads_upsample
-        self.predict_codebook_ids = n_embed is not None
-        assert use_fairscale_checkpoint != use_checkpoint or not (
-            use_checkpoint or use_fairscale_checkpoint
-        )
-        self.use_fairscale_checkpoint = False
-        checkpoint_wrapper_fn = (
-            partial(checkpoint_wrapper, offload_to_cpu=offload_to_cpu)
-            if self.use_fairscale_checkpoint
-            else lambda x: x
-        )
         time_embed_dim = model_channels * 4
-        self.time_embed = checkpoint_wrapper_fn(
-            nn.Sequential(
-                linear(model_channels, time_embed_dim),
-                nn.SiLU(),
-                linear(time_embed_dim, time_embed_dim),
-            )
         )
-        if self.num_classes is not None:
-            if isinstance(self.num_classes, int):
-                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
-            elif self.num_classes == "continuous":
-                print("setting up linear c_adm embedding layer")
-                self.label_emb = nn.Linear(1, time_embed_dim)
-            elif self.num_classes == "timestep":
-                self.label_emb = checkpoint_wrapper_fn(
-                    nn.Sequential(
-                        Timestep(model_channels),
-                        nn.Sequential(
-                            linear(model_channels, time_embed_dim),
-                            nn.SiLU(),
-                            linear(time_embed_dim, time_embed_dim),
-                        ),
-                    )
-                )
-            elif self.num_classes == "sequential":
-                assert adm_in_channels is not None
-                self.label_emb = nn.Sequential(
-                    nn.Sequential(
-                        linear(adm_in_channels, time_embed_dim),
-                        nn.SiLU(),
-                        linear(time_embed_dim, time_embed_dim),
-                    )
                 )
-            else:
-                raise ValueError()
         self.input_blocks = nn.ModuleList(
             [
@@ -698,6 +359,26 @@ class UNetModel(nn.Module):
                 )
             ]
         )
         self._feature_size = model_channels
         input_block_chans = [model_channels]
         ch = model_channels
@@ -705,16 +386,13 @@ class UNetModel(nn.Module):
         for level, mult in enumerate(channel_mult):
             for nr in range(self.num_res_blocks[level]):
                 layers = [
-                    checkpoint_wrapper_fn(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=mult * model_channels,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                        )
                     )
                 ]
                 ch = mult * model_channels
@@ -724,45 +402,19 @@ class UNetModel(nn.Module):
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
-                    if legacy:
-                        # num_heads = 1
-                        dim_head = (
-                            ch // num_heads
-                            if use_spatial_transformer
-                            else num_head_channels
-                        )
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
                     if (
                         not exists(num_attention_blocks)
                         or nr < num_attention_blocks[level]
                     ):
                         layers.append(
-                            checkpoint_wrapper_fn(
-                                AttentionBlock(
-                                    ch,
-                                    use_checkpoint=use_checkpoint,
-                                    num_heads=num_heads,
-                                    num_head_channels=dim_head,
-                                    use_new_attention_order=use_new_attention_order,
-                                )
-                            )
-                            if not use_spatial_transformer
-                            else checkpoint_wrapper_fn(
-                                SpatialTransformer(
-                                    ch,
-                                    num_heads,
-                                    dim_head,
-                                    depth=transformer_depth[level],
-                                    context_dim=context_dim,
-                                    disable_self_attn=disabled_sa,
-                                    use_linear=use_linear_in_transformer,
-                                    attn_type=spatial_transformer_attn_type,
-                                    use_checkpoint=use_checkpoint,
-                                )
                             )
                         )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
@@ -772,17 +424,14 @@ class UNetModel(nn.Module):
                 out_ch = ch
                 self.input_blocks.append(
                     TimestepEmbedSequential(
-                        checkpoint_wrapper_fn(
-                            ResBlock(
-                                ch,
-                                time_embed_dim,
-                                dropout,
-                                out_channels=out_ch,
-                                dims=dims,
-                                use_checkpoint=use_checkpoint,
-                                use_scale_shift_norm=use_scale_shift_norm,
-                                down=True,
-                            )
                         )
                         if resblock_updown
                         else Downsample(
@@ -800,54 +449,33 @@ class UNetModel(nn.Module):
         else:
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
-        if legacy:
-            # num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
         self.middle_block = TimestepEmbedSequential(
-            checkpoint_wrapper_fn(
-                ResBlock(
-                    ch,
-                    time_embed_dim,
-                    dropout,
-                    dims=dims,
-                    use_checkpoint=use_checkpoint,
-                    use_scale_shift_norm=use_scale_shift_norm,
-                )
-            ),
-            checkpoint_wrapper_fn(
-                AttentionBlock(
-                    ch,
-                    use_checkpoint=use_checkpoint,
-                    num_heads=num_heads,
-                    num_head_channels=dim_head,
-                    use_new_attention_order=use_new_attention_order,
-                )
-            )
-            if not use_spatial_transformer
-            else checkpoint_wrapper_fn(
-                SpatialTransformer(  # always uses a self-attn
-                    ch,
-                    num_heads,
-                    dim_head,
-                    depth=transformer_depth_middle,
-                    context_dim=context_dim,
-                    disable_self_attn=disable_middle_self_attn,
-                    use_linear=use_linear_in_transformer,
-                    attn_type=spatial_transformer_attn_type,
-                    use_checkpoint=use_checkpoint,
-                )
             ),
-            checkpoint_wrapper_fn(
-                ResBlock(
-                    ch,
-                    time_embed_dim,
-                    dropout,
-                    dims=dims,
-                    use_checkpoint=use_checkpoint,
-                    use_scale_shift_norm=use_scale_shift_norm,
-                )
             ),
         )
         self._feature_size += ch
         self.output_blocks = nn.ModuleList([])
@@ -855,16 +483,13 @@ class UNetModel(nn.Module):
             for i in range(self.num_res_blocks[level] + 1):
                 ich = input_block_chans.pop()
                 layers = [
-                    checkpoint_wrapper_fn(
-                        ResBlock(
-                            ch + ich,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=model_channels * mult,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                        )
                     )
                 ]
                 ch = model_channels * mult
@@ -874,61 +499,32 @@ class UNetModel(nn.Module):
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
-                    if legacy:
-                        # num_heads = 1
-                        dim_head = (
-                            ch // num_heads
-                            if use_spatial_transformer
-                            else num_head_channels
-                        )
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
                     if (
                         not exists(num_attention_blocks)
                         or i < num_attention_blocks[level]
                     ):
                         layers.append(
-                            checkpoint_wrapper_fn(
-                                AttentionBlock(
-                                    ch,
-                                    use_checkpoint=use_checkpoint,
-                                    num_heads=num_heads_upsample,
-                                    num_head_channels=dim_head,
-                                    use_new_attention_order=use_new_attention_order,
-                                )
-                            )
-                            if not use_spatial_transformer
-                            else checkpoint_wrapper_fn(
-                                SpatialTransformer(
-                                    ch,
-                                    num_heads,
-                                    dim_head,
-                                    depth=transformer_depth[level],
-                                    context_dim=context_dim,
-                                    disable_self_attn=disabled_sa,
-                                    use_linear=use_linear_in_transformer,
-                                    attn_type=spatial_transformer_attn_type,
-                                    use_checkpoint=use_checkpoint,
-                                )
                             )
                         )
                 if level and i == self.num_res_blocks[level]:
                     out_ch = ch
                     layers.append(
-                        checkpoint_wrapper_fn(
-                            ResBlock(
-                                ch,
-                                time_embed_dim,
-                                dropout,
-                                out_channels=out_ch,
-                                dims=dims,
-                                use_checkpoint=use_checkpoint,
-                                use_scale_shift_norm=use_scale_shift_norm,
-                                up=True,
-                            )
                         )
                         if resblock_updown
                         else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
@@ -937,1133 +533,92 @@ class UNetModel(nn.Module):
                 self.output_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
-        self.out = checkpoint_wrapper_fn(
-            nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
-            )
         )
-        if self.predict_codebook_ids:
-            self.id_predictor = checkpoint_wrapper_fn(
-                nn.Sequential(
-                    normalization(ch),
-                    conv_nd(dims, model_channels, n_embed, 1),
-                    # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
-                )
-            )
-    def convert_to_fp16(self):
-        """
-        Convert the torso of the model to float16.
-        """
-        self.input_blocks.apply(convert_module_to_f16)
-        self.middle_block.apply(convert_module_to_f16)
-        self.output_blocks.apply(convert_module_to_f16)
-    def convert_to_fp32(self):
-        """
-        Convert the torso of the model to float32.
-        """
-        self.input_blocks.apply(convert_module_to_f32)
-        self.middle_block.apply(convert_module_to_f32)
-        self.output_blocks.apply(convert_module_to_f32)
-    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :param context: conditioning plugged in via crossattn
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
         assert (y is not None) == (
-            self.num_classes is not None
         ), "must specify y if and only if the model is class-conditional"
         hs = []
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
         emb = self.time_embed(t_emb)
-        if self.num_classes is not None:
             assert y.shape[0] == x.shape[0]
             emb = emb + self.label_emb(y)
-        # h = x.type(self.dtype)
-        h = x
-        for i, module in enumerate(self.input_blocks):
-            h = module(h, emb, context)
-            hs.append(h)
-        h = self.middle_block(h, emb, context)
-        for i, module in enumerate(self.output_blocks):
-            h = th.cat([h, hs.pop()], dim=1)
-            h = module(h, emb, context)
-        h = h.type(x.dtype)
-        if self.predict_codebook_ids:
-            assert False, "not supported anymore. what the f*** are you doing?"
-        else:
-            return self.out(h)
-class UNetModel(nn.Module):
-    """
-    The full UNet model with attention and timestep embedding.
-    :param in_channels: channels in the input Tensor.
-    :param model_channels: base channel count for the model.
-    :param out_channels: channels in the output Tensor.
-    :param num_res_blocks: number of residual blocks per downsample.
-    :param attention_resolutions: a collection of downsample rates at which
-        attention will take place. May be a set, list, or tuple.
-        For example, if this contains 4, then at 4x downsampling, attention
-        will be used.
-    :param dropout: the dropout probability.
-    :param channel_mult: channel multiplier for each level of the UNet.
-    :param conv_resample: if True, use learned convolutions for upsampling and
-        downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param num_classes: if specified (as an int), then this model will be
-        class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
-    :param num_heads: the number of attention heads in each attention layer.
-    :param num_heads_channels: if specified, ignore num_heads and instead use
-                               a fixed channel width per attention head.
-    :param num_heads_upsample: works with num_heads to set a different number
-                               of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
-    :param resblock_updown: use residual blocks for up/downsampling.
-    :param use_new_attention_order: use a different attention pattern for potentially
-                                    increased efficiency.
-    """
-    def __init__(
-        self,
-        in_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        conv_resample=True,
-        dims=2,
-        num_classes=None,
-        use_checkpoint=False,
-        use_fp16=False,
-        num_heads=-1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,  # custom transformer support
-        transformer_depth=1,  # custom transformer support
-        context_dim=None,  # custom transformer support
-        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
-        legacy=True,
-        disable_self_attentions=None,
-        num_attention_blocks=None,
-        disable_middle_self_attn=False,
-        use_linear_in_transformer=False,
-        spatial_transformer_attn_type="softmax",
-        adm_in_channels=None,
-        use_fairscale_checkpoint=False,
-        offload_to_cpu=False,
-        transformer_depth_middle=None,
-    ):
-        super().__init__()
-        from omegaconf.listconfig import ListConfig
-        if use_spatial_transformer:
-            assert (
-                context_dim is not None
-            ), "Fool!! You forgot to include the dimension of your cross-attention conditioning..."
-        if context_dim is not None:
-            assert (
-                use_spatial_transformer
-            ), "Fool!! You forgot to use the spatial transformer for your cross-attention conditioning..."
-            if type(context_dim) == ListConfig:
-                context_dim = list(context_dim)
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-        if num_heads == -1:
-            assert (
-                num_head_channels != -1
-            ), "Either num_heads or num_head_channels has to be set"
-        if num_head_channels == -1:
-            assert (
-                num_heads != -1
-            ), "Either num_heads or num_head_channels has to be set"
-        self.in_channels = in_channels
-        self.model_channels = model_channels
-        self.out_channels = out_channels
-        if isinstance(transformer_depth, int):
-            transformer_depth = len(channel_mult) * [transformer_depth]
-        elif isinstance(transformer_depth, ListConfig):
-            transformer_depth = list(transformer_depth)
-        transformer_depth_middle = default(
-            transformer_depth_middle, transformer_depth[-1]
-        )
-        if isinstance(num_res_blocks, int):
-            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
-        else:
-            if len(num_res_blocks) != len(channel_mult):
-                raise ValueError(
-                    "provide num_res_blocks either as an int (globally constant) or "
-                    "as a list/tuple (per-level) with the same length as channel_mult"
-                )
-            self.num_res_blocks = num_res_blocks
-        # self.num_res_blocks = num_res_blocks
-        if disable_self_attentions is not None:
-            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
-            assert len(disable_self_attentions) == len(channel_mult)
-        if num_attention_blocks is not None:
-            assert len(num_attention_blocks) == len(self.num_res_blocks)
-            assert all(
-                map(
-                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
-                    range(len(num_attention_blocks)),
-                )
-            )
-            print(
-                f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
-                f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
-                f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
-                f"attention will still not be set."
-            )  # todo: convert to warning
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.num_classes = num_classes
-        self.use_checkpoint = use_checkpoint
-        if use_fp16:
-            print("WARNING: use_fp16 was dropped and has no effect anymore.")
-        # self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-        self.predict_codebook_ids = n_embed is not None
-        assert use_fairscale_checkpoint != use_checkpoint or not (
-            use_checkpoint or use_fairscale_checkpoint
-        )
-        self.use_fairscale_checkpoint = False
-        checkpoint_wrapper_fn = (
-            partial(checkpoint_wrapper, offload_to_cpu=offload_to_cpu)
-            if self.use_fairscale_checkpoint
-            else lambda x: x
-        )
-        time_embed_dim = model_channels * 4
-        self.time_embed = checkpoint_wrapper_fn(
-            nn.Sequential(
-                linear(model_channels, time_embed_dim),
-                nn.SiLU(),
-                linear(time_embed_dim, time_embed_dim),
-            )
-        )
-        if self.num_classes is not None:
-            if isinstance(self.num_classes, int):
-                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
-            elif self.num_classes == "continuous":
-                print("setting up linear c_adm embedding layer")
-                self.label_emb = nn.Linear(1, time_embed_dim)
-            elif self.num_classes == "timestep":
-                self.label_emb = checkpoint_wrapper_fn(
-                    nn.Sequential(
-                        Timestep(model_channels),
-                        nn.Sequential(
-                            linear(model_channels, time_embed_dim),
-                            nn.SiLU(),
-                            linear(time_embed_dim, time_embed_dim),
-                        ),
-                    )
-                )
-            elif self.num_classes == "sequential":
-                assert adm_in_channels is not None
-                self.label_emb = nn.Sequential(
-                    nn.Sequential(
-                        linear(adm_in_channels, time_embed_dim),
-                        nn.SiLU(),
-                        linear(time_embed_dim, time_embed_dim),
-                    )
-                )
-            else:
-                raise ValueError()
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for nr in range(self.num_res_blocks[level]):
-                layers = [
-                    checkpoint_wrapper_fn(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=mult * model_channels,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                        )
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        # num_heads = 1
-                        dim_head = (
-                            ch // num_heads
-                            if use_spatial_transformer
-                            else num_head_channels
-                        )
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
-                    if (
-                        not exists(num_attention_blocks)
-                        or nr < num_attention_blocks[level]
-                    ):
-                        layers.append(
-                            checkpoint_wrapper_fn(
-                                AttentionBlock(
-                                    ch,
-                                    use_checkpoint=use_checkpoint,
-                                    num_heads=num_heads,
-                                    num_head_channels=dim_head,
-                                    use_new_attention_order=use_new_attention_order,
-                                )
-                            )
-                            if not use_spatial_transformer
-                            else checkpoint_wrapper_fn(
-                                SpatialTransformer(
-                                    ch,
-                                    num_heads,
-                                    dim_head,
-                                    depth=transformer_depth[level],
-                                    context_dim=context_dim,
-                                    disable_self_attn=disabled_sa,
-                                    use_linear=use_linear_in_transformer,
-                                    attn_type=spatial_transformer_attn_type,
-                                    use_checkpoint=use_checkpoint,
-                                )
-                            )
-                        )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        checkpoint_wrapper_fn(
-                            ResBlock(
-                                ch,
-                                time_embed_dim,
-                                dropout,
-                                out_channels=out_ch,
-                                dims=dims,
-                                use_checkpoint=use_checkpoint,
-                                use_scale_shift_norm=use_scale_shift_norm,
-                                down=True,
-                            )
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                ds *= 2
-                self._feature_size += ch
-        if num_head_channels == -1:
-            dim_head = ch // num_heads
-        else:
-            num_heads = ch // num_head_channels
-            dim_head = num_head_channels
-        if legacy:
-            # num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-        self.middle_block = TimestepEmbedSequential(
-            checkpoint_wrapper_fn(
-                ResBlock(
-                    ch,
-                    time_embed_dim,
-                    dropout,
-                    dims=dims,
-                    use_checkpoint=use_checkpoint,
-                    use_scale_shift_norm=use_scale_shift_norm,
-                )
-            ),
-            checkpoint_wrapper_fn(
-                AttentionBlock(
-                    ch,
-                    use_checkpoint=use_checkpoint,
-                    num_heads=num_heads,
-                    num_head_channels=dim_head,
-                    use_new_attention_order=use_new_attention_order,
-                )
-            )
-            if not use_spatial_transformer
-            else checkpoint_wrapper_fn(
-                SpatialTransformer(  # always uses a self-attn
-                    ch,
-                    num_heads,
-                    dim_head,
-                    depth=transformer_depth_middle,
-                    context_dim=context_dim,
-                    disable_self_attn=disable_middle_self_attn,
-                    use_linear=use_linear_in_transformer,
-                    attn_type=spatial_transformer_attn_type,
-                    use_checkpoint=use_checkpoint,
-                )
-            ),
-            checkpoint_wrapper_fn(
-                ResBlock(
-                    ch,
-                    time_embed_dim,
-                    dropout,
-                    dims=dims,
-                    use_checkpoint=use_checkpoint,
-                    use_scale_shift_norm=use_scale_shift_norm,
-                )
-            ),
-        )
-        self._feature_size += ch
-        self.output_blocks = nn.ModuleList([])
-        for level, mult in list(enumerate(channel_mult))[::-1]:
-            for i in range(self.num_res_blocks[level] + 1):
-                ich = input_block_chans.pop()
-                layers = [
-                    checkpoint_wrapper_fn(
-                        ResBlock(
-                            ch + ich,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=model_channels * mult,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                        )
-                    )
-                ]
-                ch = model_channels * mult
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        # num_heads = 1
-                        dim_head = (
-                            ch // num_heads
-                            if use_spatial_transformer
-                            else num_head_channels
-                        )
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
-                    if (
-                        not exists(num_attention_blocks)
-                        or i < num_attention_blocks[level]
-                    ):
-                        layers.append(
-                            checkpoint_wrapper_fn(
-                                AttentionBlock(
-                                    ch,
-                                    use_checkpoint=use_checkpoint,
-                                    num_heads=num_heads_upsample,
-                                    num_head_channels=dim_head,
-                                    use_new_attention_order=use_new_attention_order,
-                                )
-                            )
-                            if not use_spatial_transformer
-                            else checkpoint_wrapper_fn(
-                                SpatialTransformer(
-                                    ch,
-                                    num_heads,
-                                    dim_head,
-                                    depth=transformer_depth[level],
-                                    context_dim=context_dim,
-                                    disable_self_attn=disabled_sa,
-                                    use_linear=use_linear_in_transformer,
-                                    attn_type=spatial_transformer_attn_type,
-                                    use_checkpoint=use_checkpoint,
-                                )
-                            )
-                        )
-                if level and i == self.num_res_blocks[level]:
-                    out_ch = ch
-                    layers.append(
-                        checkpoint_wrapper_fn(
-                            ResBlock(
-                                ch,
-                                time_embed_dim,
-                                dropout,
-                                out_channels=out_ch,
-                                dims=dims,
-                                use_checkpoint=use_checkpoint,
-                                use_scale_shift_norm=use_scale_shift_norm,
-                                up=True,
-                            )
-                        )
-                        if resblock_updown
-                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
-                    )
-                    ds //= 2
-                self.output_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-        self.out = checkpoint_wrapper_fn(
-            nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
-            )
-        )
-        if self.predict_codebook_ids:
-            self.id_predictor = checkpoint_wrapper_fn(
-                nn.Sequential(
-                    normalization(ch),
-                    conv_nd(dims, model_channels, n_embed, 1),
-                    # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
-                )
-            )
-    def convert_to_fp16(self):
-        """
-        Convert the torso of the model to float16.
-        """
-        self.input_blocks.apply(convert_module_to_f16)
-        self.middle_block.apply(convert_module_to_f16)
-        self.output_blocks.apply(convert_module_to_f16)
-    def convert_to_fp32(self):
-        """
-        Convert the torso of the model to float32.
-        """
-        self.input_blocks.apply(convert_module_to_f32)
-        self.middle_block.apply(convert_module_to_f32)
-        self.output_blocks.apply(convert_module_to_f32)
-    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :param context: conditioning plugged in via crossattn
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
-        assert (y is not None) == (
-            self.num_classes is not None
-        ), "must specify y if and only if the model is class-conditional"
-        hs = []
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
-        emb = self.time_embed(t_emb)
-        if self.num_classes is not None:
-            assert y.shape[0] == x.shape[0]
-            emb = emb + self.label_emb(y)
-        # h = x.type(self.dtype)
-        h = x
-        for i, module in enumerate(self.input_blocks):
-            h = module(h, emb, context)
-            hs.append(h)
-        h = self.middle_block(h, emb, context)
-        for i, module in enumerate(self.output_blocks):
-            h = th.cat([h, hs.pop()], dim=1)
-            h = module(h, emb, context)
-        h = h.type(x.dtype)
-        if self.predict_codebook_ids:
-            assert False, "not supported anymore. what the f*** are you doing?"
-        else:
-            return self.out(h)
-import seaborn as sns
-import matplotlib.pyplot as plt
-class UNetAddModel(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        ctrl_channels,
-        model_channels,
-        out_channels,
-        num_res_blocks,
-        attention_resolutions,
-        dropout=0,
-        channel_mult=(1, 2, 4, 8),
-        attn_type="attn2",
-        attn_layers=[],
-        conv_resample=True,
-        dims=2,
-        num_classes=None,
-        use_checkpoint=False,
-        use_fp16=False,
-        num_heads=-1,
-        num_head_channels=-1,
-        num_heads_upsample=-1,
-        use_scale_shift_norm=False,
-        resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,  # custom transformer support
-        transformer_depth=1,  # custom transformer support
-        context_dim=None,  # custom transformer support
-        add_context_dim=None,
-        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
-        legacy=True,
-        disable_self_attentions=None,
-        num_attention_blocks=None,
-        disable_middle_self_attn=False,
-        use_linear_in_transformer=False,
-        spatial_transformer_attn_type="softmax",
-        adm_in_channels=None,
-        use_fairscale_checkpoint=False,
-        offload_to_cpu=False,
-        transformer_depth_middle=None,
-    ):
-        super().__init__()
-        from omegaconf.listconfig import ListConfig
-        if use_spatial_transformer:
-            assert (
-                context_dim is not None
-            ), "Fool!! You forgot to include the dimension of your cross-attention conditioning..."
-        if context_dim is not None:
-            assert (
-                use_spatial_transformer
-            ), "Fool!! You forgot to use the spatial transformer for your cross-attention conditioning..."
-            if type(context_dim) == ListConfig:
-                context_dim = list(context_dim)
-        if num_heads_upsample == -1:
-            num_heads_upsample = num_heads
-        if num_heads == -1:
-            assert (
-                num_head_channels != -1
-            ), "Either num_heads or num_head_channels has to be set"
-        if num_head_channels == -1:
-            assert (
-                num_heads != -1
-            ), "Either num_heads or num_head_channels has to be set"
-        self.in_channels = in_channels
-        self.ctrl_channels = ctrl_channels
-        self.model_channels = model_channels
-        self.out_channels = out_channels
-        if isinstance(transformer_depth, int):
-            transformer_depth = len(channel_mult) * [transformer_depth]
-        elif isinstance(transformer_depth, ListConfig):
-            transformer_depth = list(transformer_depth)
-        transformer_depth_middle = default(
-            transformer_depth_middle, transformer_depth[-1]
-        )
-        if isinstance(num_res_blocks, int):
-            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
-        else:
-            if len(num_res_blocks) != len(channel_mult):
-                raise ValueError(
-                    "provide num_res_blocks either as an int (globally constant) or "
-                    "as a list/tuple (per-level) with the same length as channel_mult"
-                )
-            self.num_res_blocks = num_res_blocks
-        # self.num_res_blocks = num_res_blocks
-        if disable_self_attentions is not None:
-            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
-            assert len(disable_self_attentions) == len(channel_mult)
-        if num_attention_blocks is not None:
-            assert len(num_attention_blocks) == len(self.num_res_blocks)
-            assert all(
-                map(
-                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i],
-                    range(len(num_attention_blocks)),
-                )
-            )
-            print(
-                f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
-                f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
-                f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
-                f"attention will still not be set."
-            )  # todo: convert to warning
-        self.attention_resolutions = attention_resolutions
-        self.dropout = dropout
-        self.channel_mult = channel_mult
-        self.conv_resample = conv_resample
-        self.num_classes = num_classes
-        self.use_checkpoint = use_checkpoint
-        if use_fp16:
-            print("WARNING: use_fp16 was dropped and has no effect anymore.")
-        # self.dtype = th.float16 if use_fp16 else th.float32
-        self.num_heads = num_heads
-        self.num_head_channels = num_head_channels
-        self.num_heads_upsample = num_heads_upsample
-        self.predict_codebook_ids = n_embed is not None
-        assert use_fairscale_checkpoint != use_checkpoint or not (
-            use_checkpoint or use_fairscale_checkpoint
-        )
-        self.use_fairscale_checkpoint = False
-        checkpoint_wrapper_fn = (
-            partial(checkpoint_wrapper, offload_to_cpu=offload_to_cpu)
-            if self.use_fairscale_checkpoint
-            else lambda x: x
-        )
-        time_embed_dim = model_channels * 4
-        self.time_embed = checkpoint_wrapper_fn(
-            nn.Sequential(
-                linear(model_channels, time_embed_dim),
-                nn.SiLU(),
-                linear(time_embed_dim, time_embed_dim),
-            )
-        )
-        if self.num_classes is not None:
-            if isinstance(self.num_classes, int):
-                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
-            elif self.num_classes == "continuous":
-                print("setting up linear c_adm embedding layer")
-                self.label_emb = nn.Linear(1, time_embed_dim)
-            elif self.num_classes == "timestep":
-                self.label_emb = checkpoint_wrapper_fn(
-                    nn.Sequential(
-                        Timestep(model_channels),
-                        nn.Sequential(
-                            linear(model_channels, time_embed_dim),
-                            nn.SiLU(),
-                            linear(time_embed_dim, time_embed_dim),
-                        ),
-                    )
-                )
-            elif self.num_classes == "sequential":
-                assert adm_in_channels is not None
-                self.label_emb = nn.Sequential(
-                    nn.Sequential(
-                        linear(adm_in_channels, time_embed_dim),
-                        nn.SiLU(),
-                        linear(time_embed_dim, time_embed_dim),
-                    )
-                )
-            else:
-                raise ValueError()
-        self.input_blocks = nn.ModuleList(
-            [
-                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
-                )
-            ]
-        )
-        if self.ctrl_channels > 0:
-            self.add_input_block = TimestepEmbedSequential(
-                conv_nd(dims, ctrl_channels, 16, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 16, 16, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 16, 32, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 32, 32, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 32, 96, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 96, 96, 3, padding=1),
-                nn.SiLU(),
-                conv_nd(dims, 96, 256, 3, padding=1),
-                nn.SiLU(),
-                zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
-            )
-        self._feature_size = model_channels
-        input_block_chans = [model_channels]
-        ch = model_channels
-        ds = 1
-        for level, mult in enumerate(channel_mult):
-            for nr in range(self.num_res_blocks[level]):
-                layers = [
-                    checkpoint_wrapper_fn(
-                        ResBlock(
-                            ch,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=mult * model_channels,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                        )
-                    )
-                ]
-                ch = mult * model_channels
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        # num_heads = 1
-                        dim_head = (
-                            ch // num_heads
-                            if use_spatial_transformer
-                            else num_head_channels
-                        )
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
-                    if (
-                        not exists(num_attention_blocks)
-                        or nr < num_attention_blocks[level]
-                    ):
-                        layers.append(
-                            checkpoint_wrapper_fn(
-                                AttentionBlock(
-                                    ch,
-                                    use_checkpoint=use_checkpoint,
-                                    num_heads=num_heads,
-                                    num_head_channels=dim_head,
-                                    use_new_attention_order=use_new_attention_order,
-                                )
-                            )
-                            if not use_spatial_transformer
-                            else checkpoint_wrapper_fn(
-                                SpatialTransformer(
-                                    ch,
-                                    num_heads,
-                                    dim_head,
-                                    depth=transformer_depth[level],
-                                    context_dim=context_dim,
-                                    add_context_dim=add_context_dim,
-                                    disable_self_attn=disabled_sa,
-                                    use_linear=use_linear_in_transformer,
-                                    attn_type=spatial_transformer_attn_type,
-                                    use_checkpoint=use_checkpoint,
-                                )
-                            )
-                        )
-                self.input_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-                input_block_chans.append(ch)
-            if level != len(channel_mult) - 1:
-                out_ch = ch
-                self.input_blocks.append(
-                    TimestepEmbedSequential(
-                        checkpoint_wrapper_fn(
-                            ResBlock(
-                                ch,
-                                time_embed_dim,
-                                dropout,
-                                out_channels=out_ch,
-                                dims=dims,
-                                use_checkpoint=use_checkpoint,
-                                use_scale_shift_norm=use_scale_shift_norm,
-                                down=True,
-                            )
-                        )
-                        if resblock_updown
-                        else Downsample(
-                            ch, conv_resample, dims=dims, out_channels=out_ch
-                        )
-                    )
-                )
-                ch = out_ch
-                input_block_chans.append(ch)
-                ds *= 2
-                self._feature_size += ch
-        if num_head_channels == -1:
-            dim_head = ch // num_heads
-        else:
-            num_heads = ch // num_head_channels
-            dim_head = num_head_channels
-        if legacy:
-            # num_heads = 1
-            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
-        self.middle_block = TimestepEmbedSequential(
-            checkpoint_wrapper_fn(
-                ResBlock(
-                    ch,
-                    time_embed_dim,
-                    dropout,
-                    dims=dims,
-                    use_checkpoint=use_checkpoint,
-                    use_scale_shift_norm=use_scale_shift_norm,
-                )
-            ),
-            checkpoint_wrapper_fn(
-                AttentionBlock(
-                    ch,
-                    use_checkpoint=use_checkpoint,
-                    num_heads=num_heads,
-                    num_head_channels=dim_head,
-                    use_new_attention_order=use_new_attention_order,
-                )
-            )
-            if not use_spatial_transformer
-            else checkpoint_wrapper_fn(
-                SpatialTransformer(  # always uses a self-attn
-                    ch,
-                    num_heads,
-                    dim_head,
-                    depth=transformer_depth_middle,
-                    context_dim=context_dim,
-                    add_context_dim=add_context_dim,
-                    disable_self_attn=disable_middle_self_attn,
-                    use_linear=use_linear_in_transformer,
-                    attn_type=spatial_transformer_attn_type,
-                    use_checkpoint=use_checkpoint,
-                )
-            ),
-            checkpoint_wrapper_fn(
-                ResBlock(
-                    ch,
-                    time_embed_dim,
-                    dropout,
-                    dims=dims,
-                    use_checkpoint=use_checkpoint,
-                    use_scale_shift_norm=use_scale_shift_norm,
-                )
-            ),
-        )
-        self._feature_size += ch
-        self.output_blocks = nn.ModuleList([])
-        for level, mult in list(enumerate(channel_mult))[::-1]:
-            for i in range(self.num_res_blocks[level] + 1):
-                ich = input_block_chans.pop()
-                layers = [
-                    checkpoint_wrapper_fn(
-                        ResBlock(
-                            ch + ich,
-                            time_embed_dim,
-                            dropout,
-                            out_channels=model_channels * mult,
-                            dims=dims,
-                            use_checkpoint=use_checkpoint,
-                            use_scale_shift_norm=use_scale_shift_norm,
-                        )
-                    )
-                ]
-                ch = model_channels * mult
-                if ds in attention_resolutions:
-                    if num_head_channels == -1:
-                        dim_head = ch // num_heads
-                    else:
-                        num_heads = ch // num_head_channels
-                        dim_head = num_head_channels
-                    if legacy:
-                        # num_heads = 1
-                        dim_head = (
-                            ch // num_heads
-                            if use_spatial_transformer
-                            else num_head_channels
-                        )
-                    if exists(disable_self_attentions):
-                        disabled_sa = disable_self_attentions[level]
-                    else:
-                        disabled_sa = False
-                    if (
-                        not exists(num_attention_blocks)
-                        or i < num_attention_blocks[level]
-                    ):
-                        layers.append(
-                            checkpoint_wrapper_fn(
-                                AttentionBlock(
-                                    ch,
-                                    use_checkpoint=use_checkpoint,
-                                    num_heads=num_heads_upsample,
-                                    num_head_channels=dim_head,
-                                    use_new_attention_order=use_new_attention_order,
-                                )
-                            )
-                            if not use_spatial_transformer
-                            else checkpoint_wrapper_fn(
-                                SpatialTransformer(
-                                    ch,
-                                    num_heads,
-                                    dim_head,
-                                    depth=transformer_depth[level],
-                                    context_dim=context_dim,
-                                    add_context_dim=add_context_dim,
-                                    disable_self_attn=disabled_sa,
-                                    use_linear=use_linear_in_transformer,
-                                    attn_type=spatial_transformer_attn_type,
-                                    use_checkpoint=use_checkpoint,
-                                )
-                            )
-                        )
-                if level and i == self.num_res_blocks[level]:
-                    out_ch = ch
-                    layers.append(
-                        checkpoint_wrapper_fn(
-                            ResBlock(
-                                ch,
-                                time_embed_dim,
-                                dropout,
-                                out_channels=out_ch,
-                                dims=dims,
-                                use_checkpoint=use_checkpoint,
-                                use_scale_shift_norm=use_scale_shift_norm,
-                                up=True,
-                            )
-                        )
-                        if resblock_updown
-                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
-                    )
-                    ds //= 2
-                self.output_blocks.append(TimestepEmbedSequential(*layers))
-                self._feature_size += ch
-        self.out = checkpoint_wrapper_fn(
-            nn.Sequential(
-                normalization(ch),
-                nn.SiLU(),
-                zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
-            )
-        )
-        if self.predict_codebook_ids:
-            self.id_predictor = checkpoint_wrapper_fn(
-                nn.Sequential(
-                    normalization(ch),
-                    conv_nd(dims, model_channels, n_embed, 1),
-                    # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
-                )
-            )
-        # cache attn map
-        self.attn_type = attn_type
-        self.attn_layers = attn_layers
-        self.attn_map_cache = []
-        for name, module in self.named_modules():
-            if name.endswith(self.attn_type):
-                item = {"name": name, "heads": module.heads, "size": None, "attn_map": None}
-                self.attn_map_cache.append(item)
-                module.attn_map_cache = item
-    def clear_attn_map(self):
-        for item in self.attn_map_cache:
-            if item["attn_map"] is not None:
-                del item["attn_map"]
-                item["attn_map"] = None
-    def save_attn_map(self, save_name="temp", tokens=""):
-        attn_maps = []
-        for item in self.attn_map_cache:
-            name = item["name"]
-            if any([name.startswith(block) for block in self.attn_layers]):
-                heads = item["heads"]
-                attn_maps.append(item["attn_map"].detach().cpu())
-        attn_map = th.stack(attn_maps, dim=0)
-        attn_map = th.mean(attn_map, dim=0)
-        # attn_map: bh * n * l
-        bh, n, l = attn_map.shape # bh: batch size * heads / n : pixel length(h*w) / l: token length
-        attn_map = attn_map.reshape((-1,heads,n,l)).mean(dim=1)
-        b = attn_map.shape[0]
-        h = w = int(n**0.5)
-        attn_map = attn_map.permute(0,2,1).reshape((b,l,h,w)).numpy()
-        attn_map_i = attn_map[-1]
-        l = attn_map_i.shape[0]
-        fig = plt.figure(figsize=(12, 8), dpi=300)
-        for j in range(12):
-            if j >= l: break
-            ax = fig.add_subplot(3, 4, j+1)
-            sns.heatmap(attn_map_i[j], square=True, xticklabels=False, yticklabels=False)
-            if j < len(tokens):
-                ax.set_title(tokens[j])
-        fig.savefig(f"./temp/attn_map/attn_map_{save_name}.png")
-        plt.close()
-        return attn_map_i
-    def forward(self, x, timesteps=None, context=None, add_context=None, y=None, **kwargs):
-        """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :param context: conditioning plugged in via crossattn
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
-        """
-        assert (y is not None) == (
-            self.num_classes is not None
-        ), "must specify y if and only if the model is class-conditional"
-        self.clear_attn_map()
-        hs = []
-        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
-        emb = self.time_embed(t_emb)
-        if self.num_classes is not None:
-            assert y.shape[0] == x.shape[0]
-            emb = emb + self.label_emb(y)
-        # h = x.type(self.dtype)
         h = x
         if self.ctrl_channels > 0:
             in_h, add_h = th.split(h, [self.in_channels, self.ctrl_channels], dim=1)
         for i, module in enumerate(self.input_blocks):
             if self.ctrl_channels > 0 and i == 0:
-                h = module(in_h, emb, context, add_context) + self.add_input_block(add_h, emb, context, add_context)
             else:
-                h = module(h, emb, context, add_context)
             hs.append(h)
-        h = self.middle_block(h, emb, context, add_context)
         for i, module in enumerate(self.output_blocks):
             h = th.cat([h, hs.pop()], dim=1)
-            h = module(h, emb, context, add_context)
         h = h.type(x.dtype)
         return self.out(h)

 from abc import abstractmethod
 from typing import Iterable
 import numpy as np
 from ...modules.attention import SpatialTransformer
 from ...modules.diffusionmodules.util import (
     avg_pool_nd,
     conv_nd,
     linear,
     normalization,
 from ...util import default, exists
+class Timestep(nn.Module):
+    def __init__(self, dim):
         super().__init__()
+        self.dim = dim
+    def forward(self, t):
+        return timestep_embedding(t, self.dim)
 class TimestepBlock(nn.Module):
     """
         self,
         x,
         emb,
+        t_context=None,
+        v_context=None
     ):
         for layer in self:
             if isinstance(layer, TimestepBlock):
                 x = layer(x, emb)
             elif isinstance(layer, SpatialTransformer):
+                x = layer(x, t_context, v_context)
             else:
                 x = layer(x)
         return x
         return x
 class Downsample(nn.Module):
     """
     A downsampling layer with an optional convolution.
 class ResBlock(TimestepBlock):
     """
     A residual block that can optionally change the number of channels.
     """
     def __init__(
         use_conv=False,
         use_scale_shift_norm=False,
         dims=2,
         up=False,
         down=False,
         kernel_size=3,
         exchange_temb_dims=False,
+        skip_t_emb=False
     ):
         super().__init__()
         self.channels = channels
         self.dropout = dropout
         self.out_channels = out_channels or channels
         self.use_conv = use_conv
         self.use_scale_shift_norm = use_scale_shift_norm
         self.exchange_temb_dims = exchange_temb_dims
             self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
     def forward(self, x, emb):
         if self.updown:
             in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
             h = in_rest(x)
             h = self.out_layers(h)
         return self.skip_connection(x) + h
+import seaborn as sns
+import matplotlib.pyplot as plt
+class UnifiedUNetModel(nn.Module):
     def __init__(
         self,
         in_channels,
+        ctrl_channels,
         model_channels,
         out_channels,
         num_res_blocks,
         attention_resolutions,
         dropout=0,
         channel_mult=(1, 2, 4, 8),
+        save_attn_type=None,
+        save_attn_layers=[],
         conv_resample=True,
         dims=2,
+        use_label=None,
         num_heads=-1,
         num_head_channels=-1,
         num_heads_upsample=-1,
         use_scale_shift_norm=False,
         resblock_updown=False,
+        transformer_depth=1,
+        t_context_dim=None,
+        v_context_dim=None,
         num_attention_blocks=None,
         use_linear_in_transformer=False,
         adm_in_channels=None,
+        transformer_depth_middle=None
     ):
         super().__init__()
         if num_heads_upsample == -1:
             num_heads_upsample = num_heads
             ), "Either num_heads or num_head_channels has to be set"
         self.in_channels = in_channels
+        self.ctrl_channels = ctrl_channels
         self.model_channels = model_channels
         self.out_channels = out_channels
+        transformer_depth = len(channel_mult) * [transformer_depth]
+        transformer_depth_middle = default(transformer_depth_middle, transformer_depth[-1])
+        self.num_res_blocks = len(channel_mult) * [num_res_blocks]
         self.attention_resolutions = attention_resolutions
         self.dropout = dropout
         self.channel_mult = channel_mult
         self.conv_resample = conv_resample
+        self.use_label = use_label
         self.num_heads = num_heads
         self.num_head_channels = num_head_channels
         self.num_heads_upsample = num_heads_upsample
         time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
         )
+        if self.use_label is not None:
+            self.label_emb = nn.Sequential(
+                nn.Sequential(
+                    linear(adm_in_channels, time_embed_dim),
+                    nn.SiLU(),
+                    linear(time_embed_dim, time_embed_dim),
                 )
+            )
         self.input_blocks = nn.ModuleList(
             [
                 )
             ]
         )
+        if self.ctrl_channels > 0:
+            self.ctrl_block = TimestepEmbedSequential(
+                conv_nd(dims, ctrl_channels, 16, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 16, 16, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 16, 32, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 32, 32, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 32, 96, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 96, 96, 3, padding=1),
+                nn.SiLU(),
+                conv_nd(dims, 96, 256, 3, padding=1),
+                nn.SiLU(),
+                zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+            )
         self._feature_size = model_channels
         input_block_chans = [model_channels]
         ch = model_channels
         for level, mult in enumerate(channel_mult):
             for nr in range(self.num_res_blocks[level]):
                 layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_scale_shift_norm=use_scale_shift_norm
                     )
                 ]
                 ch = mult * model_channels
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
                     if (
                         not exists(num_attention_blocks)
                         or nr < num_attention_blocks[level]
                     ):
                         layers.append(
+                            SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth[level],
+                                t_context_dim=t_context_dim,
+                                v_context_dim=v_context_dim,
+                                use_linear=use_linear_in_transformer
                             )
                         )
                 self.input_blocks.append(TimestepEmbedSequential(*layers))
                 out_ch = ch
                 self.input_blocks.append(
                     TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True
                         )
                         if resblock_updown
                         else Downsample(
         else:
             num_heads = ch // num_head_channels
             dim_head = num_head_channels
         self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_scale_shift_norm=use_scale_shift_norm
             ),
+            SpatialTransformer(  # always uses a self-attn
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth_middle,
+                t_context_dim=t_context_dim,
+                v_context_dim=v_context_dim,
+                use_linear=use_linear_in_transformer
             ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_scale_shift_norm=use_scale_shift_norm
+            )
         )
         self._feature_size += ch
         self.output_blocks = nn.ModuleList([])
             for i in range(self.num_res_blocks[level] + 1):
                 ich = input_block_chans.pop()
                 layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=model_channels * mult,
+                        dims=dims,
+                        use_scale_shift_norm=use_scale_shift_norm
                     )
                 ]
                 ch = model_channels * mult
                     else:
                         num_heads = ch // num_head_channels
                         dim_head = num_head_channels
                     if (
                         not exists(num_attention_blocks)
                         or i < num_attention_blocks[level]
                     ):
                         layers.append(
+                            SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth[level],
+                                t_context_dim=t_context_dim,
+                                v_context_dim=v_context_dim,
+                                use_linear=use_linear_in_transformer
                             )
                         )
                 if level and i == self.num_res_blocks[level]:
                     out_ch = ch
                     layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True
                         )
                         if resblock_updown
                         else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
                 self.output_blocks.append(TimestepEmbedSequential(*layers))
                 self._feature_size += ch
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1))
         )
+        # cache attn map
+        self.attn_type = save_attn_type
+        self.attn_layers = save_attn_layers
+        self.attn_map_cache = []
+        for name, module in self.named_modules():
+            if any([name.endswith(attn_type) for attn_type in self.attn_type]):
+                item = {"name": name, "heads": module.heads, "size": None, "attn_map": None}
+                self.attn_map_cache.append(item)
+                module.attn_map_cache = item
+    def clear_attn_map(self):
+        for item in self.attn_map_cache:
+            if item["attn_map"] is not None:
+                del item["attn_map"]
+                item["attn_map"] = None
+    def save_attn_map(self, attn_type="t_attn", save_name="temp", tokens=""):
+        attn_maps = []
+        for item in self.attn_map_cache:
+            name = item["name"]
+            if any([name.startswith(block) for block in self.attn_layers]) and name.endswith(attn_type):
+                heads = item["heads"]
+                attn_maps.append(item["attn_map"].detach().cpu())
+        attn_map = th.stack(attn_maps, dim=0)
+        attn_map = th.mean(attn_map, dim=0)
+        # attn_map: bh * n * l
+        bh, n, l = attn_map.shape # bh: batch size * heads / n : pixel length(h*w) / l: token length
+        attn_map = attn_map.reshape((-1,heads,n,l)).mean(dim=1)
+        b = attn_map.shape[0]
+        h = w = int(n**0.5)
+        attn_map = attn_map.permute(0,2,1).reshape((b,l,h,w)).numpy()
+        attn_map_i = attn_map[-1]
+        l = attn_map_i.shape[0]
+        fig = plt.figure(figsize=(12, 8), dpi=300)
+        for j in range(12):
+            if j >= l: break
+            ax = fig.add_subplot(3, 4, j+1)
+            sns.heatmap(attn_map_i[j], square=True, xticklabels=False, yticklabels=False)
+            if j < len(tokens):
+                ax.set_title(tokens[j])
+        fig.savefig(f"temp/attn_map/attn_map_{save_name}.png")
+        plt.close()
+        return attn_map_i
+    def forward(self, x, timesteps=None, t_context=None, v_context=None, y=None, **kwargs):
         assert (y is not None) == (
+            self.use_label is not None
         ), "must specify y if and only if the model is class-conditional"
+        self.clear_attn_map()
         hs = []
         t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
         emb = self.time_embed(t_emb)
+        if self.use_label is not None:
             assert y.shape[0] == x.shape[0]
             emb = emb + self.label_emb(y)
         h = x
         if self.ctrl_channels > 0:
             in_h, add_h = th.split(h, [self.in_channels, self.ctrl_channels], dim=1)
         for i, module in enumerate(self.input_blocks):
             if self.ctrl_channels > 0 and i == 0:
+                h = module(in_h, emb, t_context, v_context) + self.ctrl_block(add_h, emb, t_context, v_context)
             else:
+                h = module(h, emb, t_context, v_context)
             hs.append(h)
+        h = self.middle_block(h, emb, t_context, v_context)
         for i, module in enumerate(self.output_blocks):
             h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, t_context, v_context)
         h = h.type(x.dtype)
         return self.out(h)

sgm/modules/diffusionmodules/sampling.py CHANGED Viewed

@@ -412,194 +412,12 @@ class EulerEDMSampler(EDMSampler):
                 inter = inter.cpu().numpy().transpose(1, 2, 0) * 255
                 inters.append(inter.astype(np.uint8))
-        print(f"Local losses: {local_losses}")
         if len(inters) > 0:
             imageio.mimsave(f"./temp/inters/{name}.gif", inters, 'GIF', duration=0.02)
         return x
-class EulerEDMDualSampler(EulerEDMSampler):
-    def prepare_sampling_loop(self, x, cond, uc_1=None, uc_2=None, num_steps=None):
-        sigmas = self.discretization(
-            self.num_steps if num_steps is None else num_steps, device=self.device
-        )
-        uc_1 = default(uc_1, cond)
-        uc_2 = default(uc_2, cond)
-        x *= torch.sqrt(1.0 + sigmas[0] ** 2.0)
-        num_sigmas = len(sigmas)
-        s_in = x.new_ones([x.shape[0]])
-        return x, s_in, sigmas, num_sigmas, cond, uc_1, uc_2
-    def denoise(self, x, model, sigma, cond, uc_1, uc_2):
-        denoised = model.denoiser(model.model, *self.guider.prepare_inputs(x, sigma, cond, uc_1, uc_2))
-        denoised = self.guider(denoised, sigma)
-        return denoised
-    def get_init_noise(self, cfgs, model, cond, batch, uc_1=None, uc_2=None):
-        H, W = batch["target_size_as_tuple"][0]
-        shape = (cfgs.batch_size, cfgs.channel, int(H) // cfgs.factor, int(W) // cfgs.factor)
-        randn = torch.randn(shape).to(torch.device("cuda", index=cfgs.gpu))
-        x = randn.clone()
-        xs = []
-        self.verbose = False
-        for _ in range(cfgs.noise_iters):
-            x, s_in, sigmas, num_sigmas, cond, uc_1, uc_2 = self.prepare_sampling_loop(
-                x, cond, uc_1, uc_2, num_steps=2
-            )
-            superv = {
-                "mask": batch["mask"] if "mask" in batch else None,
-                "seg_mask": batch["seg_mask"] if "seg_mask" in batch else None
-            }
-            local_losses = []
-            for i in self.get_sigma_gen(num_sigmas):
-                gamma = (
-                    min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
-                    if self.s_tmin <= sigmas[i] <= self.s_tmax
-                    else 0.0
-                )
-                x, inter, local_loss = self.sampler_step(
-                    s_in * sigmas[i],
-                    s_in * sigmas[i + 1],
-                    model,
-                    x,
-                    cond,
-                    superv,
-                    uc_1,
-                    uc_2,
-                    gamma,
-                    save_loss=True
-                )
-                local_losses.append(local_loss.item())
-            xs.append((randn, local_losses[-1]))
-            randn = torch.randn(shape).to(torch.device("cuda", index=cfgs.gpu))
-            x = randn.clone()
-        self.verbose = True
-        xs.sort(key = lambda x: x[-1])
-        if len(xs) > 0:
-            print(f"Init local loss: Best {xs[0][1]} Worst {xs[-1][1]}")
-            x = xs[0][0]
-        return x
-    def sampler_step(self, sigma, next_sigma, model, x, cond, batch=None, uc_1=None, uc_2=None,
-                     gamma=0.0, alpha=0, iter_enabled=False, thres=None, update=False,
-                     name=None, save_loss=False, save_attn=False, save_inter=False):
-        sigma_hat = sigma * (gamma + 1.0)
-        if gamma > 0:
-            eps = torch.randn_like(x) * self.s_noise
-            x = x + eps * append_dims(sigma_hat**2 - sigma**2, x.ndim) ** 0.5
-        if update:
-            x = self.attend_and_excite(x, model, sigma_hat, cond, batch, alpha, iter_enabled, thres)
-        denoised = self.denoise(x, model, sigma_hat, cond, uc_1, uc_2)
-        denoised_decode = model.decode_first_stage(denoised) if save_inter else None
-        if save_loss:
-            local_loss = model.loss_fn.get_min_local_loss(model.model.diffusion_model.attn_map_cache, batch["mask"], batch["seg_mask"])
-            local_loss = local_loss[-local_loss.shape[0]//3:]
-        else:
-            local_loss = torch.zeros(1)
-        if save_attn:
-            attn_map = model.model.diffusion_model.save_attn_map(save_name=name, save_single=True)
-            self.save_segment_map(attn_map, tokens=batch["label"][0], save_name=name)
-        d = to_d(x, sigma_hat, denoised)
-        dt = append_dims(next_sigma - sigma_hat, x.ndim)
-        euler_step = self.euler_step(x, d, dt)
-        return euler_step, denoised_decode, local_loss
-    def __call__(self, model, x, cond, batch=None, uc_1=None, uc_2=None, num_steps=None, init_step=0,
-                 name=None, aae_enabled=False, detailed=False):
-        x, s_in, sigmas, num_sigmas, cond, uc_1, uc_2 = self.prepare_sampling_loop(
-            x, cond, uc_1, uc_2, num_steps
-        )
-        name = batch["name"][0]
-        inters = []
-        local_losses = []
-        scales = np.linspace(start=1.0, stop=0, num=num_sigmas)
-        iter_lst = np.linspace(start=5, stop=25, num=6, dtype=np.int32)
-        thres_lst = np.linspace(start=-0.5, stop=-0.8, num=6)
-        for i in self.get_sigma_gen(num_sigmas, init_step=init_step):
-            gamma = (
-                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
-                if self.s_tmin <= sigmas[i] <= self.s_tmax
-                else 0.0
-            )
-            alpha = 20 * np.sqrt(scales[i])
-            update = aae_enabled
-            save_loss = aae_enabled
-            save_attn = detailed and (i == (num_sigmas-1)//2)
-            save_inter = aae_enabled
-            if i in iter_lst:
-                iter_enabled = True
-                thres = thres_lst[list(iter_lst).index(i)]
-            else:
-                iter_enabled = False
-                thres = 0.0
-            x, inter, local_loss = self.sampler_step(
-                s_in * sigmas[i],
-                s_in * sigmas[i + 1],
-                model,
-                x,
-                cond,
-                batch,
-                uc_1,
-                uc_2,
-                gamma,
-                alpha=alpha,
-                iter_enabled=iter_enabled,
-                thres=thres,
-                update=update,
-                name=name,
-                save_loss=save_loss,
-                save_attn=save_attn,
-                save_inter=save_inter
-            )
-            local_losses.append(local_loss.item())
-            if inter is not None:
-                inter = torch.clamp((inter + 1.0) / 2.0, min=0.0, max=1.0)[0]
-                inter = inter.cpu().numpy().transpose(1, 2, 0) * 255
-                inters.append(inter.astype(np.uint8))
-        print(f"Local losses: {local_losses}")
-        if len(inters) > 0:
-            imageio.mimsave(f"./temp/inters/{name}.gif", inters, 'GIF', duration=0.1)
-        return x
 class HeunEDMSampler(EDMSampler):

                 inter = inter.cpu().numpy().transpose(1, 2, 0) * 255
                 inters.append(inter.astype(np.uint8))
+        # print(f"Local losses: {local_losses}")
         if len(inters) > 0:
             imageio.mimsave(f"./temp/inters/{name}.gif", inters, 'GIF', duration=0.02)
         return x
 class HeunEDMSampler(EDMSampler):

sgm/modules/diffusionmodules/sampling_utils.py CHANGED Viewed

@@ -7,10 +7,7 @@ from ...util import append_dims
 class NoDynamicThresholding:
     def __call__(self, uncond, cond, scale):
         return uncond + scale * (cond - uncond)
-class DualThresholding: # Dual condition CFG (from instructPix2Pix)
-    def __call__(self, uncond_1, uncond_2, cond, scale):
-        return uncond_1 + scale[0] * (uncond_2 - uncond_1) + scale[1] * (cond - uncond_2)
 def linear_multistep_coeff(order, t, i, j, epsrel=1e-4):
     if order - 1 > i:

 class NoDynamicThresholding:
     def __call__(self, uncond, cond, scale):
         return uncond + scale * (cond - uncond)
 def linear_multistep_coeff(order, t, i, j, epsrel=1e-4):
     if order - 1 > i:

sgm/modules/diffusionmodules/wrappers.py CHANGED Viewed

@@ -28,8 +28,8 @@ class OpenAIWrapper(IdentityWrapper):
         return self.diffusion_model(
             x,
             timesteps=t,
-            context=c.get("crossattn", None),
-            add_context=c.get("add_crossattn", None),
             y=c.get("vector", None),
             **kwargs
         )

         return self.diffusion_model(
             x,
             timesteps=t,
+            t_context=c.get("t_crossattn", None),
+            v_context=c.get("v_crossattn", None),
             y=c.get("vector", None),
             **kwargs
         )

sgm/modules/encoders/modules.py CHANGED Viewed

@@ -14,6 +14,7 @@ from transformers import (
     ByT5Tokenizer,
     CLIPTextModel,
     CLIPTokenizer,
     T5EncoderModel,
     T5Tokenizer,
 )
@@ -38,18 +39,19 @@ import pytorch_lightning as pl
 from torchvision import transforms
 from timm.models.vision_transformer import VisionTransformer
 from safetensors.torch import load_file as load_safetensors
 # disable warning
 from transformers import logging
 logging.set_verbosity_error()
 class AbstractEmbModel(nn.Module):
-    def __init__(self, is_add_embedder=False):
         super().__init__()
         self._is_trainable = None
         self._ucg_rate = None
         self._input_key = None
-        self.is_add_embedder = is_add_embedder
     @property
     def is_trainable(self) -> bool:
@@ -63,6 +65,10 @@ class AbstractEmbModel(nn.Module):
     def input_key(self) -> str:
         return self._input_key
     @is_trainable.setter
     def is_trainable(self, value: bool):
         self._is_trainable = value
@@ -75,6 +81,10 @@ class AbstractEmbModel(nn.Module):
     def input_key(self, value: str):
         self._input_key = value
     @is_trainable.deleter
     def is_trainable(self):
         del self._is_trainable
@@ -87,8 +97,13 @@ class AbstractEmbModel(nn.Module):
     def input_key(self):
         del self._input_key
 class GeneralConditioner(nn.Module):
     OUTPUT_DIM2KEYS = {2: "vector", 3: "crossattn", 4: "concat", 5: "concat"}
     KEY2CATDIM = {"vector": 1, "crossattn": 2, "concat": 1}
@@ -109,7 +124,8 @@ class GeneralConditioner(nn.Module):
                 f"Initialized embedder #{n}: {embedder.__class__.__name__} "
                 f"with {count_params(embedder, False)} params. Trainable: {embedder.is_trainable}"
             )
             if "input_key" in embconfig:
                 embedder.input_key = embconfig["input_key"]
             elif "input_keys" in embconfig:
@@ -156,13 +172,10 @@ class GeneralConditioner(nn.Module):
             if not isinstance(emb_out, (list, tuple)):
                 emb_out = [emb_out]
             for emb in emb_out:
-                if embedder.is_add_embedder:
-                    out_key = "add_crossattn"
                 else:
                     out_key = self.OUTPUT_DIM2KEYS[emb.dim()]
-                if embedder.input_key == "mask":
-                    H, W = batch["image"].shape[-2:]
-                    emb = nn.functional.interpolate(emb, (H//8, W//8))
                 if embedder.ucg_rate > 0.0 and embedder.legacy_ucg_val is None:
                     emb = (
                         expand_dims_like(
@@ -204,28 +217,6 @@ class GeneralConditioner(nn.Module):
         return c, uc
-class DualConditioner(GeneralConditioner):
-    def get_unconditional_conditioning(
-        self, batch_c, batch_uc_1=None, batch_uc_2=None, force_uc_zero_embeddings=None
-    ):
-        if force_uc_zero_embeddings is None:
-            force_uc_zero_embeddings = []
-        ucg_rates = list()
-        for embedder in self.embedders:
-            ucg_rates.append(embedder.ucg_rate)
-            embedder.ucg_rate = 0.0
-        c = self(batch_c)
-        uc_1 = self(batch_uc_1, force_uc_zero_embeddings) if batch_uc_1 is not None else None
-        uc_2 = self(batch_uc_2, force_uc_zero_embeddings[:1]) if batch_uc_2 is not None else None
-        for embedder, rate in zip(self.embedders, ucg_rates):
-            embedder.ucg_rate = rate
-        return c, uc_1, uc_2
 class InceptionV3(nn.Module):
     """Wrapper around the https://github.com/mseitzer/pytorch-fid inception
     port with an additional squeeze at the end"""
@@ -409,7 +400,6 @@ class FrozenCLIPEmbedder(AbstractEmbModel):
     def freeze(self):
         self.transformer = self.transformer.eval()
         for param in self.parameters():
             param.requires_grad = False
@@ -694,24 +684,24 @@ class FrozenOpenCLIPImageEmbedder(AbstractEmbModel):
         if self.output_tokens:
             z, tokens = z[0], z[1]
         z = z.to(image.dtype)
-        if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
-            z = (
-                torch.bernoulli(
-                    (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
-                )[:, None]
-                * z
-            )
-            if tokens is not None:
-                tokens = (
-                    expand_dims_like(
-                        torch.bernoulli(
-                            (1.0 - self.ucg_rate)
-                            * torch.ones(tokens.shape[0], device=tokens.device)
-                        ),
-                        tokens,
-                    )
-                    * tokens
-                )
         if self.unsqueeze_dim:
             z = z[:, None, :]
         if self.output_tokens:
@@ -807,7 +797,7 @@ class FrozenCLIPT5Encoder(AbstractEmbModel):
         return [clip_z, t5_z]
-class SpatialRescaler(nn.Module):
     def __init__(
         self,
         n_stages=1,
@@ -846,6 +836,9 @@ class SpatialRescaler(nn.Module):
                 padding=kernel_size // 2,
             )
         self.wrap_video = wrap_video
     def forward(self, x):
         if self.wrap_video and x.ndim == 5:

     ByT5Tokenizer,
     CLIPTextModel,
     CLIPTokenizer,
+    CLIPVisionModel,
     T5EncoderModel,
     T5Tokenizer,
 )
 from torchvision import transforms
 from timm.models.vision_transformer import VisionTransformer
 from safetensors.torch import load_file as load_safetensors
+from torchvision.utils import save_image
 # disable warning
 from transformers import logging
 logging.set_verbosity_error()
 class AbstractEmbModel(nn.Module):
+    def __init__(self):
         super().__init__()
         self._is_trainable = None
         self._ucg_rate = None
         self._input_key = None
+        self._emb_key = None
     @property
     def is_trainable(self) -> bool:
     def input_key(self) -> str:
         return self._input_key
+    @property
+    def emb_key(self) -> str:
+        return self._emb_key
     @is_trainable.setter
     def is_trainable(self, value: bool):
         self._is_trainable = value
     def input_key(self, value: str):
         self._input_key = value
+    @emb_key.setter
+    def emb_key(self, value: str):
+        self._emb_key = value
     @is_trainable.deleter
     def is_trainable(self):
         del self._is_trainable
     def input_key(self):
         del self._input_key
+    @emb_key.deleter
+    def emb_key(self):
+        del self._emb_key
 class GeneralConditioner(nn.Module):
     OUTPUT_DIM2KEYS = {2: "vector", 3: "crossattn", 4: "concat", 5: "concat"}
     KEY2CATDIM = {"vector": 1, "crossattn": 2, "concat": 1}
                 f"Initialized embedder #{n}: {embedder.__class__.__name__} "
                 f"with {count_params(embedder, False)} params. Trainable: {embedder.is_trainable}"
             )
+            if "emb_key" in embconfig:
+                embedder.emb_key = embconfig["emb_key"]
             if "input_key" in embconfig:
                 embedder.input_key = embconfig["input_key"]
             elif "input_keys" in embconfig:
             if not isinstance(emb_out, (list, tuple)):
                 emb_out = [emb_out]
             for emb in emb_out:
+                if embedder.emb_key is not None:
+                    out_key = embedder.emb_key
                 else:
                     out_key = self.OUTPUT_DIM2KEYS[emb.dim()]
                 if embedder.ucg_rate > 0.0 and embedder.legacy_ucg_val is None:
                     emb = (
                         expand_dims_like(
         return c, uc
 class InceptionV3(nn.Module):
     """Wrapper around the https://github.com/mseitzer/pytorch-fid inception
     port with an additional squeeze at the end"""
     def freeze(self):
         self.transformer = self.transformer.eval()
         for param in self.parameters():
             param.requires_grad = False
         if self.output_tokens:
             z, tokens = z[0], z[1]
         z = z.to(image.dtype)
+        # if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
+        #     z = (
+        #         torch.bernoulli(
+        #             (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
+        #         )[:, None]
+        #         * z
+        #     )
+        #     if tokens is not None:
+        #         tokens = (
+        #             expand_dims_like(
+        #                 torch.bernoulli(
+        #                     (1.0 - self.ucg_rate)
+        #                     * torch.ones(tokens.shape[0], device=tokens.device)
+        #                 ),
+        #                 tokens,
+        #             )
+        #             * tokens
+        #         )
         if self.unsqueeze_dim:
             z = z[:, None, :]
         if self.output_tokens:
         return [clip_z, t5_z]
+class SpatialRescaler(AbstractEmbModel):
     def __init__(
         self,
         n_stages=1,
                 padding=kernel_size // 2,
             )
         self.wrap_video = wrap_video
+    def freeze(self):
+        pass
     def forward(self, x):
         if self.wrap_video and x.ndim == 5:

temp/attn_map/attn_map_3.png ADDED Viewed

temp/attn_map/attn_map_4.png ADDED Viewed

temp/attn_map/attn_map_5.png ADDED Viewed

temp/seg_map/seg_3.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff197cf810e4ba2d26b76265d48530ff03c7b753e1ae6b0b7dfc8d010801df26
+size 20608

temp/seg_map/seg_4.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc96f8f8a39aa63faa8ece0d8f758520a41d59b881926a9ddcacb6f5d46099dd
+size 20608

temp/seg_map/seg_5.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16f008e62ab6b2b5b1ca1f58390808b8c9096edb6ddd85570f17232c441114f2
+size 24704

util.py CHANGED Viewed

@@ -3,34 +3,6 @@ from omegaconf import OmegaConf
 from sgm.util import instantiate_from_config
 from sgm.modules.diffusionmodules.sampling import *
-SD_XL_BASE_RATIOS = {
-    "0.5": (704, 1408),
-    "0.52": (704, 1344),
-    "0.57": (768, 1344),
-    "0.6": (768, 1280),
-    "0.68": (832, 1216),
-    "0.72": (832, 1152),
-    "0.78": (896, 1152),
-    "0.82": (896, 1088),
-    "0.88": (960, 1088),
-    "0.94": (960, 1024),
-    "1.0": (1024, 1024),
-    "1.07": (1024, 960),
-    "1.13": (1088, 960),
-    "1.21": (1088, 896),
-    "1.29": (1152, 896),
-    "1.38": (1152, 832),
-    "1.46": (1216, 832),
-    "1.67": (1280, 768),
-    "1.75": (1344, 768),
-    "1.91": (1344, 704),
-    "2.0": (1408, 704),
-    "2.09": (1472, 704),
-    "2.4": (1536, 640),
-    "2.5": (1600, 640),
-    "2.89": (1664, 576),
-    "3.0": (1728, 576),
-}
 def init_model(cfgs):
@@ -43,8 +15,7 @@ def init_model(cfgs):
     if cfgs.type == "train":
         model.train()
     else:
-        if cfgs.use_gpu:
-            model.to(torch.device("cuda", index=cfgs.gpu))
         model.eval()
         model.freeze()
@@ -56,40 +27,22 @@ def init_sampling(cfgs):
         "target": "sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization",
     }
-    if cfgs.dual_conditioner:
-        guider_config = {
-            "target": "sgm.modules.diffusionmodules.guiders.DualCFG",
-            "params": {"scale": cfgs.scale},
-        }
-        sampler = EulerEDMDualSampler(
-            num_steps=cfgs.steps,
-            discretization_config=discretization_config,
-            guider_config=guider_config,
-            s_churn=0.0,
-            s_tmin=0.0,
-            s_tmax=999.0,
-            s_noise=1.0,
-            verbose=True,
-            device=torch.device("cuda", index=cfgs.gpu)
-        )
-    else:
-        guider_config = {
-            "target": "sgm.modules.diffusionmodules.guiders.VanillaCFG",
-            "params": {"scale": cfgs.scale[0]},
-        }
-        sampler = EulerEDMSampler(
-            num_steps=cfgs.steps,
-            discretization_config=discretization_config,
-            guider_config=guider_config,
-            s_churn=0.0,
-            s_tmin=0.0,
-            s_tmax=999.0,
-            s_noise=1.0,
-            verbose=True,
-            device=torch.device("cuda", index=cfgs.gpu)
-        )
     return sampler
@@ -109,29 +62,17 @@ def deep_copy(batch):
 def prepare_batch(cfgs, batch):
     for key in batch:
-        if isinstance(batch[key], torch.Tensor) and cfgs.use_gpu:
             batch[key] = batch[key].to(torch.device("cuda", index=cfgs.gpu))
-    if not cfgs.dual_conditioner:
-        batch_uc = deep_copy(batch)
-        if "ntxt" in batch:
-            batch_uc["txt"] = batch["ntxt"]
-        else:
-            batch_uc["txt"] = ["" for _ in range(len(batch["txt"]))]
-        if "label" in batch:
-            batch_uc["label"] = ["" for _ in range(len(batch["label"]))]
-        return batch, batch_uc, None
     else:
-        batch_uc_1 = deep_copy(batch)
-        batch_uc_2 = deep_copy(batch)
-        batch_uc_1["ref"] = torch.zeros_like(batch["ref"])
-        batch_uc_2["ref"] = torch.zeros_like(batch["ref"])
-        batch_uc_1["label"] = ["" for _ in range(len(batch["label"]))]
-        return batch, batch_uc_1, batch_uc_2

 from sgm.util import instantiate_from_config
 from sgm.modules.diffusionmodules.sampling import *
 def init_model(cfgs):
     if cfgs.type == "train":
         model.train()
     else:
+        model.to(torch.device("cuda", index=cfgs.gpu))
         model.eval()
         model.freeze()
         "target": "sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization",
     }
+    guider_config = {
+        "target": "sgm.modules.diffusionmodules.guiders.VanillaCFG",
+        "params": {"scale": cfgs.scale[0]},
+    }
+    sampler = EulerEDMSampler(
+        num_steps=cfgs.steps,
+        discretization_config=discretization_config,
+        guider_config=guider_config,
+        s_churn=0.0,
+        s_tmin=0.0,
+        s_tmax=999.0,
+        s_noise=1.0,
+        verbose=True,
+        device=torch.device("cuda", index=cfgs.gpu)
+    )
     return sampler
 def prepare_batch(cfgs, batch):
     for key in batch:
+        if isinstance(batch[key], torch.Tensor):
             batch[key] = batch[key].to(torch.device("cuda", index=cfgs.gpu))
+    batch_uc = deep_copy(batch)
+    if "ntxt" in batch:
+        batch_uc["txt"] = batch["ntxt"]
     else:
+        batch_uc["txt"] = ["" for _ in range(len(batch["txt"]))]
+    if "label" in batch:
+        batch_uc["label"] = ["" for _ in range(len(batch["label"]))]
+    return batch, batch_uc