Spaces:

AmitIsraeli
/

PopYou

Runtime error

App Files Files Community

AmitIsraeli commited on Nov 22, 2024

Commit

64bf706

1 Parent(s): 3aaab28

Add model and infrance app

Browse files

Files changed (16) hide show

VARtext_v1.pth +3 -0
app.py +236 -4
dist.py +211 -0
models/__init__.py +39 -0
models/basic_vae.py +226 -0
models/basic_var.py +174 -0
models/helpers.py +59 -0
models/quant.py +281 -0
models/var.py +360 -0
models/vqvae.py +95 -0
utils/amp_sc.py +89 -0
utils/arg_util.py +284 -0
utils/data.py +54 -0
utils/data_sampler.py +103 -0
utils/lr_control.py +108 -0
utils/misc.py +381 -0

VARtext_v1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbaa03cee25cb0abba7ac5d476f6b800b78dda29c6cb2773a11b584022585fcf
+size 1963751390

app.py CHANGED Viewed

@@ -1,7 +1,239 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import torch
+from models import VQVAE, build_vae_var
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, SiglipTextModel
+from peft import LoraConfig, get_peft_model
+import random
+from torchvision.transforms import ToPILImage
+import numpy as np
+from moviepy.editor import ImageSequenceClip
+import random
 import gradio as gr
+import tempfile
+import os
+class SimpleAdapter(nn.Module):
+    def __init__(self, input_dim=512, hidden_dim=1024, out_dim=1024):
+        super(SimpleAdapter, self).__init__()
+        self.layer1 = nn.Linear(input_dim, hidden_dim)
+        self.norm0 = nn.LayerNorm(input_dim)
+        self.activation1 = nn.GELU()
+        self.layer2 = nn.Linear(hidden_dim, out_dim)
+        self.norm2 = nn.LayerNorm(out_dim)
+        self._initialize_weights()
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight, gain=0.001)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+    def forward(self, x):
+        x = self.norm0(x)
+        x = self.layer1(x)
+        x = self.activation1(x)
+        x = self.layer2(x)
+        x = self.norm2(x)
+        return x
+class InrenceTextVAR(nn.Module):
+    def __init__(self, pl_checkpoint=None, start_class_id=578, hugging_face_token=None, siglip_model='google/siglip-base-patch16-224', device="cpu", MODEL_DEPTH=16):
+        super(InrenceTextVAR, self).__init__()
+        self.device = device
+        self.class_id = start_class_id
+        # Define layers
+        patch_nums = (1, 2, 3, 4, 5, 6, 8, 10, 13, 16)
+        self.vae, self.var = build_vae_var(
+            V=4096, Cvae=32, ch=160, share_quant_resi=4,
+            device=device, patch_nums=patch_nums,
+            num_classes=1000, depth=MODEL_DEPTH, shared_aln=False,
+        )
+        self.text_processor = AutoTokenizer.from_pretrained(siglip_model, token=hugging_face_token)
+        self.siglip_text_encoder = SiglipTextModel.from_pretrained(siglip_model, token=hugging_face_token).to(device)
+        self.adapter = SimpleAdapter(
+            input_dim=self.siglip_text_encoder.config.hidden_size,
+            out_dim=self.var.C  # Ensure dimensional consistency
+        ).to(device)
+        self.apply_lora_to_var()
+        if pl_checkpoint is not None:
+            state_dict = torch.load(pl_checkpoint, map_location="cpu")['state_dict']
+            var_state_dict = {k[len('var.'):]: v for k, v in state_dict.items() if k.startswith('var.')}
+            vae_state_dict = {k[len('vae.'):]: v for k, v in state_dict.items() if k.startswith('vae.')}
+            adapter_state_dict = {k[len('adapter.'):]: v for k, v in state_dict.items() if k.startswith('adapter.')}
+            self.var.load_state_dict(var_state_dict)
+            self.vae.load_state_dict(vae_state_dict)
+            self.adapter.load_state_dict(adapter_state_dict)
+        del self.vae.encoder
+    def apply_lora_to_var(self):
+        """
+        Applies LoRA (Low-Rank Adaptation) to the VAR model.
+        """
+        def find_linear_module_names(model):
+            linear_module_names = []
+            for name, module in model.named_modules():
+                if isinstance(module, nn.Linear):
+                    linear_module_names.append(name)
+            return linear_module_names
+        linear_module_names = find_linear_module_names(self.var)
+        lora_config = LoraConfig(
+            r=8,
+            lora_alpha=32,
+            target_modules=linear_module_names,
+            lora_dropout=0.05,
+            bias="none",
+        )
+        self.var = get_peft_model(self.var, lora_config)
+    @torch.no_grad()
+    def generate_image(self, text, beta=1, seed=None, more_smooth=False, top_k=0, top_p=0.9):
+        if seed is None:
+            seed = random.randint(0, 2**32 - 1)
+        inputs = self.text_processor([text], padding="max_length", return_tensors="pt").to(self.device)
+        outputs = self.siglip_text_encoder(**inputs)
+        pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        pooled_output = F.normalize(pooled_output, p=2, dim=-1)  # Normalize delta condition
+        cond_delta = F.normalize(pooled_output, p=2, dim=-1).to(self.device)  # Use correct device
+        cond_delta = self.adapter(cond_delta)
+        cond_delta = F.normalize(cond_delta, p=2, dim=-1)  # Normalize delta condition
+        generated_images = self.var.autoregressive_infer_cfg(
+            B=1,
+            label_B=self.class_id,
+            delta_condition=cond_delta[:1],
+            beta=beta,
+            alpha=1,
+            top_k=top_k,
+            top_p=top_p,
+            more_smooth=more_smooth,
+            g_seed=seed
+        )
+        image = ToPILImage()(generated_images[0].cpu())
+        return image
+    @torch.no_grad()
+    def generate_video(self, text, start_beta, target_beta, fps, length, top_k=0, top_p=0.9, seed=None,
+                       more_smooth=False,
+                       output_filename='output_video.mp4'):
+        if seed is None:
+            seed = random.randint(0, 2 ** 32 - 1)
+        num_frames = int(fps * length)
+        images = []
+        # Define an easing function for smoother interpolation
+        def ease_in_out(t):
+            return t * t * (3 - 2 * t)
+        # Generate t values between 0 and 1
+        t_values = np.linspace(0, 1, num_frames)
+        # Apply the easing function
+        eased_t_values = ease_in_out(t_values)
+        # Interpolate beta values using the eased t values
+        beta_values = start_beta + (target_beta - start_beta) * eased_t_values
+        for beta in beta_values:
+            image = self.generate_image(text, beta=beta, seed=seed, more_smooth=more_smooth, top_k=top_k, top_p=top_p)
+            images.append(np.array(image))
+        # Create a video from images
+        clip = ImageSequenceClip(images, fps=fps)
+        clip.write_videofile(output_filename, codec='libx264')
+if __name__ == '__main__':
+    # Initialize the model
+    checkpoint = 'VARtext_v1.pth'  # Replace with your actual checkpoint path
+    device = 'cpu' if not torch.cuda.is_available() else 'cuda'
+    state_dict = torch.load(checkpoint, map_location="cpu")
+    model = InrenceTextVAR(device=device)
+    model.load_state_dict(state_dict)
+    model.to(device)
+    def generate_image_gradio(text, beta=1.0, seed=None, more_smooth=False, top_k=0, top_p=0.9):
+        print(f"Generating image for text: {text}\n"
+              f"beta: {beta}\n"
+              f"seed: {seed}\n"
+              f"more_smooth: {more_smooth}\n"
+              f"top_k: {top_k}\n"
+              f"top_p: {top_p}\n")
+        image = model.generate_image(text, beta=beta, seed=seed, more_smooth=more_smooth, top_k=int(top_k), top_p=top_p)
+        return image
+    def generate_video_gradio(text, start_beta=1.0, target_beta=1.0, fps=10, length=5.0, top_k=0, top_p=0.9, seed=None, more_smooth=False, progress=gr.Progress()):
+        print(f"Generating video for text: {text}\n"
+              f"start_beta: {start_beta}\n"
+              f"target_beta: {target_beta}\n"
+              f"seed: {seed}\n"
+              f"more_smooth: {more_smooth}\n"
+              f"top_k: {top_k}\n"
+              f"top_p: {top_p}"
+              f"fps: {fps}\n"
+              f"length: {length}\n")
+        with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmpfile:
+            output_filename = tmpfile.name
+        num_frames = int(fps * length)
+        beta_values = np.linspace(start_beta, target_beta, num_frames)
+        images = []
+        for i, beta in enumerate(beta_values):
+            image = model.generate_image(text, beta=beta, seed=seed, more_smooth=more_smooth, top_k=top_k, top_p=top_p)
+            images.append(np.array(image))
+            # Update progress
+            progress((i + 1) / num_frames)
+            # Yield the frame image to update the GUI
+            yield image, gr.update()
+        # After generating all frames, create the video
+        clip = ImageSequenceClip(images, fps=fps)
+        clip.write_videofile(output_filename, codec='libx264')
+        # Yield the final video output
+        yield gr.update(), output_filename
+    with gr.Blocks() as demo:
+        gr.Markdown("# Text to Image/Video Generator")
+        with gr.Tab("Generate Image"):
+            text_input = gr.Textbox(label="Input Text")
+            beta_input = gr.Slider(label="Beta", minimum=0.0, maximum=2.5, step=0.05, value=1.0)
+            seed_input = gr.Number(label="Seed", value=None)
+            more_smooth_input = gr.Checkbox(label="More Smooth", value=False)
+            top_k_input = gr.Number(label="Top K", value=0)
+            top_p_input = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, step=0.01, value=0.9)
+            generate_button = gr.Button("Generate Image")
+            image_output = gr.Image(label="Generated Image")
+            generate_button.click(
+                generate_image_gradio,
+                inputs=[text_input, beta_input, seed_input, more_smooth_input, top_k_input, top_p_input],
+                outputs=image_output
+            )
+        with gr.Tab("Generate Video"):
+            text_input_video = gr.Textbox(label="Input Text")
+            start_beta_input = gr.Slider(label="Start Beta", minimum=0.0, maximum=2.5, step=0.05, value=0)
+            target_beta_input = gr.Slider(label="Target Beta",minimum=0.0, maximum=2.5, step=0.05, value=1.0)
+            fps_input = gr.Number(label="FPS", value=10)
+            length_input = gr.Number(label="Length (seconds)", value=5.0)
+            seed_input_video = gr.Number(label="Seed", value=None)
+            more_smooth_input_video = gr.Checkbox(label="More Smooth", value=False)
+            top_k_input_video = gr.Number(label="Top K", value=0)
+            top_p_input_video = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, step=0.01, value=0.9)
+            generate_video_button = gr.Button("Generate Video")
+            frame_output = gr.Image(label="Current Frame")
+            video_output = gr.Video(label="Generated Video")
+            generate_video_button.click(
+                generate_video_gradio,
+                inputs=[text_input_video, start_beta_input, target_beta_input, fps_input, length_input, top_k_input_video, top_p_input_video, seed_input_video, more_smooth_input_video],
+                outputs=[frame_output, video_output],
+                queue=True  # Enable queuing to allow for progress updates
+            )
+    demo.launch()

dist.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import datetime
+import functools
+import os
+import sys
+from typing import List
+from typing import Union
+import torch
+import torch.distributed as tdist
+import torch.multiprocessing as mp
+__rank, __local_rank, __world_size, __device = 0, 0, 1, 'cuda' if torch.cuda.is_available() else 'cpu'
+__initialized = False
+def initialized():
+    return __initialized
+def initialize(fork=False, backend='nccl', gpu_id_if_not_distibuted=0, timeout=30):
+    global __device
+    if not torch.cuda.is_available():
+        print(f'[dist initialize] cuda is not available, use cpu instead', file=sys.stderr)
+        return
+    elif 'RANK' not in os.environ:
+        torch.cuda.set_device(gpu_id_if_not_distibuted)
+        __device = torch.empty(1).cuda().device
+        print(f'[dist initialize] env variable "RANK" is not set, use {__device} as the device', file=sys.stderr)
+        return
+    # then 'RANK' must exist
+    global_rank, num_gpus = int(os.environ['RANK']), torch.cuda.device_count()
+    local_rank = global_rank % num_gpus
+    torch.cuda.set_device(local_rank)
+    # ref: https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py#L29
+    if mp.get_start_method(allow_none=True) is None:
+        method = 'fork' if fork else 'spawn'
+        print(f'[dist initialize] mp method={method}')
+        mp.set_start_method(method)
+    tdist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=timeout*60))
+    global __rank, __local_rank, __world_size, __initialized
+    __local_rank = local_rank
+    __rank, __world_size = tdist.get_rank(), tdist.get_world_size()
+    __device = torch.empty(1).cuda().device
+    __initialized = True
+    assert tdist.is_initialized(), 'torch.distributed is not initialized!'
+    print(f'[lrk={get_local_rank()}, rk={get_rank()}]')
+def get_rank():
+    return __rank
+def get_local_rank():
+    return __local_rank
+def get_world_size():
+    return __world_size
+def get_device():
+    return __device
+def set_gpu_id(gpu_id: int):
+    if gpu_id is None: return
+    global __device
+    if isinstance(gpu_id, (str, int)):
+        torch.cuda.set_device(int(gpu_id))
+        __device = torch.empty(1).cuda().device
+    else:
+        raise NotImplementedError
+def is_master():
+    return __rank == 0
+def is_local_master():
+    return __local_rank == 0
+def new_group(ranks: List[int]):
+    if __initialized:
+        return tdist.new_group(ranks=ranks)
+    return None
+def barrier():
+    if __initialized:
+        tdist.barrier()
+def allreduce(t: torch.Tensor, async_op=False):
+    if __initialized:
+        if not t.is_cuda:
+            cu = t.detach().cuda()
+            ret = tdist.all_reduce(cu, async_op=async_op)
+            t.copy_(cu.cpu())
+        else:
+            ret = tdist.all_reduce(t, async_op=async_op)
+        return ret
+    return None
+def allgather(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]:
+    if __initialized:
+        if not t.is_cuda:
+            t = t.cuda()
+        ls = [torch.empty_like(t) for _ in range(__world_size)]
+        tdist.all_gather(ls, t)
+    else:
+        ls = [t]
+    if cat:
+        ls = torch.cat(ls, dim=0)
+    return ls
+def allgather_diff_shape(t: torch.Tensor, cat=True) -> Union[List[torch.Tensor], torch.Tensor]:
+    if __initialized:
+        if not t.is_cuda:
+            t = t.cuda()
+        t_size = torch.tensor(t.size(), device=t.device)
+        ls_size = [torch.empty_like(t_size) for _ in range(__world_size)]
+        tdist.all_gather(ls_size, t_size)
+        max_B = max(size[0].item() for size in ls_size)
+        pad = max_B - t_size[0].item()
+        if pad:
+            pad_size = (pad, *t.size()[1:])
+            t = torch.cat((t, t.new_empty(pad_size)), dim=0)
+        ls_padded = [torch.empty_like(t) for _ in range(__world_size)]
+        tdist.all_gather(ls_padded, t)
+        ls = []
+        for t, size in zip(ls_padded, ls_size):
+            ls.append(t[:size[0].item()])
+    else:
+        ls = [t]
+    if cat:
+        ls = torch.cat(ls, dim=0)
+    return ls
+def broadcast(t: torch.Tensor, src_rank) -> None:
+    if __initialized:
+        if not t.is_cuda:
+            cu = t.detach().cuda()
+            tdist.broadcast(cu, src=src_rank)
+            t.copy_(cu.cpu())
+        else:
+            tdist.broadcast(t, src=src_rank)
+def dist_fmt_vals(val: float, fmt: Union[str, None] = '%.2f') -> Union[torch.Tensor, List]:
+    if not initialized():
+        return torch.tensor([val]) if fmt is None else [fmt % val]
+    ts = torch.zeros(__world_size)
+    ts[__rank] = val
+    allreduce(ts)
+    if fmt is None:
+        return ts
+    return [fmt % v for v in ts.cpu().numpy().tolist()]
+def master_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if force or is_master():
+            ret = func(*args, **kwargs)
+        else:
+            ret = None
+        barrier()
+        return ret
+    return wrapper
+def local_master_only(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if force or is_local_master():
+            ret = func(*args, **kwargs)
+        else:
+            ret = None
+        barrier()
+        return ret
+    return wrapper
+def for_visualize(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_master():
+            # with torch.no_grad():
+            ret = func(*args, **kwargs)
+        else:
+            ret = None
+        return ret
+    return wrapper
+def finalize():
+    if __initialized:
+        tdist.destroy_process_group()

models/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import Tuple
+import torch.nn as nn
+from .quant import VectorQuantizer2
+from .var import VAR
+from .vqvae import VQVAE
+def build_vae_var(
+    # Shared args
+    device, patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),   # 10 steps by default
+    # VQVAE args
+    V=4096, Cvae=32, ch=160, share_quant_resi=4,
+    # VAR args
+    num_classes=1000, depth=16, shared_aln=False, attn_l2_norm=True,
+    flash_if_available=True, fused_if_available=True,
+    init_adaln=0.5, init_adaln_gamma=1e-5, init_head=0.02, init_std=-1,    # init_std < 0: automated
+) -> Tuple[VQVAE, VAR]:
+    heads = depth
+    width = depth * 64
+    dpr = 0.1 * depth/24
+    # disable built-in initialization for speed
+    for clz in (nn.Linear, nn.LayerNorm, nn.BatchNorm2d, nn.SyncBatchNorm, nn.Conv1d, nn.Conv2d, nn.ConvTranspose1d, nn.ConvTranspose2d):
+        setattr(clz, 'reset_parameters', lambda self: None)
+    # build models
+    vae_local = VQVAE(vocab_size=V, z_channels=Cvae, ch=ch, test_mode=True, share_quant_resi=share_quant_resi, v_patch_nums=patch_nums).to(device)
+    var_wo_ddp = VAR(
+        vae_local=vae_local,
+        num_classes=num_classes, depth=depth, embed_dim=width, num_heads=heads, drop_rate=0., attn_drop_rate=0., drop_path_rate=dpr,
+        norm_eps=1e-6, shared_aln=shared_aln, cond_drop_rate=0.1,
+        attn_l2_norm=attn_l2_norm,
+        patch_nums=patch_nums,
+        flash_if_available=flash_if_available, fused_if_available=fused_if_available,
+    ).to(device)
+    var_wo_ddp.init_weights(init_adaln=init_adaln, init_adaln_gamma=init_adaln_gamma, init_head=init_head, init_std=init_std)
+    return vae_local, var_wo_ddp

models/basic_vae.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# this file only provides the 2 modules used in VQVAE
+__all__ = ['Encoder', 'Decoder',]
+"""
+References: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/modules/diffusionmodules/model.py
+"""
+# swish
+def nonlinearity(x):
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample2x(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        return self.conv(F.interpolate(x, scale_factor=2, mode='nearest'))
+class Downsample2x(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x):
+        return self.conv(F.pad(x, pad=(0, 1, 0, 1), mode='constant', value=0))
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, dropout): # conv_shortcut=False,  # conv_shortcut: always False in VAE
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout) if dropout > 1e-6 else nn.Identity()
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+        else:
+            self.nin_shortcut = nn.Identity()
+    def forward(self, x):
+        h = self.conv1(F.silu(self.norm1(x), inplace=True))
+        h = self.conv2(self.dropout(F.silu(self.norm2(h), inplace=True)))
+        return self.nin_shortcut(x) + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.C = in_channels
+        self.norm = Normalize(in_channels)
+        self.qkv = torch.nn.Conv2d(in_channels, 3*in_channels, kernel_size=1, stride=1, padding=0)
+        self.w_ratio = int(in_channels) ** (-0.5)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        qkv = self.qkv(self.norm(x))
+        B, _, H, W = qkv.shape  # should be B,3C,H,W
+        C = self.C
+        q, k, v = qkv.reshape(B, 3, C, H, W).unbind(1)
+        # compute attention
+        q = q.view(B, C, H * W).contiguous()
+        q = q.permute(0, 2, 1).contiguous()     # B,HW,C
+        k = k.view(B, C, H * W).contiguous()    # B,C,HW
+        w = torch.bmm(q, k).mul_(self.w_ratio)  # B,HW,HW    w[B,i,j]=sum_c q[B,i,C]k[B,C,j]
+        w = F.softmax(w, dim=2)
+        # attend to values
+        v = v.view(B, C, H * W).contiguous()
+        w = w.permute(0, 2, 1).contiguous()  # B,HW,HW (first HW of k, second of q)
+        h = torch.bmm(v, w)  # B, C,HW (HW of q) h[B,C,j] = sum_i v[B,C,i] w[B,i,j]
+        h = h.view(B, C, H, W).contiguous()
+        return x + self.proj_out(h)
+def make_attn(in_channels, using_sa=True):
+    return AttnBlock(in_channels) if using_sa else nn.Identity()
+class Encoder(nn.Module):
+    def __init__(
+        self, *, ch=128, ch_mult=(1, 2, 4, 8), num_res_blocks=2,
+        dropout=0.0, in_channels=3,
+        z_channels, double_z=False, using_sa=True, using_mid_sa=True,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.downsample_ratio = 2 ** (self.num_resolutions - 1)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dropout=dropout))
+                block_in = block_out
+                if i_level == self.num_resolutions - 1 and using_sa:
+                    attn.append(make_attn(block_in, using_sa=True))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample2x(block_in)
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, using_sa=using_mid_sa)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, (2 * z_channels if double_z else z_channels), kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = self.down[i_level].downsample(h)
+        # middle
+        h = self.mid.block_2(self.mid.attn_1(self.mid.block_1(h)))
+        # end
+        h = self.conv_out(F.silu(self.norm_out(h), inplace=True))
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self, *, ch=128, ch_mult=(1, 2, 4, 8), num_res_blocks=2,
+        dropout=0.0, in_channels=3,  # in_channels: raw img channels
+        z_channels, using_sa=True, using_mid_sa=True,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, using_sa=using_mid_sa)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dropout=dropout))
+                block_in = block_out
+                if i_level == self.num_resolutions-1 and using_sa:
+                    attn.append(make_attn(block_in, using_sa=True))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample2x(block_in)
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, z):
+        # z to block_in
+        # middle
+        h = self.mid.block_2(self.mid.attn_1(self.mid.block_1(self.conv_in(z))))
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.conv_out(F.silu(self.norm_out(h), inplace=True))
+        return h

models/basic_var.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.helpers import DropPath, drop_path
+# this file only provides the 3 blocks used in VAR transformer
+__all__ = ['FFN', 'AdaLNSelfAttn', 'AdaLNBeforeHead']
+# automatically import fused operators
+dropout_add_layer_norm = fused_mlp_func = memory_efficient_attention = flash_attn_func = None
+try:
+    from flash_attn.ops.layer_norm import dropout_add_layer_norm
+    from flash_attn.ops.fused_dense import fused_mlp_func
+except ImportError: pass
+# automatically import faster attention implementations
+try: from xformers.ops import memory_efficient_attention
+except ImportError: pass
+try: from flash_attn import flash_attn_func              # qkv: BLHc, ret: BLHcq
+except ImportError: pass
+try: from torch.nn.functional import scaled_dot_product_attention as slow_attn    # q, k, v: BHLc
+except ImportError:
+    def slow_attn(query, key, value, scale: float, attn_mask=None, dropout_p=0.0):
+        attn = query.mul(scale) @ key.transpose(-2, -1) # BHLc @ BHcL => BHLL
+        if attn_mask is not None: attn.add_(attn_mask)
+        return (F.dropout(attn.softmax(dim=-1), p=dropout_p, inplace=True) if dropout_p > 0 else attn.softmax(dim=-1)) @ value
+class FFN(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, drop=0., fused_if_available=True):
+        super().__init__()
+        self.fused_mlp_func = fused_mlp_func if fused_if_available else None
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU(approximate='tanh')
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop, inplace=True) if drop > 0 else nn.Identity()
+    def forward(self, x):
+        if self.fused_mlp_func is not None:
+            return self.drop(self.fused_mlp_func(
+                x=x, weight1=self.fc1.weight, weight2=self.fc2.weight, bias1=self.fc1.bias, bias2=self.fc2.bias,
+                activation='gelu_approx', save_pre_act=self.training, return_residual=False, checkpoint_lvl=0,
+                heuristic=0, process_group=None,
+            ))
+        else:
+            return self.drop(self.fc2( self.act(self.fc1(x)) ))
+    def extra_repr(self) -> str:
+        return f'fused_mlp_func={self.fused_mlp_func is not None}'
+class SelfAttention(nn.Module):
+    def __init__(
+        self, block_idx, embed_dim=768, num_heads=12,
+        attn_drop=0., proj_drop=0., attn_l2_norm=False, flash_if_available=True,
+    ):
+        super().__init__()
+        assert embed_dim % num_heads == 0
+        self.block_idx, self.num_heads, self.head_dim = block_idx, num_heads, embed_dim // num_heads  # =64
+        self.attn_l2_norm = attn_l2_norm
+        if self.attn_l2_norm:
+            self.scale = 1
+            self.scale_mul_1H11 = nn.Parameter(torch.full(size=(1, self.num_heads, 1, 1), fill_value=4.0).log(), requires_grad=True)
+            self.max_scale_mul = torch.log(torch.tensor(100)).item()
+        else:
+            self.scale = 0.25 / math.sqrt(self.head_dim)
+        self.mat_qkv = nn.Linear(embed_dim, embed_dim * 3, bias=False)
+        self.q_bias, self.v_bias = nn.Parameter(torch.zeros(embed_dim)), nn.Parameter(torch.zeros(embed_dim))
+        self.register_buffer('zero_k_bias', torch.zeros(embed_dim))
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = nn.Dropout(proj_drop, inplace=True) if proj_drop > 0 else nn.Identity()
+        self.attn_drop: float = attn_drop
+        self.using_flash = flash_if_available and flash_attn_func is not None
+        self.using_xform = flash_if_available and memory_efficient_attention is not None
+        # only used during inference
+        self.caching, self.cached_k, self.cached_v = False, None, None
+    def kv_caching(self, enable: bool): self.caching, self.cached_k, self.cached_v = enable, None, None
+    # NOTE: attn_bias is None during inference because kv cache is enabled
+    def forward(self, x, attn_bias):
+        B, L, C = x.shape
+        qkv = F.linear(input=x, weight=self.mat_qkv.weight, bias=torch.cat((self.q_bias, self.zero_k_bias, self.v_bias))).view(B, L, 3, self.num_heads, self.head_dim)
+        main_type = qkv.dtype
+        # qkv: BL3Hc
+        using_flash = self.using_flash and attn_bias is None and qkv.dtype != torch.float32
+        if using_flash or self.using_xform: q, k, v = qkv.unbind(dim=2); dim_cat = 1   # q or k or v: BLHc
+        else: q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(dim=0); dim_cat = 2               # q or k or v: BHLc
+        if self.attn_l2_norm:
+            scale_mul = self.scale_mul_1H11.clamp_max(self.max_scale_mul).exp()
+            if using_flash or self.using_xform: scale_mul = scale_mul.transpose(1, 2)  # 1H11 to 11H1
+            q = F.normalize(q, dim=-1).mul(scale_mul)
+            k = F.normalize(k, dim=-1)
+        if self.caching:
+            if self.cached_k is None: self.cached_k = k; self.cached_v = v
+            else: k = self.cached_k = torch.cat((self.cached_k, k), dim=dim_cat); v = self.cached_v = torch.cat((self.cached_v, v), dim=dim_cat)
+        dropout_p = self.attn_drop if self.training else 0.0
+        if using_flash:
+            oup = flash_attn_func(q.to(dtype=main_type), k.to(dtype=main_type), v.to(dtype=main_type), dropout_p=dropout_p, softmax_scale=self.scale).view(B, L, C)
+        elif self.using_xform:
+            oup = memory_efficient_attention(q.to(dtype=main_type), k.to(dtype=main_type), v.to(dtype=main_type), attn_bias=None if attn_bias is None else attn_bias.to(dtype=main_type).expand(B, self.num_heads, -1, -1), p=dropout_p, scale=self.scale).view(B, L, C)
+        else:
+            oup = slow_attn(query=q, key=k, value=v, scale=self.scale, attn_mask=attn_bias, dropout_p=dropout_p).transpose(1, 2).reshape(B, L, C)
+        return self.proj_drop(self.proj(oup))
+        # attn = (q @ k.transpose(-2, -1)).add_(attn_bias + self.local_rpb())  # BHLc @ BHcL => BHLL
+        # attn = self.attn_drop(attn.softmax(dim=-1))
+        # oup = (attn @ v).transpose_(1, 2).reshape(B, L, -1)     # BHLL @ BHLc = BHLc => BLHc => BLC
+    def extra_repr(self) -> str:
+        return f'using_flash={self.using_flash}, using_xform={self.using_xform}, attn_l2_norm={self.attn_l2_norm}'
+class AdaLNSelfAttn(nn.Module):
+    def __init__(
+        self, block_idx, last_drop_p, embed_dim, cond_dim, shared_aln: bool, norm_layer,
+        num_heads, mlp_ratio=4., drop=0., attn_drop=0., drop_path=0., attn_l2_norm=False,
+        flash_if_available=False, fused_if_available=True,
+    ):
+        super(AdaLNSelfAttn, self).__init__()
+        self.block_idx, self.last_drop_p, self.C = block_idx, last_drop_p, embed_dim
+        self.C, self.D = embed_dim, cond_dim
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.attn = SelfAttention(block_idx=block_idx, embed_dim=embed_dim, num_heads=num_heads, attn_drop=attn_drop, proj_drop=drop, attn_l2_norm=attn_l2_norm, flash_if_available=flash_if_available)
+        self.ffn = FFN(in_features=embed_dim, hidden_features=round(embed_dim * mlp_ratio), drop=drop, fused_if_available=fused_if_available)
+        self.ln_wo_grad = norm_layer(embed_dim, elementwise_affine=False)
+        self.shared_aln = shared_aln
+        if self.shared_aln:
+            self.ada_gss = nn.Parameter(torch.randn(1, 1, 6, embed_dim) / embed_dim**0.5)
+        else:
+            lin = nn.Linear(cond_dim, 6*embed_dim)
+            self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), lin)
+        self.fused_add_norm_fn = None
+    # NOTE: attn_bias is None during inference because kv cache is enabled
+    def forward(self, x, cond_BD, attn_bias):   # C: embed_dim, D: cond_dim
+        if self.shared_aln:
+            gamma1, gamma2, scale1, scale2, shift1, shift2 = (self.ada_gss + cond_BD).unbind(2) # 116C + B16C =unbind(2)=> 6 B1C
+        else:
+            gamma1, gamma2, scale1, scale2, shift1, shift2 = self.ada_lin(cond_BD).view(-1, 1, 6, self.C).unbind(2)
+        x = x + self.drop_path(self.attn( self.ln_wo_grad(x).mul(scale1.add(1)).add_(shift1), attn_bias=attn_bias ).mul_(gamma1))
+        x = x + self.drop_path(self.ffn( self.ln_wo_grad(x).mul(scale2.add(1)).add_(shift2) ).mul(gamma2)) # this mul(gamma2) cannot be in-placed when FusedMLP is used
+        return x
+    def extra_repr(self) -> str:
+        return f'shared_aln={self.shared_aln}'
+class AdaLNBeforeHead(nn.Module):
+    def __init__(self, C, D, norm_layer):   # C: embed_dim, D: cond_dim
+        super().__init__()
+        self.C, self.D = C, D
+        self.ln_wo_grad = norm_layer(C, elementwise_affine=False)
+        self.ada_lin = nn.Sequential(nn.SiLU(inplace=False), nn.Linear(D, 2*C))
+    def forward(self, x_BLC: torch.Tensor, cond_BD: torch.Tensor):
+        scale, shift = self.ada_lin(cond_BD).view(-1, 1, 2, self.C).unbind(2)
+        return self.ln_wo_grad(x_BLC).mul(scale.add(1)).add_(shift)

models/helpers.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+def sample_with_top_k_top_p_(logits_BlV: torch.Tensor, top_k: int = 0, top_p: float = 0.0, rng=None, num_samples=1) -> torch.Tensor:  # return idx, shaped (B, l)
+    B, l, V = logits_BlV.shape
+    if top_k > 0:
+        idx_to_remove = logits_BlV < logits_BlV.topk(top_k, largest=True, sorted=False, dim=-1)[0].amin(dim=-1, keepdim=True)
+        logits_BlV.masked_fill_(idx_to_remove, -torch.inf)
+    if top_p > 0:
+        sorted_logits, sorted_idx = logits_BlV.sort(dim=-1, descending=False)
+        sorted_idx_to_remove = sorted_logits.softmax(dim=-1).cumsum_(dim=-1) <= (1 - top_p)
+        sorted_idx_to_remove[..., -1:] = False
+        logits_BlV.masked_fill_(sorted_idx_to_remove.scatter(sorted_idx.ndim - 1, sorted_idx, sorted_idx_to_remove), -torch.inf)
+    # sample (have to squeeze cuz torch.multinomial can only be used for 2D tensor)
+    replacement = num_samples >= 0
+    num_samples = abs(num_samples)
+    return torch.multinomial(logits_BlV.softmax(dim=-1).view(-1, V), num_samples=num_samples, replacement=replacement, generator=rng).view(B, l, num_samples)
+def gumbel_softmax_with_rng(logits: torch.Tensor, tau: float = 1, hard: bool = False, eps: float = 1e-10, dim: int = -1, rng: torch.Generator = None) -> torch.Tensor:
+    if rng is None:
+        return F.gumbel_softmax(logits=logits, tau=tau, hard=hard, eps=eps, dim=dim)
+    gumbels = (-torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_(generator=rng).log())
+    gumbels = (logits + gumbels) / tau
+    y_soft = gumbels.softmax(dim)
+    if hard:
+        index = y_soft.max(dim, keepdim=True)[1]
+        y_hard = torch.zeros_like(logits, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
+        ret = y_hard - y_soft.detach() + y_soft
+    else:
+        ret = y_soft
+    return ret
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):    # taken from timm
+    if drop_prob == 0. or not training: return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):  # taken from timm
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f'(drop_prob=...)'

models/quant.py ADDED Viewed

	@@ -0,0 +1,281 @@

+from typing import List, Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+from torch import distributed as tdist, nn as nn
+from torch.nn import functional as F
+import dist
+# this file only provides the VectorQuantizer2 used in VQVAE
+__all__ = ['VectorQuantizer2', ]
+class VectorQuantizer2(nn.Module):
+    # VQGAN originally use beta=1.0, never tried 0.25; SD seems using 0.25
+    def __init__(
+            self, vocab_size, Cvae, using_znorm, beta: float = 0.25,
+            default_qresi_counts=0, v_patch_nums=None, quant_resi=0.5, share_quant_resi=4,  # share_quant_resi: args.qsr
+    ):
+        super().__init__()
+        self.vocab_size: int = vocab_size
+        self.Cvae: int = Cvae
+        self.using_znorm: bool = using_znorm
+        self.v_patch_nums: Tuple[int] = v_patch_nums
+        self.quant_resi_ratio = quant_resi
+        if share_quant_resi == 0:  # non-shared: \phi_{1 to K} for K scales
+            self.quant_resi = PhiNonShared(
+                [(Phi(Cvae, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity()) for _ in
+                 range(default_qresi_counts or len(self.v_patch_nums))])
+        elif share_quant_resi == 1:  # fully shared: only a single \phi for K scales
+            self.quant_resi = PhiShared(Phi(Cvae, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity())
+        else:  # partially shared: \phi_{1 to share_quant_resi} for K scales
+            self.quant_resi = PhiPartiallyShared(nn.ModuleList(
+                [(Phi(Cvae, quant_resi) if abs(quant_resi) > 1e-6 else nn.Identity()) for _ in
+                 range(share_quant_resi)]))
+        self.register_buffer('ema_vocab_hit_SV', torch.full((len(self.v_patch_nums), self.vocab_size), fill_value=0.0))
+        self.record_hit = 0
+        self.beta: float = beta
+        self.embedding = nn.Embedding(self.vocab_size, self.Cvae)
+        # only used for progressive training of VAR (not supported yet, will be tested and supported in the future)
+        self.prog_si = -1  # progressive training: not supported yet, prog_si always -1
+    def eini(self, eini):
+        if eini > 0:
+            nn.init.trunc_normal_(self.embedding.weight.data, std=eini)
+        elif eini < 0:
+            self.embedding.weight.data.uniform_(-abs(eini) / self.vocab_size, abs(eini) / self.vocab_size)
+    def extra_repr(self) -> str:
+        return f'{self.v_patch_nums}, znorm={self.using_znorm}, beta={self.beta}  |  S={len(self.v_patch_nums)}, quant_resi={self.quant_resi_ratio}'
+    # ===================== `forward` is only used in VAE training =====================
+    def forward(self, f_BChw: torch.Tensor, ret_usages=False) -> Tuple[torch.Tensor, List[float], torch.Tensor]:
+        dtype = f_BChw.dtype
+        if dtype != torch.float32: f_BChw = f_BChw.float()
+        B, C, H, W = f_BChw.shape
+        f_no_grad = f_BChw.detach()
+        f_rest = f_no_grad.clone()
+        f_hat = torch.zeros_like(f_rest)
+        with torch.cuda.amp.autocast(enabled=False):
+            mean_vq_loss: torch.Tensor = 0.0
+            vocab_hit_V = torch.zeros(self.vocab_size, dtype=torch.float, device=f_BChw.device)
+            SN = len(self.v_patch_nums)
+            for si, pn in enumerate(self.v_patch_nums):  # from small to large
+                # find the nearest embedding
+                if self.using_znorm:
+                    rest_NC = F.interpolate(f_rest, size=(pn, pn), mode='bilinear').permute(0, 2, 3, 1).reshape(-1,
+                                                                                                                C) if (
+                                si != SN - 1) else f_rest.permute(0, 2, 3, 1).reshape(-1, C)
+                    rest_NC = F.normalize(rest_NC, dim=-1)
+                    idx_N = torch.argmax(rest_NC @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1)
+                else:
+                    rest_NC = F.interpolate(f_rest, size=(pn, pn), mode='bilinear').permute(0, 2, 3, 1).reshape(-1,
+                                                                                                                C) if (
+                                si != SN - 1) else f_rest.permute(0, 2, 3, 1).reshape(-1, C)
+                    d_no_grad = torch.sum(rest_NC.square(), dim=1, keepdim=True) + torch.sum(
+                        self.embedding.weight.data.square(), dim=1, keepdim=False)
+                    d_no_grad.addmm_(rest_NC, self.embedding.weight.data.T, alpha=-2, beta=1)  # (B*h*w, vocab_size)
+                    idx_N = torch.argmin(d_no_grad, dim=1)
+                hit_V = idx_N.bincount(minlength=self.vocab_size).float()
+                if self.training:
+                    if dist.initialized(): handler = tdist.all_reduce(hit_V, async_op=True)
+                # calc loss
+                idx_Bhw = idx_N.view(B, pn, pn)
+                h_BChw = F.interpolate(self.embedding(idx_Bhw).permute(0, 3, 1, 2), size=(H, W),
+                                       mode='bilinear').contiguous() if (si != SN - 1) else self.embedding(
+                    idx_Bhw).permute(0, 3, 1, 2).contiguous()
+                h_BChw = self.quant_resi[si / (SN - 1)](h_BChw)
+                f_hat = f_hat + h_BChw
+                f_rest -= h_BChw
+                if self.training and dist.initialized():
+                    handler.wait()
+                    if self.record_hit == 0:
+                        self.ema_vocab_hit_SV[si].copy_(hit_V)
+                    elif self.record_hit < 100:
+                        self.ema_vocab_hit_SV[si].mul_(0.9).add_(hit_V.mul(0.1))
+                    else:
+                        self.ema_vocab_hit_SV[si].mul_(0.99).add_(hit_V.mul(0.01))
+                    self.record_hit += 1
+                vocab_hit_V.add_(hit_V)
+                mean_vq_loss += F.mse_loss(f_hat.data, f_BChw).mul_(self.beta) + F.mse_loss(f_hat, f_no_grad)
+            mean_vq_loss *= 1. / SN
+            f_hat = (f_hat.data - f_no_grad).add_(f_BChw)
+        margin = tdist.get_world_size() * (f_BChw.numel() / f_BChw.shape[1]) / self.vocab_size * 0.08
+        # margin = pn*pn / 100
+        if ret_usages:
+            usages = [(self.ema_vocab_hit_SV[si] >= margin).float().mean().item() * 100 for si, pn in
+                      enumerate(self.v_patch_nums)]
+        else:
+            usages = None
+        return f_hat, usages, mean_vq_loss
+    # ===================== `forward` is only used in VAE training =====================
+    def embed_to_fhat(self, ms_h_BChw: List[torch.Tensor], all_to_max_scale=True, last_one=False) -> Union[
+        List[torch.Tensor], torch.Tensor]:
+        ls_f_hat_BChw = []
+        B = ms_h_BChw[0].shape[0]
+        H = W = self.v_patch_nums[-1]
+        SN = len(self.v_patch_nums)
+        if all_to_max_scale:
+            f_hat = ms_h_BChw[0].new_zeros(B, self.Cvae, H, W, dtype=torch.float32)
+            for si, pn in enumerate(self.v_patch_nums):  # from small to large
+                h_BChw = ms_h_BChw[si]
+                if si < len(self.v_patch_nums) - 1:
+                    h_BChw = F.interpolate(h_BChw, size=(H, W), mode='bilinear')
+                h_BChw = self.quant_resi[si / (SN - 1)](h_BChw)
+                f_hat.add_(h_BChw)
+                if last_one:
+                    ls_f_hat_BChw = f_hat
+                else:
+                    ls_f_hat_BChw.append(f_hat.clone())
+        else:
+            # WARNING: this is not the case in VQ-VAE training or inference (we'll interpolate every token map to the max H W, like above)
+            # WARNING: this should only be used for experimental purpose
+            f_hat = ms_h_BChw[0].new_zeros(B, self.Cvae, self.v_patch_nums[0], self.v_patch_nums[0],
+                                           dtype=torch.float32)
+            for si, pn in enumerate(self.v_patch_nums):  # from small to large
+                f_hat = F.interpolate(f_hat, size=(pn, pn), mode='bilinear')
+                h_BChw = self.quant_resi[si / (SN - 1)](ms_h_BChw[si])
+                f_hat.add_(h_BChw)
+                if last_one:
+                    ls_f_hat_BChw = f_hat
+                else:
+                    ls_f_hat_BChw.append(f_hat)
+        return ls_f_hat_BChw
+    def f_to_idxBl_or_fhat(self, f_BChw: torch.Tensor, to_fhat: bool,
+                           v_patch_nums: Optional[Sequence[Union[int, Tuple[int, int]]]] = None) -> List[
+        Union[torch.Tensor, torch.LongTensor]]:  # z_BChw is the feature from inp_img_no_grad
+        B, C, H, W = f_BChw.shape
+        f_no_grad = f_BChw.detach()
+        f_rest = f_no_grad.clone()
+        f_hat = torch.zeros_like(f_rest)
+        f_hat_or_idx_Bl: List[torch.Tensor] = []
+        patch_hws = [(pn, pn) if isinstance(pn, int) else (pn[0], pn[1]) for pn in
+                     (v_patch_nums or self.v_patch_nums)]  # from small to large
+        assert patch_hws[-1][0] == H and patch_hws[-1][1] == W, f'{patch_hws[-1]=} != ({H=}, {W=})'
+        SN = len(patch_hws)
+        for si, (ph, pw) in enumerate(patch_hws):  # from small to large
+            if 0 <= self.prog_si < si: break  # progressive training: not supported yet, prog_si always -1
+            # find the nearest embedding
+            z_NC = F.interpolate(f_rest, size=(ph, pw), mode='bilinear').permute(0, 2, 3, 1).reshape(-1, C) if (
+                        si != SN - 1) else f_rest.permute(0, 2, 3, 1).reshape(-1, C)
+            if self.using_znorm:
+                z_NC = F.normalize(z_NC, dim=-1)
+                idx_N = torch.argmax(z_NC @ F.normalize(self.embedding.weight.data.T, dim=0), dim=1)
+            else:
+                d_no_grad = torch.sum(z_NC.square(), dim=1, keepdim=True) + torch.sum(
+                    self.embedding.weight.data.square(), dim=1, keepdim=False)
+                d_no_grad.addmm_(z_NC, self.embedding.weight.data.T, alpha=-2, beta=1)  # (B*h*w, vocab_size)
+                idx_N = torch.argmin(d_no_grad, dim=1)
+            idx_Bhw = idx_N.view(B, ph, pw)
+            h_BChw = F.interpolate(self.embedding(idx_Bhw).permute(0, 3, 1, 2), size=(H, W),
+                                   mode='bilinear').contiguous() if (si != SN - 1) else self.embedding(idx_Bhw).permute(
+                0, 3, 1, 2).contiguous()
+            h_BChw = self.quant_resi[si / (SN - 1)](h_BChw)
+            f_hat.add_(h_BChw)
+            f_rest.sub_(h_BChw)
+            f_hat_or_idx_Bl.append(f_hat.clone() if to_fhat else idx_N.reshape(B, ph * pw))
+        return f_hat_or_idx_Bl
+    # ===================== idxBl_to_var_input: only used in VAR training, for getting teacher-forcing input =====================
+    def idxBl_to_var_input(self, gt_ms_idx_Bl: List[torch.Tensor]) -> torch.Tensor:
+        next_scales = []
+        B = gt_ms_idx_Bl[0].shape[0]
+        C = self.Cvae
+        H = W = self.v_patch_nums[-1]
+        SN = len(self.v_patch_nums)
+        f_hat = gt_ms_idx_Bl[0].new_zeros(B, C, H, W, dtype=torch.float32)
+        pn_next: int = self.v_patch_nums[0]
+        for si in range(SN - 1):
+            if self.prog_si == 0 or (
+                    0 <= self.prog_si - 1 < si): break  # progressive training: not supported yet, prog_si always -1
+            h_BChw = F.interpolate(self.embedding(gt_ms_idx_Bl[si]).transpose_(1, 2).view(B, C, pn_next, pn_next),
+                                   size=(H, W), mode='bilinear')
+            f_hat.add_(self.quant_resi[si / (SN - 1)](h_BChw))
+            pn_next = self.v_patch_nums[si + 1]
+            next_scales.append(
+                F.interpolate(f_hat, size=(pn_next, pn_next), mode='bilinear').view(B, C, -1).transpose(1, 2))
+        return torch.cat(next_scales, dim=1) if len(next_scales) else None  # cat BlCs to BLC, this should be float32
+    # ===================== get_next_autoregressive_input: only used in VAR inference, for getting next step's input =====================
+    def get_next_autoregressive_input(self, si: int, SN: int, f_hat: torch.Tensor, h_BChw: torch.Tensor) -> Tuple[
+        Optional[torch.Tensor], torch.Tensor]:  # only used in VAR inference
+        HW = self.v_patch_nums[-1]
+        if si != SN - 1:
+            h = self.quant_resi[si / (SN - 1)](
+                F.interpolate(h_BChw, size=(HW, HW), mode='bilinear'))  # conv after upsample
+            f_hat.add_(h)
+            return f_hat, F.interpolate(f_hat, size=(self.v_patch_nums[si + 1], self.v_patch_nums[si + 1]),
+                                        mode='bilinear')
+        else:
+            h = self.quant_resi[si / (SN - 1)](h_BChw)
+            f_hat.add_(h)
+            return f_hat, f_hat
+class Phi(nn.Conv2d):
+    def __init__(self, embed_dim, quant_resi):
+        ks = 3
+        super().__init__(in_channels=embed_dim, out_channels=embed_dim, kernel_size=ks, stride=1, padding=ks // 2)
+        self.resi_ratio = abs(quant_resi)
+    def forward(self, h_BChw):
+        return h_BChw.mul(1 - self.resi_ratio) + super().forward(h_BChw).mul_(self.resi_ratio)
+class PhiShared(nn.Module):
+    def __init__(self, qresi: Phi):
+        super().__init__()
+        self.qresi: Phi = qresi
+    def __getitem__(self, _) -> Phi:
+        return self.qresi
+class PhiPartiallyShared(nn.Module):
+    def __init__(self, qresi_ls: nn.ModuleList):
+        super().__init__()
+        self.qresi_ls = qresi_ls
+        K = len(qresi_ls)
+        self.ticks = np.linspace(1 / 3 / K, 1 - 1 / 3 / K, K) if K == 4 else np.linspace(1 / 2 / K, 1 - 1 / 2 / K, K)
+    def __getitem__(self, at_from_0_to_1: float) -> Phi:
+        return self.qresi_ls[np.argmin(np.abs(self.ticks - at_from_0_to_1)).item()]
+    def extra_repr(self) -> str:
+        return f'ticks={self.ticks}'
+class PhiNonShared(nn.ModuleList):
+    def __init__(self, qresi: List):
+        super().__init__(qresi)
+        # self.qresi = qresi
+        K = len(qresi)
+        self.ticks = np.linspace(1 / 3 / K, 1 - 1 / 3 / K, K) if K == 4 else np.linspace(1 / 2 / K, 1 - 1 / 2 / K, K)
+    def __getitem__(self, at_from_0_to_1: float) -> Phi:
+        return super().__getitem__(np.argmin(np.abs(self.ticks - at_from_0_to_1)).item())
+    def extra_repr(self) -> str:
+        return f'ticks={self.ticks}'

models/var.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import math
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+import dist
+from models.basic_var import AdaLNBeforeHead, AdaLNSelfAttn
+from models.helpers import gumbel_softmax_with_rng, sample_with_top_k_top_p_
+from models.vqvae import VQVAE, VectorQuantizer2
+class SharedAdaLin(nn.Linear):
+    def forward(self, cond_BD):
+        C = self.weight.shape[0] // 6
+        return super().forward(cond_BD).view(-1, 1, 6, C)  # B16C
+class VAR(nn.Module):
+    def __init__(
+            self, vae_local: VQVAE,
+            num_classes=1000, depth=16, embed_dim=1024, num_heads=16, mlp_ratio=4., drop_rate=0., attn_drop_rate=0.,
+            drop_path_rate=0.,
+            norm_eps=1e-6, shared_aln=False, cond_drop_rate=0.1,
+            attn_l2_norm=False,
+            patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),  # 10 steps by default
+            flash_if_available=True, fused_if_available=True,
+    ):
+        super().__init__()
+        # 0. hyperparameters
+        assert embed_dim % num_heads == 0
+        self.Cvae, self.V = vae_local.Cvae, vae_local.vocab_size
+        self.depth, self.C, self.D, self.num_heads = depth, embed_dim, embed_dim, num_heads
+        self.cond_drop_rate = cond_drop_rate
+        self.prog_si = -1  # progressive training
+        self.patch_nums: Tuple[int] = patch_nums
+        self.L = sum(pn ** 2 for pn in self.patch_nums)
+        self.first_l = self.patch_nums[0] ** 2
+        self.begin_ends = []
+        cur = 0
+        for i, pn in enumerate(self.patch_nums):
+            self.begin_ends.append((cur, cur + pn ** 2))
+            cur += pn ** 2
+        self.num_stages_minus_1 = len(self.patch_nums) - 1
+        self.rng = torch.Generator(device="mps")
+        # 1. input (word) embedding
+        quant: VectorQuantizer2 = vae_local.quantize
+        self.vae_proxy: Tuple[VQVAE] = (vae_local,)
+        self.vae_quant_proxy: Tuple[VectorQuantizer2] = (quant,)
+        self.word_embed = nn.Linear(self.Cvae, self.C)
+        # 2. class embedding
+        init_std = math.sqrt(1 / self.C / 3)
+        self.num_classes = num_classes
+        self.uniform_prob = torch.full((1, num_classes), fill_value=1.0 / num_classes, dtype=torch.float32,
+                                       device=dist.get_device())
+        self.class_emb = nn.Embedding(self.num_classes + 1, self.C)
+        nn.init.trunc_normal_(self.class_emb.weight.data, mean=0, std=init_std)
+        self.pos_start = nn.Parameter(torch.empty(1, self.first_l, self.C))
+        nn.init.trunc_normal_(self.pos_start.data, mean=0, std=init_std)
+        # 3. absolute position embedding
+        pos_1LC = []
+        for i, pn in enumerate(self.patch_nums):
+            pe = torch.empty(1, pn * pn, self.C)
+            nn.init.trunc_normal_(pe, mean=0, std=init_std)
+            pos_1LC.append(pe)
+        pos_1LC = torch.cat(pos_1LC, dim=1)  # 1, L, C
+        assert tuple(pos_1LC.shape) == (1, self.L, self.C)
+        self.pos_1LC = nn.Parameter(pos_1LC)
+        # level embedding (similar to GPT's segment embedding, used to distinguish different levels of token pyramid)
+        self.lvl_embed = nn.Embedding(len(self.patch_nums), self.C)
+        nn.init.trunc_normal_(self.lvl_embed.weight.data, mean=0, std=init_std)
+        # 4. backbone blocks
+        self.shared_ada_lin = nn.Sequential(nn.SiLU(inplace=False),
+                                            SharedAdaLin(self.D, 6 * self.C)) if shared_aln else nn.Identity()
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        self.drop_path_rate = drop_path_rate
+        dpr = [x.item() for x in
+               torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule (linearly increasing)
+        self.blocks = nn.ModuleList([
+            AdaLNSelfAttn(
+                cond_dim=self.D, shared_aln=shared_aln,
+                block_idx=block_idx, embed_dim=self.C, norm_layer=norm_layer, num_heads=num_heads, mlp_ratio=mlp_ratio,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[block_idx],
+                last_drop_p=0 if block_idx == 0 else dpr[block_idx - 1],
+                attn_l2_norm=attn_l2_norm,
+                flash_if_available=flash_if_available, fused_if_available=fused_if_available,
+            )
+            for block_idx in range(depth)
+        ])
+        fused_add_norm_fns = [b.fused_add_norm_fn is not None for b in self.blocks]
+        self.using_fused_add_norm_fn = any(fused_add_norm_fns)
+        print(
+            f'\n[constructor]  ==== flash_if_available={flash_if_available} ({sum(b.attn.using_flash for b in self.blocks)}/{self.depth}), fused_if_available={fused_if_available} (fusing_add_ln={sum(fused_add_norm_fns)}/{self.depth}, fusing_mlp={sum(b.ffn.fused_mlp_func is not None for b in self.blocks)}/{self.depth}) ==== \n'
+            f'    [VAR config ] embed_dim={embed_dim}, num_heads={num_heads}, depth={depth}, mlp_ratio={mlp_ratio}\n'
+            f'    [drop ratios ] drop_rate={drop_rate}, attn_drop_rate={attn_drop_rate}, drop_path_rate={drop_path_rate:g} ({torch.linspace(0, drop_path_rate, depth)})',
+            end='\n\n', flush=True
+        )
+        # 5. attention mask used in training (for masking out the future)
+        #    it won't be used in inference, since kv cache is enabled
+        d: torch.Tensor = torch.cat([torch.full((pn * pn,), i) for i, pn in enumerate(self.patch_nums)]).view(1, self.L,
+                                                                                                              1)
+        dT = d.transpose(1, 2)  # dT: 11L
+        lvl_1L = dT[:, 0].contiguous()
+        self.register_buffer('lvl_1L', lvl_1L)
+        attn_bias_for_masking = torch.where(d >= dT, 0., -torch.inf).reshape(1, 1, self.L, self.L)
+        self.register_buffer('attn_bias_for_masking', attn_bias_for_masking.contiguous())
+        # 6. classifier head
+        self.head_nm = AdaLNBeforeHead(self.C, self.D, norm_layer=norm_layer)
+        self.head = nn.Linear(self.C, self.V)
+    def get_logits(self, h_or_h_and_residual: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+                   cond_BD: Optional[torch.Tensor]):
+        if not isinstance(h_or_h_and_residual, torch.Tensor):
+            h, resi = h_or_h_and_residual  # fused_add_norm must be used
+            h = resi + self.blocks[-1].drop_path(h)
+        else:  # fused_add_norm is not used
+            h = h_or_h_and_residual
+        return self.head(self.head_nm(h.float(), cond_BD).float()).float()
+    @torch.no_grad()
+    def autoregressive_infer_cfg(
+            self, B: int, label_B: Optional[Union[int, torch.LongTensor]],
+            delta_condition: torch.Tensor, alpha: float, beta: float,
+            g_seed: Optional[int] = None, cfg=1.5, top_k=0, top_p=0.0,
+            more_smooth=False,
+    ) -> torch.Tensor:  # returns reconstructed image (B, 3, H, W) in [0, 1]
+        """
+        Generate images using autoregressive inference with classifier-free guidance.
+        :param B: batch size
+        :param label_B: class labels; if None, randomly sampled
+        :param delta_condition: tensor of shape (B, D)
+        :param alpha: scalar weight for class embedding
+        :param beta: scalar weight for delta_condition
+        :param g_seed: random seed
+        :param cfg: classifier-free guidance ratio
+        :param top_k: top-k sampling
+        :param top_p: top-p sampling
+        :param more_smooth: smoothing the pred using gumbel softmax; only used in visualization, not used in FID/IS benchmarking
+        :return: reconstructed images (B, 3, H, W)
+        """
+        if g_seed is None:
+            rng = None
+        else:
+            self.rng.manual_seed(g_seed)
+            rng = self.rng
+        device = self.lvl_1L.device
+        if label_B is None:
+            label_B = torch.multinomial(self.uniform_prob, num_samples=B, replacement=True, generator=rng).reshape(B)
+        elif isinstance(label_B, int):
+            label_B = torch.full((B,), fill_value=self.num_classes if label_B < 0 else label_B, device=device)
+        # Prepare labels for conditioned and unconditioned versions
+        label_B_cond = label_B
+        label_B_uncond = torch.full_like(label_B, fill_value=self.num_classes)
+        label_B = torch.cat((label_B_cond, label_B_uncond), dim=0)  # shape (2B,)
+        # Prepare delta_condition for conditioned and unconditioned versions
+        delta_condition_uncond = torch.zeros_like(delta_condition)
+        delta_condition = torch.cat((delta_condition, delta_condition_uncond), dim=0)  # shape (2B, D)
+        class_emb = self.class_emb(label_B)  # shape (2B, D)
+        cond_BD = alpha * class_emb + beta * delta_condition  # shape (2B, D)
+        sos = cond_BD.unsqueeze(1).expand(2 * B, self.first_l, -1) + self.pos_start.expand(2 * B, self.first_l, -1)
+        lvl_pos = self.lvl_embed(self.lvl_1L) + self.pos_1LC
+        next_token_map = sos + lvl_pos[:, :self.first_l]
+        cur_L = 0
+        f_hat = sos.new_zeros(B, self.Cvae, self.patch_nums[-1], self.patch_nums[-1])
+        for b in self.blocks:
+            b.attn.kv_caching(True)
+        for si, pn in enumerate(self.patch_nums):  # si: i-th segment
+            ratio = si / self.num_stages_minus_1
+            cur_L += pn * pn
+            cond_BD_or_gss = self.shared_ada_lin(cond_BD)
+            x = next_token_map
+            for b in self.blocks:
+                x = b(x=x, cond_BD=cond_BD_or_gss, attn_bias=None)
+            logits_BlV = self.get_logits(x, cond_BD)
+            t = cfg * ratio
+            logits_BlV = (1 + t) * logits_BlV[:B] - t * logits_BlV[B:]
+            idx_Bl = sample_with_top_k_top_p_(logits_BlV, rng=rng, top_k=top_k, top_p=top_p, num_samples=1)[:, :, 0]
+            if not more_smooth:  # this is the default case
+                h_BChw = self.vae_quant_proxy[0].embedding(idx_Bl)  # B, l, Cvae
+            else:  # not used when evaluating FID/IS/Precision/Recall
+                gum_t = max(0.27 * (1 - ratio * 0.95), 0.005)  # refer to mask-git
+                h_BChw = gumbel_softmax_with_rng(logits_BlV.mul(1 + ratio), tau=gum_t, hard=False, dim=-1, rng=rng) @ \
+                         self.vae_quant_proxy[0].embedding.weight.unsqueeze(0)
+            h_BChw = h_BChw.transpose_(1, 2).reshape(B, self.Cvae, pn, pn)
+            f_hat, next_token_map = self.vae_quant_proxy[0].get_next_autoregressive_input(si, len(self.patch_nums),
+                                                                                          f_hat, h_BChw)
+            if si != self.num_stages_minus_1:  # prepare for next stage
+                next_token_map = next_token_map.view(B, self.Cvae, -1).transpose(1, 2)
+                next_token_map = self.word_embed(next_token_map) + lvl_pos[:,
+                                                                   cur_L:cur_L + self.patch_nums[si + 1] ** 2]
+                next_token_map = next_token_map.repeat(2, 1, 1)  # double the batch sizes due to CFG
+        for b in self.blocks:
+            b.attn.kv_caching(False)
+        return self.vae_proxy[0].fhat_to_img(f_hat).add_(1).mul_(0.5)  # de-normalize, from [-1, 1] to [0, 1]
+    def forward(self, label_B: torch.LongTensor, x_BLCv_wo_first_l: torch.Tensor, delta_condition: torch.Tensor,
+                alpha: float, beta: float) -> torch.Tensor:
+        """
+        :param label_B: label_B
+        :param x_BLCv_wo_first_l: teacher forcing input (B, self.L-self.first_l, self.Cvae)
+        :param delta_condition: tensor of shape (B, D)
+        :param alpha: scalar weight for class embedding
+        :param beta: scalar weight for delta_condition
+        :return: logits BLV, V is vocab_size
+        """
+        bg, ed = self.begin_ends[self.prog_si] if self.prog_si >= 0 else (0, self.L)
+        B = x_BLCv_wo_first_l.shape[0]
+        with torch.cuda.amp.autocast(enabled=False):
+            # Implement conditional dropout
+            drop_mask = torch.rand(B, device=label_B.device) < self.cond_drop_rate
+            label_B_dropped = torch.where(drop_mask, self.num_classes, label_B)
+            delta_condition_dropped = delta_condition.clone()
+            delta_condition_dropped[drop_mask] = 0.0  # Drop delta_condition
+            class_emb = self.class_emb(label_B_dropped)
+            cond_BD = alpha * class_emb + beta * delta_condition_dropped
+            sos = cond_BD.unsqueeze(1).expand(B, self.first_l, -1) + self.pos_start.expand(B, self.first_l, -1)
+            if self.prog_si == 0:
+                x_BLC = sos
+            else:
+                x_BLC = torch.cat((sos, self.word_embed(x_BLCv_wo_first_l.float())), dim=1)
+            x_BLC += self.lvl_embed(self.lvl_1L[:, :ed].expand(B, -1)) + self.pos_1LC[:, :ed]  # lvl: BLC;  pos: 1LC
+        attn_bias = self.attn_bias_for_masking[:, :, :ed, :ed]
+        cond_BD_or_gss = self.shared_ada_lin(cond_BD)
+        # hack: get the dtype if mixed precision is used
+        temp = x_BLC.new_ones(8, 8)
+        main_type = torch.matmul(temp, temp).dtype
+        x_BLC = x_BLC.to(dtype=main_type)
+        cond_BD_or_gss = cond_BD_or_gss.to(dtype=main_type)
+        attn_bias = attn_bias.to(dtype=main_type)
+        AdaLNSelfAttn.forward
+        for i, b in enumerate(self.blocks):
+            x_BLC = b(x=x_BLC, cond_BD=cond_BD_or_gss, attn_bias=attn_bias)
+        x_BLC = self.get_logits(x_BLC.float(), cond_BD)
+        if self.prog_si == 0:
+            if isinstance(self.word_embed, nn.Linear):
+                x_BLC[0, 0, 0] += self.word_embed.weight[0, 0] * 0 + self.word_embed.bias[0] * 0
+            else:
+                s = 0
+                for p in self.word_embed.parameters():
+                    if p.requires_grad:
+                        s += p.view(-1)[0] * 0
+                x_BLC[0, 0, 0] += s
+        return x_BLC  # logits BLV, V is vocab_size
+    def init_weights(self, init_adaln=0.5, init_adaln_gamma=1e-5, init_head=0.02, init_std=0.02, conv_std_or_gain=0.02):
+        if init_std < 0: init_std = (1 / self.C / 3) ** 0.5  # init_std < 0: automated
+        print(f'[init_weights] {type(self).__name__} with {init_std=:g}')
+        for m in self.modules():
+            with_weight = hasattr(m, 'weight') and m.weight is not None
+            with_bias = hasattr(m, 'bias') and m.bias is not None
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight.data, std=init_std)
+                if with_bias: m.bias.data.zero_()
+            elif isinstance(m, nn.Embedding):
+                nn.init.trunc_normal_(m.weight.data, std=init_std)
+                if m.padding_idx is not None: m.weight.data[m.padding_idx].zero_()
+            elif isinstance(m, (
+            nn.LayerNorm, nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm, nn.GroupNorm,
+            nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d)):
+                if with_weight: m.weight.data.fill_(1.)
+                if with_bias: m.bias.data.zero_()
+            # conv: VAR has no conv, only VQVAE has conv
+            elif isinstance(m, (
+            nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d)):
+                if conv_std_or_gain > 0:
+                    nn.init.trunc_normal_(m.weight.data, std=conv_std_or_gain)
+                else:
+                    nn.init.xavier_normal_(m.weight.data, gain=-conv_std_or_gain)
+                if with_bias: m.bias.data.zero_()
+        if init_head >= 0:
+            if isinstance(self.head, nn.Linear):
+                self.head.weight.data.mul_(init_head)
+                self.head.bias.data.zero_()
+            elif isinstance(self.head, nn.Sequential):
+                self.head[-1].weight.data.mul_(init_head)
+                self.head[-1].bias.data.zero_()
+        if isinstance(self.head_nm, AdaLNBeforeHead):
+            self.head_nm.ada_lin[-1].weight.data.mul_(init_adaln)
+            if hasattr(self.head_nm.ada_lin[-1], 'bias') and self.head_nm.ada_lin[-1].bias is not None:
+                self.head_nm.ada_lin[-1].bias.data.zero_()
+        depth = len(self.blocks)
+        for block_idx, sab in enumerate(self.blocks):
+            sab: AdaLNSelfAttn
+            sab.attn.proj.weight.data.div_(math.sqrt(2 * depth))
+            sab.ffn.fc2.weight.data.div_(math.sqrt(2 * depth))
+            if hasattr(sab.ffn, 'fcg') and sab.ffn.fcg is not None:
+                nn.init.ones_(sab.ffn.fcg.bias)
+                nn.init.trunc_normal_(sab.ffn.fcg.weight, std=1e-5)
+            if hasattr(sab, 'ada_lin'):
+                sab.ada_lin[-1].weight.data[2 * self.C:].mul_(init_adaln)
+                sab.ada_lin[-1].weight.data[:2 * self.C].mul_(init_adaln_gamma)
+                if hasattr(sab.ada_lin[-1], 'bias') and sab.ada_lin[-1].bias is not None:
+                    sab.ada_lin[-1].bias.data.zero_()
+            elif hasattr(sab, 'ada_gss'):
+                sab.ada_gss.data[:, :, 2:].mul_(init_adaln)
+                sab.ada_gss.data[:, :, :2].mul_(init_adaln_gamma)
+    def extra_repr(self):
+        return f'drop_path_rate={self.drop_path_rate:g}'
+class VARHF(VAR, PyTorchModelHubMixin):
+    def __init__(
+            self,
+            vae_kwargs,
+            num_classes=1000, depth=16, embed_dim=1024, num_heads=16, mlp_ratio=4., drop_rate=0., attn_drop_rate=0.,
+            drop_path_rate=0.,
+            norm_eps=1e-6, shared_aln=False, cond_drop_rate=0.1,
+            attn_l2_norm=False,
+            patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16),  # 10 steps by default
+            flash_if_available=True, fused_if_available=True,
+    ):
+        vae_local = VQVAE(**vae_kwargs)
+        super().__init__(
+            vae_local=vae_local,
+            num_classes=num_classes, depth=depth, embed_dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio,
+            drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate,
+            norm_eps=norm_eps, shared_aln=shared_aln, cond_drop_rate=cond_drop_rate,
+            attn_l2_norm=attn_l2_norm,
+            patch_nums=patch_nums,
+            flash_if_available=flash_if_available, fused_if_available=fused_if_available,
+        )

models/vqvae.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+References:
+- VectorQuantizer2: https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/modules/vqvae/quantize.py#L110
+- GumbelQuantize: https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/taming/modules/vqvae/quantize.py#L213
+- VQVAE (VQModel): https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/models/autoencoder.py#L14
+"""
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+from .basic_vae import Decoder, Encoder
+from .quant import VectorQuantizer2
+class VQVAE(nn.Module):
+    def __init__(
+        self, vocab_size=4096, z_channels=32, ch=128, dropout=0.0,
+        beta=0.25,              # commitment loss weight
+        using_znorm=False,      # whether to normalize when computing the nearest neighbors
+        quant_conv_ks=3,        # quant conv kernel size
+        quant_resi=0.5,         # 0.5 means \phi(x) = 0.5conv(x) + (1-0.5)x
+        share_quant_resi=4,     # use 4 \phi layers for K scales: partially-shared \phi
+        default_qresi_counts=0, # if is 0: automatically set to len(v_patch_nums)
+        v_patch_nums=(1, 2, 3, 4, 5, 6, 8, 10, 13, 16), # number of patches for each scale, h_{1 to K} = w_{1 to K} = v_patch_nums[k]
+        test_mode=True,
+    ):
+        super().__init__()
+        self.test_mode = test_mode
+        self.V, self.Cvae = vocab_size, z_channels
+        # ddconfig is copied from https://github.com/CompVis/latent-diffusion/blob/e66308c7f2e64cb581c6d27ab6fbeb846828253b/models/first_stage_models/vq-f16/config.yaml
+        ddconfig = dict(
+            dropout=dropout, ch=ch, z_channels=z_channels,
+            in_channels=3, ch_mult=(1, 1, 2, 2, 4), num_res_blocks=2,   # from vq-f16/config.yaml above
+            using_sa=True, using_mid_sa=True,                           # from vq-f16/config.yaml above
+            # resamp_with_conv=True,   # always True, removed.
+        )
+        ddconfig.pop('double_z', None)  # only KL-VAE should use double_z=True
+        self.encoder = Encoder(double_z=False, **ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.vocab_size = vocab_size
+        self.downsample = 2 ** (len(ddconfig['ch_mult'])-1)
+        self.quantize: VectorQuantizer2 = VectorQuantizer2(
+            vocab_size=vocab_size, Cvae=self.Cvae, using_znorm=using_znorm, beta=beta,
+            default_qresi_counts=default_qresi_counts, v_patch_nums=v_patch_nums, quant_resi=quant_resi, share_quant_resi=share_quant_resi,
+        )
+        self.quant_conv = torch.nn.Conv2d(self.Cvae, self.Cvae, quant_conv_ks, stride=1, padding=quant_conv_ks//2)
+        self.post_quant_conv = torch.nn.Conv2d(self.Cvae, self.Cvae, quant_conv_ks, stride=1, padding=quant_conv_ks//2)
+        if self.test_mode:
+            self.eval()
+            [p.requires_grad_(False) for p in self.parameters()]
+    # ===================== `forward` is only used in VAE training =====================
+    def forward(self, inp, ret_usages=False):   # -> rec_B3HW, idx_N, loss
+        VectorQuantizer2.forward
+        f_hat, usages, vq_loss = self.quantize(self.quant_conv(self.encoder(inp)), ret_usages=ret_usages)
+        return self.decoder(self.post_quant_conv(f_hat)), usages, vq_loss
+    # ===================== `forward` is only used in VAE training =====================
+    def fhat_to_img(self, f_hat: torch.Tensor):
+        return self.decoder(self.post_quant_conv(f_hat)).clamp_(-1, 1)
+    def img_to_idxBl(self, inp_img_no_grad: torch.Tensor, v_patch_nums: Optional[Sequence[Union[int, Tuple[int, int]]]] = None) -> List[torch.LongTensor]:    # return List[Bl]
+        f = self.quant_conv(self.encoder(inp_img_no_grad))
+        return self.quantize.f_to_idxBl_or_fhat(f, to_fhat=False, v_patch_nums=v_patch_nums)
+    def idxBl_to_img(self, ms_idx_Bl: List[torch.Tensor], same_shape: bool, last_one=False) -> Union[List[torch.Tensor], torch.Tensor]:
+        B = ms_idx_Bl[0].shape[0]
+        ms_h_BChw = []
+        for idx_Bl in ms_idx_Bl:
+            l = idx_Bl.shape[1]
+            pn = round(l ** 0.5)
+            ms_h_BChw.append(self.quantize.embedding(idx_Bl).transpose(1, 2).view(B, self.Cvae, pn, pn))
+        return self.embed_to_img(ms_h_BChw=ms_h_BChw, all_to_max_scale=same_shape, last_one=last_one)
+    def embed_to_img(self, ms_h_BChw: List[torch.Tensor], all_to_max_scale: bool, last_one=False) -> Union[List[torch.Tensor], torch.Tensor]:
+        if last_one:
+            return self.decoder(self.post_quant_conv(self.quantize.embed_to_fhat(ms_h_BChw, all_to_max_scale=all_to_max_scale, last_one=True))).clamp_(-1, 1)
+        else:
+            return [self.decoder(self.post_quant_conv(f_hat)).clamp_(-1, 1) for f_hat in self.quantize.embed_to_fhat(ms_h_BChw, all_to_max_scale=all_to_max_scale, last_one=False)]
+    def img_to_reconstructed_img(self, x, v_patch_nums: Optional[Sequence[Union[int, Tuple[int, int]]]] = None, last_one=False) -> List[torch.Tensor]:
+        f = self.quant_conv(self.encoder(x))
+        ls_f_hat_BChw = self.quantize.f_to_idxBl_or_fhat(f, to_fhat=True, v_patch_nums=v_patch_nums)
+        if last_one:
+            return self.decoder(self.post_quant_conv(ls_f_hat_BChw[-1])).clamp_(-1, 1)
+        else:
+            return [self.decoder(self.post_quant_conv(f_hat)).clamp_(-1, 1) for f_hat in ls_f_hat_BChw]
+    def load_state_dict(self, state_dict: Dict[str, Any], strict=True, assign=False):
+        if 'quantize.ema_vocab_hit_SV' in state_dict and state_dict['quantize.ema_vocab_hit_SV'].shape[0] != self.quantize.ema_vocab_hit_SV.shape[0]:
+            state_dict['quantize.ema_vocab_hit_SV'] = self.quantize.ema_vocab_hit_SV
+        return super().load_state_dict(state_dict=state_dict, strict=strict, assign=assign)

utils/amp_sc.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import math
+from typing import List, Optional, Tuple, Union
+import torch
+class NullCtx:
+    def __enter__(self):
+        pass
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+class AmpOptimizer:
+    def __init__(
+        self,
+        mixed_precision: int,
+        optimizer: torch.optim.Optimizer, names: List[str], paras: List[torch.nn.Parameter],
+        grad_clip: float, n_gradient_accumulation: int = 1,
+    ):
+        self.enable_amp = mixed_precision > 0
+        self.using_fp16_rather_bf16 = mixed_precision == 1
+        if self.enable_amp:
+            self.amp_ctx = torch.autocast('cuda', enabled=True, dtype=torch.float16 if self.using_fp16_rather_bf16 else torch.bfloat16, cache_enabled=True)
+            self.scaler = torch.cuda.amp.GradScaler(init_scale=2. ** 11, growth_interval=1000) if self.using_fp16_rather_bf16 else None # only fp16 needs a scaler
+        else:
+            self.amp_ctx = NullCtx()
+            self.scaler = None
+        self.optimizer, self.names, self.paras = optimizer, names, paras   # paras have been filtered so everyone requires grad
+        self.grad_clip = grad_clip
+        self.early_clipping = self.grad_clip > 0 and not hasattr(optimizer, 'global_grad_norm')
+        self.late_clipping = self.grad_clip > 0 and hasattr(optimizer, 'global_grad_norm')
+        self.r_accu = 1 / n_gradient_accumulation   # r_accu == 1.0 / n_gradient_accumulation
+    def backward_clip_step(
+        self, stepping: bool, loss: torch.Tensor,
+    ) -> Tuple[Optional[Union[torch.Tensor, float]], Optional[float]]:
+        # backward
+        loss = loss.mul(self.r_accu)   # r_accu == 1.0 / n_gradient_accumulation
+        orig_norm = scaler_sc = None
+        if self.scaler is not None:
+            self.scaler.scale(loss).backward(retain_graph=False, create_graph=False)
+        else:
+            loss.backward(retain_graph=False, create_graph=False)
+        if stepping:
+            if self.scaler is not None: self.scaler.unscale_(self.optimizer)
+            if self.early_clipping:
+                orig_norm = torch.nn.utils.clip_grad_norm_(self.paras, self.grad_clip)
+            if self.scaler is not None:
+                self.scaler.step(self.optimizer)
+                scaler_sc: float = self.scaler.get_scale()
+                if scaler_sc > 32768.: # fp16 will overflow when >65536, so multiply 32768 could be dangerous
+                    self.scaler.update(new_scale=32768.)
+                else:
+                    self.scaler.update()
+                try:
+                    scaler_sc = float(math.log2(scaler_sc))
+                except Exception as e:
+                    print(f'[scaler_sc = {scaler_sc}]\n' * 15, flush=True)
+                    raise e
+            else:
+                self.optimizer.step()
+            if self.late_clipping:
+                orig_norm = self.optimizer.global_grad_norm
+            self.optimizer.zero_grad(set_to_none=True)
+        return orig_norm, scaler_sc
+    def state_dict(self):
+        return {
+            'optimizer': self.optimizer.state_dict()
+        } if self.scaler is None else {
+            'scaler': self.scaler.state_dict(),
+            'optimizer': self.optimizer.state_dict()
+        }
+    def load_state_dict(self, state, strict=True):
+        if self.scaler is not None:
+            try: self.scaler.load_state_dict(state['scaler'])
+            except Exception as e: print(f'[fp16 load_state_dict err] {e}')
+        self.optimizer.load_state_dict(state['optimizer'])

utils/arg_util.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import json
+import os
+import random
+import re
+import subprocess
+import sys
+import time
+from collections import OrderedDict
+from typing import Optional, Union
+import numpy as np
+import torch
+try:
+    from tap import Tap
+except ImportError as e:
+    print(f'`>>>>>>>> from tap import Tap` failed, please run:      pip3 install typed-argument-parser     <<<<<<<<', file=sys.stderr, flush=True)
+    print(f'`>>>>>>>> from tap import Tap` failed, please run:      pip3 install typed-argument-parser     <<<<<<<<', file=sys.stderr, flush=True)
+    time.sleep(5)
+    raise e
+import dist
+class Args(Tap):
+    data_path: str = '/path/to/imagenet'
+    exp_name: str = 'text'
+    # VAE
+    vfast: int = 0      # torch.compile VAE; =0: not compile; 1: compile with 'reduce-overhead'; 2: compile with 'max-autotune'
+    # VAR
+    tfast: int = 0      # torch.compile VAR; =0: not compile; 1: compile with 'reduce-overhead'; 2: compile with 'max-autotune'
+    depth: int = 16     # VAR depth
+    # VAR initialization
+    ini: float = -1     # -1: automated model parameter initialization
+    hd: float = 0.02    # head.w *= hd
+    aln: float = 0.5    # the multiplier of ada_lin.w's initialization
+    alng: float = 1e-5  # the multiplier of ada_lin.w[gamma channels]'s initialization
+    # VAR optimization
+    fp16: int = 0           # 1: using fp16, 2: bf16
+    tblr: float = 1e-4      # base lr
+    tlr: float = None       # lr = base lr * (bs / 256)
+    twd: float = 0.05       # initial wd
+    twde: float = 0         # final wd, =twde or twd
+    tclip: float = 2.       # <=0 for not using grad clip
+    ls: float = 0.0         # label smooth
+    bs: int = 768           # global batch size
+    batch_size: int = 0     # [automatically set; don't specify this] batch size per GPU = round(args.bs / args.ac / dist.get_world_size() / 8) * 8
+    glb_batch_size: int = 0 # [automatically set; don't specify this] global batch size = args.batch_size * dist.get_world_size()
+    ac: int = 1             # gradient accumulation
+    ep: int = 250
+    wp: float = 0
+    wp0: float = 0.005      # initial lr ratio at the begging of lr warm up
+    wpe: float = 0.01       # final lr ratio at the end of training
+    sche: str = 'lin0'      # lr schedule
+    opt: str = 'adamw'      # lion: https://cloud.tencent.com/developer/article/2336657?areaId=106001 lr=5e-5 (0.25x) wd=0.8 (8x); Lion needs a large bs to work
+    afuse: bool = True      # fused adamw
+    # other hps
+    saln: bool = False      # whether to use shared adaln
+    anorm: bool = True      # whether to use L2 normalized attention
+    fuse: bool = True       # whether to use fused op like flash attn, xformers, fused MLP, fused LayerNorm, etc.
+    # data
+    pn: str = '1_2_3_4_5_6_8_10_13_16'
+    patch_size: int = 16
+    patch_nums: tuple = None    # [automatically set; don't specify this] = tuple(map(int, args.pn.replace('-', '_').split('_')))
+    resos: tuple = None         # [automatically set; don't specify this] = tuple(pn * args.patch_size for pn in args.patch_nums)
+    data_load_reso: int = None  # [automatically set; don't specify this] would be max(patch_nums) * patch_size
+    mid_reso: float = 1.125     # aug: first resize to mid_reso = 1.125 * data_load_reso, then crop to data_load_reso
+    hflip: bool = False         # augmentation: horizontal flip
+    workers: int = 0        # num workers; 0: auto, -1: don't use multiprocessing in DataLoader
+    # progressive training
+    pg: float = 0.0         # >0 for use progressive training during [0%, this] of training
+    pg0: int = 4            # progressive initial stage, 0: from the 1st token map, 1: from the 2nd token map, etc
+    pgwp: float = 0         # num of warmup epochs at each progressive stage
+    # would be automatically set in runtime
+    cmd: str = ' '.join(sys.argv[1:])  # [automatically set; don't specify this]
+    branch: str = subprocess.check_output(f'git symbolic-ref --short HEAD 2>/dev/null || git rev-parse HEAD', shell=True).decode('utf-8').strip() or '[unknown]' # [automatically set; don't specify this]
+    commit_id: str = subprocess.check_output(f'git rev-parse HEAD', shell=True).decode('utf-8').strip() or '[unknown]'  # [automatically set; don't specify this]
+    commit_msg: str = (subprocess.check_output(f'git log -1', shell=True).decode('utf-8').strip().splitlines() or ['[unknown]'])[-1].strip()    # [automatically set; don't specify this]
+    acc_mean: float = None      # [automatically set; don't specify this]
+    acc_tail: float = None      # [automatically set; don't specify this]
+    L_mean: float = None        # [automatically set; don't specify this]
+    L_tail: float = None        # [automatically set; don't specify this]
+    vacc_mean: float = None     # [automatically set; don't specify this]
+    vacc_tail: float = None     # [automatically set; don't specify this]
+    vL_mean: float = None       # [automatically set; don't specify this]
+    vL_tail: float = None       # [automatically set; don't specify this]
+    grad_norm: float = None     # [automatically set; don't specify this]
+    cur_lr: float = None        # [automatically set; don't specify this]
+    cur_wd: float = None        # [automatically set; don't specify this]
+    cur_it: str = ''            # [automatically set; don't specify this]
+    cur_ep: str = ''            # [automatically set; don't specify this]
+    remain_time: str = ''       # [automatically set; don't specify this]
+    finish_time: str = ''       # [automatically set; don't specify this]
+    # environment
+    local_out_dir_path: str = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'local_output')  # [automatically set; don't specify this]
+    tb_log_dir_path: str = '...tb-...'  # [automatically set; don't specify this]
+    log_txt_path: str = '...'           # [automatically set; don't specify this]
+    last_ckpt_path: str = '...'         # [automatically set; don't specify this]
+    tf32: bool = True       # whether to use TensorFloat32
+    device: str = 'cpu'     # [automatically set; don't specify this]
+    seed: int = None        # seed
+    def seed_everything(self, benchmark: bool):
+        torch.backends.cudnn.enabled = True
+        torch.backends.cudnn.benchmark = benchmark
+        if self.seed is None:
+            torch.backends.cudnn.deterministic = False
+        else:
+            torch.backends.cudnn.deterministic = True
+            seed = self.seed * dist.get_world_size() + dist.get_rank()
+            os.environ['PYTHONHASHSEED'] = str(seed)
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+            if torch.cuda.is_available():
+                torch.cuda.manual_seed(seed)
+                torch.cuda.manual_seed_all(seed)
+    same_seed_for_all_ranks: int = 0     # this is only for distributed sampler
+    def get_different_generator_for_each_rank(self) -> Optional[torch.Generator]:   # for random augmentation
+        if self.seed is None:
+            return None
+        g = torch.Generator()
+        g.manual_seed(self.seed * dist.get_world_size() + dist.get_rank())
+        return g
+    local_debug: bool = 'KEVIN_LOCAL' in os.environ
+    dbg_nan: bool = False   # 'KEVIN_LOCAL' in os.environ
+    def compile_model(self, m, fast):
+        if fast == 0 or self.local_debug:
+            return m
+        return torch.compile(m, mode={
+            1: 'reduce-overhead',
+            2: 'max-autotune',
+            3: 'default',
+        }[fast]) if hasattr(torch, 'compile') else m
+    def state_dict(self, key_ordered=True) -> Union[OrderedDict, dict]:
+        d = (OrderedDict if key_ordered else dict)()
+        # self.as_dict() would contain methods, but we only need variables
+        for k in self.class_variables.keys():
+            if k not in {'device'}:     # these are not serializable
+                d[k] = getattr(self, k)
+        return d
+    def load_state_dict(self, d: Union[OrderedDict, dict, str]):
+        if isinstance(d, str):  # for compatibility with old version
+            d: dict = eval('\n'.join([l for l in d.splitlines() if '<bound' not in l and 'device(' not in l]))
+        for k in d.keys():
+            try:
+                setattr(self, k, d[k])
+            except Exception as e:
+                print(f'k={k}, v={d[k]}')
+                raise e
+    @staticmethod
+    def set_tf32(tf32: bool):
+        if torch.cuda.is_available():
+            torch.backends.cudnn.allow_tf32 = bool(tf32)
+            torch.backends.cuda.matmul.allow_tf32 = bool(tf32)
+            if hasattr(torch, 'set_float32_matmul_precision'):
+                torch.set_float32_matmul_precision('high' if tf32 else 'highest')
+                print(f'[tf32] [precis] torch.get_float32_matmul_precision(): {torch.get_float32_matmul_precision()}')
+            print(f'[tf32] [ conv ] torch.backends.cudnn.allow_tf32: {torch.backends.cudnn.allow_tf32}')
+            print(f'[tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: {torch.backends.cuda.matmul.allow_tf32}')
+    def dump_log(self):
+        if not dist.is_local_master():
+            return
+        if '1/' in self.cur_ep: # first time to dump log
+            with open(self.log_txt_path, 'w') as fp:
+                json.dump({'is_master': dist.is_master(), 'name': self.exp_name, 'cmd': self.cmd, 'commit': self.commit_id, 'branch': self.branch, 'tb_log_dir_path': self.tb_log_dir_path}, fp, indent=0)
+                fp.write('\n')
+        log_dict = {}
+        for k, v in {
+            'it': self.cur_it, 'ep': self.cur_ep,
+            'lr': self.cur_lr, 'wd': self.cur_wd, 'grad_norm': self.grad_norm,
+            'L_mean': self.L_mean, 'L_tail': self.L_tail, 'acc_mean': self.acc_mean, 'acc_tail': self.acc_tail,
+            'vL_mean': self.vL_mean, 'vL_tail': self.vL_tail, 'vacc_mean': self.vacc_mean, 'vacc_tail': self.vacc_tail,
+            'remain_time': self.remain_time, 'finish_time': self.finish_time,
+        }.items():
+            if hasattr(v, 'item'): v = v.item()
+            log_dict[k] = v
+        with open(self.log_txt_path, 'a') as fp:
+            fp.write(f'{log_dict}\n')
+    def __str__(self):
+        s = []
+        for k in self.class_variables.keys():
+            if k not in {'device', 'dbg_ks_fp'}:     # these are not serializable
+                s.append(f'  {k:20s}: {getattr(self, k)}')
+        s = '\n'.join(s)
+        return f'{{\n{s}\n}}\n'
+def init_dist_and_get_args():
+    for i in range(len(sys.argv)):
+        if sys.argv[i].startswith('--local-rank=') or sys.argv[i].startswith('--local_rank='):
+            del sys.argv[i]
+            break
+    args = Args(explicit_bool=True).parse_args(known_only=True)
+    if args.local_debug:
+        args.pn = '1_2_3'
+        args.seed = 1
+        args.aln = 1e-2
+        args.alng = 1e-5
+        args.saln = False
+        args.afuse = False
+        args.pg = 0.8
+        args.pg0 = 1
+    else:
+        if args.data_path == '/path/to/imagenet':
+            raise ValueError(f'{"*"*40}  please specify --data_path=/path/to/imagenet  {"*"*40}')
+    # warn args.extra_args
+    if len(args.extra_args) > 0:
+        print(f'======================================================================================')
+        print(f'=========================== WARNING: UNEXPECTED EXTRA ARGS ===========================\n{args.extra_args}')
+        print(f'=========================== WARNING: UNEXPECTED EXTRA ARGS ===========================')
+        print(f'======================================================================================\n\n')
+    # init torch distributed
+    from utils import misc
+    os.makedirs(args.local_out_dir_path, exist_ok=True)
+    misc.init_distributed_mode(local_out_path=args.local_out_dir_path, timeout=30)
+    # set env
+    args.set_tf32(args.tf32)
+    args.seed_everything(benchmark=args.pg == 0)
+    # update args: data loading
+    args.device = dist.get_device()
+    if args.pn == '256':
+        args.pn = '1_2_3_4_5_6_8_10_13_16'
+    elif args.pn == '512':
+        args.pn = '1_2_3_4_6_9_13_18_24_32'
+    elif args.pn == '1024':
+        args.pn = '1_2_3_4_5_7_9_12_16_21_27_36_48_64'
+    args.patch_nums = tuple(map(int, args.pn.replace('-', '_').split('_')))
+    args.resos = tuple(pn * args.patch_size for pn in args.patch_nums)
+    args.data_load_reso = max(args.resos)
+    # update args: bs and lr
+    bs_per_gpu = round(args.bs / args.ac / dist.get_world_size())
+    args.batch_size = bs_per_gpu
+    args.bs = args.glb_batch_size = args.batch_size * dist.get_world_size()
+    args.workers = min(max(0, args.workers), args.batch_size)
+    args.tlr = args.ac * args.tblr * args.glb_batch_size / 256
+    args.twde = args.twde or args.twd
+    if args.wp == 0:
+        args.wp = args.ep * 1/50
+    # update args: progressive training
+    if args.pgwp == 0:
+        args.pgwp = args.ep * 1/300
+    if args.pg > 0:
+        args.sche = f'lin{args.pg:g}'
+    # update args: paths
+    args.log_txt_path = os.path.join(args.local_out_dir_path, 'log.txt')
+    args.last_ckpt_path = os.path.join(args.local_out_dir_path, f'ar-ckpt-last.pth')
+    _reg_valid_name = re.compile(r'[^\w\-+,.]')
+    tb_name = _reg_valid_name.sub(
+        '_',
+        f'tb-VARd{args.depth}'
+        f'__pn{args.pn}'
+        f'__b{args.bs}ep{args.ep}{args.opt[:4]}lr{args.tblr:g}wd{args.twd:g}'
+    )
+    args.tb_log_dir_path = os.path.join(args.local_out_dir_path, tb_name)
+    return args

utils/data.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os.path as osp
+import PIL.Image as PImage
+from torchvision.datasets.folder import DatasetFolder, IMG_EXTENSIONS
+from torchvision.transforms import InterpolationMode, transforms
+def normalize_01_into_pm1(x):  # normalize x from [0, 1] to [-1, 1] by (x*2) - 1
+    return x.add(x).add_(-1)
+def build_dataset(
+    data_path: str, final_reso: int,
+    hflip=False, mid_reso=1.125,
+):
+    # build augmentations
+    mid_reso = round(mid_reso * final_reso)  # first resize to mid_reso, then crop to final_reso
+    train_aug, val_aug = [
+        transforms.Resize(mid_reso, interpolation=InterpolationMode.LANCZOS), # transforms.Resize: resize the shorter edge to mid_reso
+        transforms.RandomCrop((final_reso, final_reso)),
+        transforms.ToTensor(), normalize_01_into_pm1,
+    ], [
+        transforms.Resize(mid_reso, interpolation=InterpolationMode.LANCZOS), # transforms.Resize: resize the shorter edge to mid_reso
+        transforms.CenterCrop((final_reso, final_reso)),
+        transforms.ToTensor(), normalize_01_into_pm1,
+    ]
+    if hflip: train_aug.insert(0, transforms.RandomHorizontalFlip())
+    train_aug, val_aug = transforms.Compose(train_aug), transforms.Compose(val_aug)
+    # build dataset
+    train_set = DatasetFolder(root=osp.join(data_path, 'train'), loader=pil_loader, extensions=IMG_EXTENSIONS, transform=train_aug)
+    val_set = DatasetFolder(root=osp.join(data_path, 'val'), loader=pil_loader, extensions=IMG_EXTENSIONS, transform=val_aug)
+    num_classes = 1000
+    print(f'[Dataset] {len(train_set)=}, {len(val_set)=}, {num_classes=}')
+    print_aug(train_aug, '[train]')
+    print_aug(val_aug, '[val]')
+    return num_classes, train_set, val_set
+def pil_loader(path):
+    with open(path, 'rb') as f:
+        img: PImage.Image = PImage.open(f).convert('RGB')
+    return img
+def print_aug(transform, label):
+    print(f'Transform {label} = ')
+    if hasattr(transform, 'transforms'):
+        for t in transform.transforms:
+            print(t)
+    else:
+        print(transform)
+    print('---------------------------\n')

utils/data_sampler.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+class EvalDistributedSampler(Sampler):
+    def __init__(self, dataset, num_replicas, rank):
+        seps = np.linspace(0, len(dataset), num_replicas+1, dtype=int)
+        beg, end = seps[:-1], seps[1:]
+        beg, end = beg[rank], end[rank]
+        self.indices = tuple(range(beg, end))
+    def __iter__(self):
+        return iter(self.indices)
+    def __len__(self) -> int:
+        return len(self.indices)
+class InfiniteBatchSampler(Sampler):
+    def __init__(self, dataset_len, batch_size, seed_for_all_rank=0, fill_last=False, shuffle=True, drop_last=False, start_ep=0, start_it=0):
+        self.dataset_len = dataset_len
+        self.batch_size = batch_size
+        self.iters_per_ep = dataset_len // batch_size if drop_last else (dataset_len + batch_size - 1) // batch_size
+        self.max_p = self.iters_per_ep * batch_size
+        self.fill_last = fill_last
+        self.shuffle = shuffle
+        self.epoch = start_ep
+        self.same_seed_for_all_ranks = seed_for_all_rank
+        self.indices = self.gener_indices()
+        self.start_ep, self.start_it = start_ep, start_it
+    def gener_indices(self):
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch + self.same_seed_for_all_ranks)
+            indices = torch.randperm(self.dataset_len, generator=g).numpy()
+        else:
+            indices = torch.arange(self.dataset_len).numpy()
+        tails = self.batch_size - (self.dataset_len % self.batch_size)
+        if tails != self.batch_size and self.fill_last:
+            tails = indices[:tails]
+            np.random.shuffle(indices)
+            indices = np.concatenate((indices, tails))
+        # built-in list/tuple is faster than np.ndarray (when collating the data via a for-loop)
+        # noinspection PyTypeChecker
+        return tuple(indices.tolist())
+    def __iter__(self):
+        self.epoch = self.start_ep
+        while True:
+            self.epoch += 1
+            p = (self.start_it * self.batch_size) if self.epoch == self.start_ep else 0
+            while p < self.max_p:
+                q = p + self.batch_size
+                yield self.indices[p:q]
+                p = q
+            if self.shuffle:
+                self.indices = self.gener_indices()
+    def __len__(self):
+        return self.iters_per_ep
+class DistInfiniteBatchSampler(InfiniteBatchSampler):
+    def __init__(self, world_size, rank, dataset_len, glb_batch_size, same_seed_for_all_ranks=0, repeated_aug=0, fill_last=False, shuffle=True, start_ep=0, start_it=0):
+        assert glb_batch_size % world_size == 0
+        self.world_size, self.rank = world_size, rank
+        self.dataset_len = dataset_len
+        self.glb_batch_size = glb_batch_size
+        self.batch_size = glb_batch_size // world_size
+        self.iters_per_ep = (dataset_len + glb_batch_size - 1) // glb_batch_size
+        self.fill_last = fill_last
+        self.shuffle = shuffle
+        self.repeated_aug = repeated_aug
+        self.epoch = start_ep
+        self.same_seed_for_all_ranks = same_seed_for_all_ranks
+        self.indices = self.gener_indices()
+        self.start_ep, self.start_it = start_ep, start_it
+    def gener_indices(self):
+        global_max_p = self.iters_per_ep * self.glb_batch_size  # global_max_p % world_size must be 0 cuz glb_batch_size % world_size == 0
+        # print(f'global_max_p = iters_per_ep({self.iters_per_ep}) * glb_batch_size({self.glb_batch_size}) = {global_max_p}')
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch + self.same_seed_for_all_ranks)
+            global_indices = torch.randperm(self.dataset_len, generator=g)
+            if self.repeated_aug > 1:
+                global_indices = global_indices[:(self.dataset_len + self.repeated_aug - 1) // self.repeated_aug].repeat_interleave(self.repeated_aug, dim=0)[:global_max_p]
+        else:
+            global_indices = torch.arange(self.dataset_len)
+        filling = global_max_p - global_indices.shape[0]
+        if filling > 0 and self.fill_last:
+            global_indices = torch.cat((global_indices, global_indices[:filling]))
+        # global_indices = tuple(global_indices.numpy().tolist())
+        seps = torch.linspace(0, global_indices.shape[0], self.world_size + 1, dtype=torch.int)
+        local_indices = global_indices[seps[self.rank].item():seps[self.rank + 1].item()].tolist()
+        self.max_p = len(local_indices)
+        return local_indices

utils/lr_control.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import math
+from pprint import pformat
+from typing import Tuple, List, Dict, Union
+import torch.nn
+import dist
+def lr_wd_annealing(sche_type: str, optimizer, peak_lr, wd, wd_end, cur_it, wp_it, max_it, wp0=0.005, wpe=0.001):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    wp_it = round(wp_it)
+    if cur_it < wp_it:
+        cur_lr = wp0 + (1-wp0) * cur_it / wp_it
+    else:
+        pasd = (cur_it - wp_it) / (max_it-1 - wp_it)   # [0, 1]
+        rest = 1 - pasd     # [1, 0]
+        if sche_type == 'cos':
+            cur_lr = wpe + (1-wpe) * (0.5 + 0.5 * math.cos(math.pi * pasd))
+        elif sche_type == 'lin':
+            T = 0.15; max_rest = 1-T
+            if pasd < T: cur_lr = 1
+            else: cur_lr = wpe + (1-wpe) * rest / max_rest  # 1 to wpe
+        elif sche_type == 'lin0':
+            T = 0.05; max_rest = 1-T
+            if pasd < T: cur_lr = 1
+            else: cur_lr = wpe + (1-wpe) * rest / max_rest
+        elif sche_type == 'lin00':
+            cur_lr = wpe + (1-wpe) * rest
+        elif sche_type.startswith('lin'):
+            T = float(sche_type[3:]); max_rest = 1-T
+            wpe_mid = wpe + (1-wpe) * max_rest
+            wpe_mid = (1 + wpe_mid) / 2
+            if pasd < T: cur_lr = 1 + (wpe_mid-1) * pasd / T
+            else: cur_lr = wpe + (wpe_mid-wpe) * rest / max_rest
+        elif sche_type == 'exp':
+            T = 0.15; max_rest = 1-T
+            if pasd < T: cur_lr = 1
+            else:
+                expo = (pasd-T) / max_rest * math.log(wpe)
+                cur_lr = math.exp(expo)
+        else:
+            raise NotImplementedError(f'unknown sche_type {sche_type}')
+    cur_lr *= peak_lr
+    pasd = cur_it / (max_it-1)
+    cur_wd = wd_end + (wd - wd_end) * (0.5 + 0.5 * math.cos(math.pi * pasd))
+    inf = 1e6
+    min_lr, max_lr = inf, -1
+    min_wd, max_wd = inf, -1
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = cur_lr * param_group.get('lr_sc', 1)    # 'lr_sc' could be assigned
+        max_lr = max(max_lr, param_group['lr'])
+        min_lr = min(min_lr, param_group['lr'])
+        param_group['weight_decay'] = cur_wd * param_group.get('wd_sc', 1)
+        max_wd = max(max_wd, param_group['weight_decay'])
+        if param_group['weight_decay'] > 0:
+            min_wd = min(min_wd, param_group['weight_decay'])
+    if min_lr == inf: min_lr = -1
+    if min_wd == inf: min_wd = -1
+    return min_lr, max_lr, min_wd, max_wd
+def filter_params(model, nowd_keys=()) -> Tuple[
+    List[str], List[torch.nn.Parameter], List[Dict[str, Union[torch.nn.Parameter, float]]]
+]:
+    para_groups, para_groups_dbg = {}, {}
+    names, paras = [], []
+    names_no_grad = []
+    count, numel = 0, 0
+    for name, para in model.named_parameters():
+        name = name.replace('_fsdp_wrapped_module.', '')
+        if not para.requires_grad:
+            names_no_grad.append(name)
+            continue  # frozen weights
+        count += 1
+        numel += para.numel()
+        names.append(name)
+        paras.append(para)
+        if para.ndim == 1 or name.endswith('bias') or any(k in name for k in nowd_keys):
+            cur_wd_sc, group_name = 0., 'ND'
+        else:
+            cur_wd_sc, group_name = 1., 'D'
+        cur_lr_sc = 1.
+        if group_name not in para_groups:
+            para_groups[group_name] = {'params': [], 'wd_sc': cur_wd_sc, 'lr_sc': cur_lr_sc}
+            para_groups_dbg[group_name] = {'params': [], 'wd_sc': cur_wd_sc, 'lr_sc': cur_lr_sc}
+        para_groups[group_name]['params'].append(para)
+        para_groups_dbg[group_name]['params'].append(name)
+    for g in para_groups_dbg.values():
+        g['params'] = pformat(', '.join(g['params']), width=200)
+    print(f'[get_param_groups] param_groups = \n{pformat(para_groups_dbg, indent=2, width=240)}\n')
+    for rk in range(dist.get_world_size()):
+        dist.barrier()
+        if dist.get_rank() == rk:
+            print(f'[get_param_groups][rank{dist.get_rank()}] {type(model).__name__=} {count=}, {numel=}', flush=True, force=True)
+    print('')
+    assert len(names_no_grad) == 0, f'[get_param_groups] names_no_grad = \n{pformat(names_no_grad, indent=2, width=240)}\n'
+    return names, paras, list(para_groups.values())

utils/misc.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import datetime
+import functools
+import glob
+import os
+import subprocess
+import sys
+import time
+from collections import defaultdict, deque
+from typing import Iterator, List, Tuple
+import numpy as np
+import pytz
+import torch
+import torch.distributed as tdist
+import dist
+from utils import arg_util
+os_system = functools.partial(subprocess.call, shell=True)
+def echo(info):
+    os_system(f'echo "[$(date "+%m-%d-%H:%M:%S")] ({os.path.basename(sys._getframe().f_back.f_code.co_filename)}, line{sys._getframe().f_back.f_lineno})=> {info}"')
+def os_system_get_stdout(cmd):
+    return subprocess.run(cmd, shell=True, stdout=subprocess.PIPE).stdout.decode('utf-8')
+def os_system_get_stdout_stderr(cmd):
+    cnt = 0
+    while True:
+        try:
+            sp = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=30)
+        except subprocess.TimeoutExpired:
+            cnt += 1
+            print(f'[fetch free_port file] timeout cnt={cnt}')
+        else:
+            return sp.stdout.decode('utf-8'), sp.stderr.decode('utf-8')
+def time_str(fmt='[%m-%d %H:%M:%S]'):
+    return datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')).strftime(fmt)
+def init_distributed_mode(local_out_path, only_sync_master=False, timeout=30):
+    try:
+        dist.initialize(fork=False, timeout=timeout)
+        dist.barrier()
+    except RuntimeError:
+        print(f'{">"*75}  NCCL Error  {"<"*75}', flush=True)
+        time.sleep(10)
+    if local_out_path is not None: os.makedirs(local_out_path, exist_ok=True)
+    _change_builtin_print(dist.is_local_master())
+    if (dist.is_master() if only_sync_master else dist.is_local_master()) and local_out_path is not None and len(local_out_path):
+        sys.stdout, sys.stderr = SyncPrint(local_out_path, sync_stdout=True), SyncPrint(local_out_path, sync_stdout=False)
+def _change_builtin_print(is_master):
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    if type(builtin_print) != type(open):
+        return
+    def prt(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        clean = kwargs.pop('clean', False)
+        deeper = kwargs.pop('deeper', False)
+        if is_master or force:
+            if not clean:
+                f_back = sys._getframe().f_back
+                if deeper and f_back.f_back is not None:
+                    f_back = f_back.f_back
+                file_desc = f'{f_back.f_code.co_filename:24s}'[-24:]
+                builtin_print(f'{time_str()} ({file_desc}, line{f_back.f_lineno:-4d})=>', *args, **kwargs)
+            else:
+                builtin_print(*args, **kwargs)
+    __builtin__.print = prt
+class SyncPrint(object):
+    def __init__(self, local_output_dir, sync_stdout=True):
+        self.sync_stdout = sync_stdout
+        self.terminal_stream = sys.stdout if sync_stdout else sys.stderr
+        fname = os.path.join(local_output_dir, 'stdout.txt' if sync_stdout else 'stderr.txt')
+        existing = os.path.exists(fname)
+        self.file_stream = open(fname, 'a')
+        if existing:
+            self.file_stream.write('\n'*7 + '='*55 + f'   RESTART {time_str()}   ' + '='*55 + '\n')
+        self.file_stream.flush()
+        self.enabled = True
+    def write(self, message):
+        self.terminal_stream.write(message)
+        self.file_stream.write(message)
+    def flush(self):
+        self.terminal_stream.flush()
+        self.file_stream.flush()
+    def close(self):
+        if not self.enabled:
+            return
+        self.enabled = False
+        self.file_stream.flush()
+        self.file_stream.close()
+        if self.sync_stdout:
+            sys.stdout = self.terminal_stream
+            sys.stdout.flush()
+        else:
+            sys.stderr = self.terminal_stream
+            sys.stderr.flush()
+    def __del__(self):
+        self.close()
+class DistLogger(object):
+    def __init__(self, lg, verbose):
+        self._lg, self._verbose = lg, verbose
+    @staticmethod
+    def do_nothing(*args, **kwargs):
+        pass
+    def __getattr__(self, attr: str):
+        return getattr(self._lg, attr) if self._verbose else DistLogger.do_nothing
+class TensorboardLogger(object):
+    def __init__(self, log_dir, filename_suffix):
+        try: import tensorflow_io as tfio
+        except: pass
+        from torch.utils.tensorboard import SummaryWriter
+        self.writer = SummaryWriter(log_dir=log_dir, filename_suffix=filename_suffix)
+        self.step = 0
+    def set_step(self, step=None):
+        if step is not None:
+            self.step = step
+        else:
+            self.step += 1
+    def update(self, head='scalar', step=None, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            # assert isinstance(v, (float, int)), type(v)
+            if step is None:  # iter wise
+                it = self.step
+                if it == 0 or (it + 1) % 500 == 0:
+                    if hasattr(v, 'item'): v = v.item()
+                    self.writer.add_scalar(f'{head}/{k}', v, it)
+            else:  # epoch wise
+                if hasattr(v, 'item'): v = v.item()
+                self.writer.add_scalar(f'{head}/{k}', v, step)
+    def log_tensor_as_distri(self, tag, tensor1d, step=None):
+        if step is None:  # iter wise
+            step = self.step
+            loggable = step == 0 or (step + 1) % 500 == 0
+        else:  # epoch wise
+            loggable = True
+        if loggable:
+            try:
+                self.writer.add_histogram(tag=tag, values=tensor1d, global_step=step)
+            except Exception as e:
+                print(f'[log_tensor_as_distri writer.add_histogram failed]: {e}')
+    def log_image(self, tag, img_chw, step=None):
+        if step is None:  # iter wise
+            step = self.step
+            loggable = step == 0 or (step + 1) % 500 == 0
+        else:  # epoch wise
+            loggable = True
+        if loggable:
+            self.writer.add_image(tag, img_chw, step, dataformats='CHW')
+    def flush(self):
+        self.writer.flush()
+    def close(self):
+        self.writer.close()
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=30, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        tdist.barrier()
+        tdist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        return np.median(self.deque) if len(self.deque) else 0
+    @property
+    def avg(self):
+        return sum(self.deque) / (len(self.deque) or 1)
+    @property
+    def global_avg(self):
+        return self.total / (self.count or 1)
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1] if len(self.deque) else 0
+    def time_preds(self, counts) -> Tuple[float, str, str]:
+        remain_secs = counts * self.median
+        return remain_secs, str(datetime.timedelta(seconds=round(remain_secs))), time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time() + remain_secs))
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter='  '):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+        self.iter_end_t = time.time()
+        self.log_iters = []
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if hasattr(v, 'item'): v = v.item()
+            # assert isinstance(v, (float, int)), type(v)
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            if len(meter.deque):
+                loss_str.append(
+                    "{}: {}".format(name, str(meter))
+                )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, start_it, max_iters, itrt, print_freq, header=None):
+        self.log_iters = set(np.linspace(0, max_iters-1, print_freq, dtype=int).tolist())
+        self.log_iters.add(start_it)
+        if not header:
+            header = ''
+        start_time = time.time()
+        self.iter_end_t = time.time()
+        self.iter_time = SmoothedValue(fmt='{avg:.4f}')
+        self.data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(max_iters))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        log_msg = self.delimiter.join(log_msg)
+        if isinstance(itrt, Iterator) and not hasattr(itrt, 'preload') and not hasattr(itrt, 'set_epoch'):
+            for i in range(start_it, max_iters):
+                obj = next(itrt)
+                self.data_time.update(time.time() - self.iter_end_t)
+                yield i, obj
+                self.iter_time.update(time.time() - self.iter_end_t)
+                if i in self.log_iters:
+                    eta_seconds = self.iter_time.global_avg * (max_iters - i)
+                    eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                    print(log_msg.format(
+                        i, max_iters, eta=eta_string,
+                        meters=str(self),
+                        time=str(self.iter_time), data=str(self.data_time)), flush=True)
+                self.iter_end_t = time.time()
+        else:
+            if isinstance(itrt, int): itrt = range(itrt)
+            for i, obj in enumerate(itrt):
+                self.data_time.update(time.time() - self.iter_end_t)
+                yield i, obj
+                self.iter_time.update(time.time() - self.iter_end_t)
+                if i in self.log_iters:
+                    eta_seconds = self.iter_time.global_avg * (max_iters - i)
+                    eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                    print(log_msg.format(
+                        i, max_iters, eta=eta_string,
+                        meters=str(self),
+                        time=str(self.iter_time), data=str(self.data_time)), flush=True)
+                self.iter_end_t = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{}   Total time:      {}   ({:.3f} s / it)'.format(
+            header, total_time_str, total_time / max_iters), flush=True)
+def glob_with_latest_modified_first(pattern, recursive=False):
+    return sorted(glob.glob(pattern, recursive=recursive), key=os.path.getmtime, reverse=True)
+def auto_resume(args: arg_util.Args, pattern='ckpt*.pth') -> Tuple[List[str], int, int, dict, dict]:
+    info = []
+    file = os.path.join(args.local_out_dir_path, pattern)
+    all_ckpt = glob_with_latest_modified_first(file)
+    if len(all_ckpt) == 0:
+        info.append(f'[auto_resume] no ckpt found @ {file}')
+        info.append(f'[auto_resume quit]')
+        return info, 0, 0, {}, {}
+    else:
+        info.append(f'[auto_resume] load ckpt from @ {all_ckpt[0]} ...')
+        ckpt = torch.load(all_ckpt[0], map_location='cpu')
+        ep, it = ckpt['epoch'], ckpt['iter']
+        info.append(f'[auto_resume success] resume from ep{ep}, it{it}')
+        return info, ep, it, ckpt['trainer'], ckpt['args']
+def create_npz_from_sample_folder(sample_folder: str):
+    """
+    Builds a single .npz file from a folder of .png samples. Refer to DiT.
+    """
+    import os, glob
+    import numpy as np
+    from tqdm import tqdm
+    from PIL import Image
+    samples = []
+    pngs = glob.glob(os.path.join(sample_folder, '*.png')) + glob.glob(os.path.join(sample_folder, '*.PNG'))
+    assert len(pngs) == 50_000, f'{len(pngs)} png files found in {sample_folder}, but expected 50,000'
+    for png in tqdm(pngs, desc='Building .npz file from samples (png only)'):
+        with Image.open(png) as sample_pil:
+            sample_np = np.asarray(sample_pil).astype(np.uint8)
+        samples.append(sample_np)
+    samples = np.stack(samples)
+    assert samples.shape == (50_000, samples.shape[1], samples.shape[2], 3)
+    npz_path = f'{sample_folder}.npz'
+    np.savez(npz_path, arr_0=samples)
+    print(f'Saved .npz file to {npz_path} [shape={samples.shape}].')
+    return npz_path