Spaces:

Ryukijano
/

Splat_to_mesh

Runtime error

App Files Files Community

Ryukijano commited on May 30, 2024

Commit

913bca8

verified ·

1 Parent(s): 750d16c

Upload 14 files

Browse files

Files changed (15) hide show

.gitattributes +1 -0
Dockerfile +54 -0
__pycache__/app.cpython-310.pyc +0 -0
core/__init__.py +0 -0
core/__pycache__/__init__.cpython-310.pyc +0 -0
core/__pycache__/gs.cpython-310.pyc +0 -0
core/__pycache__/options.cpython-310.pyc +0 -0
core/attention.py +156 -0
core/gs.py +190 -0
core/models.py +174 -0
core/options.py +120 -0
core/provider_objaverse.py +172 -0
core/unet.py +319 -0
core/utils.py +109 -0
data_test/catstatue.ply +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data_test/catstatue.ply filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,54 @@

+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+# Configure environment
+ENV DEBIAN_FRONTEND=noninteractive
+# Install the required packages
+RUN apt-get update && apt-get install -y \
+    software-properties-common
+# Add the deadsnakes PPA
+RUN add-apt-repository ppa:deadsnakes/ppa
+# Install Python 3.10
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3.10-dev \
+    python3.10-distutils \
+    python3.10-venv \
+    python3-pip
+# Install other dependencies
+RUN apt-get install -y \
+    git \
+    gcc \
+    g++ \
+    libgl1 \
+    libglib2.0.0 \
+    ffmpeg \
+    cmake \
+    libgtk2.0.0
+# Working directory
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+# Install the required Python packages
+RUN pip install wheel
+RUN pip install torch==2.1.0+cu121 torchvision==0.16.0+cu121 torchaudio==2.1.0+cu121 torchtext==0.16.0 torchdata==0.7.0 --extra-index-url https://download.pytorch.org/whl/cu121 -U
+RUN sed -i 's/return caster.operator typename make_caster<T>::template cast_op_type<T>();/return caster;/' /home/user/.local/lib/python3.10/site-packages/torch/include/pybind11/cast.h
+RUN pip install tyro kiui PyMCubes nerfacc trimesh pymeshlab ninja plyfile xatlas pygltflib gradio opencv-python scikit-learn
+RUN pip install https://github.com/dylanebert/wheels/releases/download/1.0.0/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl
+RUN pip install https://github.com/dylanebert/wheels/releases/download/1.0.0/nvdiffrast-0.3.1-py3-none-any.whl
+RUN pip install git+https://github.com/ashawkey/kiuikit.git
+# Copy all files to the working directory
+COPY --chown=user . $HOME/app
+EXPOSE 7860
+# Run the gradio app
+CMD ["python3.10", "app.py"]

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (1.95 kB). View file

core/__init__.py ADDED Viewed

File without changes

core/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (135 Bytes). View file

core/__pycache__/gs.cpython-310.pyc ADDED Viewed

Binary file (5.45 kB). View file

core/__pycache__/options.cpython-310.pyc ADDED Viewed

Binary file (2.49 kB). View file

core/attention.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Attention)")
+    else:
+        warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_q: int,
+        dim_k: int,
+        dim_v: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.to_q = nn.Linear(dim_q, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(dim_k, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(dim_v, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # q: [B, N, Cq]
+        # k: [B, M, Ck]
+        # v: [B, M, Cv]
+        # return: [B, N, C]
+        B, N, _ = q.shape
+        M = k.shape[1]
+        q = self.scale * self.to_q(q).reshape(B, N, self.num_heads, self.dim // self.num_heads).permute(0, 2, 1, 3) # [B, nh, N, C/nh]
+        k = self.to_k(k).reshape(B, M, self.num_heads, self.dim // self.num_heads).permute(0, 2, 1, 3) # [B, nh, M, C/nh]
+        v = self.to_v(v).reshape(B, M, self.num_heads, self.dim // self.num_heads).permute(0, 2, 1, 3) # [B, nh, M, C/nh]
+        attn = q @ k.transpose(-2, -1) # [B, nh, N, M]
+        attn = attn.softmax(dim=-1) # [B, nh, N, M]
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1) # [B, nh, N, M] @ [B, nh, M, C/nh] --> [B, nh, N, C/nh] --> [B, N, nh, C/nh] --> [B, N, C]
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffCrossAttention(CrossAttention):
+    def forward(self, q: Tensor, k: Tensor, v: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, _ = q.shape
+        M = k.shape[1]
+        q = self.scale * self.to_q(q).reshape(B, N, self.num_heads, self.dim // self.num_heads) # [B, N, nh, C/nh]
+        k = self.to_k(k).reshape(B, M, self.num_heads, self.dim // self.num_heads) # [B, M, nh, C/nh]
+        v = self.to_v(v).reshape(B, M, self.num_heads, self.dim // self.num_heads) # [B, M, nh, C/nh]
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

core/gs.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diff_gaussian_rasterization import (
+    GaussianRasterizationSettings,
+    GaussianRasterizer,
+)
+from core.options import Options
+import kiui
+class GaussianRenderer:
+    def __init__(self, opt: Options):
+        self.opt = opt
+        self.bg_color = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
+        # intrinsics
+        self.tan_half_fov = np.tan(0.5 * np.deg2rad(self.opt.fovy))
+        self.proj_matrix = torch.zeros(4, 4, dtype=torch.float32)
+        self.proj_matrix[0, 0] = 1 / self.tan_half_fov
+        self.proj_matrix[1, 1] = 1 / self.tan_half_fov
+        self.proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
+        self.proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear)
+        self.proj_matrix[2, 3] = 1
+    def render(self, gaussians, cam_view, cam_view_proj, cam_pos, bg_color=None, scale_modifier=1):
+        # gaussians: [B, N, 14]
+        # cam_view, cam_view_proj: [B, V, 4, 4]
+        # cam_pos: [B, V, 3]
+        device = gaussians.device
+        B, V = cam_view.shape[:2]
+        # loop of loop...
+        images = []
+        alphas = []
+        for b in range(B):
+            # pos, opacity, scale, rotation, shs
+            means3D = gaussians[b, :, 0:3].contiguous().float()
+            opacity = gaussians[b, :, 3:4].contiguous().float()
+            scales = gaussians[b, :, 4:7].contiguous().float()
+            rotations = gaussians[b, :, 7:11].contiguous().float()
+            rgbs = gaussians[b, :, 11:].contiguous().float() # [N, 3]
+            for v in range(V):
+                # render novel views
+                view_matrix = cam_view[b, v].float()
+                view_proj_matrix = cam_view_proj[b, v].float()
+                campos = cam_pos[b, v].float()
+                raster_settings = GaussianRasterizationSettings(
+                    image_height=self.opt.output_size,
+                    image_width=self.opt.output_size,
+                    tanfovx=self.tan_half_fov,
+                    tanfovy=self.tan_half_fov,
+                    bg=self.bg_color if bg_color is None else bg_color,
+                    scale_modifier=scale_modifier,
+                    viewmatrix=view_matrix,
+                    projmatrix=view_proj_matrix,
+                    sh_degree=0,
+                    campos=campos,
+                    prefiltered=False,
+                    debug=False,
+                )
+                rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+                # Rasterize visible Gaussians to image, obtain their radii (on screen).
+                rendered_image, radii, rendered_depth, rendered_alpha = rasterizer(
+                    means3D=means3D,
+                    means2D=torch.zeros_like(means3D, dtype=torch.float32, device=device),
+                    shs=None,
+                    colors_precomp=rgbs,
+                    opacities=opacity,
+                    scales=scales,
+                    rotations=rotations,
+                    cov3D_precomp=None,
+                )
+                rendered_image = rendered_image.clamp(0, 1)
+                images.append(rendered_image)
+                alphas.append(rendered_alpha)
+        images = torch.stack(images, dim=0).view(B, V, 3, self.opt.output_size, self.opt.output_size)
+        alphas = torch.stack(alphas, dim=0).view(B, V, 1, self.opt.output_size, self.opt.output_size)
+        return {
+            "image": images, # [B, V, 3, H, W]
+            "alpha": alphas, # [B, V, 1, H, W]
+        }
+    def save_ply(self, gaussians, path, compatible=True):
+        # gaussians: [B, N, 14]
+        # compatible: save pre-activated gaussians as in the original paper
+        assert gaussians.shape[0] == 1, 'only support batch size 1'
+        from plyfile import PlyData, PlyElement
+        means3D = gaussians[0, :, 0:3].contiguous().float()
+        opacity = gaussians[0, :, 3:4].contiguous().float()
+        scales = gaussians[0, :, 4:7].contiguous().float()
+        rotations = gaussians[0, :, 7:11].contiguous().float()
+        shs = gaussians[0, :, 11:].unsqueeze(1).contiguous().float() # [N, 1, 3]
+        # prune by opacity
+        mask = opacity.squeeze(-1) >= 0.005
+        means3D = means3D[mask]
+        opacity = opacity[mask]
+        scales = scales[mask]
+        rotations = rotations[mask]
+        shs = shs[mask]
+        # invert activation to make it compatible with the original ply format
+        if compatible:
+            opacity = kiui.op.inverse_sigmoid(opacity)
+            scales = torch.log(scales + 1e-8)
+            shs = (shs - 0.5) / 0.28209479177387814
+        xyzs = means3D.detach().cpu().numpy()
+        f_dc = shs.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        opacities = opacity.detach().cpu().numpy()
+        scales = scales.detach().cpu().numpy()
+        rotations = rotations.detach().cpu().numpy()
+        l = ['x', 'y', 'z']
+        # All channels except the 3 DC
+        for i in range(f_dc.shape[1]):
+            l.append('f_dc_{}'.format(i))
+        l.append('opacity')
+        for i in range(scales.shape[1]):
+            l.append('scale_{}'.format(i))
+        for i in range(rotations.shape[1]):
+            l.append('rot_{}'.format(i))
+        dtype_full = [(attribute, 'f4') for attribute in l]
+        elements = np.empty(xyzs.shape[0], dtype=dtype_full)
+        attributes = np.concatenate((xyzs, f_dc, opacities, scales, rotations), axis=1)
+        elements[:] = list(map(tuple, attributes))
+        el = PlyElement.describe(elements, 'vertex')
+        PlyData([el]).write(path)
+    def load_ply(self, path, compatible=True):
+        from plyfile import PlyData, PlyElement
+        plydata = PlyData.read(path)
+        xyz = np.stack((np.asarray(plydata.elements[0]["x"]),
+                        np.asarray(plydata.elements[0]["y"]),
+                        np.asarray(plydata.elements[0]["z"])),  axis=1)
+        print("Number of points at loading : ", xyz.shape[0])
+        opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis]
+        shs = np.zeros((xyz.shape[0], 3))
+        shs[:, 0] = np.asarray(plydata.elements[0]["f_dc_0"])
+        shs[:, 1] = np.asarray(plydata.elements[0]["f_dc_1"])
+        shs[:, 2] = np.asarray(plydata.elements[0]["f_dc_2"])
+        scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")]
+        scales = np.zeros((xyz.shape[0], len(scale_names)))
+        for idx, attr_name in enumerate(scale_names):
+            scales[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot_")]
+        rots = np.zeros((xyz.shape[0], len(rot_names)))
+        for idx, attr_name in enumerate(rot_names):
+            rots[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        gaussians = np.concatenate([xyz, opacities, scales, rots, shs], axis=1)
+        gaussians = torch.from_numpy(gaussians).float() # cpu
+        if compatible:
+            gaussians[..., 3:4] = torch.sigmoid(gaussians[..., 3:4])
+            gaussians[..., 4:7] = torch.exp(gaussians[..., 4:7])
+            gaussians[..., 11:] = 0.28209479177387814 * gaussians[..., 11:] + 0.5
+        return gaussians

core/models.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import kiui
+from kiui.lpips import LPIPS
+from core.unet import UNet
+from core.options import Options
+from core.gs import GaussianRenderer
+class LGM(nn.Module):
+    def __init__(
+        self,
+        opt: Options,
+    ):
+        super().__init__()
+        self.opt = opt
+        # unet
+        self.unet = UNet(
+            9, 14,
+            down_channels=self.opt.down_channels,
+            down_attention=self.opt.down_attention,
+            mid_attention=self.opt.mid_attention,
+            up_channels=self.opt.up_channels,
+            up_attention=self.opt.up_attention,
+        )
+        # last conv
+        self.conv = nn.Conv2d(14, 14, kernel_size=1) # NOTE: maybe remove it if train again
+        # Gaussian Renderer
+        self.gs = GaussianRenderer(opt)
+        # activations...
+        self.pos_act = lambda x: x.clamp(-1, 1)
+        self.scale_act = lambda x: 0.1 * F.softplus(x)
+        self.opacity_act = lambda x: torch.sigmoid(x)
+        self.rot_act = F.normalize
+        self.rgb_act = lambda x: 0.5 * torch.tanh(x) + 0.5 # NOTE: may use sigmoid if train again
+        # LPIPS loss
+        if self.opt.lambda_lpips > 0:
+            self.lpips_loss = LPIPS(net='vgg')
+            self.lpips_loss.requires_grad_(False)
+    def state_dict(self, **kwargs):
+        # remove lpips_loss
+        state_dict = super().state_dict(**kwargs)
+        for k in list(state_dict.keys()):
+            if 'lpips_loss' in k:
+                del state_dict[k]
+        return state_dict
+    def prepare_default_rays(self, device, elevation=0):
+        from kiui.cam import orbit_camera
+        from core.utils import get_rays
+        cam_poses = np.stack([
+            orbit_camera(elevation, 0, radius=self.opt.cam_radius),
+            orbit_camera(elevation, 90, radius=self.opt.cam_radius),
+            orbit_camera(elevation, 180, radius=self.opt.cam_radius),
+            orbit_camera(elevation, 270, radius=self.opt.cam_radius),
+        ], axis=0) # [4, 4, 4]
+        cam_poses = torch.from_numpy(cam_poses)
+        rays_embeddings = []
+        for i in range(cam_poses.shape[0]):
+            rays_o, rays_d = get_rays(cam_poses[i], self.opt.input_size, self.opt.input_size, self.opt.fovy) # [h, w, 3]
+            rays_plucker = torch.cat([torch.cross(rays_o, rays_d, dim=-1), rays_d], dim=-1) # [h, w, 6]
+            rays_embeddings.append(rays_plucker)
+            ## visualize rays for plotting figure
+            # kiui.vis.plot_image(rays_d * 0.5 + 0.5, save=True)
+        rays_embeddings = torch.stack(rays_embeddings, dim=0).permute(0, 3, 1, 2).contiguous().to(device) # [V, 6, h, w]
+        return rays_embeddings
+    def forward_gaussians(self, images):
+        # images: [B, 4, 9, H, W]
+        # return: Gaussians: [B, dim_t]
+        B, V, C, H, W = images.shape
+        images = images.view(B*V, C, H, W)
+        x = self.unet(images) # [B*4, 14, h, w]
+        x = self.conv(x) # [B*4, 14, h, w]
+        x = x.reshape(B, 4, 14, self.opt.splat_size, self.opt.splat_size)
+        ## visualize multi-view gaussian features for plotting figure
+        # tmp_alpha = self.opacity_act(x[0, :, 3:4])
+        # tmp_img_rgb = self.rgb_act(x[0, :, 11:]) * tmp_alpha + (1 - tmp_alpha)
+        # tmp_img_pos = self.pos_act(x[0, :, 0:3]) * 0.5 + 0.5
+        # kiui.vis.plot_image(tmp_img_rgb, save=True)
+        # kiui.vis.plot_image(tmp_img_pos, save=True)
+        x = x.permute(0, 1, 3, 4, 2).reshape(B, -1, 14)
+        pos = self.pos_act(x[..., 0:3]) # [B, N, 3]
+        opacity = self.opacity_act(x[..., 3:4])
+        scale = self.scale_act(x[..., 4:7])
+        rotation = self.rot_act(x[..., 7:11])
+        rgbs = self.rgb_act(x[..., 11:])
+        gaussians = torch.cat([pos, opacity, scale, rotation, rgbs], dim=-1) # [B, N, 14]
+        return gaussians
+    def forward(self, data, step_ratio=1):
+        # data: output of the dataloader
+        # return: loss
+        results = {}
+        loss = 0
+        images = data['input'] # [B, 4, 9, h, W], input features
+        # use the first view to predict gaussians
+        gaussians = self.forward_gaussians(images) # [B, N, 14]
+        results['gaussians'] = gaussians
+        # random bg for training
+        if self.training:
+            bg_color = torch.rand(3, dtype=torch.float32, device=gaussians.device)
+        else:
+            bg_color = torch.ones(3, dtype=torch.float32, device=gaussians.device)
+        # use the other views for rendering and supervision
+        results = self.gs.render(gaussians, data['cam_view'], data['cam_view_proj'], data['cam_pos'], bg_color=bg_color)
+        pred_images = results['image'] # [B, V, C, output_size, output_size]
+        pred_alphas = results['alpha'] # [B, V, 1, output_size, output_size]
+        results['images_pred'] = pred_images
+        results['alphas_pred'] = pred_alphas
+        gt_images = data['images_output'] # [B, V, 3, output_size, output_size], ground-truth novel views
+        gt_masks = data['masks_output'] # [B, V, 1, output_size, output_size], ground-truth masks
+        gt_images = gt_images * gt_masks + bg_color.view(1, 1, 3, 1, 1) * (1 - gt_masks)
+        loss_mse = F.mse_loss(pred_images, gt_images) + F.mse_loss(pred_alphas, gt_masks)
+        loss = loss + loss_mse
+        if self.opt.lambda_lpips > 0:
+            loss_lpips = self.lpips_loss(
+                # gt_images.view(-1, 3, self.opt.output_size, self.opt.output_size) * 2 - 1,
+                # pred_images.view(-1, 3, self.opt.output_size, self.opt.output_size) * 2 - 1,
+                # downsampled to at most 256 to reduce memory cost
+                F.interpolate(gt_images.view(-1, 3, self.opt.output_size, self.opt.output_size) * 2 - 1, (256, 256), mode='bilinear', align_corners=False),
+                F.interpolate(pred_images.view(-1, 3, self.opt.output_size, self.opt.output_size) * 2 - 1, (256, 256), mode='bilinear', align_corners=False),
+            ).mean()
+            results['loss_lpips'] = loss_lpips
+            loss = loss + self.opt.lambda_lpips * loss_lpips
+        results['loss'] = loss
+        # metric
+        with torch.no_grad():
+            psnr = -10 * torch.log10(torch.mean((pred_images.detach() - gt_images) ** 2))
+            results['psnr'] = psnr
+        return results

core/options.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import tyro
+from dataclasses import dataclass
+from typing import Tuple, Literal, Dict, Optional
+@dataclass
+class Options:
+    ### model
+    # Unet image input size
+    input_size: int = 256
+    # Unet definition
+    down_channels: Tuple[int, ...] = (64, 128, 256, 512, 1024, 1024)
+    down_attention: Tuple[bool, ...] = (False, False, False, True, True, True)
+    mid_attention: bool = True
+    up_channels: Tuple[int, ...] = (1024, 1024, 512, 256)
+    up_attention: Tuple[bool, ...] = (True, True, True, False)
+    # Unet output size, dependent on the input_size and U-Net structure!
+    splat_size: int = 64
+    # gaussian render size
+    output_size: int = 256
+    ### dataset
+    # data mode (only support s3 now)
+    data_mode: Literal['s3'] = 's3'
+    # fovy of the dataset
+    fovy: float = 49.1
+    # camera near plane
+    znear: float = 0.5
+    # camera far plane
+    zfar: float = 2.5
+    # number of all views (input + output)
+    num_views: int = 12
+    # number of views
+    num_input_views: int = 4
+    # camera radius
+    cam_radius: float = 1.5 # to better use [-1, 1]^3 space
+    # num workers
+    num_workers: int = 8
+    ### training
+    # workspace
+    workspace: str = './workspace'
+    # resume
+    resume: Optional[str] = None
+    # batch size (per-GPU)
+    batch_size: int = 8
+    # gradient accumulation
+    gradient_accumulation_steps: int = 1
+    # training epochs
+    num_epochs: int = 30
+    # lpips loss weight
+    lambda_lpips: float = 1.0
+    # gradient clip
+    gradient_clip: float = 1.0
+    # mixed precision
+    mixed_precision: str = 'bf16'
+    # learning rate
+    lr: float = 4e-4
+    # augmentation prob for grid distortion
+    prob_grid_distortion: float = 0.5
+    # augmentation prob for camera jitter
+    prob_cam_jitter: float = 0.5
+    ### testing
+    # test image path
+    test_path: Optional[str] = None
+    ### misc
+    # nvdiffrast backend setting
+    force_cuda_rast: bool = False
+    # render fancy video with gaussian scaling effect
+    fancy_video: bool = False
+# all the default settings
+config_defaults: Dict[str, Options] = {}
+config_doc: Dict[str, str] = {}
+config_doc['lrm'] = 'the default settings for LGM'
+config_defaults['lrm'] = Options()
+config_doc['small'] = 'small model with lower resolution Gaussians'
+config_defaults['small'] = Options(
+    input_size=256,
+    splat_size=64,
+    output_size=256,
+    batch_size=8,
+    gradient_accumulation_steps=1,
+    mixed_precision='bf16',
+)
+config_doc['big'] = 'big model with higher resolution Gaussians'
+config_defaults['big'] = Options(
+    input_size=256,
+    up_channels=(1024, 1024, 512, 256, 128), # one more decoder
+    up_attention=(True, True, True, False, False),
+    splat_size=128,
+    output_size=512, # render & supervise Gaussians at a higher resolution.
+    batch_size=8,
+    num_views=8,
+    gradient_accumulation_steps=1,
+    mixed_precision='bf16',
+)
+config_doc['tiny'] = 'tiny model for ablation'
+config_defaults['tiny'] = Options(
+    input_size=256,
+    down_channels=(32, 64, 128, 256, 512),
+    down_attention=(False, False, False, False, True),
+    up_channels=(512, 256, 128),
+    up_attention=(True, False, False, False),
+    splat_size=64,
+    output_size=256,
+    batch_size=16,
+    num_views=8,
+    gradient_accumulation_steps=1,
+    mixed_precision='bf16',
+)
+AllConfigs = tyro.extras.subcommand_type_from_defaults(config_defaults, config_doc)

core/provider_objaverse.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import cv2
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+from torch.utils.data import Dataset
+import kiui
+from core.options import Options
+from core.utils import get_rays, grid_distortion, orbit_camera_jitter
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+class ObjaverseDataset(Dataset):
+    def _warn(self):
+        raise NotImplementedError('this dataset is just an example and cannot be used directly, you should modify it to your own setting! (search keyword TODO)')
+    def __init__(self, opt: Options, training=True):
+        self.opt = opt
+        self.training = training
+        # TODO: remove this barrier
+        self._warn()
+        # TODO: load the list of objects for training
+        self.items = []
+        with open('TODO: file containing the list', 'r') as f:
+            for line in f.readlines():
+                self.items.append(line.strip())
+        # naive split
+        if self.training:
+            self.items = self.items[:-self.opt.batch_size]
+        else:
+            self.items = self.items[-self.opt.batch_size:]
+        # default camera intrinsics
+        self.tan_half_fov = np.tan(0.5 * np.deg2rad(self.opt.fovy))
+        self.proj_matrix = torch.zeros(4, 4, dtype=torch.float32)
+        self.proj_matrix[0, 0] = 1 / self.tan_half_fov
+        self.proj_matrix[1, 1] = 1 / self.tan_half_fov
+        self.proj_matrix[2, 2] = (self.opt.zfar + self.opt.znear) / (self.opt.zfar - self.opt.znear)
+        self.proj_matrix[3, 2] = - (self.opt.zfar * self.opt.znear) / (self.opt.zfar - self.opt.znear)
+        self.proj_matrix[2, 3] = 1
+    def __len__(self):
+        return len(self.items)
+    def __getitem__(self, idx):
+        uid = self.items[idx]
+        results = {}
+        # load num_views images
+        images = []
+        masks = []
+        cam_poses = []
+        vid_cnt = 0
+        # TODO: choose views, based on your rendering settings
+        if self.training:
+            # input views are in (36, 72), other views are randomly selected
+            vids = np.random.permutation(np.arange(36, 73))[:self.opt.num_input_views].tolist() + np.random.permutation(100).tolist()
+        else:
+            # fixed views
+            vids = np.arange(36, 73, 4).tolist() + np.arange(100).tolist()
+        for vid in vids:
+            image_path = os.path.join(uid, 'rgb', f'{vid:03d}.png')
+            camera_path = os.path.join(uid, 'pose', f'{vid:03d}.txt')
+            try:
+                # TODO: load data (modify self.client here)
+                image = np.frombuffer(self.client.get(image_path), np.uint8)
+                image = torch.from_numpy(cv2.imdecode(image, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255) # [512, 512, 4] in [0, 1]
+                c2w = [float(t) for t in self.client.get(camera_path).decode().strip().split(' ')]
+                c2w = torch.tensor(c2w, dtype=torch.float32).reshape(4, 4)
+            except Exception as e:
+                # print(f'[WARN] dataset {uid} {vid}: {e}')
+                continue
+            # TODO: you may have a different camera system
+            # blender world + opencv cam --> opengl world & cam
+            c2w[1] *= -1
+            c2w[[1, 2]] = c2w[[2, 1]]
+            c2w[:3, 1:3] *= -1 # invert up and forward direction
+            # scale up radius to fully use the [-1, 1]^3 space!
+            c2w[:3, 3] *= self.opt.cam_radius / 1.5 # 1.5 is the default scale
+            image = image.permute(2, 0, 1) # [4, 512, 512]
+            mask = image[3:4] # [1, 512, 512]
+            image = image[:3] * mask + (1 - mask) # [3, 512, 512], to white bg
+            image = image[[2,1,0]].contiguous() # bgr to rgb
+            images.append(image)
+            masks.append(mask.squeeze(0))
+            cam_poses.append(c2w)
+            vid_cnt += 1
+            if vid_cnt == self.opt.num_views:
+                break
+        if vid_cnt < self.opt.num_views:
+            print(f'[WARN] dataset {uid}: not enough valid views, only {vid_cnt} views found!')
+            n = self.opt.num_views - vid_cnt
+            images = images + [images[-1]] * n
+            masks = masks + [masks[-1]] * n
+            cam_poses = cam_poses + [cam_poses[-1]] * n
+        images = torch.stack(images, dim=0) # [V, C, H, W]
+        masks = torch.stack(masks, dim=0) # [V, H, W]
+        cam_poses = torch.stack(cam_poses, dim=0) # [V, 4, 4]
+        # normalized camera feats as in paper (transform the first pose to a fixed position)
+        transform = torch.tensor([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, self.opt.cam_radius], [0, 0, 0, 1]], dtype=torch.float32) @ torch.inverse(cam_poses[0])
+        cam_poses = transform.unsqueeze(0) @ cam_poses  # [V, 4, 4]
+        images_input = F.interpolate(images[:self.opt.num_input_views].clone(), size=(self.opt.input_size, self.opt.input_size), mode='bilinear', align_corners=False) # [V, C, H, W]
+        cam_poses_input = cam_poses[:self.opt.num_input_views].clone()
+        # data augmentation
+        if self.training:
+            # apply random grid distortion to simulate 3D inconsistency
+            if random.random() < self.opt.prob_grid_distortion:
+                images_input[1:] = grid_distortion(images_input[1:])
+            # apply camera jittering (only to input!)
+            if random.random() < self.opt.prob_cam_jitter:
+                cam_poses_input[1:] = orbit_camera_jitter(cam_poses_input[1:])
+        images_input = TF.normalize(images_input, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)
+        # resize render ground-truth images, range still in [0, 1]
+        results['images_output'] = F.interpolate(images, size=(self.opt.output_size, self.opt.output_size), mode='bilinear', align_corners=False) # [V, C, output_size, output_size]
+        results['masks_output'] = F.interpolate(masks.unsqueeze(1), size=(self.opt.output_size, self.opt.output_size), mode='bilinear', align_corners=False) # [V, 1, output_size, output_size]
+        # build rays for input views
+        rays_embeddings = []
+        for i in range(self.opt.num_input_views):
+            rays_o, rays_d = get_rays(cam_poses_input[i], self.opt.input_size, self.opt.input_size, self.opt.fovy) # [h, w, 3]
+            rays_plucker = torch.cat([torch.cross(rays_o, rays_d, dim=-1), rays_d], dim=-1) # [h, w, 6]
+            rays_embeddings.append(rays_plucker)
+        rays_embeddings = torch.stack(rays_embeddings, dim=0).permute(0, 3, 1, 2).contiguous() # [V, 6, h, w]
+        final_input = torch.cat([images_input, rays_embeddings], dim=1) # [V=4, 9, H, W]
+        results['input'] = final_input
+        # opengl to colmap camera for gaussian renderer
+        cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
+        # cameras needed by gaussian rasterizer
+        cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
+        cam_view_proj = cam_view @ self.proj_matrix # [V, 4, 4]
+        cam_pos = - cam_poses[:, :3, 3] # [V, 3]
+        results['cam_view'] = cam_view
+        results['cam_view_proj'] = cam_view_proj
+        results['cam_pos'] = cam_pos
+        return results

core/unet.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import Tuple, Literal
+from functools import partial
+from core.attention import MemEffAttention
+class MVAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        groups: int = 32,
+        eps: float = 1e-5,
+        residual: bool = True,
+        skip_scale: float = 1,
+        num_frames: int = 4, # WARN: hardcoded!
+    ):
+        super().__init__()
+        self.residual = residual
+        self.skip_scale = skip_scale
+        self.num_frames = num_frames
+        self.norm = nn.GroupNorm(num_groups=groups, num_channels=dim, eps=eps, affine=True)
+        self.attn = MemEffAttention(dim, num_heads, qkv_bias, proj_bias, attn_drop, proj_drop)
+    def forward(self, x):
+        # x: [B*V, C, H, W]
+        BV, C, H, W = x.shape
+        B = BV // self.num_frames # assert BV % self.num_frames == 0
+        res = x
+        x = self.norm(x)
+        x = x.reshape(B, self.num_frames, C, H, W).permute(0, 1, 3, 4, 2).reshape(B, -1, C)
+        x = self.attn(x)
+        x = x.reshape(B, self.num_frames, H, W, C).permute(0, 1, 4, 2, 3).reshape(BV, C, H, W)
+        if self.residual:
+            x = (x + res) * self.skip_scale
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resample: Literal['default', 'up', 'down'] = 'default',
+        groups: int = 32,
+        eps: float = 1e-5,
+        skip_scale: float = 1, # multiplied to output
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.skip_scale = skip_scale
+        self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=groups, num_channels=out_channels, eps=eps, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.act = F.silu
+        self.resample = None
+        if resample == 'up':
+            self.resample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
+        elif resample == 'down':
+            self.resample = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.shortcut = nn.Identity()
+        if self.in_channels != self.out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=True)
+    def forward(self, x):
+        res = x
+        x = self.norm1(x)
+        x = self.act(x)
+        if self.resample:
+            res = self.resample(res)
+            x = self.resample(x)
+        x = self.conv1(x)
+        x = self.norm2(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = (x + self.shortcut(res)) * self.skip_scale
+        return x
+class DownBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_layers: int = 1,
+        downsample: bool = True,
+        attention: bool = True,
+        attention_heads: int = 16,
+        skip_scale: float = 1,
+    ):
+        super().__init__()
+        nets = []
+        attns = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            nets.append(ResnetBlock(in_channels, out_channels, skip_scale=skip_scale))
+            if attention:
+                attns.append(MVAttention(out_channels, attention_heads, skip_scale=skip_scale))
+            else:
+                attns.append(None)
+        self.nets = nn.ModuleList(nets)
+        self.attns = nn.ModuleList(attns)
+        self.downsample = None
+        if downsample:
+            self.downsample = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1)
+    def forward(self, x):
+        xs = []
+        for attn, net in zip(self.attns, self.nets):
+            x = net(x)
+            if attn:
+                x = attn(x)
+            xs.append(x)
+        if self.downsample:
+            x = self.downsample(x)
+            xs.append(x)
+        return x, xs
+class MidBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_layers: int = 1,
+        attention: bool = True,
+        attention_heads: int = 16,
+        skip_scale: float = 1,
+    ):
+        super().__init__()
+        nets = []
+        attns = []
+        # first layer
+        nets.append(ResnetBlock(in_channels, in_channels, skip_scale=skip_scale))
+        # more layers
+        for i in range(num_layers):
+            nets.append(ResnetBlock(in_channels, in_channels, skip_scale=skip_scale))
+            if attention:
+                attns.append(MVAttention(in_channels, attention_heads, skip_scale=skip_scale))
+            else:
+                attns.append(None)
+        self.nets = nn.ModuleList(nets)
+        self.attns = nn.ModuleList(attns)
+    def forward(self, x):
+        x = self.nets[0](x)
+        for attn, net in zip(self.attns, self.nets[1:]):
+            if attn:
+                x = attn(x)
+            x = net(x)
+        return x
+class UpBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_out_channels: int,
+        out_channels: int,
+        num_layers: int = 1,
+        upsample: bool = True,
+        attention: bool = True,
+        attention_heads: int = 16,
+        skip_scale: float = 1,
+    ):
+        super().__init__()
+        nets = []
+        attns = []
+        for i in range(num_layers):
+            cin = in_channels if i == 0 else out_channels
+            cskip = prev_out_channels if (i == num_layers - 1) else out_channels
+            nets.append(ResnetBlock(cin + cskip, out_channels, skip_scale=skip_scale))
+            if attention:
+                attns.append(MVAttention(out_channels, attention_heads, skip_scale=skip_scale))
+            else:
+                attns.append(None)
+        self.nets = nn.ModuleList(nets)
+        self.attns = nn.ModuleList(attns)
+        self.upsample = None
+        if upsample:
+            self.upsample = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x, xs):
+        for attn, net in zip(self.attns, self.nets):
+            res_x = xs[-1]
+            xs = xs[:-1]
+            x = torch.cat([x, res_x], dim=1)
+            x = net(x)
+            if attn:
+                x = attn(x)
+        if self.upsample:
+            x = F.interpolate(x, scale_factor=2.0, mode='nearest')
+            x = self.upsample(x)
+        return x
+# it could be asymmetric!
+class UNet(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_channels: Tuple[int, ...] = (64, 128, 256, 512, 1024),
+        down_attention: Tuple[bool, ...] = (False, False, False, True, True),
+        mid_attention: bool = True,
+        up_channels: Tuple[int, ...] = (1024, 512, 256),
+        up_attention: Tuple[bool, ...] = (True, True, False),
+        layers_per_block: int = 2,
+        skip_scale: float = np.sqrt(0.5),
+    ):
+        super().__init__()
+        # first
+        self.conv_in = nn.Conv2d(in_channels, down_channels[0], kernel_size=3, stride=1, padding=1)
+        # down
+        down_blocks = []
+        cout = down_channels[0]
+        for i in range(len(down_channels)):
+            cin = cout
+            cout = down_channels[i]
+            down_blocks.append(DownBlock(
+                cin, cout,
+                num_layers=layers_per_block,
+                downsample=(i != len(down_channels) - 1), # not final layer
+                attention=down_attention[i],
+                skip_scale=skip_scale,
+            ))
+        self.down_blocks = nn.ModuleList(down_blocks)
+        # mid
+        self.mid_block = MidBlock(down_channels[-1], attention=mid_attention, skip_scale=skip_scale)
+        # up
+        up_blocks = []
+        cout = up_channels[0]
+        for i in range(len(up_channels)):
+            cin = cout
+            cout = up_channels[i]
+            cskip = down_channels[max(-2 - i, -len(down_channels))] # for assymetric
+            up_blocks.append(UpBlock(
+                cin, cskip, cout,
+                num_layers=layers_per_block + 1, # one more layer for up
+                upsample=(i != len(up_channels) - 1), # not final layer
+                attention=up_attention[i],
+                skip_scale=skip_scale,
+            ))
+        self.up_blocks = nn.ModuleList(up_blocks)
+        # last
+        self.norm_out = nn.GroupNorm(num_channels=up_channels[-1], num_groups=32, eps=1e-5)
+        self.conv_out = nn.Conv2d(up_channels[-1], out_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        # x: [B, Cin, H, W]
+        # first
+        x = self.conv_in(x)
+        # down
+        xss = [x]
+        for block in self.down_blocks:
+            x, xs = block(x)
+            xss.extend(xs)
+        # mid
+        x = self.mid_block(x)
+        # up
+        for block in self.up_blocks:
+            xs = xss[-len(block.nets):]
+            xss = xss[:-len(block.nets)]
+            x = block(x, xs)
+        # last
+        x = self.norm_out(x)
+        x = F.silu(x)
+        x = self.conv_out(x) # [B, Cout, H', W']
+        return x

core/utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import roma
+from kiui.op import safe_normalize
+def get_rays(pose, h, w, fovy, opengl=True):
+    x, y = torch.meshgrid(
+        torch.arange(w, device=pose.device),
+        torch.arange(h, device=pose.device),
+        indexing="xy",
+    )
+    x = x.flatten()
+    y = y.flatten()
+    cx = w * 0.5
+    cy = h * 0.5
+    focal = h * 0.5 / np.tan(0.5 * np.deg2rad(fovy))
+    camera_dirs = F.pad(
+        torch.stack(
+            [
+                (x - cx + 0.5) / focal,
+                (y - cy + 0.5) / focal * (-1.0 if opengl else 1.0),
+            ],
+            dim=-1,
+        ),
+        (0, 1),
+        value=(-1.0 if opengl else 1.0),
+    )  # [hw, 3]
+    rays_d = camera_dirs @ pose[:3, :3].transpose(0, 1)  # [hw, 3]
+    rays_o = pose[:3, 3].unsqueeze(0).expand_as(rays_d) # [hw, 3]
+    rays_o = rays_o.view(h, w, 3)
+    rays_d = safe_normalize(rays_d).view(h, w, 3)
+    return rays_o, rays_d
+def orbit_camera_jitter(poses, strength=0.1):
+    # poses: [B, 4, 4], assume orbit camera in opengl format
+    # random orbital rotate
+    B = poses.shape[0]
+    rotvec_x = poses[:, :3, 1] * strength * np.pi * (torch.rand(B, 1, device=poses.device) * 2 - 1)
+    rotvec_y = poses[:, :3, 0] * strength * np.pi / 2 * (torch.rand(B, 1, device=poses.device) * 2 - 1)
+    rot = roma.rotvec_to_rotmat(rotvec_x) @ roma.rotvec_to_rotmat(rotvec_y)
+    R = rot @ poses[:, :3, :3]
+    T = rot @ poses[:, :3, 3:]
+    new_poses = poses.clone()
+    new_poses[:, :3, :3] = R
+    new_poses[:, :3, 3:] = T
+    return new_poses
+def grid_distortion(images, strength=0.5):
+    # images: [B, C, H, W]
+    # num_steps: int, grid resolution for distortion
+    # strength: float in [0, 1], strength of distortion
+    B, C, H, W = images.shape
+    num_steps = np.random.randint(8, 17)
+    grid_steps = torch.linspace(-1, 1, num_steps)
+    # have to loop batch...
+    grids = []
+    for b in range(B):
+        # construct displacement
+        x_steps = torch.linspace(0, 1, num_steps) # [num_steps], inclusive
+        x_steps = (x_steps + strength * (torch.rand_like(x_steps) - 0.5) / (num_steps - 1)).clamp(0, 1) # perturb
+        x_steps = (x_steps * W).long() # [num_steps]
+        x_steps[0] = 0
+        x_steps[-1] = W
+        xs = []
+        for i in range(num_steps - 1):
+            xs.append(torch.linspace(grid_steps[i], grid_steps[i + 1], x_steps[i + 1] - x_steps[i]))
+        xs = torch.cat(xs, dim=0) # [W]
+        y_steps = torch.linspace(0, 1, num_steps) # [num_steps], inclusive
+        y_steps = (y_steps + strength * (torch.rand_like(y_steps) - 0.5) / (num_steps - 1)).clamp(0, 1) # perturb
+        y_steps = (y_steps * H).long() # [num_steps]
+        y_steps[0] = 0
+        y_steps[-1] = H
+        ys = []
+        for i in range(num_steps - 1):
+            ys.append(torch.linspace(grid_steps[i], grid_steps[i + 1], y_steps[i + 1] - y_steps[i]))
+        ys = torch.cat(ys, dim=0) # [H]
+        # construct grid
+        grid_x, grid_y = torch.meshgrid(xs, ys, indexing='xy') # [H, W]
+        grid = torch.stack([grid_x, grid_y], dim=-1) # [H, W, 2]
+        grids.append(grid)
+    grids = torch.stack(grids, dim=0).to(images.device) # [B, H, W, 2]
+    # grid sample
+    images = F.grid_sample(images, grids, align_corners=False)
+    return images

data_test/catstatue.ply ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57dc6f5902301d7577c53a73ce4c9d1bbff2fca86bf93d015b6cdfa1d3de9b18
+size 2390497