Stable-beta-vcam

Runtime error

App Files Files Community

hangg-sai Aaryaman Vasishta commited on Feb 19

Commit

a342aa8

0 Parent(s):

Initial commit

Browse files

Co-authored-by: Aaryaman Vasishta <aaryaman.vasishta@stability.ai>

Files changed (47) hide show

.gitattributes +1 -0
.gitignore +50 -0
.gitmodules +3 -0
LICENSE +124 -0
README.md +17 -0
assets/advance/backyard-7_0.jpg +3 -0
assets/advance/backyard-7_1.jpg +3 -0
assets/advance/backyard-7_2.jpg +3 -0
assets/advance/backyard-7_3.jpg +3 -0
assets/advance/backyard-7_4.jpg +3 -0
assets/advance/backyard-7_5.jpg +3 -0
assets/advance/backyard-7_6.jpg +3 -0
assets/advance/blue-car.jpg +1 -0
assets/advance/garden-4_0.jpg +3 -0
assets/advance/garden-4_1.jpg +3 -0
assets/advance/garden-4_2.jpg +3 -0
assets/advance/garden-4_3.jpg +3 -0
assets/advance/telebooth-2_0.jpg +3 -0
assets/advance/telebooth-2_1.jpg +3 -0
assets/advance/vgg-lab-4_0.png +3 -0
assets/advance/vgg-lab-4_1.png +3 -0
assets/advance/vgg-lab-4_2.png +3 -0
assets/advance/vgg-lab-4_3.png +3 -0
assets/basic/blue-car.jpg +3 -0
assets/basic/hilly-countryside.jpg +3 -0
assets/basic/lily-dragon.png +3 -0
assets/basic/llff-room.jpg +3 -0
assets/basic/mountain-lake.jpg +3 -0
assets/basic/vasedeck.jpg +3 -0
assets/basic/vgg-lab-4_0.png +1 -0
demo_gr.py +1238 -0
requirements.txt +35 -0
seva/__init__.py +0 -0
seva/data_io.py +553 -0
seva/eval.py +1988 -0
seva/geometry.py +811 -0
seva/gui.py +975 -0
seva/model.py +234 -0
seva/modules/__init__.py +0 -0
seva/modules/autoencoder.py +51 -0
seva/modules/conditioner.py +39 -0
seva/modules/layers.py +140 -0
seva/modules/preprocessor.py +116 -0
seva/modules/transformer.py +247 -0
seva/sampling.py +405 -0
seva/utils.py +56 -0
third_party/dust3r +1 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ assets/** filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,50 @@

+.envrc
+.venv/
+.gradio/
+work_dirs*
+logs*
+pull_changes.sh
+# Byte-compiled files
+__pycache__/
+*.py[cod]
+# Virtual environments
+env/
+venv/
+ENV/
+.VENV/
+# Distribution files
+build/
+dist/
+*.egg-info/
+# Logs and temporary files
+*.log
+*.tmp
+*.bak
+*.swp
+# IDE files
+.idea/
+.vscode/
+*.sublime-workspace
+*.sublime-project
+# OS files
+.DS_Store
+Thumbs.db
+# Testing and coverage
+htmlcov/
+.coverage
+*.cover
+*.py,cover
+.cache/
+# Jupyter Notebook checkpoints
+.ipynb_checkpoints/
+# Pre-commit hooks
+.pre-commit-config.yaml~

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "third_party/dust3r"]
+	path = third_party/dust3r
+	url = https://github.com/jensenstability/dust3r

LICENSE ADDED Viewed

	@@ -0,0 +1,124 @@

+Stability AI Non-Commercial License Agreement
+Last Updated: February 20, 2025
+I. INTRODUCTION
+This Stability AI Non-Commercial License Agreement (the “Agreement”) applies to any individual person or entity
+(“You”, “Your” or “Licensee”) that uses or distributes any portion or element of the Stability AI Materials or
+Derivative Works thereof for any Research & Non-Commercial use. Capitalized terms not otherwise defined herein
+are defined in Section IV below.
+This Agreement is intended to allow research and non-commercial uses of the Model free of charge.
+By clicking “I Accept” or by using or distributing or using any portion or element of the Stability Materials
+or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement.
+If You are acting on behalf of a company, organization, or other entity, then “You” includes you and that entity,
+and You agree that You:
+(i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and
+(ii) You agree to the terms of this Agreement on that entity’s behalf.
+---
+II. RESEARCH & NON-COMMERCIAL USE LICENSE
+Subject to the terms of this Agreement, Stability AI grants You a non-exclusive, worldwide, non-transferable,
+non-sublicensable, revocable, and royalty-free limited license under Stability AI’s intellectual property or other
+rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create
+Derivative Works of, and make modifications to, the Stability AI Materials for any Research or Non-Commercial Purpose.
+- **“Research Purpose”** means academic or scientific advancement, and in each case, is not primarily intended
+  for commercial advantage or monetary compensation to You or others.
+- **“Non-Commercial Purpose”** means any purpose other than a Research Purpose that is not primarily intended
+  for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist)
+  or evaluation and testing.
+---
+III. GENERAL TERMS
+Your Research or Non-Commercial license under this Agreement is subject to the following terms.
+### a. Distribution & Attribution
+If You distribute or make available the Stability AI Materials or a Derivative Work to a third party, or a product
+or service that uses any portion of them, You shall:
+1. Provide a copy of this Agreement to that third party.
+2. Retain the following attribution notice within a **"Notice"** text file distributed as a part of such copies:
+   **"This Stability AI Model is licensed under the Stability AI Non-Commercial License,
+   Copyright © Stability AI Ltd. All Rights Reserved."**
+3. Prominently display **“Powered by Stability AI”** on a related website, user interface, blog post,
+   about page, or product documentation.
+4. If You create a Derivative Work, You may add your own attribution notice(s) to the **"Notice"** text file
+   included with that Derivative Work, provided that You clearly indicate which attributions apply to the
+   Stability AI Materials and state in the **"Notice"** text file that You changed the Stability AI Materials
+   and how it was modified.
+### b. Use Restrictions
+Your use of the Stability AI Materials and Derivative Works, including any output or results of the Stability
+AI Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control
+Laws and equivalent regulations) and adhere to the Documentation and Stability AI’s AUP, which is hereby
+incorporated by reference.
+Furthermore, You will not use the Stability AI Materials or Derivative Works, or any output or results of the
+Stability AI Materials or Derivative Works, to create or improve any foundational generative AI model
+(excluding the Model or Derivative Works).
+### c. Intellectual Property
+#### (i) Trademark License
+No trademark licenses are granted under this Agreement, and in connection with the Stability AI Materials
+or Derivative Works, You may not use any name or mark owned by or associated with Stability AI or any of
+its Affiliates, except as required under Section IV(a) herein.
+#### (ii) Ownership of Derivative Works
+As between You and Stability AI, You are the owner of Derivative Works You create, subject to Stability AI’s
+ownership of the Stability AI Materials and any Derivative Works made by or for Stability AI.
+#### (iii) Ownership of Outputs
+As between You and Stability AI, You own any outputs generated from the Model or Derivative Works to the extent
+permitted by applicable law.
+#### (iv) Disputes
+If You or Your Affiliate(s) institute litigation or other proceedings against Stability AI (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Stability AI Materials, Derivative Works, or
+associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual
+property or other rights owned or licensable by You, then any licenses granted to You under this Agreement
+shall terminate as of the date such litigation or claim is filed or instituted.
+You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out
+of or related to Your use or distribution of the Stability AI Materials or Derivative Works in violation of
+this Agreement.
+#### (v) Feedback
+From time to time, You may provide Stability AI with verbal and/or written suggestions, comments, or other
+feedback related to Stability AI’s existing or prospective technology, products, or services (collectively,
+“Feedback”).
+You are not obligated to provide Stability AI with Feedback, but to the extent that You do, You hereby grant
+Stability AI a **perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive,
+worldwide right and license** to exploit the Feedback in any manner without restriction.
+Your Feedback is provided **“AS IS”** and You make no warranties whatsoever about any Feedback.
+---
+IV. DEFINITIONS
+- **“Affiliate(s)”** means any entity that directly or indirectly controls, is controlled by, or is under common
+  control with the subject entity. For purposes of this definition, “control” means direct or indirect ownership
+  or control of more than 50% of the voting interests of the subject entity.
+- **“AUP”** means the Stability AI Acceptable Use Policy available at https://stability.ai/use-policy, as may
+  be updated from time to time.
+- **"Derivative Work(s)"** means:
+  (a) Any derivative work of the Stability AI Materials as recognized by U.S. copyright laws.
+  (b) Any modifications to a Model, and any other model created which is based on or derived from the Model or
+      the Model’s output, including **fine-tune** and **low-rank adaptation** models derived from a Model or
+      a Model’s output, but does not include the output of any Model.
+- **“Model”** means Stability AI’s Stable Virtual Camera model.
+- **"Stability AI" or "we"** means Stability AI Ltd. and its Affiliates.
+- **"Software"** means Stability AI’s proprietary software made available under this Agreement now or in the future.
+- **“Stability AI Materials”** means, collectively, Stability’s proprietary Model, Software, and Documentation
+  (and any portion or combination thereof) made available under this Agreement.
+- **“Trade Control Laws”** means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.

README.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+title: Stable Virtual Camera
+emoji: ⚡
+colorFrom: yellow
+colorTo: yellow
+sdk: gradio
+sdk_version: 5.17.0
+app_file: demo_gr.py
+pinned: false
+---
+- **Project Page**: [https://stable-virtual-camera.github.io/](https://stable-virtual-camera.github.io/)
+- **Paper**: [https://stable-virtual-camera.github.io/assets/paper.pdf](http://https://stable-virtual-camera.github.io/assets/paper.pdf)
+- **Blog**: [https://stability.ai/news/introducing-stable-virtual-camera-multi-view-video-generation-with-3d-camera-control](https://stability.ai/news/introducing-stable-virtual-camera-multi-view-video-generation-with-3d-camera-control)
+- **Code**: [https://github.com/Stability-AI/stable-virtual-camera](https://github.com/Stability-AI/stable-virtual-camera)
+- **Model Card**: [https://huggingface.co/stabilityai/stable-virtual-camera](https://huggingface.co/stabilityai/stable-virtual-camera)
+- **Video**: [https://www.youtube.com/channel/UCLLlVDcS7nNenT_zzO3OPxQ](http://https://www.youtube.com/channel/UCLLlVDcS7nNenT_zzO3OPxQ)

assets/advance/backyard-7_0.jpg ADDED Viewed

Git LFS Details

SHA256: 102c0b5ce669e41b9a1ac7e7d8096b5bdd848faaed6e0227e927dde6e9f8ffe5
Pointer size: 130 Bytes
Size of remote file: 67.7 kB

assets/advance/backyard-7_1.jpg ADDED Viewed

Git LFS Details

SHA256: 256919ceb20bfcc9bd4c03baec1bf06a96c5cc28ce34549999b92df2cc3a61c0
Pointer size: 130 Bytes
Size of remote file: 59.7 kB

assets/advance/backyard-7_2.jpg ADDED Viewed

Git LFS Details

SHA256: 26a280e4814feba832c961025c1f1c1bd9f248b96611625323ecc5f9940325f4
Pointer size: 130 Bytes
Size of remote file: 70.4 kB

assets/advance/backyard-7_3.jpg ADDED Viewed

Git LFS Details

SHA256: 9fe7e28c41968d1f31c70cc64c69b885e404d5637195f68c1d7659cab75c6403
Pointer size: 130 Bytes
Size of remote file: 71.1 kB

assets/advance/backyard-7_4.jpg ADDED Viewed

Git LFS Details

SHA256: f5ea4fe7da9a22bc5c4936d1fae0f366344b2ed38b0aea95b8cb9389e895c9fe
Pointer size: 130 Bytes
Size of remote file: 77.2 kB

assets/advance/backyard-7_5.jpg ADDED Viewed

Git LFS Details

SHA256: a186074f378e1c8da814c61d53b99b037267a23fa9e72ce8a3c7d1bab9da1bf0
Pointer size: 130 Bytes
Size of remote file: 81.4 kB

assets/advance/backyard-7_6.jpg ADDED Viewed

Git LFS Details

SHA256: 1040d5738f1039dafcaac28f5c0e180d45e5e8453c18739966c109a47138282e
Pointer size: 130 Bytes
Size of remote file: 76.4 kB

assets/advance/blue-car.jpg ADDED Viewed

assets/advance/garden-4_0.jpg ADDED Viewed

Git LFS Details

SHA256: 38fbe78f699fc84a1f4268ef8bacef9ddacfd32e9eb8fbcb605e46cfd52b988e
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

assets/advance/garden-4_1.jpg ADDED Viewed

Git LFS Details

SHA256: 1975effeffc9b2011a28f6eb04d1b0bd2f37f765c194249c95e6b3783d698a42
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

assets/advance/garden-4_2.jpg ADDED Viewed

Git LFS Details

SHA256: 4112ff5f2ceaa3b469bb402853e7cde10396f858e5a2ceba93b095e1e3d8d335
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

assets/advance/garden-4_3.jpg ADDED Viewed

Git LFS Details

SHA256: a750b648c389f78f2f6b26d78f753eace13a41d355f725850c2667f864f709cd
Pointer size: 132 Bytes
Size of remote file: 1.06 MB

assets/advance/telebooth-2_0.jpg ADDED Viewed

Git LFS Details

SHA256: 8822927954b3dda7c40ebc1311a5e97bae5875d5c2ee06331de5c39ef69b36c9
Pointer size: 130 Bytes
Size of remote file: 78.5 kB

assets/advance/telebooth-2_1.jpg ADDED Viewed

Git LFS Details

SHA256: b6718ec1dcb569ac36b5aa69154d505653985e8045aacf25c3e0d3df9798a2f5
Pointer size: 130 Bytes
Size of remote file: 78 kB

assets/advance/vgg-lab-4_0.png ADDED Viewed

Git LFS Details

SHA256: d1442eb509af02273cf7168f5212b3221142df4db99991b38395f42f8b239960
Pointer size: 131 Bytes
Size of remote file: 412 kB

assets/advance/vgg-lab-4_1.png ADDED Viewed

Git LFS Details

SHA256: c2bb10b9574247ceb0948aa00afea588f001f0271f51908b8132d63587fc43d0
Pointer size: 131 Bytes
Size of remote file: 443 kB

assets/advance/vgg-lab-4_2.png ADDED Viewed

Git LFS Details

SHA256: 7fa884bb6d783fd9385bd38042f3461f430bec8311e7b2171474b6a906538030
Pointer size: 131 Bytes
Size of remote file: 410 kB

assets/advance/vgg-lab-4_3.png ADDED Viewed

Git LFS Details

SHA256: 99469f816604c92c9c27a7cff119cb3649d3dfa4c41dcef89525b7b3cbd885a4
Pointer size: 131 Bytes
Size of remote file: 475 kB

assets/basic/blue-car.jpg ADDED Viewed

Git LFS Details

SHA256: 0cf493d0f738830223949fd24bb3ab0a1c078804fdb744efa95a1fdcfcfb5332
Pointer size: 131 Bytes
Size of remote file: 106 kB

assets/basic/hilly-countryside.jpg ADDED Viewed

Git LFS Details

SHA256: 4ae3b8cb5d989b62ceaf4930afea55790048657fa459f383f8bd809b3bdcfca0
Pointer size: 131 Bytes
Size of remote file: 107 kB

assets/basic/lily-dragon.png ADDED Viewed

Git LFS Details

SHA256: c545057ee2feeced73566f708311bf758350ef0ded844d7bd438e48fca7f5bd2
Pointer size: 132 Bytes
Size of remote file: 1.57 MB

assets/basic/llff-room.jpg ADDED Viewed

Git LFS Details

SHA256: 3529ea8355bb564784390a1a3ba5f0d2e71c4bbaed652dd2c778049766eedebf
Pointer size: 130 Bytes
Size of remote file: 35.1 kB

assets/basic/mountain-lake.jpg ADDED Viewed

Git LFS Details

SHA256: 5c6ef051d69e4e08ab29508d8d4b5171384935719030e88e706584db2f409c3b
Pointer size: 130 Bytes
Size of remote file: 55.6 kB

assets/basic/vasedeck.jpg ADDED Viewed

Git LFS Details

SHA256: 334fde31dd688bd8343ac0fba1354b4aa4779c09de03c7cb20a0386f660049cd
Pointer size: 130 Bytes
Size of remote file: 61.5 kB

assets/basic/vgg-lab-4_0.png ADDED Viewed

demo_gr.py ADDED Viewed

	@@ -0,0 +1,1238 @@

+import copy
+import json
+import os
+import os.path as osp
+import queue
+import secrets
+import threading
+import time
+from datetime import datetime
+from glob import glob
+from pathlib import Path
+from typing import Literal
+import gradio as gr
+import httpx
+import imageio.v3 as iio
+import numpy as np
+import torch
+import torch.nn.functional as F
+import tyro
+import viser
+import viser.transforms as vt
+from einops import rearrange
+from gradio import networking
+from gradio.context import LocalContext
+from gradio.tunneling import CERTIFICATE_PATH, Tunnel
+from seva.eval import (
+    IS_TORCH_NIGHTLY,
+    chunk_input_and_test,
+    create_transforms_simple,
+    infer_prior_stats,
+    run_one_scene,
+    transform_img_and_K,
+)
+from seva.geometry import (
+    DEFAULT_FOV_RAD,
+    get_default_intrinsics,
+    get_preset_pose_fov,
+    normalize_scene,
+)
+from seva.gui import define_gui
+from seva.model import SGMWrapper
+from seva.modules.autoencoder import AutoEncoder
+from seva.modules.conditioner import CLIPConditioner
+from seva.modules.preprocessor import Dust3rPipeline
+from seva.sampling import DDPMDiscretization, DiscreteDenoiser
+from seva.utils import load_model
+device = "cuda:0"
+# Constants.
+WORK_DIR = "work_dirs/demo_gr"
+MAX_SESSIONS = 1
+ADVANCE_EXAMPLE_MAP = [
+    (
+        "assets/advance/blue-car.jpg",
+        ["assets/advance/blue-car.jpg"],
+    ),
+    (
+        "assets/advance/garden-4_0.jpg",
+        [
+            "assets/advance/garden-4_0.jpg",
+            "assets/advance/garden-4_1.jpg",
+            "assets/advance/garden-4_2.jpg",
+            "assets/advance/garden-4_3.jpg",
+        ],
+    ),
+    (
+        "assets/advance/vgg-lab-4_0.png",
+        [
+            "assets/advance/vgg-lab-4_0.png",
+            "assets/advance/vgg-lab-4_1.png",
+            "assets/advance/vgg-lab-4_2.png",
+            "assets/advance/vgg-lab-4_3.png",
+        ],
+    ),
+    (
+        "assets/advance/telebooth-2_0.jpg",
+        [
+            "assets/advance/telebooth-2_0.jpg",
+            "assets/advance/telebooth-2_1.jpg",
+        ],
+    ),
+    (
+        "assets/advance/backyard-7_0.jpg",
+        [
+            "assets/advance/backyard-7_0.jpg",
+            "assets/advance/backyard-7_1.jpg",
+            "assets/advance/backyard-7_2.jpg",
+            "assets/advance/backyard-7_3.jpg",
+            "assets/advance/backyard-7_4.jpg",
+            "assets/advance/backyard-7_5.jpg",
+            "assets/advance/backyard-7_6.jpg",
+        ],
+    ),
+]
+if IS_TORCH_NIGHTLY:
+    COMPILE = True
+    os.environ["TORCHINDUCTOR_AUTOGRAD_CACHE"] = "1"
+    os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
+else:
+    COMPILE = False
+# Shared global variables across sessions.
+DUST3R = Dust3rPipeline(device=device)  # type: ignore
+MODEL = SGMWrapper(load_model(device="cpu", verbose=True).eval()).to(device)
+AE = AutoEncoder(chunk_size=1).to(device)
+CONDITIONER = CLIPConditioner().to(device)
+DISCRETIZATION = DDPMDiscretization()
+DENOISER = DiscreteDenoiser(discretization=DISCRETIZATION, num_idx=1000, device=device)
+VERSION_DICT = {
+    "H": 576,
+    "W": 576,
+    "T": 21,
+    "C": 4,
+    "f": 8,
+    "options": {},
+}
+SERVERS = {}
+ABORT_EVENTS = {}
+if COMPILE:
+    MODEL = torch.compile(MODEL)
+    CONDITIONER = torch.compile(CONDITIONER)
+    AE = torch.compile(AE)
+class SevaRenderer(object):
+    def __init__(self, server: viser.ViserServer):
+        self.server = server
+        self.gui_state = None
+    def preprocess(
+        self, input_img_path_or_tuples: list[tuple[str, None]] | str
+    ) -> tuple[dict, dict, dict]:
+        # Simply hardcode these such that aspect ratio is always kept and
+        # shorter side is resized to 576. This is only to make GUI option fewer
+        # though, changing it still works.
+        shorter: int = 576
+        # Has to be 64 multiple for the network.
+        shorter = round(shorter / 64) * 64
+        if isinstance(input_img_path_or_tuples, str):
+            # Assume `Basic` demo mode: just hardcode the camera parameters and ignore points.
+            input_imgs = torch.as_tensor(
+                iio.imread(input_img_path_or_tuples) / 255.0, dtype=torch.float32
+            )[None, ..., :3]
+            input_imgs = transform_img_and_K(
+                input_imgs.permute(0, 3, 1, 2),
+                shorter,
+                K=None,
+                size_stride=64,
+            )[0].permute(0, 2, 3, 1)
+            input_Ks = get_default_intrinsics(
+                aspect_ratio=input_imgs.shape[2] / input_imgs.shape[1]
+            )
+            input_c2ws = torch.eye(4)[None]
+            # Simulate a small time interval such that gradio can update
+            # propgress properly.
+            time.sleep(0.1)
+            return (
+                {
+                    "input_imgs": input_imgs,
+                    "input_Ks": input_Ks,
+                    "input_c2ws": input_c2ws,
+                    "input_wh": (input_imgs.shape[2], input_imgs.shape[1]),
+                    "points": [np.zeros((0, 3))],
+                    "point_colors": [np.zeros((0, 3))],
+                    "scene_scale": 1.0,
+                },
+                gr.update(visible=False),
+                gr.update(),
+            )
+        else:
+            # Assume `Advance` demo mode: use dust3r to extract camera parameters and points.
+            img_paths = [p for (p, _) in input_img_path_or_tuples]
+            (
+                input_imgs,
+                input_Ks,
+                input_c2ws,
+                points,
+                point_colors,
+            ) = DUST3R.infer_cameras_and_points(img_paths)
+            num_inputs = len(img_paths)
+            if num_inputs == 1:
+                input_imgs, input_Ks, input_c2ws, points, point_colors = (
+                    input_imgs[:1],
+                    input_Ks[:1],
+                    input_c2ws[:1],
+                    points[:1],
+                    point_colors[:1],
+                )
+            input_imgs = [img[..., :3] for img in input_imgs]
+            # Normalize the scene.
+            point_chunks = [p.shape[0] for p in points]
+            point_indices = np.cumsum(point_chunks)[:-1]
+            input_c2ws, points, _ = normalize_scene(  # type: ignore
+                input_c2ws,
+                np.concatenate(points, 0),
+                camera_center_method="poses",
+            )
+            points = np.split(points, point_indices, 0)
+            # Scale camera and points for viewport visualization.
+            scene_scale = np.median(
+                np.ptp(np.concatenate([input_c2ws[:, :3, 3], *points], 0), -1)
+            )
+            input_c2ws[:, :3, 3] /= scene_scale
+            points = [point / scene_scale for point in points]
+            input_imgs = [
+                torch.as_tensor(img / 255.0, dtype=torch.float32) for img in input_imgs
+            ]
+            input_Ks = torch.as_tensor(input_Ks)
+            input_c2ws = torch.as_tensor(input_c2ws)
+            new_input_imgs, new_input_Ks = [], []
+            for img, K in zip(input_imgs, input_Ks):
+                img = rearrange(img, "h w c -> 1 c h w")
+                # If you don't want to keep aspect ratio and want to always center crop, use this:
+                # img, K = transform_img_and_K(img, (shorter, shorter), K=K[None])
+                img, K = transform_img_and_K(img, shorter, K=K[None], size_stride=64)
+                assert isinstance(K, torch.Tensor)
+                K = K / K.new_tensor([img.shape[-1], img.shape[-2], 1])[:, None]
+                new_input_imgs.append(img)
+                new_input_Ks.append(K)
+            input_imgs = torch.cat(new_input_imgs, 0)
+            input_imgs = rearrange(input_imgs, "b c h w -> b h w c")[..., :3]
+            input_Ks = torch.cat(new_input_Ks, 0)
+            return (
+                {
+                    "input_imgs": input_imgs,
+                    "input_Ks": input_Ks,
+                    "input_c2ws": input_c2ws,
+                    "input_wh": (input_imgs.shape[2], input_imgs.shape[1]),
+                    "points": points,
+                    "point_colors": point_colors,
+                    "scene_scale": scene_scale,
+                },
+                gr.update(visible=False),
+                gr.update()
+                if num_inputs <= 10
+                else gr.update(choices=["interp"], value="interp"),
+            )
+    def visualize_scene(self, preprocessed: dict):
+        server = self.server
+        server.scene.reset()
+        server.gui.reset()
+        set_bkgd_color(server)
+        (
+            input_imgs,
+            input_Ks,
+            input_c2ws,
+            input_wh,
+            points,
+            point_colors,
+            scene_scale,
+        ) = (
+            preprocessed["input_imgs"],
+            preprocessed["input_Ks"],
+            preprocessed["input_c2ws"],
+            preprocessed["input_wh"],
+            preprocessed["points"],
+            preprocessed["point_colors"],
+            preprocessed["scene_scale"],
+        )
+        W, H = input_wh
+        server.scene.set_up_direction(-input_c2ws[..., :3, 1].mean(0).numpy())
+        # Use first image as default fov.
+        assert input_imgs[0].shape[:2] == (H, W)
+        if H > W:
+            init_fov = 2 * np.arctan(1 / (2 * input_Ks[0, 0, 0].item()))
+        else:
+            init_fov = 2 * np.arctan(1 / (2 * input_Ks[0, 1, 1].item()))
+        init_fov_deg = float(init_fov / np.pi * 180.0)
+        frustum_nodes, pcd_nodes = [], []
+        for i in range(len(input_imgs)):
+            K = input_Ks[i]
+            frustum = server.scene.add_camera_frustum(
+                f"/scene_assets/cameras/{i}",
+                fov=2 * np.arctan(1 / (2 * K[1, 1].item())),
+                aspect=W / H,
+                scale=0.1 * scene_scale,
+                image=(input_imgs[i].numpy() * 255.0).astype(np.uint8),
+                wxyz=vt.SO3.from_matrix(input_c2ws[i, :3, :3].numpy()).wxyz,
+                position=input_c2ws[i, :3, 3].numpy(),
+            )
+            def get_handler(frustum):
+                def handler(event: viser.GuiEvent) -> None:
+                    assert event.client_id is not None
+                    client = server.get_clients()[event.client_id]
+                    with client.atomic():
+                        client.camera.position = frustum.position
+                        client.camera.wxyz = frustum.wxyz
+                        # Set look_at as the projected origin onto the
+                        # frustum's forward direction.
+                        look_direction = vt.SO3(frustum.wxyz).as_matrix()[:, 2]
+                        position_origin = -frustum.position
+                        client.camera.look_at = (
+                            frustum.position
+                            + np.dot(look_direction, position_origin)
+                            / np.linalg.norm(position_origin)
+                            * look_direction
+                        )
+                return handler
+            frustum.on_click(get_handler(frustum))  # type: ignore
+            frustum_nodes.append(frustum)
+            pcd = server.scene.add_point_cloud(
+                f"/scene_assets/points/{i}",
+                points[i],
+                point_colors[i],
+                point_size=0.01 * scene_scale,
+                point_shape="circle",
+            )
+            pcd_nodes.append(pcd)
+        with server.gui.add_folder("Scene scale", expand_by_default=False, order=200):
+            camera_scale_slider = server.gui.add_slider(
+                "Log camera scale", initial_value=0.0, min=-2.0, max=2.0, step=0.1
+            )
+            @camera_scale_slider.on_update
+            def _(_) -> None:
+                for i in range(len(frustum_nodes)):
+                    frustum_nodes[i].scale = (
+                        0.1 * scene_scale * 10**camera_scale_slider.value
+                    )
+            point_scale_slider = server.gui.add_slider(
+                "Log point scale", initial_value=0.0, min=-2.0, max=2.0, step=0.1
+            )
+            @point_scale_slider.on_update
+            def _(_) -> None:
+                for i in range(len(pcd_nodes)):
+                    pcd_nodes[i].point_size = (
+                        0.01 * scene_scale * 10**point_scale_slider.value
+                    )
+        self.gui_state = define_gui(
+            server,
+            init_fov=init_fov_deg,
+            img_wh=input_wh,
+            scene_scale=scene_scale,
+        )
+    def get_target_c2ws_and_Ks_from_gui(self, preprocessed: dict):
+        input_wh = preprocessed["input_wh"]
+        W, H = input_wh
+        gui_state = self.gui_state
+        assert gui_state is not None and gui_state.camera_traj_list is not None
+        target_c2ws, target_Ks = [], []
+        for item in gui_state.camera_traj_list:
+            target_c2ws.append(item["w2c"])
+            assert item["img_wh"] == input_wh
+            K = np.array(item["K"]).reshape(3, 3) / np.array([W, H, 1])[:, None]
+            target_Ks.append(K)
+        target_c2ws = torch.as_tensor(
+            np.linalg.inv(np.array(target_c2ws).reshape(-1, 4, 4))
+        )
+        target_Ks = torch.as_tensor(np.array(target_Ks).reshape(-1, 3, 3))
+        return target_c2ws, target_Ks
+    def get_target_c2ws_and_Ks_from_preset(
+        self,
+        preprocessed: dict,
+        preset_traj: Literal[
+            "orbit",
+            "spiral",
+            "lemniscate",
+            "zoom-in",
+            "zoom-out",
+            "dolly zoom-in",
+            "dolly zoom-out",
+            "move-forward",
+            "move-backward",
+            "move-up",
+            "move-down",
+            "move-left",
+            "move-right",
+        ],
+        num_frames: int,
+        zoom_factor: float | None,
+    ):
+        img_wh = preprocessed["input_wh"]
+        start_c2w = preprocessed["input_c2ws"][0]
+        start_w2c = torch.linalg.inv(start_c2w)
+        look_at = torch.tensor([0, 0, 10])
+        start_fov = DEFAULT_FOV_RAD
+        target_c2ws, target_fovs = get_preset_pose_fov(
+            preset_traj,
+            num_frames,
+            start_w2c,
+            look_at,
+            -start_c2w[:3, 1],
+            start_fov,
+            spiral_radii=[1.0, 1.0, 0.5],
+            zoom_factor=zoom_factor,
+        )
+        target_c2ws = torch.as_tensor(target_c2ws)
+        target_fovs = torch.as_tensor(target_fovs)
+        target_Ks = get_default_intrinsics(
+            target_fovs,  # type: ignore
+            aspect_ratio=img_wh[0] / img_wh[1],
+        )
+        return target_c2ws, target_Ks
+    def export_output_data(self, preprocessed: dict, output_dir: str):
+        input_imgs, input_Ks, input_c2ws, input_wh = (
+            preprocessed["input_imgs"],
+            preprocessed["input_Ks"],
+            preprocessed["input_c2ws"],
+            preprocessed["input_wh"],
+        )
+        target_c2ws, target_Ks = self.get_target_c2ws_and_Ks_from_gui(preprocessed)
+        num_inputs = len(input_imgs)
+        num_targets = len(target_c2ws)
+        input_imgs = (input_imgs.cpu().numpy() * 255.0).astype(np.uint8)
+        input_c2ws = input_c2ws.cpu().numpy()
+        input_Ks = input_Ks.cpu().numpy()
+        target_c2ws = target_c2ws.cpu().numpy()
+        target_Ks = target_Ks.cpu().numpy()
+        img_whs = np.array(input_wh)[None].repeat(len(input_imgs) + len(target_Ks), 0)
+        os.makedirs(output_dir, exist_ok=True)
+        img_paths = []
+        for i, img in enumerate(input_imgs):
+            iio.imwrite(img_path := osp.join(output_dir, f"{i:03d}.png"), img)
+            img_paths.append(img_path)
+        for i in range(num_targets):
+            iio.imwrite(
+                img_path := osp.join(output_dir, f"{i + num_inputs:03d}.png"),
+                np.zeros((input_wh[1], input_wh[0], 3), dtype=np.uint8),
+            )
+            img_paths.append(img_path)
+        # Convert from OpenCV to OpenGL camera format.
+        all_c2ws = np.concatenate([input_c2ws, target_c2ws])
+        all_Ks = np.concatenate([input_Ks, target_Ks])
+        all_c2ws = all_c2ws @ np.diag([1, -1, -1, 1])
+        create_transforms_simple(output_dir, img_paths, img_whs, all_c2ws, all_Ks)
+        split_dict = {
+            "train_ids": list(range(num_inputs)),
+            "test_ids": list(range(num_inputs, num_inputs + num_targets)),
+        }
+        with open(
+            osp.join(output_dir, f"train_test_split_{num_inputs}.json"), "w"
+        ) as f:
+            json.dump(split_dict, f, indent=4)
+        gr.Info(f"Output data saved to {output_dir}", duration=1)
+    def render(
+        self,
+        preprocessed: dict,
+        session_hash: str,
+        seed: int,
+        chunk_strategy: str,
+        cfg: float,
+        preset_traj: Literal[
+            "orbit",
+            "spiral",
+            "lemniscate",
+            "zoom-in",
+            "zoom-out",
+            "dolly zoom-in",
+            "dolly zoom-out",
+            "move-forward",
+            "move-backward",
+            "move-up",
+            "move-down",
+            "move-left",
+            "move-right",
+        ]
+        | None,
+        num_frames: int | None,
+        zoom_factor: float | None,
+        camera_scale: float,
+    ):
+        render_name = datetime.now().strftime("%Y%m%d_%H%M%S")
+        render_dir = osp.join(WORK_DIR, render_name)
+        input_imgs, input_Ks, input_c2ws, (W, H) = (
+            preprocessed["input_imgs"],
+            preprocessed["input_Ks"],
+            preprocessed["input_c2ws"],
+            preprocessed["input_wh"],
+        )
+        num_inputs = len(input_imgs)
+        if preset_traj is None:
+            target_c2ws, target_Ks = self.get_target_c2ws_and_Ks_from_gui(preprocessed)
+        else:
+            assert num_frames is not None
+            assert num_inputs == 1
+            input_c2ws = torch.eye(4)[None].to(dtype=input_c2ws.dtype)
+            target_c2ws, target_Ks = self.get_target_c2ws_and_Ks_from_preset(
+                preprocessed, preset_traj, num_frames, zoom_factor
+            )
+        all_c2ws = torch.cat([input_c2ws, target_c2ws], 0)
+        all_Ks = (
+            torch.cat([input_Ks, target_Ks], 0)
+            * input_Ks.new_tensor([W, H, 1])[:, None]
+        )
+        num_targets = len(target_c2ws)
+        input_indices = list(range(num_inputs))
+        target_indices = np.arange(num_inputs, num_inputs + num_targets).tolist()
+        # Get anchor cameras.
+        T = VERSION_DICT["T"]
+        version_dict = copy.deepcopy(VERSION_DICT)
+        num_anchors = infer_prior_stats(
+            T,
+            num_inputs,
+            num_total_frames=num_targets,
+            version_dict=version_dict,
+        )
+        # infer_prior_stats modifies T in-place.
+        T = version_dict["T"]
+        assert isinstance(num_anchors, int)
+        anchor_indices = np.linspace(
+            num_inputs,
+            num_inputs + num_targets - 1,
+            num_anchors,
+        ).tolist()
+        anchor_c2ws = all_c2ws[[round(ind) for ind in anchor_indices]]
+        anchor_Ks = all_Ks[[round(ind) for ind in anchor_indices]]
+        # Create image conditioning.
+        all_imgs_np = (
+            F.pad(input_imgs, (0, 0, 0, 0, 0, 0, 0, num_targets), value=0.0).numpy()
+            * 255.0
+        ).astype(np.uint8)
+        image_cond = {
+            "img": all_imgs_np,
+            "input_indices": input_indices,
+            "prior_indices": anchor_indices,
+        }
+        # Create camera conditioning (K is unnormalized).
+        camera_cond = {
+            "c2w": all_c2ws,
+            "K": all_Ks,
+            "input_indices": list(range(num_inputs + num_targets)),
+        }
+        # Run rendering.
+        num_steps = 50
+        options_ori = VERSION_DICT["options"]
+        options = copy.deepcopy(options_ori)
+        options["chunk_strategy"] = chunk_strategy
+        options["video_save_fps"] = 30.0
+        options["beta_linear_start"] = 5e-6
+        options["log_snr_shift"] = 2.4
+        options["guider_types"] = [1, 2]
+        options["cfg"] = [
+            float(cfg),
+            3.0 if num_inputs >= 9 else 2.0,
+        ]  # We define semi-dense-view regime to have 9 input views.
+        options["camera_scale"] = camera_scale
+        options["num_steps"] = num_steps
+        options["cfg_min"] = 1.2
+        options["encoding_t"] = 1
+        options["decoding_t"] = 1
+        assert session_hash in ABORT_EVENTS
+        abort_event = ABORT_EVENTS[session_hash]
+        abort_event.clear()
+        options["abort_event"] = abort_event
+        task = "img2trajvid"
+        # Get number of first pass chunks.
+        T_first_pass = T[0] if isinstance(T, (list, tuple)) else T
+        chunk_strategy_first_pass = options.get(
+            "chunk_strategy_first_pass", "gt-nearest"
+        )
+        num_chunks_0 = len(
+            chunk_input_and_test(
+                T_first_pass,
+                input_c2ws,
+                anchor_c2ws,
+                input_indices,
+                image_cond["prior_indices"],
+                options={**options, "sampler_verbose": False},
+                task=task,
+                chunk_strategy=chunk_strategy_first_pass,
+                gt_input_inds=list(range(input_c2ws.shape[0])),
+            )[1]
+        )
+        # Get number of second pass chunks.
+        anchor_argsort = np.argsort(input_indices + anchor_indices).tolist()
+        anchor_indices = np.array(input_indices + anchor_indices)[
+            anchor_argsort
+        ].tolist()
+        gt_input_inds = [anchor_argsort.index(i) for i in range(input_c2ws.shape[0])]
+        anchor_c2ws_second_pass = torch.cat([input_c2ws, anchor_c2ws], dim=0)[
+            anchor_argsort
+        ]
+        T_second_pass = T[1] if isinstance(T, (list, tuple)) else T
+        chunk_strategy = options.get("chunk_strategy", "nearest")
+        num_chunks_1 = len(
+            chunk_input_and_test(
+                T_second_pass,
+                anchor_c2ws_second_pass,
+                target_c2ws,
+                anchor_indices,
+                target_indices,
+                options={**options, "sampler_verbose": False},
+                task=task,
+                chunk_strategy=chunk_strategy,
+                gt_input_inds=gt_input_inds,
+            )[1]
+        )
+        second_pass_pbar = gr.Progress().tqdm(
+            iterable=None,
+            desc="Second pass sampling",
+            total=num_chunks_1 * num_steps,
+        )
+        first_pass_pbar = gr.Progress().tqdm(
+            iterable=None,
+            desc="First pass sampling",
+            total=num_chunks_0 * num_steps,
+        )
+        video_path_generator = run_one_scene(
+            task=task,
+            version_dict={
+                "H": H,
+                "W": W,
+                "T": T,
+                "C": VERSION_DICT["C"],
+                "f": VERSION_DICT["f"],
+                "options": options,
+            },
+            model=MODEL,
+            ae=AE,
+            conditioner=CONDITIONER,
+            denoiser=DENOISER,
+            image_cond=image_cond,
+            camera_cond=camera_cond,
+            save_path=render_dir,
+            use_traj_prior=True,
+            traj_prior_c2ws=anchor_c2ws,
+            traj_prior_Ks=anchor_Ks,
+            seed=seed,
+            gradio=True,
+            first_pass_pbar=first_pass_pbar,
+            second_pass_pbar=second_pass_pbar,
+            abort_event=abort_event,
+        )
+        output_queue = queue.Queue()
+        blocks = LocalContext.blocks.get()
+        event_id = LocalContext.event_id.get()
+        def worker():
+            # gradio doesn't support threading with progress intentionally, so
+            # we need to hack this.
+            LocalContext.blocks.set(blocks)
+            LocalContext.event_id.set(event_id)
+            for i, video_path in enumerate(video_path_generator):
+                if i == 0:
+                    output_queue.put(
+                        (
+                            video_path,
+                            gr.update(),
+                            gr.update(),
+                            gr.update(),
+                        )
+                    )
+                elif i == 1:
+                    output_queue.put(
+                        (
+                            video_path,
+                            gr.update(visible=True),
+                            gr.update(visible=False),
+                            gr.update(visible=False),
+                        )
+                    )
+                else:
+                    gr.Error("More than two passes during rendering.")
+        thread = threading.Thread(target=worker, daemon=True)
+        thread.start()
+        while thread.is_alive() or not output_queue.empty():
+            if abort_event.is_set():
+                thread.join()
+                abort_event.clear()
+                yield (
+                    gr.update(),
+                    gr.update(visible=True),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                )
+            time.sleep(0.1)
+            while not output_queue.empty():
+                yield output_queue.get()
+# This is basically a copy of the original `networking.setup_tunnel` function,
+# but it also returns the tunnel object for proper cleanup.
+def setup_tunnel(
+    local_host: str, local_port: int, share_token: str, share_server_address: str | None
+) -> tuple[str, Tunnel]:
+    share_server_address = (
+        networking.GRADIO_SHARE_SERVER_ADDRESS
+        if share_server_address is None
+        else share_server_address
+    )
+    if share_server_address is None:
+        try:
+            response = httpx.get(networking.GRADIO_API_SERVER, timeout=30)
+            payload = response.json()[0]
+            remote_host, remote_port = payload["host"], int(payload["port"])
+            certificate = payload["root_ca"]
+            Path(CERTIFICATE_PATH).parent.mkdir(parents=True, exist_ok=True)
+            with open(CERTIFICATE_PATH, "w") as f:
+                f.write(certificate)
+        except Exception as e:
+            raise RuntimeError(
+                "Could not get share link from Gradio API Server."
+            ) from e
+    else:
+        remote_host, remote_port = share_server_address.split(":")
+        remote_port = int(remote_port)
+    tunnel = Tunnel(remote_host, remote_port, local_host, local_port, share_token)
+    address = tunnel.start_tunnel()
+    return address, tunnel
+def set_bkgd_color(server: viser.ViserServer | viser.ClientHandle):
+    server.scene.set_background_image(np.array([[[39, 39, 42]]], dtype=np.uint8))
+def start_server_and_abort_event(request: gr.Request):
+    server = viser.ViserServer()
+    @server.on_client_connect
+    def _(client: viser.ClientHandle):
+        # Force dark mode that blends well with gradio's dark theme.
+        client.gui.configure_theme(
+            dark_mode=True,
+            show_share_button=False,
+            control_layout="collapsible",
+        )
+        set_bkgd_color(client)
+    print(f"Starting server {server.get_port()}")
+    server_url, tunnel = setup_tunnel(
+        local_host=server.get_host(),
+        local_port=server.get_port(),
+        share_token=secrets.token_urlsafe(32),
+        share_server_address=None,
+    )
+    SERVERS[request.session_hash] = (server, tunnel)
+    if server_url is None:
+        raise gr.Error(
+            "Failed to get a viewport URL. Please check your network connection."
+        )
+    # Give it enough time to start.
+    time.sleep(1)
+    ABORT_EVENTS[request.session_hash] = threading.Event()
+    return (
+        SevaRenderer(server),
+        gr.HTML(
+            f'<iframe src="{server_url}" style="display: block; margin: auto; width: 100%; height: min(60vh, 600px);" frameborder="0"></iframe>',
+            container=True,
+        ),
+        request.session_hash,
+    )
+def stop_server_and_abort_event(request: gr.Request):
+    if request.session_hash in SERVERS:
+        print(f"Stopping server {request.session_hash}")
+        server, tunnel = SERVERS.pop(request.session_hash)
+        server.stop()
+        tunnel.kill()
+    if request.session_hash in ABORT_EVENTS:
+        print(f"Setting abort event {request.session_hash}")
+        ABORT_EVENTS[request.session_hash].set()
+        # Give it enough time to abort jobs.
+        time.sleep(5)
+        ABORT_EVENTS.pop(request.session_hash)
+def set_abort_event(request: gr.Request):
+    if request.session_hash in ABORT_EVENTS:
+        print(f"Setting abort event {request.session_hash}")
+        ABORT_EVENTS[request.session_hash].set()
+def get_advance_examples(selection: gr.SelectData):
+    index = selection.index
+    return (
+        gr.Gallery(ADVANCE_EXAMPLE_MAP[index][1], visible=True),
+        gr.update(visible=True),
+        gr.update(visible=True),
+        gr.Gallery(visible=False),
+    )
+def get_preamble():
+    gr.Markdown("""
+# Stable Virtual Camera
+<span style="display: flex; flex-wrap: wrap; gap: 5px;">
+    <a href="https://stable-virtual-camera.github.io"><img src="https://img.shields.io/badge/%F0%9F%8F%A0%20Project%20Page-gray.svg"></a>
+    <a href="https://stable-virtual-camera.github.io/pdf/paper.pdf"><img src="https://img.shields.io/badge/%F0%9F%93%84%20Paper-gray.svg"></a>
+    <a href="https://stability.ai/news/introducing-stable-virtual-camera-multi-view-video-generation-with-3d-camera-control"><img src="https://img.shields.io/badge/%F0%9F%93%83%20Blog-Stability%20AI-orange.svg"></a>
+    <a href="https://huggingface.co/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange"></a>
+    <a href="https://huggingface.co/spaces/stabilityai/stable-virtual-camera"><img src="https://img.shields.io/badge/%F0%9F%9A%80%20Gradio%20Demo-Huggingface-orange"></a>
+    <a href="https://www.youtube.com/channel/UCLLlVDcS7nNenT_zzO3OPxQ"><img src="https://img.shields.io/badge/%F0%9F%8E%AC%20Video-YouTube-orange"></a>
+</span>
+Welcome to the demo of <strong>Stable Virtual Camera (Seva)</strong>! Given any number of input views and their cameras, this demo will allow you to generate novel views of a scene at any target camera of interest.
+We provide two ways to use our demo (selected by the tab below, documented [here](https://github.com/Stability-AI/stable-virtual-camera/blob/main/docs/GR_USAGE.md)):
+1. **[Basic](https://github.com/user-attachments/assets/4d965fa6-d8eb-452c-b773-6e09c88ca705)**: Given a single image, you can generate a video following one of our preset camera trajectories.
+2. **[Advanced](https://github.com/user-attachments/assets/dcec1be0-bd10-441e-879c-d1c2b63091ba)**: Given any number of input images, you can generate a video following any camera trajectory of your choice by our key-frame-based interface.
+> This is a research preview and comes with a few [limitations](https://stable-virtual-camera.github.io/#limitations):
+> - Limited quality in certain subjects due to training data, including humans, animals, and dynamic textures.
+> - Limited quality in some highly ambiguous scenes and camera trajectories, including extreme views and collision into objects.
+    """)
+# Make sure that gradio uses dark theme.
+_APP_JS = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'dark') {
+        url.searchParams.set('__theme', 'dark');
+    }
+}
+"""
+def main(server_port: int | None = None, share: bool = True):
+    with gr.Blocks(js=_APP_JS) as app:
+        renderer = gr.State()
+        session_hash = gr.State()
+        _ = get_preamble()
+        with gr.Tabs():
+            with gr.Tab("Basic"):
+                render_btn = gr.Button("Render video", interactive=False, render=False)
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Group():
+                            preprocess_btn = gr.Button("Preprocess images")
+                            preprocess_progress = gr.Textbox(
+                                label="",
+                                visible=False,
+                                interactive=False,
+                            )
+                        with gr.Group():
+                            input_imgs = gr.Image(
+                                type="filepath",
+                                label="Input",
+                                height=200,
+                            )
+                            _ = gr.Examples(
+                                examples=sorted(glob("assets/basic/*")),
+                                inputs=[input_imgs],
+                                label="Example",
+                            )
+                            chunk_strategy = gr.Dropdown(
+                                ["interp", "interp-gt"],
+                                label="Chunk strategy",
+                                render=False,
+                            )
+                            preprocessed = gr.State()
+                            preprocess_btn.click(
+                                lambda r, *args: [
+                                    *r.preprocess(*args),
+                                    gr.update(interactive=True),
+                                ],
+                                inputs=[renderer, input_imgs],
+                                outputs=[
+                                    preprocessed,
+                                    preprocess_progress,
+                                    chunk_strategy,
+                                    render_btn,
+                                ],
+                                show_progress_on=[preprocess_progress],
+                                concurrency_limit=1,
+                                concurrency_id="gpu_queue",
+                            )
+                            preprocess_btn.click(
+                                lambda: gr.update(visible=True),
+                                outputs=[preprocess_progress],
+                            )
+                        with gr.Row():
+                            preset_traj = gr.Dropdown(
+                                choices=[
+                                    "orbit",
+                                    "spiral",
+                                    "lemniscate",
+                                    "zoom-in",
+                                    "zoom-out",
+                                    "dolly zoom-in",
+                                    "dolly zoom-out",
+                                    "move-forward",
+                                    "move-backward",
+                                    "move-up",
+                                    "move-down",
+                                    "move-left",
+                                    "move-right",
+                                ],
+                                label="Preset trajectory",
+                                value="orbit",
+                            )
+                            num_frames = gr.Slider(30, 150, 80, label="#Frames")
+                            zoom_factor = gr.Slider(
+                                step=0.01, label="Zoom factor", visible=False
+                            )
+                        with gr.Row():
+                            seed = gr.Number(value=23, label="Random seed")
+                            chunk_strategy.render()
+                            cfg = gr.Slider(1.0, 7.0, value=4.0, label="CFG value")
+                        with gr.Row():
+                            camera_scale = gr.Slider(
+                                0.1,
+                                15.0,
+                                value=2.0,
+                                label="Camera scale",
+                            )
+                        def default_cfg_preset_traj(traj):
+                            # These are just some hand-tuned values that we
+                            # found work the best.
+                            if traj in ["zoom-out", "move-down"]:
+                                value = 5.0
+                            elif traj in [
+                                "orbit",
+                                "dolly zoom-out",
+                                "move-backward",
+                                "move-up",
+                                "move-left",
+                                "move-right",
+                            ]:
+                                value = 4.0
+                            else:
+                                value = 3.0
+                            return value
+                        preset_traj.change(
+                            default_cfg_preset_traj,
+                            inputs=[preset_traj],
+                            outputs=[cfg],
+                        )
+                        preset_traj.change(
+                            lambda traj: gr.update(
+                                value=(
+                                    10.0 if "dolly" in traj or "pan" in traj else 2.0
+                                )
+                            ),
+                            inputs=[preset_traj],
+                            outputs=[camera_scale],
+                        )
+                        def zoom_factor_preset_traj(traj):
+                            visible = traj in [
+                                "zoom-in",
+                                "zoom-out",
+                                "dolly zoom-in",
+                                "dolly zoom-out",
+                            ]
+                            is_zoomin = traj.endswith("zoom-in")
+                            if is_zoomin:
+                                minimum = 0.1
+                                maximum = 0.5
+                                value = 0.28
+                            else:
+                                minimum = 1.2
+                                maximum = 3
+                                value = 1.5
+                            return gr.update(
+                                visible=visible,
+                                minimum=minimum,
+                                maximum=maximum,
+                                value=value,
+                            )
+                        preset_traj.change(
+                            zoom_factor_preset_traj,
+                            inputs=[preset_traj],
+                            outputs=[zoom_factor],
+                        )
+                    with gr.Column():
+                        with gr.Group():
+                            abort_btn = gr.Button("Abort rendering", visible=False)
+                            render_btn.render()
+                            render_progress = gr.Textbox(
+                                label="", visible=False, interactive=False
+                            )
+                        output_video = gr.Video(
+                            label="Output", interactive=False, autoplay=True, loop=True
+                        )
+                        render_btn.click(
+                            lambda r, *args: (yield from r.render(*args)),
+                            inputs=[
+                                renderer,
+                                preprocessed,
+                                session_hash,
+                                seed,
+                                chunk_strategy,
+                                cfg,
+                                preset_traj,
+                                num_frames,
+                                zoom_factor,
+                                camera_scale,
+                            ],
+                            outputs=[
+                                output_video,
+                                render_btn,
+                                abort_btn,
+                                render_progress,
+                            ],
+                            show_progress_on=[render_progress],
+                            concurrency_id="gpu_queue",
+                        )
+                        render_btn.click(
+                            lambda: [
+                                gr.update(visible=False),
+                                gr.update(visible=True),
+                                gr.update(visible=True),
+                            ],
+                            outputs=[render_btn, abort_btn, render_progress],
+                        )
+                        abort_btn.click(set_abort_event)
+            with gr.Tab("Advanced"):
+                render_btn = gr.Button("Render video", interactive=False, render=False)
+                viewport = gr.HTML(container=True, render=False)
+                gr.Timer(0.1).tick(
+                    lambda renderer: gr.update(
+                        interactive=renderer is not None
+                        and renderer.gui_state is not None
+                        and renderer.gui_state.camera_traj_list is not None
+                    ),
+                    inputs=[renderer],
+                    outputs=[render_btn],
+                )
+                with gr.Row():
+                    viewport.render()
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Group():
+                            preprocess_btn = gr.Button("Preprocess images")
+                            preprocess_progress = gr.Textbox(
+                                label="",
+                                visible=False,
+                                interactive=False,
+                            )
+                        with gr.Group():
+                            input_imgs = gr.Gallery(
+                                interactive=True,
+                                label="Input",
+                                columns=4,
+                                height=200,
+                            )
+                            # Define example images (gradio doesn't support variable length
+                            # examples so we need to hack it).
+                            example_imgs = gr.Gallery(
+                                [e[0] for e in ADVANCE_EXAMPLE_MAP],
+                                allow_preview=False,
+                                preview=False,
+                                label="Example",
+                                columns=20,
+                                rows=1,
+                                height=115,
+                            )
+                            example_imgs_expander = gr.Gallery(
+                                visible=False,
+                                interactive=False,
+                                label="Example",
+                                preview=True,
+                                columns=20,
+                                rows=1,
+                            )
+                            chunk_strategy = gr.Dropdown(
+                                ["interp-gt", "interp"],
+                                label="Chunk strategy",
+                                value="interp-gt",
+                                render=False,
+                            )
+                            with gr.Row():
+                                example_imgs_backer = gr.Button(
+                                    "Go back", visible=False
+                                )
+                                example_imgs_confirmer = gr.Button(
+                                    "Confirm", visible=False
+                                )
+                            example_imgs.select(
+                                get_advance_examples,
+                                outputs=[
+                                    example_imgs_expander,
+                                    example_imgs_confirmer,
+                                    example_imgs_backer,
+                                    example_imgs,
+                                ],
+                            )
+                            example_imgs_confirmer.click(
+                                lambda x: (
+                                    x,
+                                    gr.update(visible=False),
+                                    gr.update(visible=False),
+                                    gr.update(visible=False),
+                                    gr.update(visible=True),
+                                ),
+                                inputs=[example_imgs_expander],
+                                outputs=[
+                                    input_imgs,
+                                    example_imgs_expander,
+                                    example_imgs_confirmer,
+                                    example_imgs_backer,
+                                    example_imgs,
+                                ],
+                            )
+                            example_imgs_backer.click(
+                                lambda: (
+                                    gr.update(visible=False),
+                                    gr.update(visible=False),
+                                    gr.update(visible=False),
+                                    gr.update(visible=True),
+                                ),
+                                outputs=[
+                                    example_imgs_expander,
+                                    example_imgs_confirmer,
+                                    example_imgs_backer,
+                                    example_imgs,
+                                ],
+                            )
+                            preprocessed = gr.State()
+                            preprocess_btn.click(
+                                lambda r, *args: r.preprocess(*args),
+                                inputs=[renderer, input_imgs],
+                                outputs=[
+                                    preprocessed,
+                                    preprocess_progress,
+                                    chunk_strategy,
+                                ],
+                                show_progress_on=[preprocess_progress],
+                                concurrency_id="gpu_queue",
+                            )
+                            preprocess_btn.click(
+                                lambda: gr.update(visible=True),
+                                outputs=[preprocess_progress],
+                            )
+                            preprocessed.change(
+                                lambda r, *args: r.visualize_scene(*args),
+                                inputs=[renderer, preprocessed],
+                            )
+                        with gr.Row():
+                            seed = gr.Number(value=23, label="Random seed")
+                            chunk_strategy.render()
+                            cfg = gr.Slider(1.0, 7.0, value=3.0, label="CFG value")
+                        with gr.Row():
+                            camera_scale = gr.Slider(
+                                0.1,
+                                15.0,
+                                value=2.0,
+                                label="Camera scale (useful for single-view input)",
+                            )
+                        with gr.Group():
+                            output_data_dir = gr.Textbox(label="Output data directory")
+                            output_data_btn = gr.Button("Export output data")
+                        output_data_btn.click(
+                            lambda r, *args: r.export_output_data(*args),
+                            inputs=[renderer, preprocessed, output_data_dir],
+                        )
+                    with gr.Column():
+                        with gr.Group():
+                            abort_btn = gr.Button("Abort rendering", visible=False)
+                            render_btn.render()
+                            render_progress = gr.Textbox(
+                                label="", visible=False, interactive=False
+                            )
+                        output_video = gr.Video(
+                            label="Output", interactive=False, autoplay=True, loop=True
+                        )
+                        render_btn.click(
+                            lambda r, *args: (yield from r.render(*args)),
+                            inputs=[
+                                renderer,
+                                preprocessed,
+                                session_hash,
+                                seed,
+                                chunk_strategy,
+                                cfg,
+                                gr.State(),
+                                gr.State(),
+                                gr.State(),
+                                camera_scale,
+                            ],
+                            outputs=[
+                                output_video,
+                                render_btn,
+                                abort_btn,
+                                render_progress,
+                            ],
+                            show_progress_on=[render_progress],
+                            concurrency_id="gpu_queue",
+                        )
+                        render_btn.click(
+                            lambda: [
+                                gr.update(visible=False),
+                                gr.update(visible=True),
+                                gr.update(visible=True),
+                            ],
+                            outputs=[render_btn, abort_btn, render_progress],
+                        )
+                        abort_btn.click(set_abort_event)
+        # Register the session initialization and cleanup functions.
+        app.load(
+            start_server_and_abort_event,
+            outputs=[renderer, viewport, session_hash],
+        )
+        app.unload(stop_server_and_abort_event)
+    app.queue(max_size=5).launch(
+        share=share,
+        server_port=server_port,
+        show_error=True,
+        allowed_paths=[WORK_DIR],
+        # Badget rendering will be broken otherwise.
+        ssr_mode=False,
+    )
+if __name__ == "__main__":
+    tyro.cli(main)

requirements.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+--extra-index-url https://download.pytorch.org/whl/nightly/cu124
+torch==2.7.0.dev20250218+cu124
+torchvision==0.22.0.dev20250219+cu124
+roma
+gradio==5.17.0
+matplotlib
+tqdm
+opencv-python
+scipy
+einops
+trimesh
+tensorboard
+git+https://github.com/jensenz-sai/pycolmap@543266bc316df2fe407b3a33d454b310b1641042
+pyglet<2
+huggingface-hub[torch]>=0.22
+pillow-heif  # add heif/heic image support
+pyrender  # for rendering depths in scannetpp
+kapture  # for visloc data loading
+kapture-localization
+numpy==1.24.4
+numpy-quaternion
+pycolmap  # for pnp
+poselib  # for pnp
+viser
+tyro
+ninja
+colorama
+pytorch-lightning
+splines
+diffusers
+kornia
+open-clip-torch
+accelerate
+pyav
+imageio[ffmpeg]

seva/__init__.py ADDED Viewed

File without changes

seva/data_io.py ADDED Viewed

	@@ -0,0 +1,553 @@

+import json
+import os
+import os.path as osp
+from glob import glob
+from typing import Any, Dict, List, Optional, Tuple
+import cv2
+import imageio.v3 as iio
+import numpy as np
+import torch
+from seva.geometry import (
+    align_principle_axes,
+    similarity_from_cameras,
+    transform_cameras,
+    transform_points,
+)
+def _get_rel_paths(path_dir: str) -> List[str]:
+    """Recursively get relative paths of files in a directory."""
+    paths = []
+    for dp, _, fn in os.walk(path_dir):
+        for f in fn:
+            paths.append(os.path.relpath(os.path.join(dp, f), path_dir))
+    return paths
+class BaseParser(object):
+    def __init__(
+        self,
+        data_dir: str,
+        factor: int = 1,
+        normalize: bool = False,
+        test_every: Optional[int] = 8,
+    ):
+        self.data_dir = data_dir
+        self.factor = factor
+        self.normalize = normalize
+        self.test_every = test_every
+        self.image_names: List[str] = []  # (num_images,)
+        self.image_paths: List[str] = []  # (num_images,)
+        self.camtoworlds: np.ndarray = np.zeros((0, 4, 4))  # (num_images, 4, 4)
+        self.camera_ids: List[int] = []  # (num_images,)
+        self.Ks_dict: Dict[int, np.ndarray] = {}  # Dict of camera_id -> K
+        self.params_dict: Dict[int, np.ndarray] = {}  # Dict of camera_id -> params
+        self.imsize_dict: Dict[
+            int, Tuple[int, int]
+        ] = {}  # Dict of camera_id -> (width, height)
+        self.points: np.ndarray = np.zeros((0, 3))  # (num_points, 3)
+        self.points_err: np.ndarray = np.zeros((0,))  # (num_points,)
+        self.points_rgb: np.ndarray = np.zeros((0, 3))  # (num_points, 3)
+        self.point_indices: Dict[str, np.ndarray] = {}  # Dict of image_name -> (M,)
+        self.transform: np.ndarray = np.zeros((4, 4))  # (4, 4)
+        self.mapx_dict: Dict[int, np.ndarray] = {}  # Dict of camera_id -> (H, W)
+        self.mapy_dict: Dict[int, np.ndarray] = {}  # Dict of camera_id -> (H, W)
+        self.roi_undist_dict: Dict[int, Tuple[int, int, int, int]] = (
+            dict()
+        )  # Dict of camera_id -> (x, y, w, h)
+        self.scene_scale: float = 1.0
+class DirectParser(BaseParser):
+    def __init__(
+        self,
+        imgs: List[np.ndarray],
+        c2ws: np.ndarray,
+        Ks: np.ndarray,
+        points: Optional[np.ndarray] = None,
+        points_rgb: Optional[np.ndarray] = None,  # uint8
+        mono_disps: Optional[List[np.ndarray]] = None,
+        normalize: bool = False,
+        test_every: Optional[int] = None,
+    ):
+        super().__init__("", 1, normalize, test_every)
+        self.image_names = [f"{i:06d}" for i in range(len(imgs))]
+        self.image_paths = ["null" for _ in range(len(imgs))]
+        self.camtoworlds = c2ws
+        self.camera_ids = [i for i in range(len(imgs))]
+        self.Ks_dict = {i: K for i, K in enumerate(Ks)}
+        self.imsize_dict = {
+            i: (img.shape[1], img.shape[0]) for i, img in enumerate(imgs)
+        }
+        if points is not None:
+            self.points = points
+            assert points_rgb is not None
+            self.points_rgb = points_rgb
+            self.points_err = np.zeros((len(points),))
+        self.imgs = imgs
+        self.mono_disps = mono_disps
+        # Normalize the world space.
+        if normalize:
+            T1 = similarity_from_cameras(self.camtoworlds)
+            self.camtoworlds = transform_cameras(T1, self.camtoworlds)
+            if points is not None:
+                self.points = transform_points(T1, self.points)
+                T2 = align_principle_axes(self.points)
+                self.camtoworlds = transform_cameras(T2, self.camtoworlds)
+                self.points = transform_points(T2, self.points)
+            else:
+                T2 = np.eye(4)
+            self.transform = T2 @ T1
+        else:
+            self.transform = np.eye(4)
+        # size of the scene measured by cameras
+        camera_locations = self.camtoworlds[:, :3, 3]
+        scene_center = np.mean(camera_locations, axis=0)
+        dists = np.linalg.norm(camera_locations - scene_center, axis=1)
+        self.scene_scale = np.max(dists)
+class COLMAPParser(BaseParser):
+    """COLMAP parser."""
+    def __init__(
+        self,
+        data_dir: str,
+        factor: int = 1,
+        normalize: bool = False,
+        test_every: Optional[int] = 8,
+        image_folder: str = "images",
+        colmap_folder: str = "sparse/0",
+    ):
+        super().__init__(data_dir, factor, normalize, test_every)
+        colmap_dir = os.path.join(data_dir, colmap_folder)
+        assert os.path.exists(
+            colmap_dir
+        ), f"COLMAP directory {colmap_dir} does not exist."
+        try:
+            from pycolmap import SceneManager
+        except ImportError:
+            raise ImportError(
+                "Please install pycolmap to use the data parsers: "
+                "  `pip install git+https://github.com/jensenz-sai/pycolmap.git@543266bc316df2fe407b3a33d454b310b1641042`"
+            )
+        manager = SceneManager(colmap_dir)
+        manager.load_cameras()
+        manager.load_images()
+        manager.load_points3D()
+        # Extract extrinsic matrices in world-to-camera format.
+        imdata = manager.images
+        w2c_mats = []
+        camera_ids = []
+        Ks_dict = dict()
+        params_dict = dict()
+        imsize_dict = dict()  # width, height
+        bottom = np.array([0, 0, 0, 1]).reshape(1, 4)
+        for k in imdata:
+            im = imdata[k]
+            rot = im.R()
+            trans = im.tvec.reshape(3, 1)
+            w2c = np.concatenate([np.concatenate([rot, trans], 1), bottom], axis=0)
+            w2c_mats.append(w2c)
+            # support different camera intrinsics
+            camera_id = im.camera_id
+            camera_ids.append(camera_id)
+            # camera intrinsics
+            cam = manager.cameras[camera_id]
+            fx, fy, cx, cy = cam.fx, cam.fy, cam.cx, cam.cy
+            K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+            K[:2, :] /= factor
+            Ks_dict[camera_id] = K
+            # Get distortion parameters.
+            type_ = cam.camera_type
+            if type_ == 0 or type_ == "SIMPLE_PINHOLE":
+                params = np.empty(0, dtype=np.float32)
+                camtype = "perspective"
+            elif type_ == 1 or type_ == "PINHOLE":
+                params = np.empty(0, dtype=np.float32)
+                camtype = "perspective"
+            if type_ == 2 or type_ == "SIMPLE_RADIAL":
+                params = np.array([cam.k1, 0.0, 0.0, 0.0], dtype=np.float32)
+                camtype = "perspective"
+            elif type_ == 3 or type_ == "RADIAL":
+                params = np.array([cam.k1, cam.k2, 0.0, 0.0], dtype=np.float32)
+                camtype = "perspective"
+            elif type_ == 4 or type_ == "OPENCV":
+                params = np.array([cam.k1, cam.k2, cam.p1, cam.p2], dtype=np.float32)
+                camtype = "perspective"
+            elif type_ == 5 or type_ == "OPENCV_FISHEYE":
+                params = np.array([cam.k1, cam.k2, cam.k3, cam.k4], dtype=np.float32)
+                camtype = "fisheye"
+            assert (
+                camtype == "perspective"  # type: ignore
+            ), f"Only support perspective camera model, got {type_}"
+            params_dict[camera_id] = params  # type: ignore
+            # image size
+            imsize_dict[camera_id] = (cam.width // factor, cam.height // factor)
+        print(
+            f"[Parser] {len(imdata)} images, taken by {len(set(camera_ids))} cameras."
+        )
+        if len(imdata) == 0:
+            raise ValueError("No images found in COLMAP.")
+        if not (type_ == 0 or type_ == 1):  # type: ignore
+            print("Warning: COLMAP Camera is not PINHOLE. Images have distortion.")
+        w2c_mats = np.stack(w2c_mats, axis=0)
+        # Convert extrinsics to camera-to-world.
+        camtoworlds = np.linalg.inv(w2c_mats)
+        # Image names from COLMAP. No need for permuting the poses according to
+        # image names anymore.
+        image_names = [imdata[k].name for k in imdata]
+        # Previous Nerf results were generated with images sorted by filename,
+        # ensure metrics are reported on the same test set.
+        inds = np.argsort(image_names)
+        image_names = [image_names[i] for i in inds]
+        camtoworlds = camtoworlds[inds]
+        camera_ids = [camera_ids[i] for i in inds]
+        # Load images.
+        if factor > 1:
+            image_dir_suffix = f"_{factor}"
+        else:
+            image_dir_suffix = ""
+        colmap_image_dir = os.path.join(data_dir, image_folder)
+        image_dir = os.path.join(data_dir, image_folder + image_dir_suffix)
+        for d in [image_dir, colmap_image_dir]:
+            if not os.path.exists(d):
+                raise ValueError(f"Image folder {d} does not exist.")
+        # Downsampled images may have different names vs images used for COLMAP,
+        # so we need to map between the two sorted lists of files.
+        colmap_files = sorted(_get_rel_paths(colmap_image_dir))
+        image_files = sorted(_get_rel_paths(image_dir))
+        colmap_to_image = dict(zip(colmap_files, image_files))
+        image_paths = [os.path.join(image_dir, colmap_to_image[f]) for f in image_names]
+        # 3D points and {image_name -> [point_idx]}
+        points = manager.points3D.astype(np.float32)  # type: ignore
+        points_err = manager.point3D_errors.astype(np.float32)  # type: ignore
+        points_rgb = manager.point3D_colors.astype(np.uint8)  # type: ignore
+        point_indices = dict()
+        image_id_to_name = {v: k for k, v in manager.name_to_image_id.items()}
+        for point_id, data in manager.point3D_id_to_images.items():
+            for image_id, _ in data:
+                image_name = image_id_to_name[image_id]
+                point_idx = manager.point3D_id_to_point3D_idx[point_id]
+                point_indices.setdefault(image_name, []).append(point_idx)
+        point_indices = {
+            k: np.array(v).astype(np.int32) for k, v in point_indices.items()
+        }
+        # Normalize the world space.
+        if normalize:
+            T1 = similarity_from_cameras(camtoworlds)
+            camtoworlds = transform_cameras(T1, camtoworlds)
+            points = transform_points(T1, points)
+            T2 = align_principle_axes(points)
+            camtoworlds = transform_cameras(T2, camtoworlds)
+            points = transform_points(T2, points)
+            transform = T2 @ T1
+        else:
+            transform = np.eye(4)
+        self.image_names = image_names  # List[str], (num_images,)
+        self.image_paths = image_paths  # List[str], (num_images,)
+        self.camtoworlds = camtoworlds  # np.ndarray, (num_images, 4, 4)
+        self.camera_ids = camera_ids  # List[int], (num_images,)
+        self.Ks_dict = Ks_dict  # Dict of camera_id -> K
+        self.params_dict = params_dict  # Dict of camera_id -> params
+        self.imsize_dict = imsize_dict  # Dict of camera_id -> (width, height)
+        self.points = points  # np.ndarray, (num_points, 3)
+        self.points_err = points_err  # np.ndarray, (num_points,)
+        self.points_rgb = points_rgb  # np.ndarray, (num_points, 3)
+        self.point_indices = point_indices  # Dict[str, np.ndarray], image_name -> [M,]
+        self.transform = transform  # np.ndarray, (4, 4)
+        # undistortion
+        self.mapx_dict = dict()
+        self.mapy_dict = dict()
+        self.roi_undist_dict = dict()
+        for camera_id in self.params_dict.keys():
+            params = self.params_dict[camera_id]
+            if len(params) == 0:
+                continue  # no distortion
+            assert camera_id in self.Ks_dict, f"Missing K for camera {camera_id}"
+            assert (
+                camera_id in self.params_dict
+            ), f"Missing params for camera {camera_id}"
+            K = self.Ks_dict[camera_id]
+            width, height = self.imsize_dict[camera_id]
+            K_undist, roi_undist = cv2.getOptimalNewCameraMatrix(
+                K, params, (width, height), 0
+            )
+            mapx, mapy = cv2.initUndistortRectifyMap(
+                K,
+                params,
+                None,
+                K_undist,
+                (width, height),
+                cv2.CV_32FC1,  # type: ignore
+            )
+            self.Ks_dict[camera_id] = K_undist
+            self.mapx_dict[camera_id] = mapx
+            self.mapy_dict[camera_id] = mapy
+            self.roi_undist_dict[camera_id] = roi_undist  # type: ignore
+        # size of the scene measured by cameras
+        camera_locations = camtoworlds[:, :3, 3]
+        scene_center = np.mean(camera_locations, axis=0)
+        dists = np.linalg.norm(camera_locations - scene_center, axis=1)
+        self.scene_scale = np.max(dists)
+class ReconfusionParser(BaseParser):
+    def __init__(self, data_dir: str, normalize: bool = False):
+        super().__init__(data_dir, 1, normalize, test_every=None)
+        def get_num(p):
+            return p.split("_")[-1].removesuffix(".json")
+        splits_per_num_input_frames = {}
+        num_input_frames = [
+            int(get_num(p)) if get_num(p).isdigit() else get_num(p)
+            for p in sorted(glob(osp.join(data_dir, "train_test_split_*.json")))
+        ]
+        for num_input_frames in num_input_frames:
+            with open(
+                osp.join(
+                    data_dir,
+                    f"train_test_split_{num_input_frames}.json",
+                )
+            ) as f:
+                splits_per_num_input_frames[num_input_frames] = json.load(f)
+        self.splits_per_num_input_frames = splits_per_num_input_frames
+        with open(osp.join(data_dir, "transforms.json")) as f:
+            metadata = json.load(f)
+        image_names, image_paths, camtoworlds = [], [], []
+        for frame in metadata["frames"]:
+            if frame["file_path"] is None:
+                image_path = image_name = None
+            else:
+                image_path = osp.join(data_dir, frame["file_path"])
+                image_name = osp.basename(image_path)
+            image_paths.append(image_path)
+            image_names.append(image_name)
+            camtoworld = np.array(frame["transform_matrix"])
+            if "applied_transform" in metadata:
+                applied_transform = np.concatenate(
+                    [metadata["applied_transform"], [[0, 0, 0, 1]]], axis=0
+                )
+                camtoworld = applied_transform @ camtoworld
+            camtoworlds.append(camtoworld)
+        camtoworlds = np.array(camtoworlds)
+        camtoworlds[:, :, [1, 2]] *= -1
+        # Normalize the world space.
+        if normalize:
+            T1 = similarity_from_cameras(camtoworlds)
+            camtoworlds = transform_cameras(T1, camtoworlds)
+            self.transform = T1
+        else:
+            self.transform = np.eye(4)
+        self.image_names = image_names
+        self.image_paths = image_paths
+        self.camtoworlds = camtoworlds
+        self.camera_ids = list(range(len(image_paths)))
+        self.Ks_dict = {
+            i: np.array(
+                [
+                    [
+                        metadata.get("fl_x", frame.get("fl_x", None)),
+                        0.0,
+                        metadata.get("cx", frame.get("cx", None)),
+                    ],
+                    [
+                        0.0,
+                        metadata.get("fl_y", frame.get("fl_y", None)),
+                        metadata.get("cy", frame.get("cy", None)),
+                    ],
+                    [0.0, 0.0, 1.0],
+                ]
+            )
+            for i, frame in enumerate(metadata["frames"])
+        }
+        self.imsize_dict = {
+            i: (
+                metadata.get("w", frame.get("w", None)),
+                metadata.get("h", frame.get("h", None)),
+            )
+            for i, frame in enumerate(metadata["frames"])
+        }
+        # When num_input_frames is None, use all frames for both training and
+        # testing.
+        # self.splits_per_num_input_frames[None] = {
+        #     "train_ids": list(range(len(image_paths))),
+        #     "test_ids": list(range(len(image_paths))),
+        # }
+        # size of the scene measured by cameras
+        camera_locations = camtoworlds[:, :3, 3]
+        scene_center = np.mean(camera_locations, axis=0)
+        dists = np.linalg.norm(camera_locations - scene_center, axis=1)
+        self.scene_scale = np.max(dists)
+        self.bounds = None
+        if osp.exists(osp.join(data_dir, "bounds.npy")):
+            self.bounds = np.load(osp.join(data_dir, "bounds.npy"))
+            scaling = np.linalg.norm(self.transform[0, :3])
+            self.bounds = self.bounds / scaling
+class Dataset(torch.utils.data.Dataset):
+    """A simple dataset class."""
+    def __init__(
+        self,
+        parser: BaseParser,
+        split: str = "train",
+        num_input_frames: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        load_depths: bool = False,
+        load_mono_disps: bool = False,
+    ):
+        self.parser = parser
+        self.split = split
+        self.num_input_frames = num_input_frames
+        self.patch_size = patch_size
+        self.load_depths = load_depths
+        self.load_mono_disps = load_mono_disps
+        if load_mono_disps:
+            assert isinstance(parser, DirectParser)
+            assert parser.mono_disps is not None
+        if isinstance(parser, ReconfusionParser):
+            ids_per_split = parser.splits_per_num_input_frames[num_input_frames]
+            self.indices = ids_per_split[
+                "train_ids" if split == "train" else "test_ids"
+            ]
+        else:
+            indices = np.arange(len(self.parser.image_names))
+            if split == "train":
+                self.indices = (
+                    indices[indices % self.parser.test_every != 0]
+                    if self.parser.test_every is not None
+                    else indices
+                )
+            else:
+                self.indices = (
+                    indices[indices % self.parser.test_every == 0]
+                    if self.parser.test_every is not None
+                    else indices
+                )
+    def __len__(self):
+        return len(self.indices)
+    def __getitem__(self, item: int) -> Dict[str, Any]:
+        index = self.indices[item]
+        if isinstance(self.parser, DirectParser):
+            image = self.parser.imgs[index]
+        else:
+            image = iio.imread(self.parser.image_paths[index])[..., :3]
+        camera_id = self.parser.camera_ids[index]
+        K = self.parser.Ks_dict[camera_id].copy()  # undistorted K
+        params = self.parser.params_dict.get(camera_id, None)
+        camtoworlds = self.parser.camtoworlds[index]
+        x, y, w, h = 0, 0, image.shape[1], image.shape[0]
+        if params is not None and len(params) > 0:
+            # Images are distorted. Undistort them.
+            mapx, mapy = (
+                self.parser.mapx_dict[camera_id],
+                self.parser.mapy_dict[camera_id],
+            )
+            image = cv2.remap(image, mapx, mapy, cv2.INTER_LINEAR)
+            x, y, w, h = self.parser.roi_undist_dict[camera_id]
+            image = image[y : y + h, x : x + w]
+        if self.patch_size is not None:
+            # Random crop.
+            h, w = image.shape[:2]
+            x = np.random.randint(0, max(w - self.patch_size, 1))
+            y = np.random.randint(0, max(h - self.patch_size, 1))
+            image = image[y : y + self.patch_size, x : x + self.patch_size]
+            K[0, 2] -= x
+            K[1, 2] -= y
+        data = {
+            "K": torch.from_numpy(K).float(),
+            "camtoworld": torch.from_numpy(camtoworlds).float(),
+            "image": torch.from_numpy(image).float(),
+            "image_id": item,  # the index of the image in the dataset
+        }
+        if self.load_depths:
+            # projected points to image plane to get depths
+            worldtocams = np.linalg.inv(camtoworlds)
+            image_name = self.parser.image_names[index]
+            point_indices = self.parser.point_indices[image_name]
+            points_world = self.parser.points[point_indices]
+            points_cam = (worldtocams[:3, :3] @ points_world.T + worldtocams[:3, 3:4]).T
+            points_proj = (K @ points_cam.T).T
+            points = points_proj[:, :2] / points_proj[:, 2:3]  # (M, 2)
+            depths = points_cam[:, 2]  # (M,)
+            if self.patch_size is not None:
+                points[:, 0] -= x
+                points[:, 1] -= y
+            # filter out points outside the image
+            selector = (
+                (points[:, 0] >= 0)
+                & (points[:, 0] < image.shape[1])
+                & (points[:, 1] >= 0)
+                & (points[:, 1] < image.shape[0])
+                & (depths > 0)
+            )
+            points = points[selector]
+            depths = depths[selector]
+            data["points"] = torch.from_numpy(points).float()
+            data["depths"] = torch.from_numpy(depths).float()
+        if self.load_mono_disps:
+            data["mono_disps"] = torch.from_numpy(self.parser.mono_disps[index]).float()  # type: ignore
+        return data
+def get_parser(parser_type: str, **kwargs) -> BaseParser:
+    if parser_type == "colmap":
+        parser = COLMAPParser(**kwargs)
+    elif parser_type == "direct":
+        parser = DirectParser(**kwargs)
+    elif parser_type == "reconfusion":
+        parser = ReconfusionParser(**kwargs)
+    else:
+        raise ValueError(f"Unknown parser type: {parser_type}")
+    return parser

seva/eval.py ADDED Viewed

	@@ -0,0 +1,1988 @@

+import collections
+import json
+import math
+import os
+import re
+import threading
+from typing import List, Literal, Optional, Tuple, Union
+import gradio as gr
+from colorama import Fore, Style, init
+init(autoreset=True)
+import imageio.v3 as iio
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+from einops import repeat
+from PIL import Image
+from tqdm.auto import tqdm
+from seva.geometry import get_camera_dist, get_plucker_coordinates, to_hom_pose
+from seva.sampling import (
+    EulerEDMSampler,
+    MultiviewCFG,
+    MultiviewTemporalCFG,
+    VanillaCFG,
+)
+from seva.utils import seed_everything
+try:
+    # Check if version string contains 'dev' or 'nightly'
+    version = torch.__version__
+    IS_TORCH_NIGHTLY = "dev" in version
+    if IS_TORCH_NIGHTLY:
+        torch._dynamo.config.cache_size_limit = 128  # type: ignore[assignment]
+        torch._dynamo.config.accumulated_cache_size_limit = 1024  # type: ignore[assignment]
+        torch._dynamo.config.force_parameter_static_shapes = False  # type: ignore[assignment]
+except Exception:
+    IS_TORCH_NIGHTLY = False
+def pad_indices(
+    input_indices: List[int],
+    test_indices: List[int],
+    T: int,
+    padding_mode: Literal["first", "last", "none"] = "last",
+):
+    assert padding_mode in ["last", "none"], "`first` padding is not supported yet."
+    if padding_mode == "last":
+        padded_indices = [
+            i for i in range(T) if i not in (input_indices + test_indices)
+        ]
+    else:
+        padded_indices = []
+    input_selects = list(range(len(input_indices)))
+    test_selects = list(range(len(test_indices)))
+    if max(input_indices) > max(test_indices):
+        # last elem from input
+        input_selects += [input_selects[-1]] * len(padded_indices)
+        input_indices = input_indices + padded_indices
+        sorted_inds = np.argsort(input_indices)
+        input_indices = [input_indices[ind] for ind in sorted_inds]
+        input_selects = [input_selects[ind] for ind in sorted_inds]
+    else:
+        # last elem from test
+        test_selects += [test_selects[-1]] * len(padded_indices)
+        test_indices = test_indices + padded_indices
+        sorted_inds = np.argsort(test_indices)
+        test_indices = [test_indices[ind] for ind in sorted_inds]
+        test_selects = [test_selects[ind] for ind in sorted_inds]
+    if padding_mode == "last":
+        input_maps = np.array([-1] * T)
+        test_maps = np.array([-1] * T)
+    else:
+        input_maps = np.array([-1] * (len(input_indices) + len(test_indices)))
+        test_maps = np.array([-1] * (len(input_indices) + len(test_indices)))
+    input_maps[input_indices] = input_selects
+    test_maps[test_indices] = test_selects
+    return input_indices, test_indices, input_maps, test_maps
+def assemble(
+    input,
+    test,
+    input_maps,
+    test_maps,
+):
+    T = len(input_maps)
+    assembled = torch.zeros_like(test[-1:]).repeat_interleave(T, dim=0)
+    assembled[input_maps != -1] = input[input_maps[input_maps != -1]]
+    assembled[test_maps != -1] = test[test_maps[test_maps != -1]]
+    assert np.logical_xor(input_maps != -1, test_maps != -1).all()
+    return assembled
+def get_resizing_factor(
+    target_shape: Tuple[int, int],  # H, W
+    current_shape: Tuple[int, int],  # H, W
+    cover_target: bool = True,
+    # If True, the output shape will fully cover the target shape.
+    # If No, the target shape will fully cover the output shape.
+) -> float:
+    r_bound = target_shape[1] / target_shape[0]
+    aspect_r = current_shape[1] / current_shape[0]
+    if r_bound >= 1.0:
+        if cover_target:
+            if aspect_r >= r_bound:
+                factor = min(target_shape) / min(current_shape)
+            elif aspect_r < 1.0:
+                factor = max(target_shape) / min(current_shape)
+            else:
+                factor = max(target_shape) / max(current_shape)
+        else:
+            if aspect_r >= r_bound:
+                factor = max(target_shape) / max(current_shape)
+            elif aspect_r < 1.0:
+                factor = min(target_shape) / max(current_shape)
+            else:
+                factor = min(target_shape) / min(current_shape)
+    else:
+        if cover_target:
+            if aspect_r <= r_bound:
+                factor = min(target_shape) / min(current_shape)
+            elif aspect_r > 1.0:
+                factor = max(target_shape) / min(current_shape)
+            else:
+                factor = max(target_shape) / max(current_shape)
+        else:
+            if aspect_r <= r_bound:
+                factor = max(target_shape) / max(current_shape)
+            elif aspect_r > 1.0:
+                factor = min(target_shape) / max(current_shape)
+            else:
+                factor = min(target_shape) / min(current_shape)
+    return factor
+def get_unique_embedder_keys_from_conditioner(conditioner):
+    keys = [x.input_key for x in conditioner.embedders if x.input_key is not None]
+    keys = [item for sublist in keys for item in sublist]  # Flatten list
+    return set(keys)
+def get_wh_with_fixed_shortest_side(w, h, size):
+    # size is smaller or equal to zero, we return original w h
+    if size is None or size <= 0:
+        return w, h
+    if w < h:
+        new_w = size
+        new_h = int(size * h / w)
+    else:
+        new_h = size
+        new_w = int(size * w / h)
+    return new_w, new_h
+def load_img_and_K(
+    image_path_or_size: Union[str, torch.Size],
+    size: Optional[Union[int, Tuple[int, int]]],
+    scale: float = 1.0,
+    center: Tuple[float, float] = (0.5, 0.5),
+    K: torch.Tensor | None = None,
+    size_stride: int = 1,
+    center_crop: bool = False,
+    image_as_tensor: bool = True,
+    context_rgb: np.ndarray | None = None,
+    device: str = "cuda",
+):
+    if isinstance(image_path_or_size, torch.Size):
+        image = Image.new("RGBA", image_path_or_size[::-1])
+    else:
+        image = Image.open(image_path_or_size).convert("RGBA")
+    w, h = image.size
+    if size is None:
+        size = (w, h)
+    image = np.array(image).astype(np.float32) / 255
+    if image.shape[-1] == 4:
+        rgb, alpha = image[:, :, :3], image[:, :, 3:]
+        if context_rgb is not None:
+            image = rgb * alpha + context_rgb * (1 - alpha)
+        else:
+            image = rgb * alpha + (1 - alpha)
+    image = image.transpose(2, 0, 1)
+    image = torch.from_numpy(image).to(dtype=torch.float32)
+    image = image.unsqueeze(0)
+    if isinstance(size, (tuple, list)):
+        # => if size is a tuple or list, we first rescale to fully cover the `size`
+        # area and then crop the `size` area from the rescale image
+        W, H = size
+    else:
+        # => if size is int, we rescale the image to fit the shortest side to size
+        # => if size is None, no rescaling is applied
+        W, H = get_wh_with_fixed_shortest_side(w, h, size)
+    W, H = (
+        math.floor(W / size_stride + 0.5) * size_stride,
+        math.floor(H / size_stride + 0.5) * size_stride,
+    )
+    rfs = get_resizing_factor((math.floor(H * scale), math.floor(W * scale)), (h, w))
+    resize_size = rh, rw = [int(np.ceil(rfs * s)) for s in (h, w)]
+    image = torch.nn.functional.interpolate(
+        image, resize_size, mode="area", antialias=False
+    )
+    if scale < 1.0:
+        pw = math.ceil((W - resize_size[1]) * 0.5)
+        ph = math.ceil((H - resize_size[0]) * 0.5)
+        image = F.pad(image, (pw, pw, ph, ph), "constant", 1.0)
+    cy_center = int(center[1] * image.shape[-2])
+    cx_center = int(center[0] * image.shape[-1])
+    if center_crop:
+        side = min(H, W)
+        ct = max(0, cy_center - side // 2)
+        cl = max(0, cx_center - side // 2)
+        ct = min(ct, image.shape[-2] - side)
+        cl = min(cl, image.shape[-1] - side)
+        image = TF.crop(image, top=ct, left=cl, height=side, width=side)
+    else:
+        ct = max(0, cy_center - H // 2)
+        cl = max(0, cx_center - W // 2)
+        ct = min(ct, image.shape[-2] - H)
+        cl = min(cl, image.shape[-1] - W)
+        image = TF.crop(image, top=ct, left=cl, height=H, width=W)
+    if K is not None:
+        K = K.clone()
+        if torch.all(K[:2, -1] >= 0) and torch.all(K[:2, -1] <= 1):
+            K[:2] *= K.new_tensor([rw, rh])[:, None]  # normalized K
+        else:
+            K[:2] *= K.new_tensor([rw / w, rh / h])[:, None]  # unnormalized K
+        K[:2, 2] -= K.new_tensor([cl, ct])
+    if image_as_tensor:
+        # tensor of shape (1, 3, H, W) with values ranging from (-1, 1)
+        image = image.to(device) * 2.0 - 1.0
+    else:
+        # PIL Image with values ranging from (0, 255)
+        image = image.permute(0, 2, 3, 1).numpy()[0]
+        image = Image.fromarray((image * 255).astype(np.uint8))
+    return image, K
+def transform_img_and_K(
+    image: torch.Tensor,
+    size: Union[int, Tuple[int, int]],
+    scale: float = 1.0,
+    center: Tuple[float, float] = (0.5, 0.5),
+    K: torch.Tensor | None = None,
+    size_stride: int = 1,
+    mode: str = "crop",
+):
+    assert mode in [
+        "crop",
+        "pad",
+        "stretch",
+    ], f"mode should be one of ['crop', 'pad', 'stretch'], got {mode}"
+    h, w = image.shape[-2:]
+    if isinstance(size, (tuple, list)):
+        # => if size is a tuple or list, we first rescale to fully cover the `size`
+        # area and then crop the `size` area from the rescale image
+        W, H = size
+    else:
+        # => if size is int, we rescale the image to fit the shortest side to size
+        # => if size is None, no rescaling is applied
+        W, H = get_wh_with_fixed_shortest_side(w, h, size)
+    W, H = (
+        math.floor(W / size_stride + 0.5) * size_stride,
+        math.floor(H / size_stride + 0.5) * size_stride,
+    )
+    if mode == "stretch":
+        rh, rw = H, W
+    else:
+        rfs = get_resizing_factor(
+            (H, W),
+            (h, w),
+            cover_target=mode != "pad",
+        )
+        (rh, rw) = [int(np.ceil(rfs * s)) for s in (h, w)]
+    rh, rw = int(rh / scale), int(rw / scale)
+    image = torch.nn.functional.interpolate(
+        image, (rh, rw), mode="area", antialias=False
+    )
+    cy_center = int(center[1] * image.shape[-2])
+    cx_center = int(center[0] * image.shape[-1])
+    if mode != "pad":
+        ct = max(0, cy_center - H // 2)
+        cl = max(0, cx_center - W // 2)
+        ct = min(ct, image.shape[-2] - H)
+        cl = min(cl, image.shape[-1] - W)
+        image = TF.crop(image, top=ct, left=cl, height=H, width=W)
+        pl, pt = 0, 0
+    else:
+        pt = max(0, H // 2 - cy_center)
+        pl = max(0, W // 2 - cx_center)
+        pb = max(0, H - pt - image.shape[-2])
+        pr = max(0, W - pl - image.shape[-1])
+        image = TF.pad(
+            image,
+            [pl, pt, pr, pb],
+        )
+        cl, ct = 0, 0
+    if K is not None:
+        K = K.clone()
+        # K[:, :2, 2] += K.new_tensor([pl, pt])
+        if torch.all(K[:, :2, -1] >= 0) and torch.all(K[:, :2, -1] <= 1):
+            K[:, :2] *= K.new_tensor([rw, rh])[None, :, None]  # normalized K
+        else:
+            K[:, :2] *= K.new_tensor([rw / w, rh / h])[None, :, None]  # unnormalized K
+        K[:, :2, 2] += K.new_tensor([pl - cl, pt - ct])
+    return image, K
+lowvram_mode = False
+def set_lowvram_mode(mode):
+    global lowvram_mode
+    lowvram_mode = mode
+def load_model(model, device: str = "cuda"):
+    model.to(device)
+def unload_model(model):
+    global lowvram_mode
+    if lowvram_mode:
+        model.cpu()
+        torch.cuda.empty_cache()
+def infer_prior_stats(
+    T,
+    num_input_frames,
+    num_total_frames,
+    version_dict,
+):
+    options = version_dict["options"]
+    chunk_strategy = options.get("chunk_strategy", "nearest")
+    T_first_pass = T[0] if isinstance(T, (list, tuple)) else T
+    T_second_pass = T[1] if isinstance(T, (list, tuple)) else T
+    # get traj_prior_c2ws for 2-pass sampling
+    if chunk_strategy.startswith("interp"):
+        # Start and end have alreay taken up two slots
+        # +1 means we need X + 1 prior frames to bound X times forwards for all test frames
+        # Tuning up `num_prior_frames_ratio` is helpful when you observe sudden jump in the
+        # generated frames due to insufficient prior frames. This option is effective for
+        # complicated trajectory and when `interp` strategy is used (usually semi-dense-view
+        # regime). Recommended range is [1.0 (default), 1.5].
+        if num_input_frames >= options.get("num_input_semi_dense", 9):
+            num_prior_frames = (
+                math.ceil(
+                    num_total_frames
+                    / (T_second_pass - 2)
+                    * options.get("num_prior_frames_ratio", 1.0)
+                )
+                + 1
+            )
+            if num_prior_frames + num_input_frames < T_first_pass:
+                num_prior_frames = T_first_pass - num_input_frames
+            num_prior_frames = max(
+                num_prior_frames,
+                options.get("num_prior_frames", 0),
+            )
+            T_first_pass = num_prior_frames + num_input_frames
+            if "gt" in chunk_strategy:
+                T_second_pass = T_second_pass + num_input_frames
+            # Dynamically update context window length.
+            version_dict["T"] = [T_first_pass, T_second_pass]
+        else:
+            num_prior_frames = (
+                math.ceil(
+                    num_total_frames
+                    / (
+                        T_second_pass
+                        - 2
+                        - (num_input_frames if "gt" in chunk_strategy else 0)
+                    )
+                    * options.get("num_prior_frames_ratio", 1.0)
+                )
+                + 1
+            )
+            if num_prior_frames + num_input_frames < T_first_pass:
+                num_prior_frames = T_first_pass - num_input_frames
+            num_prior_frames = max(
+                num_prior_frames,
+                options.get("num_prior_frames", 0),
+            )
+    else:
+        num_prior_frames = max(
+            T_first_pass - num_input_frames,
+            options.get("num_prior_frames", 0),
+        )
+        if num_input_frames >= options.get("num_input_semi_dense", 9):
+            T_first_pass = num_prior_frames + num_input_frames
+            # Dynamically update context window length.
+            version_dict["T"] = [T_first_pass, T_second_pass]
+    return num_prior_frames
+def infer_prior_inds(
+    c2ws,
+    num_prior_frames,
+    input_frame_indices,
+    options,
+):
+    chunk_strategy = options.get("chunk_strategy", "nearest")
+    if chunk_strategy.startswith("interp"):
+        prior_frame_indices = np.array(
+            [i for i in range(c2ws.shape[0]) if i not in input_frame_indices]
+        )
+        prior_frame_indices = prior_frame_indices[
+            np.ceil(
+                np.linspace(
+                    0, prior_frame_indices.shape[0] - 1, num_prior_frames, endpoint=True
+                )
+            ).astype(int)
+        ]  # having a ceil here is actually safer for corner case
+    else:
+        prior_frame_indices = []
+        while len(prior_frame_indices) < num_prior_frames:
+            closest_distance = np.abs(
+                np.arange(c2ws.shape[0])[None]
+                - np.concatenate(
+                    [np.array(input_frame_indices), np.array(prior_frame_indices)]
+                )[:, None]
+            ).min(0)
+            prior_frame_indices.append(np.argsort(closest_distance)[-1])
+    return np.sort(prior_frame_indices)
+def compute_relative_inds(
+    source_inds,
+    target_inds,
+):
+    assert len(source_inds) > 2
+    # compute relative indices of target_inds within source_inds
+    relative_inds = []
+    for ind in target_inds:
+        if ind in source_inds:
+            relative_ind = int(np.where(source_inds == ind)[0][0])
+        elif ind < source_inds[0]:
+            # extrapolate
+            relative_ind = -((source_inds[0] - ind) / (source_inds[1] - source_inds[0]))
+        elif ind > source_inds[-1]:
+            # extrapolate
+            relative_ind = len(source_inds) + (
+                (ind - source_inds[-1]) / (source_inds[-1] - source_inds[-2])
+            )
+        else:
+            # interpolate
+            lower_inds = source_inds[source_inds < ind]
+            upper_inds = source_inds[source_inds > ind]
+            if len(lower_inds) > 0 and len(upper_inds) > 0:
+                lower_ind = lower_inds[-1]
+                upper_ind = upper_inds[0]
+                relative_lower_ind = int(np.where(source_inds == lower_ind)[0][0])
+                relative_upper_ind = int(np.where(source_inds == upper_ind)[0][0])
+                relative_ind = relative_lower_ind + (ind - lower_ind) / (
+                    upper_ind - lower_ind
+                ) * (relative_upper_ind - relative_lower_ind)
+            else:
+                # Out of range
+                relative_inds.append(float("nan"))  # Or some other placeholder
+        relative_inds.append(relative_ind)
+    return relative_inds
+def find_nearest_source_inds(
+    source_c2ws,
+    target_c2ws,
+    nearest_num=1,
+    mode="translation",
+):
+    dists = get_camera_dist(source_c2ws, target_c2ws, mode=mode).cpu().numpy()
+    sorted_inds = np.argsort(dists, axis=0).T
+    return sorted_inds[:, :nearest_num]
+def chunk_input_and_test(
+    T,
+    input_c2ws,
+    test_c2ws,
+    input_ords,  # orders
+    test_ords,  # orders
+    options,
+    task: str = "img2img",
+    chunk_strategy: str = "gt",
+    gt_input_inds: list = [],
+):
+    M, N = input_c2ws.shape[0], test_c2ws.shape[0]
+    chunks = []
+    if chunk_strategy.startswith("gt"):
+        assert len(gt_input_inds) < T, (
+            f"Number of gt input frames {len(gt_input_inds)} should be "
+            f"less than {T} when `gt` chunking strategy is used."
+        )
+        assert (
+            list(range(M)) == gt_input_inds
+        ), "All input_c2ws should be gt when `gt` chunking strategy is used."
+        # LEGACY CHUNKING STRATEGY
+        # num_test_per_chunk = T - len(gt_input_inds)
+        # test_inds_per_chunk = [i for i in range(T) if i not in gt_input_inds]
+        # for i in range(0, test_c2ws.shape[0], num_test_per_chunk):
+        #     chunk = ["NULL"] * T
+        #     for j, k in enumerate(gt_input_inds):
+        #         chunk[k] = f"!{j:03d}"
+        #     for j, k in enumerate(
+        #         test_inds_per_chunk[: test_c2ws[i : i + num_test_per_chunk].shape[0]]
+        #     ):
+        #         chunk[k] = f">{i + j:03d}"
+        #     chunks.append(chunk)
+        num_test_seen = 0
+        while num_test_seen < N:
+            chunk = [f"!{i:03d}" for i in gt_input_inds]
+            if chunk_strategy != "gt" and num_test_seen > 0:
+                pseudo_num_ratio = options.get("pseudo_num_ratio", 0.33)
+                if (N - num_test_seen) >= math.floor(
+                    (T - len(gt_input_inds)) * pseudo_num_ratio
+                ):
+                    pseudo_num = math.ceil((T - len(gt_input_inds)) * pseudo_num_ratio)
+                else:
+                    pseudo_num = (T - len(gt_input_inds)) - (N - num_test_seen)
+                pseudo_num = min(pseudo_num, options.get("pseudo_num_max", 10000))
+                if "ltr" in chunk_strategy:
+                    chunk.extend(
+                        [
+                            f"!{i + len(gt_input_inds):03d}"
+                            for i in range(num_test_seen - pseudo_num, num_test_seen)
+                        ]
+                    )
+                elif "nearest" in chunk_strategy:
+                    source_inds = np.concatenate(
+                        [
+                            find_nearest_source_inds(
+                                test_c2ws[:num_test_seen],
+                                test_c2ws[num_test_seen:],
+                                nearest_num=1,  # pseudo_num,
+                                mode="rotation",
+                            ),
+                            find_nearest_source_inds(
+                                test_c2ws[:num_test_seen],
+                                test_c2ws[num_test_seen:],
+                                nearest_num=1,  # pseudo_num,
+                                mode="translation",
+                            ),
+                        ],
+                        axis=1,
+                    )
+                    ####### [HACK ALERT] keep running until pseudo num is stablized ########
+                    temp_pseudo_num = pseudo_num
+                    while True:
+                        nearest_source_inds = np.concatenate(
+                            [
+                                np.sort(
+                                    [
+                                        ind
+                                        for (ind, _) in collections.Counter(
+                                            [
+                                                item
+                                                for item in source_inds[
+                                                    : T
+                                                    - len(gt_input_inds)
+                                                    - temp_pseudo_num
+                                                ]
+                                                .flatten()
+                                                .tolist()
+                                                if item
+                                                != (
+                                                    num_test_seen - 1
+                                                )  # exclude the last one here
+                                            ]
+                                        ).most_common(pseudo_num - 1)
+                                    ],
+                                ).astype(int),
+                                [num_test_seen - 1],  # always keep the last one
+                            ]
+                        )
+                        if len(nearest_source_inds) >= temp_pseudo_num:
+                            break  # stablized
+                        else:
+                            temp_pseudo_num = len(nearest_source_inds)
+                    pseudo_num = len(nearest_source_inds)
+                    ########################################################################
+                    chunk.extend(
+                        [f"!{i + len(gt_input_inds):03d}" for i in nearest_source_inds]
+                    )
+                else:
+                    raise NotImplementedError(
+                        f"Chunking strategy {chunk_strategy} for the first pass is not implemented."
+                    )
+                chunk.extend(
+                    [
+                        f">{i:03d}"
+                        for i in range(
+                            num_test_seen,
+                            min(num_test_seen + T - len(gt_input_inds) - pseudo_num, N),
+                        )
+                    ]
+                )
+            else:
+                chunk.extend(
+                    [
+                        f">{i:03d}"
+                        for i in range(
+                            num_test_seen,
+                            min(num_test_seen + T - len(gt_input_inds), N),
+                        )
+                    ]
+                )
+            num_test_seen += sum([1 for c in chunk if c.startswith(">")])
+            if len(chunk) < T:
+                chunk.extend(["NULL"] * (T - len(chunk)))
+            chunks.append(chunk)
+    elif chunk_strategy.startswith("nearest"):
+        input_imgs = np.array([f"!{i:03d}" for i in range(M)])
+        test_imgs = np.array([f">{i:03d}" for i in range(N)])
+        match = re.match(r"^nearest-(\d+)$", chunk_strategy)
+        if match:
+            nearest_num = int(match.group(1))
+            assert (
+                nearest_num < T
+            ), f"Nearest number of {nearest_num} should be less than {T}."
+            source_inds = find_nearest_source_inds(
+                input_c2ws,
+                test_c2ws,
+                nearest_num=nearest_num,
+                mode="translation",  # during the second pass, consider translation only is enough
+            )
+            for i in range(0, N, T - nearest_num):
+                nearest_source_inds = np.sort(
+                    [
+                        ind
+                        for (ind, _) in collections.Counter(
+                            source_inds[i : i + T - nearest_num].flatten().tolist()
+                        ).most_common(nearest_num)
+                    ]
+                )
+                chunk = (
+                    input_imgs[nearest_source_inds].tolist()
+                    + test_imgs[i : i + T - nearest_num].tolist()
+                )
+                chunks.append(chunk + ["NULL"] * (T - len(chunk)))
+        else:
+            # do not always condition on gt cond frames
+            if "gt" not in chunk_strategy:
+                gt_input_inds = []
+            source_inds = find_nearest_source_inds(
+                input_c2ws,
+                test_c2ws,
+                nearest_num=1,
+                mode="translation",  # during the second pass, consider translation only is enough
+            )[:, 0]
+            test_inds_per_input = {}
+            for test_idx, input_idx in enumerate(source_inds):
+                if input_idx not in test_inds_per_input:
+                    test_inds_per_input[input_idx] = []
+                test_inds_per_input[input_idx].append(test_idx)
+            num_test_seen = 0
+            chunk = input_imgs[gt_input_inds].tolist()
+            candidate_input_inds = sorted(list(test_inds_per_input.keys()))
+            while num_test_seen < N:
+                input_idx = candidate_input_inds[0]
+                test_inds = test_inds_per_input[input_idx]
+                input_is_cond = input_idx in gt_input_inds
+                prefix_inds = [] if input_is_cond else [input_idx]
+                if len(chunk) == T - len(prefix_inds) or not candidate_input_inds:
+                    if chunk:
+                        chunk += ["NULL"] * (T - len(chunk))
+                        chunks.append(chunk)
+                        chunk = input_imgs[gt_input_inds].tolist()
+                    if num_test_seen >= N:
+                        break
+                    continue
+                candidate_chunk = (
+                    input_imgs[prefix_inds].tolist() + test_imgs[test_inds].tolist()
+                )
+                space_left = T - len(chunk)
+                if len(candidate_chunk) <= space_left:
+                    chunk.extend(candidate_chunk)
+                    num_test_seen += len(test_inds)
+                    candidate_input_inds.pop(0)
+                else:
+                    chunk.extend(candidate_chunk[:space_left])
+                    num_input_idx = 0 if input_is_cond else 1
+                    num_test_seen += space_left - num_input_idx
+                    test_inds_per_input[input_idx] = test_inds[
+                        space_left - num_input_idx :
+                    ]
+                if len(chunk) == T:
+                    chunks.append(chunk)
+                    chunk = input_imgs[gt_input_inds].tolist()
+            if chunk and chunk != input_imgs[gt_input_inds].tolist():
+                chunks.append(chunk + ["NULL"] * (T - len(chunk)))
+    elif chunk_strategy.startswith("interp"):
+        # `interp` chunk requires ordering info
+        assert input_ords is not None and test_ords is not None, (
+            "When using `interp` chunking strategy, ordering of input "
+            "and test frames should be provided."
+        )
+        # if chunk_strategy is `interp*`` and task is `img2trajvid*`, we will not
+        # use input views since their order info within target views is unknown
+        if "img2trajvid" in task:
+            assert (
+                list(range(len(gt_input_inds))) == gt_input_inds
+            ), "`img2trajvid` task should put `gt_input_inds` in start."
+            input_c2ws = input_c2ws[
+                [ind for ind in range(M) if ind not in gt_input_inds]
+            ]
+            input_ords = [
+                input_ords[ind] for ind in range(M) if ind not in gt_input_inds
+            ]
+            M = input_c2ws.shape[0]
+        input_ords = [0] + input_ords  # this is a  hack accounting for test views
+        # before the first input view
+        input_ords[-1] += 0.01  # this is a hack ensuring last test stop is included
+        # in the last forward when input_ords[-1] == test_ords[-1]
+        input_ords = np.array(input_ords)[:, None]
+        input_ords_ = np.concatenate([input_ords[1:], np.full((1, 1), np.inf)])
+        test_ords = np.array(test_ords)[None]
+        in_stop_ranges = np.logical_and(
+            np.repeat(input_ords, N, axis=1) <= np.repeat(test_ords, M + 1, axis=0),
+            np.repeat(input_ords_, N, axis=1) > np.repeat(test_ords, M + 1, axis=0),
+        )  # (M, N)
+        assert (in_stop_ranges.sum(1) <= T - 2).all(), (
+            "More input frames need to be sampled during the first pass to ensure "
+            f"#test frames during each forard in the second pass will not exceed {T - 2}."
+        )
+        if input_ords[1, 0] <= test_ords[0, 0]:
+            assert not in_stop_ranges[0].any()
+        if input_ords[-1, 0] >= test_ords[0, -1]:
+            assert not in_stop_ranges[-1].any()
+        gt_chunk = (
+            [f"!{i:03d}" for i in gt_input_inds] if "gt" in chunk_strategy else []
+        )
+        chunk = gt_chunk + []
+        # any test views before the first input views
+        if in_stop_ranges[0].any():
+            for j, in_range in enumerate(in_stop_ranges[0]):
+                if in_range:
+                    chunk.append(f">{j:03d}")
+        in_stop_ranges = in_stop_ranges[1:]
+        i = 0
+        base_i = len(gt_input_inds) if "img2trajvid" in task else 0
+        chunk.append(f"!{i + base_i:03d}")
+        while i < len(in_stop_ranges):
+            in_stop_range = in_stop_ranges[i]
+            if not in_stop_range.any():
+                i += 1
+                continue
+            input_left = i + 1 < M
+            space_left = T - len(chunk)
+            if sum(in_stop_range) + input_left <= space_left:
+                for j, in_range in enumerate(in_stop_range):
+                    if in_range:
+                        chunk.append(f">{j:03d}")
+                i += 1
+                if input_left:
+                    chunk.append(f"!{i + base_i:03d}")
+            else:
+                chunk += ["NULL"] * space_left
+                chunks.append(chunk)
+                chunk = gt_chunk + [f"!{i + base_i:03d}"]
+        if len(chunk) > 1:
+            chunk += ["NULL"] * (T - len(chunk))
+            chunks.append(chunk)
+    else:
+        raise NotImplementedError
+    (
+        input_inds_per_chunk,
+        input_sels_per_chunk,
+        test_inds_per_chunk,
+        test_sels_per_chunk,
+    ) = (
+        [],
+        [],
+        [],
+        [],
+    )
+    for chunk in chunks:
+        input_inds = [
+            int(img.removeprefix("!")) for img in chunk if img.startswith("!")
+        ]
+        input_sels = [chunk.index(img) for img in chunk if img.startswith("!")]
+        test_inds = [int(img.removeprefix(">")) for img in chunk if img.startswith(">")]
+        test_sels = [chunk.index(img) for img in chunk if img.startswith(">")]
+        input_inds_per_chunk.append(input_inds)
+        input_sels_per_chunk.append(input_sels)
+        test_inds_per_chunk.append(test_inds)
+        test_sels_per_chunk.append(test_sels)
+    if options.get("sampler_verbose", True):
+        def colorize(item):
+            if item.startswith("!"):
+                return f"{Fore.RED}{item}{Style.RESET_ALL}"  # Red for items starting with '!'
+            elif item.startswith(">"):
+                return f"{Fore.GREEN}{item}{Style.RESET_ALL}"  # Green for items starting with '>'
+            return item  # Default color if neither '!' nor '>'
+        print("\nchunks:")
+        for chunk in chunks:
+            print(", ".join(colorize(item) for item in chunk))
+    return (
+        chunks,
+        input_inds_per_chunk,  # ordering of input in raw sequence
+        input_sels_per_chunk,  # ordering of input in one-forward sequence of length T
+        test_inds_per_chunk,  # ordering of test in raw sequence
+        test_sels_per_chunk,  # oredering of test in one-forward sequence of length T
+    )
+def is_k_in_dict(d, k):
+    return any(map(lambda x: x.startswith(k), d.keys()))
+def get_k_from_dict(d, k):
+    media_d = {}
+    for key, value in d.items():
+        if key == k:
+            return value
+        if key.startswith(k):
+            media = key.split("/")[-1]
+            if media == "raw":
+                return value
+            media_d[media] = value
+    if len(media_d) == 0:
+        return torch.tensor([])
+    assert (
+        len(media_d) == 1
+    ), f"multiple media found in {d} for key {k}: {media_d.keys()}"
+    return media_d[media]
+def update_kv_for_dict(d, k, v):
+    for key in d.keys():
+        if key.startswith(k):
+            d[key] = v
+    return d
+def extend_dict(ds, d):
+    for key in d.keys():
+        if key in ds:
+            ds[key] = torch.cat([ds[key], d[key]], 0)
+        else:
+            ds[key] = d[key]
+    return ds
+def replace_or_include_input_for_dict(
+    samples,
+    test_indices,
+    imgs,
+    c2w,
+    K,
+):
+    samples_new = {}
+    for sample, value in samples.items():
+        if "rgb" in sample:
+            imgs[test_indices] = (
+                value[test_indices] if value.shape[0] == imgs.shape[0] else value
+            ).to(device=imgs.device, dtype=imgs.dtype)
+            samples_new[sample] = imgs
+        elif "c2w" in sample:
+            c2w[test_indices] = (
+                value[test_indices] if value.shape[0] == c2w.shape[0] else value
+            ).to(device=c2w.device, dtype=c2w.dtype)
+            samples_new[sample] = c2w
+        elif "intrinsics" in sample:
+            K[test_indices] = (
+                value[test_indices] if value.shape[0] == K.shape[0] else value
+            ).to(device=K.device, dtype=K.dtype)
+            samples_new[sample] = K
+        else:
+            samples_new[sample] = value
+    return samples_new
+def decode_output(
+    samples,
+    T,
+    indices=None,
+):
+    # decode model output into dict if it is not
+    if isinstance(samples, dict):
+        # model with postprocessor and outputs dict
+        for sample, value in samples.items():
+            if isinstance(value, torch.Tensor):
+                value = value.detach().cpu()
+            elif isinstance(value, np.ndarray):
+                value = torch.from_numpy(value)
+            else:
+                value = torch.tensor(value)
+            if indices is not None and value.shape[0] == T:
+                value = value[indices]
+            samples[sample] = value
+    else:
+        # model without postprocessor and outputs tensor (rgb)
+        samples = samples.detach().cpu()
+        if indices is not None and samples.shape[0] == T:
+            samples = samples[indices]
+        samples = {"samples-rgb/image": samples}
+    return samples
+def save_output(
+    samples,
+    save_path,
+    video_save_fps=2,
+):
+    os.makedirs(save_path, exist_ok=True)
+    for sample in samples:
+        media_type = "video"
+        if "/" in sample:
+            sample_, media_type = sample.split("/")
+        else:
+            sample_ = sample
+        value = samples[sample]
+        if isinstance(value, torch.Tensor):
+            value = value.detach().cpu()
+        elif isinstance(value, np.ndarray):
+            value = torch.from_numpy(value)
+        else:
+            value = torch.tensor(value)
+        if media_type == "image":
+            value = (value.permute(0, 2, 3, 1) + 1) / 2.0
+            value = (value * 255).clamp(0, 255).to(torch.uint8)
+            iio.imwrite(
+                os.path.join(save_path, f"{sample_}.mp4")
+                if sample_
+                else f"{save_path}.mp4",
+                value,
+                fps=video_save_fps,
+                macro_block_size=1,
+                ffmpeg_log_level="error",
+            )
+            os.makedirs(os.path.join(save_path, sample_), exist_ok=True)
+            for i, s in enumerate(value):
+                iio.imwrite(
+                    os.path.join(save_path, sample_, f"{i:03d}.png"),
+                    s,
+                )
+        elif media_type == "video":
+            value = (value.permute(0, 2, 3, 1) + 1) / 2.0
+            value = (value * 255).clamp(0, 255).to(torch.uint8)
+            iio.imwrite(
+                os.path.join(save_path, f"{sample_}.mp4"),
+                value,
+                fps=video_save_fps,
+                macro_block_size=1,
+                ffmpeg_log_level="error",
+            )
+        elif media_type == "raw":
+            torch.save(
+                value,
+                os.path.join(save_path, f"{sample_}.pt"),
+            )
+        else:
+            pass
+def create_transforms_simple(save_path, img_paths, img_whs, c2ws, Ks):
+    import os.path as osp
+    out_frames = []
+    for img_path, img_wh, c2w, K in zip(img_paths, img_whs, c2ws, Ks):
+        out_frame = {
+            "fl_x": K[0][0].item(),
+            "fl_y": K[1][1].item(),
+            "cx": K[0][2].item(),
+            "cy": K[1][2].item(),
+            "w": img_wh[0].item(),
+            "h": img_wh[1].item(),
+            "file_path": f"./{osp.relpath(img_path, start=save_path)}"
+            if img_path is not None
+            else None,
+            "transform_matrix": c2w.tolist(),
+        }
+        out_frames.append(out_frame)
+    out = {
+        # "camera_model": "PINHOLE",
+        "orientation_override": "none",
+        "frames": out_frames,
+    }
+    with open(osp.join(save_path, "transforms.json"), "w") as of:
+        json.dump(out, of, indent=5)
+class GradioTrackedSampler(EulerEDMSampler):
+    """
+    A thin wrapper around the EulerEDMSampler that allows tracking progress and
+    aborting sampling for gradio demo.
+    """
+    def __init__(self, abort_event: threading.Event, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.abort_event = abort_event
+    def __call__(  # type: ignore
+        self,
+        denoiser,
+        x: torch.Tensor,
+        scale: float | torch.Tensor,
+        cond: dict,
+        uc: dict | None = None,
+        num_steps: int | None = None,
+        verbose: bool = True,
+        global_pbar: gr.Progress | None = None,
+        **guider_kwargs,
+    ) -> torch.Tensor | None:
+        uc = cond if uc is None else uc
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x,
+            cond,
+            uc,
+            num_steps,
+        )
+        for i in self.get_sigma_gen(num_sigmas, verbose=verbose):
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                if self.s_tmin <= sigmas[i] <= self.s_tmax
+                else 0.0
+            )
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                scale,
+                cond,
+                uc,
+                gamma,
+                **guider_kwargs,
+            )
+            # Allow tracking progress in gradio demo.
+            if global_pbar is not None:
+                global_pbar.update()
+            # Allow aborting sampling in gradio demo.
+            if self.abort_event.is_set():
+                return None
+        return x
+def create_samplers(
+    guider_types: int | list[int],
+    discretization,
+    num_frames: list[int] | None,
+    num_steps: int,
+    cfg_min: float = 1.0,
+    device: str | torch.device = "cuda",
+    abort_event: threading.Event | None = None,
+):
+    guider_mapping = {
+        0: VanillaCFG,
+        1: MultiviewCFG,
+        2: MultiviewTemporalCFG,
+    }
+    samplers = []
+    if not isinstance(guider_types, (list, tuple)):
+        guider_types = [guider_types]
+    for i, guider_type in enumerate(guider_types):
+        if guider_type not in guider_mapping:
+            raise ValueError(
+                f"Invalid guider type {guider_type}. Must be one of {list(guider_mapping.keys())}"
+            )
+        guider_cls = guider_mapping[guider_type]
+        guider_args = ()
+        if guider_type > 0:
+            guider_args += (cfg_min,)
+            if guider_type == 2:
+                assert num_frames is not None
+                guider_args = (num_frames[i], cfg_min)
+        guider = guider_cls(*guider_args)
+        if abort_event is not None:
+            sampler = GradioTrackedSampler(
+                abort_event,
+                discretization=discretization,
+                guider=guider,
+                num_steps=num_steps,
+                s_churn=0.0,
+                s_tmin=0.0,
+                s_tmax=999.0,
+                s_noise=1.0,
+                verbose=True,
+                device=device,
+            )
+        else:
+            sampler = EulerEDMSampler(
+                discretization=discretization,
+                guider=guider,
+                num_steps=num_steps,
+                s_churn=0.0,
+                s_tmin=0.0,
+                s_tmax=999.0,
+                s_noise=1.0,
+                verbose=True,
+                device=device,
+            )
+        samplers.append(sampler)
+    return samplers
+def get_value_dict(
+    curr_imgs,
+    curr_imgs_clip,
+    curr_input_frame_indices,
+    curr_c2ws,
+    curr_Ks,
+    curr_input_camera_indices,
+    all_c2ws,
+    camera_scale=2.0,
+):
+    assert sorted(curr_input_camera_indices) == sorted(
+        range(len(curr_input_camera_indices))
+    )
+    H, W, T, F = curr_imgs.shape[-2], curr_imgs.shape[-1], len(curr_imgs), 8
+    value_dict = {}
+    value_dict["cond_frames_without_noise"] = curr_imgs_clip[curr_input_frame_indices]
+    value_dict["cond_frames"] = curr_imgs + 0.0 * torch.randn_like(curr_imgs)
+    value_dict["cond_frames_mask"] = torch.zeros(T, dtype=torch.bool)
+    value_dict["cond_frames_mask"][curr_input_frame_indices] = True
+    value_dict["cond_aug"] = 0.0
+    c2w = to_hom_pose(curr_c2ws.float())
+    w2c = torch.linalg.inv(c2w)
+    # camera centering
+    ref_c2ws = all_c2ws
+    camera_dist_2med = torch.norm(
+        ref_c2ws[:, :3, 3] - ref_c2ws[:, :3, 3].median(0, keepdim=True).values,
+        dim=-1,
+    )
+    valid_mask = camera_dist_2med <= torch.clamp(
+        torch.quantile(camera_dist_2med, 0.97) * 10,
+        max=1e6,
+    )
+    c2w[:, :3, 3] -= ref_c2ws[valid_mask, :3, 3].mean(0, keepdim=True)
+    w2c = torch.linalg.inv(c2w)
+    # camera normalization
+    camera_dists = c2w[:, :3, 3].clone()
+    translation_scaling_factor = (
+        camera_scale
+        if torch.isclose(
+            torch.norm(camera_dists[0]),
+            torch.zeros(1),
+            atol=1e-5,
+        ).any()
+        else (camera_scale / torch.norm(camera_dists[0]))
+    )
+    w2c[:, :3, 3] *= translation_scaling_factor
+    c2w[:, :3, 3] *= translation_scaling_factor
+    value_dict["plucker_coordinate"], _ = get_plucker_coordinates(
+        extrinsics_src=w2c[0],
+        extrinsics=w2c,
+        intrinsics=curr_Ks.float().clone(),
+        mode="plucker",
+        rel_zero_translation=True,
+        target_size=(H // F, W // F),
+        return_grid_cam=True,
+    )
+    value_dict["c2w"] = c2w
+    value_dict["K"] = curr_Ks
+    value_dict["camera_mask"] = torch.zeros(T, dtype=torch.bool)
+    value_dict["camera_mask"][curr_input_camera_indices] = True
+    return value_dict
+def do_sample(
+    model,
+    ae,
+    conditioner,
+    denoiser,
+    sampler,
+    value_dict,
+    H,
+    W,
+    C,
+    F,
+    T,
+    cfg,
+    encoding_t=1,
+    decoding_t=1,
+    verbose=True,
+    global_pbar=None,
+    **_,
+):
+    imgs = value_dict["cond_frames"].to("cuda")
+    input_masks = value_dict["cond_frames_mask"].to("cuda")
+    pluckers = value_dict["plucker_coordinate"].to("cuda")
+    num_samples = [1, T]
+    with torch.inference_mode(), torch.autocast("cuda"):
+        load_model(ae)
+        load_model(conditioner)
+        latents = torch.nn.functional.pad(
+            ae.encode(imgs[input_masks], encoding_t), (0, 0, 0, 0, 0, 1), value=1.0
+        )
+        c_crossattn = repeat(conditioner(imgs[input_masks]).mean(0), "d -> n 1 d", n=T)
+        uc_crossattn = torch.zeros_like(c_crossattn)
+        c_replace = latents.new_zeros(T, *latents.shape[1:])
+        c_replace[input_masks] = latents
+        uc_replace = torch.zeros_like(c_replace)
+        c_concat = torch.cat(
+            [
+                repeat(
+                    input_masks,
+                    "n -> n 1 h w",
+                    h=pluckers.shape[2],
+                    w=pluckers.shape[3],
+                ),
+                pluckers,
+            ],
+            1,
+        )
+        uc_concat = torch.cat(
+            [pluckers.new_zeros(T, 1, *pluckers.shape[-2:]), pluckers], 1
+        )
+        c_dense_vector = pluckers
+        uc_dense_vector = c_dense_vector
+        # TODO(hangg): concat and dense are problematic.
+        c = {
+            "crossattn": c_crossattn,
+            "replace": c_replace,
+            "concat": c_concat,
+            "dense_vector": c_dense_vector,
+        }
+        uc = {
+            "crossattn": uc_crossattn,
+            "replace": uc_replace,
+            "concat": uc_concat,
+            "dense_vector": uc_dense_vector,
+        }
+        unload_model(ae)
+        unload_model(conditioner)
+        additional_model_inputs = {"num_frames": T}
+        additional_sampler_inputs = {
+            "c2w": value_dict["c2w"].to("cuda"),
+            "K": value_dict["K"].to("cuda"),
+            "input_frame_mask": value_dict["cond_frames_mask"].to("cuda"),
+        }
+        if global_pbar is not None:
+            additional_sampler_inputs["global_pbar"] = global_pbar
+        shape = (math.prod(num_samples), C, H // F, W // F)
+        randn = torch.randn(shape).to("cuda")
+        load_model(model)
+        samples_z = sampler(
+            lambda input, sigma, c: denoiser(
+                model,
+                input,
+                sigma,
+                c,
+                **additional_model_inputs,
+            ),
+            randn,
+            scale=cfg,
+            cond=c,
+            uc=uc,
+            verbose=verbose,
+            **additional_sampler_inputs,
+        )
+        if samples_z is None:
+            return
+        unload_model(model)
+        load_model(ae)
+        samples = ae.decode(samples_z, decoding_t)
+        unload_model(ae)
+    return samples
+def run_one_scene(
+    task,
+    version_dict,
+    model,
+    ae,
+    conditioner,
+    denoiser,
+    image_cond,
+    camera_cond,
+    save_path,
+    use_traj_prior,
+    traj_prior_Ks,
+    traj_prior_c2ws,
+    seed=23,
+    gradio=False,
+    abort_event=None,
+    first_pass_pbar=None,
+    second_pass_pbar=None,
+):
+    H, W, T, C, F, options = (
+        version_dict["H"],
+        version_dict["W"],
+        version_dict["T"],
+        version_dict["C"],
+        version_dict["f"],
+        version_dict["options"],
+    )
+    if isinstance(image_cond, str):
+        image_cond = {"img": [image_cond]}
+    imgs_clip, imgs, img_size = [], [], None
+    for i, (img, K) in enumerate(zip(image_cond["img"], camera_cond["K"])):
+        if isinstance(img, str) or img is None:
+            img, K = load_img_and_K(img or img_size, None, K=K, device="cpu")  # type: ignore
+            img_size = img.shape[-2:]
+            if options.get("L_short", -1) == -1:
+                img, K = transform_img_and_K(
+                    img,
+                    (W, H),
+                    K=K[None],
+                    mode=(
+                        options.get("transform_input", "crop")
+                        if i in image_cond["input_indices"]
+                        else options.get("transform_target", "crop")
+                    ),
+                    scale=(
+                        1.0
+                        if i in image_cond["input_indices"]
+                        else options.get("transform_scale", 1.0)
+                    ),
+                )
+            else:
+                downsample = 3
+                assert options["L_short"] % F * 2**downsample == 0, (
+                    "Short side of the image should be divisible by "
+                    f"F*2**{downsample}={F * 2**downsample}."
+                )
+                img, K = transform_img_and_K(
+                    img,
+                    options["L_short"],
+                    K=K[None],
+                    size_stride=F * 2**downsample,
+                    mode=(
+                        options.get("transform_input", "crop")
+                        if i in image_cond["input_indices"]
+                        else options.get("transform_target", "crop")
+                    ),
+                    scale=(
+                        1.0
+                        if i in image_cond["input_indices"]
+                        else options.get("transform_scale", 1.0)
+                    ),
+                )
+                version_dict["W"] = W = img.shape[-1]
+                version_dict["H"] = H = img.shape[-2]
+            K = K[0]
+            K[0] /= W
+            K[1] /= H
+            camera_cond["K"][i] = K
+            img_clip = img
+        elif isinstance(img, np.ndarray):
+            img_size = torch.Size(img.shape[:2])
+            img = torch.as_tensor(img).permute(2, 0, 1)
+            img = img.unsqueeze(0)
+            img = img / 255.0 * 2.0 - 1.0
+            if not gradio:
+                img, K = transform_img_and_K(img, (W, H), K=K[None])
+                assert K is not None
+                K = K[0]
+            K[0] /= W
+            K[1] /= H
+            camera_cond["K"][i] = K
+            img_clip = img
+        else:
+            assert (
+                False
+            ), f"Variable `img` got {type(img)} type which is not supported!!!"
+        imgs_clip.append(img_clip)
+        imgs.append(img)
+    imgs_clip = torch.cat(imgs_clip, dim=0)
+    imgs = torch.cat(imgs, dim=0)
+    if traj_prior_Ks is not None:
+        assert img_size is not None
+        for i, prior_k in enumerate(traj_prior_Ks):
+            img, prior_k = load_img_and_K(img_size, None, K=prior_k, device="cpu")  # type: ignore
+            img, prior_k = transform_img_and_K(
+                img,
+                (W, H),
+                K=prior_k[None],
+                mode=options.get(
+                    "transform_target", "crop"
+                ),  # mode for prior is always same as target
+                scale=options.get(
+                    "transform_scale", 1.0
+                ),  # scale for prior is always same as target
+            )
+            prior_k = prior_k[0]
+            prior_k[0] /= W
+            prior_k[1] /= H
+            traj_prior_Ks[i] = prior_k
+    options["num_frames"] = T
+    discretization = denoiser.discretization
+    torch.cuda.empty_cache()
+    seed_everything(seed)
+    # Get Data
+    input_indices = image_cond["input_indices"]
+    input_imgs = imgs[input_indices]
+    input_imgs_clip = imgs_clip[input_indices]
+    input_c2ws = camera_cond["c2w"][input_indices]
+    input_Ks = camera_cond["K"][input_indices]
+    test_indices = [i for i in range(len(imgs)) if i not in input_indices]
+    test_imgs = imgs[test_indices]
+    test_imgs_clip = imgs_clip[test_indices]
+    test_c2ws = camera_cond["c2w"][test_indices]
+    test_Ks = camera_cond["K"][test_indices]
+    if options.get("save_input", True):
+        save_output(
+            {"/image": input_imgs},
+            save_path=os.path.join(save_path, "input"),
+            video_save_fps=2,
+        )
+    if not use_traj_prior:
+        chunk_strategy = options.get("chunk_strategy", "gt")
+        (
+            _,
+            input_inds_per_chunk,
+            input_sels_per_chunk,
+            test_inds_per_chunk,
+            test_sels_per_chunk,
+        ) = chunk_input_and_test(
+            T,
+            input_c2ws,
+            test_c2ws,
+            input_indices,
+            test_indices,
+            options=options,
+            task=task,
+            chunk_strategy=chunk_strategy,
+            gt_input_inds=list(range(input_c2ws.shape[0])),
+        )
+        print(
+            f"One pass - chunking with `{chunk_strategy}` strategy: total "
+            f"{len(input_inds_per_chunk)} forward(s) ..."
+        )
+        all_samples = {}
+        all_test_inds = []
+        for i, (
+            chunk_input_inds,
+            chunk_input_sels,
+            chunk_test_inds,
+            chunk_test_sels,
+        ) in tqdm(
+            enumerate(
+                zip(
+                    input_inds_per_chunk,
+                    input_sels_per_chunk,
+                    test_inds_per_chunk,
+                    test_sels_per_chunk,
+                )
+            ),
+            total=len(input_inds_per_chunk),
+            leave=False,
+        ):
+            (
+                curr_input_sels,
+                curr_test_sels,
+                curr_input_maps,
+                curr_test_maps,
+            ) = pad_indices(
+                chunk_input_sels,
+                chunk_test_sels,
+                T=T,
+                padding_mode=options.get("t_padding_mode", "last"),
+            )
+            curr_imgs, curr_imgs_clip, curr_c2ws, curr_Ks = [
+                assemble(
+                    input=x[chunk_input_inds],
+                    test=y[chunk_test_inds],
+                    input_maps=curr_input_maps,
+                    test_maps=curr_test_maps,
+                )
+                for x, y in zip(
+                    [
+                        torch.cat(
+                            [
+                                input_imgs,
+                                get_k_from_dict(all_samples, "samples-rgb").to(
+                                    input_imgs.device
+                                ),
+                            ],
+                            dim=0,
+                        ),
+                        torch.cat(
+                            [
+                                input_imgs_clip,
+                                get_k_from_dict(all_samples, "samples-rgb").to(
+                                    input_imgs.device
+                                ),
+                            ],
+                            dim=0,
+                        ),
+                        torch.cat([input_c2ws, test_c2ws[all_test_inds]], dim=0),
+                        torch.cat([input_Ks, test_Ks[all_test_inds]], dim=0),
+                    ],  # procedually append generated prior views to the input views
+                    [test_imgs, test_imgs_clip, test_c2ws, test_Ks],
+                )
+            ]
+            value_dict = get_value_dict(
+                curr_imgs.to("cuda"),
+                curr_imgs_clip.to("cuda"),
+                curr_input_sels
+                + [
+                    sel
+                    for (ind, sel) in zip(
+                        np.array(chunk_test_inds)[curr_test_maps[curr_test_maps != -1]],
+                        curr_test_sels,
+                    )
+                    if test_indices[ind] in image_cond["input_indices"]
+                ],
+                curr_c2ws,
+                curr_Ks,
+                curr_input_sels
+                + [
+                    sel
+                    for (ind, sel) in zip(
+                        np.array(chunk_test_inds)[curr_test_maps[curr_test_maps != -1]],
+                        curr_test_sels,
+                    )
+                    if test_indices[ind] in camera_cond["input_indices"]
+                ],
+                all_c2ws=camera_cond["c2w"],
+            )
+            samplers = create_samplers(
+                options["guider_types"],
+                discretization,
+                [len(curr_imgs)],
+                options["num_steps"],
+                options["cfg_min"],
+                abort_event=abort_event,
+            )
+            assert len(samplers) == 1
+            samples = do_sample(
+                model,
+                ae,
+                conditioner,
+                denoiser,
+                samplers[0],
+                value_dict,
+                H,
+                W,
+                C,
+                F,
+                T=len(curr_imgs),
+                cfg=(
+                    options["cfg"][0]
+                    if isinstance(options["cfg"], (list, tuple))
+                    else options["cfg"]
+                ),
+                **{k: options[k] for k in options if k not in ["cfg", "T"]},
+            )
+            samples = decode_output(
+                samples, len(curr_imgs), chunk_test_sels
+            )  # decode into dict
+            if options.get("save_first_pass", False):
+                save_output(
+                    replace_or_include_input_for_dict(
+                        samples,
+                        chunk_test_sels,
+                        curr_imgs,
+                        curr_c2ws,
+                        curr_Ks,
+                    ),
+                    save_path=os.path.join(save_path, "first-pass", f"forward_{i}"),
+                    video_save_fps=2,
+                )
+            extend_dict(all_samples, samples)
+            all_test_inds.extend(chunk_test_inds)
+    else:
+        assert traj_prior_c2ws is not None, (
+            "`traj_prior_c2ws` should be set when using 2-pass sampling. One "
+            "potential reason is that the amount of input frames is larger than "
+            "T. Set `num_prior_frames` manually to overwrite the infered stats."
+        )
+        traj_prior_c2ws = torch.as_tensor(
+            traj_prior_c2ws,
+            device=input_c2ws.device,
+            dtype=input_c2ws.dtype,
+        )
+        if traj_prior_Ks is None:
+            traj_prior_Ks = test_Ks[:1].repeat_interleave(
+                traj_prior_c2ws.shape[0], dim=0
+            )
+        traj_prior_imgs = imgs.new_zeros(traj_prior_c2ws.shape[0], *imgs.shape[1:])
+        traj_prior_imgs_clip = imgs_clip.new_zeros(
+            traj_prior_c2ws.shape[0], *imgs_clip.shape[1:]
+        )
+        # ---------------------------------- first pass ----------------------------------
+        T_first_pass = T[0] if isinstance(T, (list, tuple)) else T
+        T_second_pass = T[1] if isinstance(T, (list, tuple)) else T
+        chunk_strategy_first_pass = options.get(
+            "chunk_strategy_first_pass", "gt-nearest"
+        )
+        (
+            _,
+            input_inds_per_chunk,
+            input_sels_per_chunk,
+            prior_inds_per_chunk,
+            prior_sels_per_chunk,
+        ) = chunk_input_and_test(
+            T_first_pass,
+            input_c2ws,
+            traj_prior_c2ws,
+            input_indices,
+            image_cond["prior_indices"],
+            options=options,
+            task=task,
+            chunk_strategy=chunk_strategy_first_pass,
+            gt_input_inds=list(range(input_c2ws.shape[0])),
+        )
+        print(
+            f"Two passes (first) - chunking with `{chunk_strategy_first_pass}` strategy: total "
+            f"{len(input_inds_per_chunk)} forward(s) ..."
+        )
+        all_samples = {}
+        all_prior_inds = []
+        for i, (
+            chunk_input_inds,
+            chunk_input_sels,
+            chunk_prior_inds,
+            chunk_prior_sels,
+        ) in tqdm(
+            enumerate(
+                zip(
+                    input_inds_per_chunk,
+                    input_sels_per_chunk,
+                    prior_inds_per_chunk,
+                    prior_sels_per_chunk,
+                )
+            ),
+            total=len(input_inds_per_chunk),
+            leave=False,
+        ):
+            (
+                curr_input_sels,
+                curr_prior_sels,
+                curr_input_maps,
+                curr_prior_maps,
+            ) = pad_indices(
+                chunk_input_sels,
+                chunk_prior_sels,
+                T=T_first_pass,
+                padding_mode=options.get("t_padding_mode", "last"),
+            )
+            curr_imgs, curr_imgs_clip, curr_c2ws, curr_Ks = [
+                assemble(
+                    input=x[chunk_input_inds],
+                    test=y[chunk_prior_inds],
+                    input_maps=curr_input_maps,
+                    test_maps=curr_prior_maps,
+                )
+                for x, y in zip(
+                    [
+                        torch.cat(
+                            [
+                                input_imgs,
+                                get_k_from_dict(all_samples, "samples-rgb").to(
+                                    input_imgs.device
+                                ),
+                            ],
+                            dim=0,
+                        ),
+                        torch.cat(
+                            [
+                                input_imgs_clip,
+                                get_k_from_dict(all_samples, "samples-rgb").to(
+                                    input_imgs.device
+                                ),
+                            ],
+                            dim=0,
+                        ),
+                        torch.cat([input_c2ws, traj_prior_c2ws[all_prior_inds]], dim=0),
+                        torch.cat([input_Ks, traj_prior_Ks[all_prior_inds]], dim=0),
+                    ],  # procedually append generated prior views to the input views
+                    [
+                        traj_prior_imgs,
+                        traj_prior_imgs_clip,
+                        traj_prior_c2ws,
+                        traj_prior_Ks,
+                    ],
+                )
+            ]
+            value_dict = get_value_dict(
+                curr_imgs.to("cuda"),
+                curr_imgs_clip.to("cuda"),
+                curr_input_sels,
+                curr_c2ws,
+                curr_Ks,
+                list(range(T_first_pass)),
+                all_c2ws=camera_cond["c2w"],  # traj_prior_c2ws,
+            )
+            samplers = create_samplers(
+                options["guider_types"],
+                discretization,
+                [T_first_pass, T_second_pass],
+                options["num_steps"],
+                options["cfg_min"],
+                abort_event=abort_event,
+            )
+            samples = do_sample(
+                model,
+                ae,
+                conditioner,
+                denoiser,
+                (
+                    samplers[1]
+                    if len(samplers) > 1
+                    and options.get("ltr_first_pass", False)
+                    and chunk_strategy_first_pass != "gt"
+                    and i > 0
+                    else samplers[0]
+                ),
+                value_dict,
+                H,
+                W,
+                C,
+                F,
+                cfg=(
+                    options["cfg"][0]
+                    if isinstance(options["cfg"], (list, tuple))
+                    else options["cfg"]
+                ),
+                T=T_first_pass,
+                global_pbar=first_pass_pbar,
+                **{k: options[k] for k in options if k not in ["cfg", "T", "sampler"]},
+            )
+            if samples is None:
+                return
+            samples = decode_output(
+                samples, T_first_pass, chunk_prior_sels
+            )  # decode into dict
+            extend_dict(all_samples, samples)
+            all_prior_inds.extend(chunk_prior_inds)
+        if options.get("save_first_pass", True):
+            save_output(
+                all_samples,
+                save_path=os.path.join(save_path, "first-pass"),
+                video_save_fps=5,
+            )
+            video_path_0 = os.path.join(save_path, "first-pass", "samples-rgb.mp4")
+            yield video_path_0
+        # ---------------------------------- second pass ----------------------------------
+        prior_indices = image_cond["prior_indices"]
+        assert (
+            prior_indices is not None
+        ), "`prior_frame_indices` needs to be set if using 2-pass sampling."
+        prior_argsort = np.argsort(input_indices + prior_indices).tolist()
+        prior_indices = np.array(input_indices + prior_indices)[prior_argsort].tolist()
+        gt_input_inds = [prior_argsort.index(i) for i in range(input_c2ws.shape[0])]
+        traj_prior_imgs = torch.cat(
+            [input_imgs, get_k_from_dict(all_samples, "samples-rgb")], dim=0
+        )[prior_argsort]
+        traj_prior_imgs_clip = torch.cat(
+            [
+                input_imgs_clip,
+                get_k_from_dict(all_samples, "samples-rgb"),
+            ],
+            dim=0,
+        )[prior_argsort]
+        traj_prior_c2ws = torch.cat([input_c2ws, traj_prior_c2ws], dim=0)[prior_argsort]
+        traj_prior_Ks = torch.cat([input_Ks, traj_prior_Ks], dim=0)[prior_argsort]
+        update_kv_for_dict(all_samples, "samples-rgb", traj_prior_imgs)
+        update_kv_for_dict(all_samples, "samples-c2ws", traj_prior_c2ws)
+        update_kv_for_dict(all_samples, "samples-intrinsics", traj_prior_Ks)
+        chunk_strategy = options.get("chunk_strategy", "nearest")
+        (
+            _,
+            prior_inds_per_chunk,
+            prior_sels_per_chunk,
+            test_inds_per_chunk,
+            test_sels_per_chunk,
+        ) = chunk_input_and_test(
+            T_second_pass,
+            traj_prior_c2ws,
+            test_c2ws,
+            prior_indices,
+            test_indices,
+            options=options,
+            task=task,
+            chunk_strategy=chunk_strategy,
+            gt_input_inds=gt_input_inds,
+        )
+        print(
+            f"Two passes (second) - chunking with `{chunk_strategy}` strategy: total "
+            f"{len(prior_inds_per_chunk)} forward(s) ..."
+        )
+        all_samples = {}
+        all_test_inds = []
+        for i, (
+            chunk_prior_inds,
+            chunk_prior_sels,
+            chunk_test_inds,
+            chunk_test_sels,
+        ) in tqdm(
+            enumerate(
+                zip(
+                    prior_inds_per_chunk,
+                    prior_sels_per_chunk,
+                    test_inds_per_chunk,
+                    test_sels_per_chunk,
+                )
+            ),
+            total=len(prior_inds_per_chunk),
+            leave=False,
+        ):
+            (
+                curr_prior_sels,
+                curr_test_sels,
+                curr_prior_maps,
+                curr_test_maps,
+            ) = pad_indices(
+                chunk_prior_sels,
+                chunk_test_sels,
+                T=T_second_pass,
+                padding_mode="last",
+            )
+            curr_imgs, curr_imgs_clip, curr_c2ws, curr_Ks = [
+                assemble(
+                    input=x[chunk_prior_inds],
+                    test=y[chunk_test_inds],
+                    input_maps=curr_prior_maps,
+                    test_maps=curr_test_maps,
+                )
+                for x, y in zip(
+                    [
+                        traj_prior_imgs,
+                        traj_prior_imgs_clip,
+                        traj_prior_c2ws,
+                        traj_prior_Ks,
+                    ],
+                    [test_imgs, test_imgs_clip, test_c2ws, test_Ks],
+                )
+            ]
+            value_dict = get_value_dict(
+                curr_imgs.to("cuda"),
+                curr_imgs_clip.to("cuda"),
+                curr_prior_sels,
+                curr_c2ws,
+                curr_Ks,
+                list(range(T_second_pass)),
+                all_c2ws=camera_cond["c2w"],  # test_c2ws,
+            )
+            samples = do_sample(
+                model,
+                ae,
+                conditioner,
+                denoiser,
+                samplers[1] if len(samplers) > 1 else samplers[0],
+                value_dict,
+                H,
+                W,
+                C,
+                F,
+                T=T_second_pass,
+                cfg=(
+                    options["cfg"][1]
+                    if isinstance(options["cfg"], (list, tuple))
+                    and len(options["cfg"]) > 1
+                    else options["cfg"]
+                ),
+                global_pbar=second_pass_pbar,
+                **{k: options[k] for k in options if k not in ["cfg", "T", "sampler"]},
+            )
+            if samples is None:
+                return
+            samples = decode_output(
+                samples, T_second_pass, chunk_test_sels
+            )  # decode into dict
+            if options.get("save_second_pass", False):
+                save_output(
+                    replace_or_include_input_for_dict(
+                        samples,
+                        chunk_test_sels,
+                        curr_imgs,
+                        curr_c2ws,
+                        curr_Ks,
+                    ),
+                    save_path=os.path.join(save_path, "second-pass", f"forward_{i}"),
+                    video_save_fps=2,
+                )
+            extend_dict(all_samples, samples)
+            all_test_inds.extend(chunk_test_inds)
+        all_samples = {
+            key: value[np.argsort(all_test_inds)] for key, value in all_samples.items()
+        }
+    save_output(
+        replace_or_include_input_for_dict(
+            all_samples,
+            test_indices,
+            imgs.clone(),
+            camera_cond["c2w"].clone(),
+            camera_cond["K"].clone(),
+        )
+        if options.get("replace_or_include_input", False)
+        else all_samples,
+        save_path=save_path,
+        video_save_fps=options.get("video_save_fps", 2),
+    )
+    video_path_1 = os.path.join(save_path, "samples-rgb.mp4")
+    yield video_path_1

seva/geometry.py ADDED Viewed

	@@ -0,0 +1,811 @@

+from typing import Literal
+import numpy as np
+import roma
+import scipy.interpolate
+import torch
+import torch.nn.functional as F
+DEFAULT_FOV_RAD = 0.9424777960769379  # 54 degrees by default
+def get_camera_dist(
+    source_c2ws: torch.Tensor,  # N x 3 x 4
+    target_c2ws: torch.Tensor,  # M x 3 x 4
+    mode: str = "translation",
+):
+    if mode == "rotation":
+        dists = torch.acos(
+            (
+                (
+                    torch.matmul(
+                        source_c2ws[:, None, :3, :3],
+                        target_c2ws[None, :, :3, :3].transpose(-1, -2),
+                    )
+                    .diagonal(offset=0, dim1=-2, dim2=-1)
+                    .sum(-1)
+                    - 1
+                )
+                / 2
+            ).clamp(-1, 1)
+        ) * (180 / torch.pi)
+    elif mode == "translation":
+        dists = torch.norm(
+            source_c2ws[:, None, :3, 3] - target_c2ws[None, :, :3, 3], dim=-1
+        )
+    else:
+        raise NotImplementedError(
+            f"Mode {mode} is not implemented for finding nearest source indices."
+        )
+    return dists
+def to_hom(X):
+    # get homogeneous coordinates of the input
+    X_hom = torch.cat([X, torch.ones_like(X[..., :1])], dim=-1)
+    return X_hom
+def to_hom_pose(pose):
+    # get homogeneous coordinates of the input pose
+    if pose.shape[-2:] == (3, 4):
+        pose_hom = torch.eye(4, device=pose.device)[None].repeat(pose.shape[0], 1, 1)
+        pose_hom[:, :3, :] = pose
+        return pose_hom
+    return pose
+def get_default_intrinsics(
+    fov_rad=DEFAULT_FOV_RAD,
+    aspect_ratio=1.0,
+):
+    if not isinstance(fov_rad, torch.Tensor):
+        fov_rad = torch.tensor(
+            [fov_rad] if isinstance(fov_rad, (int, float)) else fov_rad
+        )
+    if aspect_ratio >= 1.0:  # W >= H
+        focal_x = 0.5 / torch.tan(0.5 * fov_rad)
+        focal_y = focal_x * aspect_ratio
+    else:  # W < H
+        focal_y = 0.5 / torch.tan(0.5 * fov_rad)
+        focal_x = focal_y / aspect_ratio
+    intrinsics = focal_x.new_zeros((focal_x.shape[0], 3, 3))
+    intrinsics[:, torch.eye(3, device=focal_x.device, dtype=bool)] = torch.stack(
+        [focal_x, focal_y, torch.ones_like(focal_x)], dim=-1
+    )
+    intrinsics[:, :, -1] = torch.tensor(
+        [0.5, 0.5, 1.0], device=focal_x.device, dtype=focal_x.dtype
+    )
+    return intrinsics
+def get_image_grid(img_h, img_w):
+    # add 0.5 is VERY important especially when your img_h and img_w
+    # is not very large (e.g., 72)!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    y_range = torch.arange(img_h, dtype=torch.float32).add_(0.5)
+    x_range = torch.arange(img_w, dtype=torch.float32).add_(0.5)
+    Y, X = torch.meshgrid(y_range, x_range, indexing="ij")  # [H,W]
+    xy_grid = torch.stack([X, Y], dim=-1).view(-1, 2)  # [HW,2]
+    return to_hom(xy_grid)  # [HW,3]
+def img2cam(X, cam_intr):
+    return X @ cam_intr.inverse().transpose(-1, -2)
+def cam2world(X, pose):
+    X_hom = to_hom(X)
+    pose_inv = torch.linalg.inv(to_hom_pose(pose))[..., :3, :4]
+    return X_hom @ pose_inv.transpose(-1, -2)
+def get_center_and_ray(
+    img_h, img_w, pose, intr, zero_center_for_debugging=False
+):  # [HW,2]
+    # given the intrinsic/extrinsic matrices, get the camera center and ray directions]
+    # assert(opt.camera.model=="perspective")
+    # compute center and ray
+    grid_img = get_image_grid(img_h, img_w)  # [HW,3]
+    grid_3D_cam = img2cam(grid_img.to(intr.device), intr.float())  # [B,HW,3]
+    center_3D_cam = torch.zeros_like(grid_3D_cam)  # [B,HW,3]
+    # transform from camera to world coordinates
+    grid_3D = cam2world(grid_3D_cam, pose)  # [B,HW,3]
+    center_3D = cam2world(center_3D_cam, pose)  # [B,HW,3]
+    ray = grid_3D - center_3D  # [B,HW,3]
+    return center_3D_cam if zero_center_for_debugging else center_3D, ray, grid_3D_cam
+def get_plucker_coordinates(
+    extrinsics_src,
+    extrinsics,
+    intrinsics=None,
+    fov_rad=DEFAULT_FOV_RAD,
+    mode="plucker",
+    rel_zero_translation=True,
+    zero_center_for_debugging=False,
+    target_size=[72, 72],  # 576-size image
+    return_grid_cam=False,  # save for later use if want restore
+):
+    if intrinsics is None:
+        intrinsics = get_default_intrinsics(fov_rad).to(extrinsics.device)
+    else:
+        # for some data preprocessed in the early stage (e.g., MVI and CO3D),
+        # intrinsics are expressed in raw pixel space (e.g., 576x576) instead
+        # of normalized image coordinates
+        if not (
+            torch.all(intrinsics[:, :2, -1] >= 0)
+            and torch.all(intrinsics[:, :2, -1] <= 1)
+        ):
+            intrinsics[:, :2] /= intrinsics.new_tensor(target_size).view(1, -1, 1) * 8
+        # you should ensure the intrisics are expressed in
+        # resolution-independent normalized image coordinates just performing a
+        # very simple verification here checking if principal points are
+        # between 0 and 1
+        assert (
+            torch.all(intrinsics[:, :2, -1] >= 0)
+            and torch.all(intrinsics[:, :2, -1] <= 1)
+        ), "Intrinsics should be expressed in resolution-independent normalized image coordinates."
+    c2w_src = torch.linalg.inv(extrinsics_src)
+    if not rel_zero_translation:
+        c2w_src[:3, 3] = c2w_src[3, :3] = 0.0
+    # transform coordinates from the source camera's coordinate system to the coordinate system of the respective camera
+    extrinsics_rel = torch.einsum(
+        "vnm,vmp->vnp", extrinsics, c2w_src[None].repeat(extrinsics.shape[0], 1, 1)
+    )
+    intrinsics[:, :2] *= extrinsics.new_tensor(
+        [
+            target_size[1],  # w
+            target_size[0],  # h
+        ]
+    ).view(1, -1, 1)
+    centers, rays, grid_cam = get_center_and_ray(
+        img_h=target_size[0],
+        img_w=target_size[1],
+        pose=extrinsics_rel[:, :3, :],
+        intr=intrinsics,
+        zero_center_for_debugging=zero_center_for_debugging,
+    )
+    if mode == "plucker" or "v1" in mode:
+        rays = torch.nn.functional.normalize(rays, dim=-1)
+        plucker = torch.cat((rays, torch.cross(centers, rays, dim=-1)), dim=-1)
+    else:
+        raise ValueError(f"Unknown Plucker coordinate mode: {mode}")
+    plucker = plucker.permute(0, 2, 1).reshape(plucker.shape[0], -1, *target_size)
+    if return_grid_cam:
+        return plucker, grid_cam.reshape(-1, *target_size, 3)
+    return plucker
+def rt_to_mat4(
+    R: torch.Tensor, t: torch.Tensor, s: torch.Tensor | None = None
+) -> torch.Tensor:
+    """
+    Args:
+        R (torch.Tensor): (..., 3, 3).
+        t (torch.Tensor): (..., 3).
+        s (torch.Tensor): (...,).
+    Returns:
+        torch.Tensor: (..., 4, 4)
+    """
+    mat34 = torch.cat([R, t[..., None]], dim=-1)
+    if s is None:
+        bottom = (
+            mat34.new_tensor([[0.0, 0.0, 0.0, 1.0]])
+            .reshape((1,) * (mat34.dim() - 2) + (1, 4))
+            .expand(mat34.shape[:-2] + (1, 4))
+        )
+    else:
+        bottom = F.pad(1.0 / s[..., None, None], (3, 0), value=0.0)
+    mat4 = torch.cat([mat34, bottom], dim=-2)
+    return mat4
+def get_preset_pose_fov(
+    option: Literal[
+        "orbit",
+        "spiral",
+        "lemniscate",
+        "zoom-in",
+        "zoom-out",
+        "dolly zoom-in",
+        "dolly zoom-out",
+        "move-forward",
+        "move-backward",
+        "move-up",
+        "move-down",
+        "move-left",
+        "move-right",
+        "roll",
+    ],
+    num_frames: int,
+    start_w2c: torch.Tensor,
+    look_at: torch.Tensor,
+    up_direction: torch.Tensor | None = None,
+    fov: float = DEFAULT_FOV_RAD,
+    spiral_radii: list[float] = [0.5, 0.5, 0.2],
+    zoom_factor: float | None = None,
+):
+    poses = fovs = None
+    if option == "orbit":
+        poses = torch.linalg.inv(
+            get_arc_horizontal_w2cs(
+                start_w2c,
+                look_at,
+                up_direction,
+                num_frames=num_frames,
+                endpoint=False,
+            )
+        ).numpy()
+        fovs = np.full((num_frames,), fov)
+    elif option == "spiral":
+        poses = generate_spiral_path(
+            torch.linalg.inv(start_w2c)[None].numpy() @ np.diagflat([1, -1, -1, 1]),
+            np.array([1, 5]),
+            n_frames=num_frames,
+            n_rots=2,
+            zrate=0.5,
+            radii=spiral_radii,
+            endpoint=False,
+        ) @ np.diagflat([1, -1, -1, 1])
+        poses = np.concatenate(
+            [
+                poses,
+                np.array([0.0, 0.0, 0.0, 1.0])[None, None].repeat(len(poses), 0),
+            ],
+            1,
+        )
+        # We want the spiral trajectory to always start from start_w2c. Thus we
+        # apply the relative pose to get the final trajectory.
+        poses = (
+            np.linalg.inv(start_w2c.numpy())[None] @ np.linalg.inv(poses[:1]) @ poses
+        )
+        fovs = np.full((num_frames,), fov)
+    elif option == "lemniscate":
+        poses = torch.linalg.inv(
+            get_lemniscate_w2cs(
+                start_w2c,
+                look_at,
+                up_direction,
+                num_frames,
+                degree=60.0,
+                endpoint=False,
+            )
+        ).numpy()
+        fovs = np.full((num_frames,), fov)
+    elif option == "roll":
+        poses = torch.linalg.inv(
+            get_roll_w2cs(
+                start_w2c,
+                look_at,
+                None,
+                num_frames,
+                degree=360.0,
+                endpoint=False,
+            )
+        ).numpy()
+        fovs = np.full((num_frames,), fov)
+    elif option in [
+        "dolly zoom-in",
+        "dolly zoom-out",
+        "zoom-in",
+        "zoom-out",
+    ]:
+        if option.startswith("dolly"):
+            direction = "backward" if option == "dolly zoom-in" else "forward"
+            poses = torch.linalg.inv(
+                get_moving_w2cs(
+                    start_w2c,
+                    look_at,
+                    up_direction,
+                    num_frames,
+                    endpoint=True,
+                    direction=direction,
+                )
+            ).numpy()
+        else:
+            poses = torch.linalg.inv(start_w2c)[None].repeat(num_frames, 1, 1).numpy()
+        fov_rad_start = fov
+        if zoom_factor is None:
+            zoom_factor = 0.28 if option.endswith("zoom-in") else 1.5
+        fov_rad_end = zoom_factor * fov
+        fovs = (
+            np.linspace(0, 1, num_frames) * (fov_rad_end - fov_rad_start)
+            + fov_rad_start
+        )
+    elif option in [
+        "move-forward",
+        "move-backward",
+        "move-up",
+        "move-down",
+        "move-left",
+        "move-right",
+    ]:
+        poses = torch.linalg.inv(
+            get_moving_w2cs(
+                start_w2c,
+                look_at,
+                up_direction,
+                num_frames,
+                endpoint=True,
+                direction=option.removeprefix("move-"),
+            )
+        ).numpy()
+        fovs = np.full((num_frames,), fov)
+    else:
+        raise ValueError(f"Unknown preset option {option}.")
+    return poses, fovs
+def get_lookat(origins: torch.Tensor, viewdirs: torch.Tensor) -> torch.Tensor:
+    """Triangulate a set of rays to find a single lookat point.
+    Args:
+        origins (torch.Tensor): A (N, 3) array of ray origins.
+        viewdirs (torch.Tensor): A (N, 3) array of ray view directions.
+    Returns:
+        torch.Tensor: A (3,) lookat point.
+    """
+    viewdirs = torch.nn.functional.normalize(viewdirs, dim=-1)
+    eye = torch.eye(3, device=origins.device, dtype=origins.dtype)[None]
+    # Calculate projection matrix I - rr^T
+    I_min_cov = eye - (viewdirs[..., None] * viewdirs[..., None, :])
+    # Compute sum of projections
+    sum_proj = I_min_cov.matmul(origins[..., None]).sum(dim=-3)
+    # Solve for the intersection point using least squares
+    lookat = torch.linalg.lstsq(I_min_cov.sum(dim=-3), sum_proj).solution[..., 0]
+    # Check NaNs.
+    assert not torch.any(torch.isnan(lookat))
+    return lookat
+def get_lookat_w2cs(
+    positions: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor,
+    face_off: bool = False,
+):
+    """
+    Args:
+        positions: (N, 3) tensor of camera positions
+        lookat: (3,) tensor of lookat point
+        up: (3,) or (N, 3) tensor of up vector
+    Returns:
+        w2cs: (N, 3, 3) tensor of world to camera rotation matrices
+    """
+    forward_vectors = F.normalize(lookat - positions, dim=-1)
+    if face_off:
+        forward_vectors = -forward_vectors
+    if up.dim() == 1:
+        up = up[None]
+    right_vectors = F.normalize(torch.cross(forward_vectors, up, dim=-1), dim=-1)
+    down_vectors = F.normalize(
+        torch.cross(forward_vectors, right_vectors, dim=-1), dim=-1
+    )
+    Rs = torch.stack([right_vectors, down_vectors, forward_vectors], dim=-1)
+    w2cs = torch.linalg.inv(rt_to_mat4(Rs, positions))
+    return w2cs
+def get_arc_horizontal_w2cs(
+    ref_w2c: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor | None,
+    num_frames: int,
+    clockwise: bool = True,
+    face_off: bool = False,
+    endpoint: bool = False,
+    degree: float = 360.0,
+    ref_up_shift: float = 0.0,
+    ref_radius_scale: float = 1.0,
+    **_,
+) -> torch.Tensor:
+    ref_c2w = torch.linalg.inv(ref_w2c)
+    ref_position = ref_c2w[:3, 3]
+    if up is None:
+        up = -ref_c2w[:3, 1]
+    assert up is not None
+    ref_position += up * ref_up_shift
+    ref_position *= ref_radius_scale
+    thetas = (
+        torch.linspace(0.0, torch.pi * degree / 180, num_frames, device=ref_w2c.device)
+        if endpoint
+        else torch.linspace(
+            0.0, torch.pi * degree / 180, num_frames + 1, device=ref_w2c.device
+        )[:-1]
+    )
+    if not clockwise:
+        thetas = -thetas
+    positions = (
+        torch.einsum(
+            "nij,j->ni",
+            roma.rotvec_to_rotmat(thetas[:, None] * up[None]),
+            ref_position - lookat,
+        )
+        + lookat
+    )
+    return get_lookat_w2cs(positions, lookat, up, face_off=face_off)
+def get_lemniscate_w2cs(
+    ref_w2c: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor | None,
+    num_frames: int,
+    degree: float,
+    endpoint: bool = False,
+    **_,
+) -> torch.Tensor:
+    ref_c2w = torch.linalg.inv(ref_w2c)
+    a = torch.linalg.norm(ref_c2w[:3, 3] - lookat) * np.tan(degree / 360 * np.pi)
+    # Lemniscate curve in camera space. Starting at the origin.
+    thetas = (
+        torch.linspace(0, 2 * torch.pi, num_frames, device=ref_w2c.device)
+        if endpoint
+        else torch.linspace(0, 2 * torch.pi, num_frames + 1, device=ref_w2c.device)[:-1]
+    ) + torch.pi / 2
+    positions = torch.stack(
+        [
+            a * torch.cos(thetas) / (1 + torch.sin(thetas) ** 2),
+            a * torch.cos(thetas) * torch.sin(thetas) / (1 + torch.sin(thetas) ** 2),
+            torch.zeros(num_frames, device=ref_w2c.device),
+        ],
+        dim=-1,
+    )
+    # Transform to world space.
+    positions = torch.einsum(
+        "ij,nj->ni", ref_c2w[:3], F.pad(positions, (0, 1), value=1.0)
+    )
+    if up is None:
+        up = -ref_c2w[:3, 1]
+    assert up is not None
+    return get_lookat_w2cs(positions, lookat, up)
+def get_moving_w2cs(
+    ref_w2c: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor | None,
+    num_frames: int,
+    endpoint: bool = False,
+    direction: str = "forward",
+    tilt_xy: torch.Tensor = None,
+):
+    """
+    Args:
+        ref_w2c: (4, 4) tensor of the reference wolrd-to-camera matrix
+        lookat: (3,) tensor of lookat point
+        up: (3,) tensor of up vector
+    Returns:
+        w2cs: (N, 3, 3) tensor of world to camera rotation matrices
+    """
+    ref_c2w = torch.linalg.inv(ref_w2c)
+    ref_position = ref_c2w[:3, -1]
+    if up is None:
+        up = -ref_c2w[:3, 1]
+    direction_vectors = {
+        "forward": (lookat - ref_position).clone(),
+        "backward": -(lookat - ref_position).clone(),
+        "up": up.clone(),
+        "down": -up.clone(),
+        "right": torch.cross((lookat - ref_position), up, dim=0),
+        "left": -torch.cross((lookat - ref_position), up, dim=0),
+    }
+    if direction not in direction_vectors:
+        raise ValueError(
+            f"Invalid direction: {direction}. Must be one of {list(direction_vectors.keys())}"
+        )
+    positions = ref_position + (
+        F.normalize(direction_vectors[direction], dim=0)
+        * (
+            torch.linspace(0, 0.99, num_frames, device=ref_w2c.device)
+            if endpoint
+            else torch.linspace(0, 1, num_frames + 1, device=ref_w2c.device)[:-1]
+        )[:, None]
+    )
+    if tilt_xy is not None:
+        positions[:, :2] += tilt_xy
+    return get_lookat_w2cs(positions, lookat, up)
+def get_roll_w2cs(
+    ref_w2c: torch.Tensor,
+    lookat: torch.Tensor,
+    up: torch.Tensor | None,
+    num_frames: int,
+    endpoint: bool = False,
+    degree: float = 360.0,
+    **_,
+) -> torch.Tensor:
+    ref_c2w = torch.linalg.inv(ref_w2c)
+    ref_position = ref_c2w[:3, 3]
+    if up is None:
+        up = -ref_c2w[:3, 1]  # Infer the up vector from the reference.
+    # Create vertical angles
+    thetas = (
+        torch.linspace(0.0, torch.pi * degree / 180, num_frames, device=ref_w2c.device)
+        if endpoint
+        else torch.linspace(
+            0.0, torch.pi * degree / 180, num_frames + 1, device=ref_w2c.device
+        )[:-1]
+    )[:, None]
+    lookat_vector = F.normalize(lookat[None].float(), dim=-1)
+    up = up[None]
+    up = (
+        up * torch.cos(thetas)
+        + torch.cross(lookat_vector, up) * torch.sin(thetas)
+        + lookat_vector
+        * torch.einsum("ij,ij->i", lookat_vector, up)[:, None]
+        * (1 - torch.cos(thetas))
+    )
+    # Normalize the camera orientation
+    return get_lookat_w2cs(ref_position[None].repeat(num_frames, 1), lookat, up)
+def normalize(x):
+    """Normalization helper function."""
+    return x / np.linalg.norm(x)
+def viewmatrix(lookdir, up, position, subtract_position=False):
+    """Construct lookat view matrix."""
+    vec2 = normalize((lookdir - position) if subtract_position else lookdir)
+    vec0 = normalize(np.cross(up, vec2))
+    vec1 = normalize(np.cross(vec2, vec0))
+    m = np.stack([vec0, vec1, vec2, position], axis=1)
+    return m
+def poses_avg(poses):
+    """New pose using average position, z-axis, and up vector of input poses."""
+    position = poses[:, :3, 3].mean(0)
+    z_axis = poses[:, :3, 2].mean(0)
+    up = poses[:, :3, 1].mean(0)
+    cam2world = viewmatrix(z_axis, up, position)
+    return cam2world
+def generate_spiral_path(
+    poses, bounds, n_frames=120, n_rots=2, zrate=0.5, endpoint=False, radii=None
+):
+    """Calculates a forward facing spiral path for rendering."""
+    # Find a reasonable 'focus depth' for this dataset as a weighted average
+    # of near and far bounds in disparity space.
+    close_depth, inf_depth = bounds.min() * 0.9, bounds.max() * 5.0
+    dt = 0.75
+    focal = 1 / ((1 - dt) / close_depth + dt / inf_depth)
+    # Get radii for spiral path using 90th percentile of camera positions.
+    positions = poses[:, :3, 3]
+    if radii is None:
+        radii = np.percentile(np.abs(positions), 90, 0)
+    radii = np.concatenate([radii, [1.0]])
+    # Generate poses for spiral path.
+    render_poses = []
+    cam2world = poses_avg(poses)
+    up = poses[:, :3, 1].mean(0)
+    for theta in np.linspace(0.0, 2.0 * np.pi * n_rots, n_frames, endpoint=endpoint):
+        t = radii * [np.cos(theta), -np.sin(theta), -np.sin(theta * zrate), 1.0]
+        position = cam2world @ t
+        lookat = cam2world @ [0, 0, -focal, 1.0]
+        z_axis = position - lookat
+        render_poses.append(viewmatrix(z_axis, up, position))
+    render_poses = np.stack(render_poses, axis=0)
+    return render_poses
+def generate_interpolated_path(
+    poses: np.ndarray,
+    n_interp: int,
+    spline_degree: int = 5,
+    smoothness: float = 0.03,
+    rot_weight: float = 0.1,
+    endpoint: bool = False,
+):
+    """Creates a smooth spline path between input keyframe camera poses.
+    Spline is calculated with poses in format (position, lookat-point, up-point).
+    Args:
+      poses: (n, 3, 4) array of input pose keyframes.
+      n_interp: returned path will have n_interp * (n - 1) total poses.
+      spline_degree: polynomial degree of B-spline.
+      smoothness: parameter for spline smoothing, 0 forces exact interpolation.
+      rot_weight: relative weighting of rotation/translation in spline solve.
+    Returns:
+      Array of new camera poses with shape (n_interp * (n - 1), 3, 4).
+    """
+    def poses_to_points(poses, dist):
+        """Converts from pose matrices to (position, lookat, up) format."""
+        pos = poses[:, :3, -1]
+        lookat = poses[:, :3, -1] - dist * poses[:, :3, 2]
+        up = poses[:, :3, -1] + dist * poses[:, :3, 1]
+        return np.stack([pos, lookat, up], 1)
+    def points_to_poses(points):
+        """Converts from (position, lookat, up) format to pose matrices."""
+        return np.array([viewmatrix(p - l, u - p, p) for p, l, u in points])
+    def interp(points, n, k, s):
+        """Runs multidimensional B-spline interpolation on the input points."""
+        sh = points.shape
+        pts = np.reshape(points, (sh[0], -1))
+        k = min(k, sh[0] - 1)
+        tck, _ = scipy.interpolate.splprep(pts.T, k=k, s=s)
+        u = np.linspace(0, 1, n, endpoint=endpoint)
+        new_points = np.array(scipy.interpolate.splev(u, tck))
+        new_points = np.reshape(new_points.T, (n, sh[1], sh[2]))
+        return new_points
+    points = poses_to_points(poses, dist=rot_weight)
+    new_points = interp(
+        points, n_interp * (points.shape[0] - 1), k=spline_degree, s=smoothness
+    )
+    return points_to_poses(new_points)
+def similarity_from_cameras(c2w, strict_scaling=False, center_method="focus"):
+    """
+    reference: nerf-factory
+    Get a similarity transform to normalize dataset
+    from c2w (OpenCV convention) cameras
+    :param c2w: (N, 4)
+    :return T (4,4) , scale (float)
+    """
+    t = c2w[:, :3, 3]
+    R = c2w[:, :3, :3]
+    # (1) Rotate the world so that z+ is the up axis
+    # we estimate the up axis by averaging the camera up axes
+    ups = np.sum(R * np.array([0, -1.0, 0]), axis=-1)
+    world_up = np.mean(ups, axis=0)
+    world_up /= np.linalg.norm(world_up)
+    up_camspace = np.array([0.0, -1.0, 0.0])
+    c = (up_camspace * world_up).sum()
+    cross = np.cross(world_up, up_camspace)
+    skew = np.array(
+        [
+            [0.0, -cross[2], cross[1]],
+            [cross[2], 0.0, -cross[0]],
+            [-cross[1], cross[0], 0.0],
+        ]
+    )
+    if c > -1:
+        R_align = np.eye(3) + skew + (skew @ skew) * 1 / (1 + c)
+    else:
+        # In the unlikely case the original data has y+ up axis,
+        # rotate 180-deg about x axis
+        R_align = np.array([[-1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]])
+    #  R_align = np.eye(3) # DEBUG
+    R = R_align @ R
+    fwds = np.sum(R * np.array([0, 0.0, 1.0]), axis=-1)
+    t = (R_align @ t[..., None])[..., 0]
+    # (2) Recenter the scene.
+    if center_method == "focus":
+        # find the closest point to the origin for each camera's center ray
+        nearest = t + (fwds * -t).sum(-1)[:, None] * fwds
+        translate = -np.median(nearest, axis=0)
+    elif center_method == "poses":
+        # use center of the camera positions
+        translate = -np.median(t, axis=0)
+    else:
+        raise ValueError(f"Unknown center_method {center_method}")
+    transform = np.eye(4)
+    transform[:3, 3] = translate
+    transform[:3, :3] = R_align
+    # (3) Rescale the scene using camera distances
+    scale_fn = np.max if strict_scaling else np.median
+    inv_scale = scale_fn(np.linalg.norm(t + translate, axis=-1))
+    if inv_scale == 0:
+        inv_scale = 1.0
+    scale = 1.0 / inv_scale
+    transform[:3, :] *= scale
+    return transform
+def align_principle_axes(point_cloud):
+    # Compute centroid
+    centroid = np.median(point_cloud, axis=0)
+    # Translate point cloud to centroid
+    translated_point_cloud = point_cloud - centroid
+    # Compute covariance matrix
+    covariance_matrix = np.cov(translated_point_cloud, rowvar=False)
+    # Compute eigenvectors and eigenvalues
+    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
+    # Sort eigenvectors by eigenvalues (descending order) so that the z-axis
+    # is the principal axis with the smallest eigenvalue.
+    sort_indices = eigenvalues.argsort()[::-1]
+    eigenvectors = eigenvectors[:, sort_indices]
+    # Check orientation of eigenvectors. If the determinant of the eigenvectors is
+    # negative, then we need to flip the sign of one of the eigenvectors.
+    if np.linalg.det(eigenvectors) < 0:
+        eigenvectors[:, 0] *= -1
+    # Create rotation matrix
+    rotation_matrix = eigenvectors.T
+    # Create SE(3) matrix (4x4 transformation matrix)
+    transform = np.eye(4)
+    transform[:3, :3] = rotation_matrix
+    transform[:3, 3] = -rotation_matrix @ centroid
+    return transform
+def transform_points(matrix, points):
+    """Transform points using a SE(4) matrix.
+    Args:
+        matrix: 4x4 SE(4) matrix
+        points: Nx3 array of points
+    Returns:
+        Nx3 array of transformed points
+    """
+    assert matrix.shape == (4, 4)
+    assert len(points.shape) == 2 and points.shape[1] == 3
+    return points @ matrix[:3, :3].T + matrix[:3, 3]
+def transform_cameras(matrix, camtoworlds):
+    """Transform cameras using a SE(4) matrix.
+    Args:
+        matrix: 4x4 SE(4) matrix
+        camtoworlds: Nx4x4 array of camera-to-world matrices
+    Returns:
+        Nx4x4 array of transformed camera-to-world matrices
+    """
+    assert matrix.shape == (4, 4)
+    assert len(camtoworlds.shape) == 3 and camtoworlds.shape[1:] == (4, 4)
+    camtoworlds = np.einsum("nij, ki -> nkj", camtoworlds, matrix)
+    scaling = np.linalg.norm(camtoworlds[:, 0, :3], axis=1)
+    camtoworlds[:, :3, :3] = camtoworlds[:, :3, :3] / scaling[:, None, None]
+    return camtoworlds
+def normalize_scene(camtoworlds, points=None, camera_center_method="focus"):
+    T1 = similarity_from_cameras(camtoworlds, center_method=camera_center_method)
+    camtoworlds = transform_cameras(T1, camtoworlds)
+    if points is not None:
+        points = transform_points(T1, points)
+        T2 = align_principle_axes(points)
+        camtoworlds = transform_cameras(T2, camtoworlds)
+        points = transform_points(T2, points)
+        return camtoworlds, points, T2 @ T1
+    else:
+        return camtoworlds, T1

seva/gui.py ADDED Viewed

	@@ -0,0 +1,975 @@

+import colorsys
+import dataclasses
+import threading
+import time
+from pathlib import Path
+import numpy as np
+import scipy
+import splines
+import splines.quaternion
+import torch
+import viser
+import viser.transforms as vt
+from seva.geometry import get_preset_pose_fov
+@dataclasses.dataclass
+class Keyframe(object):
+    position: np.ndarray
+    wxyz: np.ndarray
+    override_fov_enabled: bool
+    override_fov_rad: float
+    aspect: float
+    override_transition_enabled: bool
+    override_transition_sec: float | None
+    @staticmethod
+    def from_camera(camera: viser.CameraHandle, aspect: float) -> "Keyframe":
+        return Keyframe(
+            camera.position,
+            camera.wxyz,
+            override_fov_enabled=False,
+            override_fov_rad=camera.fov,
+            aspect=aspect,
+            override_transition_enabled=False,
+            override_transition_sec=None,
+        )
+    @staticmethod
+    def from_se3(se3: vt.SE3, fov: float, aspect: float) -> "Keyframe":
+        return Keyframe(
+            se3.translation(),
+            se3.rotation().wxyz,
+            override_fov_enabled=False,
+            override_fov_rad=fov,
+            aspect=aspect,
+            override_transition_enabled=False,
+            override_transition_sec=None,
+        )
+class CameraTrajectory(object):
+    def __init__(
+        self,
+        server: viser.ViserServer,
+        duration_element: viser.GuiInputHandle[float],
+        scene_scale: float,
+        scene_node_prefix: str = "/",
+    ):
+        self._server = server
+        self._keyframes: dict[int, tuple[Keyframe, viser.CameraFrustumHandle]] = {}
+        self._keyframe_counter: int = 0
+        self._spline_nodes: list[viser.SceneNodeHandle] = []
+        self._camera_edit_panel: viser.Gui3dContainerHandle | None = None
+        self._orientation_spline: splines.quaternion.KochanekBartels | None = None
+        self._position_spline: splines.KochanekBartels | None = None
+        self._fov_spline: splines.KochanekBartels | None = None
+        self._keyframes_visible: bool = True
+        self._duration_element = duration_element
+        self._scene_node_prefix = scene_node_prefix
+        self.scene_scale = scene_scale
+        # These parameters should be overridden externally.
+        self.loop: bool = False
+        self.framerate: float = 30.0
+        self.tension: float = 0.0  # Tension / alpha term.
+        self.default_fov: float = 0.0
+        self.default_transition_sec: float = 0.0
+        self.show_spline: bool = True
+    def set_keyframes_visible(self, visible: bool) -> None:
+        self._keyframes_visible = visible
+        for keyframe in self._keyframes.values():
+            keyframe[1].visible = visible
+    def add_camera(self, keyframe: Keyframe, keyframe_index: int | None = None) -> None:
+        """Add a new camera, or replace an old one if `keyframe_index` is passed in."""
+        server = self._server
+        # Add a keyframe if we aren't replacing an existing one.
+        if keyframe_index is None:
+            keyframe_index = self._keyframe_counter
+            self._keyframe_counter += 1
+        print(
+            f"{keyframe.wxyz=} {keyframe.position=} {keyframe_index=} {keyframe.aspect=}"
+        )
+        frustum_handle = server.scene.add_camera_frustum(
+            str(Path(self._scene_node_prefix) / f"cameras/{keyframe_index}"),
+            fov=(
+                keyframe.override_fov_rad
+                if keyframe.override_fov_enabled
+                else self.default_fov
+            ),
+            aspect=keyframe.aspect,
+            scale=0.1 * self.scene_scale,
+            color=(200, 10, 30),
+            wxyz=keyframe.wxyz,
+            position=keyframe.position,
+            visible=self._keyframes_visible,
+        )
+        self._server.scene.add_icosphere(
+            str(Path(self._scene_node_prefix) / f"cameras/{keyframe_index}/sphere"),
+            radius=0.03,
+            color=(200, 10, 30),
+        )
+        @frustum_handle.on_click
+        def _(_) -> None:
+            if self._camera_edit_panel is not None:
+                self._camera_edit_panel.remove()
+                self._camera_edit_panel = None
+            with server.scene.add_3d_gui_container(
+                "/camera_edit_panel",
+                position=keyframe.position,
+            ) as camera_edit_panel:
+                self._camera_edit_panel = camera_edit_panel
+                override_fov = server.gui.add_checkbox(
+                    "Override FOV", initial_value=keyframe.override_fov_enabled
+                )
+                override_fov_degrees = server.gui.add_slider(
+                    "Override FOV (degrees)",
+                    5.0,
+                    175.0,
+                    step=0.1,
+                    initial_value=keyframe.override_fov_rad * 180.0 / np.pi,
+                    disabled=not keyframe.override_fov_enabled,
+                )
+                delete_button = server.gui.add_button(
+                    "Delete", color="red", icon=viser.Icon.TRASH
+                )
+                go_to_button = server.gui.add_button("Go to")
+                close_button = server.gui.add_button("Close")
+            @override_fov.on_update
+            def _(_) -> None:
+                keyframe.override_fov_enabled = override_fov.value
+                override_fov_degrees.disabled = not override_fov.value
+                self.add_camera(keyframe, keyframe_index)
+            @override_fov_degrees.on_update
+            def _(_) -> None:
+                keyframe.override_fov_rad = override_fov_degrees.value / 180.0 * np.pi
+                self.add_camera(keyframe, keyframe_index)
+            @delete_button.on_click
+            def _(event: viser.GuiEvent) -> None:
+                assert event.client is not None
+                with event.client.gui.add_modal("Confirm") as modal:
+                    event.client.gui.add_markdown("Delete keyframe?")
+                    confirm_button = event.client.gui.add_button(
+                        "Yes", color="red", icon=viser.Icon.TRASH
+                    )
+                    exit_button = event.client.gui.add_button("Cancel")
+                    @confirm_button.on_click
+                    def _(_) -> None:
+                        assert camera_edit_panel is not None
+                        keyframe_id = None
+                        for i, keyframe_tuple in self._keyframes.items():
+                            if keyframe_tuple[1] is frustum_handle:
+                                keyframe_id = i
+                                break
+                        assert keyframe_id is not None
+                        self._keyframes.pop(keyframe_id)
+                        frustum_handle.remove()
+                        camera_edit_panel.remove()
+                        self._camera_edit_panel = None
+                        modal.close()
+                        self.update_spline()
+                    @exit_button.on_click
+                    def _(_) -> None:
+                        modal.close()
+            @go_to_button.on_click
+            def _(event: viser.GuiEvent) -> None:
+                assert event.client is not None
+                client = event.client
+                T_world_current = vt.SE3.from_rotation_and_translation(
+                    vt.SO3(client.camera.wxyz), client.camera.position
+                )
+                T_world_target = vt.SE3.from_rotation_and_translation(
+                    vt.SO3(keyframe.wxyz), keyframe.position
+                ) @ vt.SE3.from_translation(np.array([0.0, 0.0, -0.5]))
+                T_current_target = T_world_current.inverse() @ T_world_target
+                for j in range(10):
+                    T_world_set = T_world_current @ vt.SE3.exp(
+                        T_current_target.log() * j / 9.0
+                    )
+                    # Important bit: we atomically set both the orientation and
+                    # the position of the camera.
+                    with client.atomic():
+                        client.camera.wxyz = T_world_set.rotation().wxyz
+                        client.camera.position = T_world_set.translation()
+                    time.sleep(1.0 / 30.0)
+            @close_button.on_click
+            def _(_) -> None:
+                assert camera_edit_panel is not None
+                camera_edit_panel.remove()
+                self._camera_edit_panel = None
+        self._keyframes[keyframe_index] = (keyframe, frustum_handle)
+    def update_aspect(self, aspect: float) -> None:
+        for keyframe_index, frame in self._keyframes.items():
+            frame = dataclasses.replace(frame[0], aspect=aspect)
+            self.add_camera(frame, keyframe_index=keyframe_index)
+    def get_aspect(self) -> float:
+        """Get W/H aspect ratio, which is shared across all keyframes."""
+        assert len(self._keyframes) > 0
+        return next(iter(self._keyframes.values()))[0].aspect
+    def reset(self) -> None:
+        for frame in self._keyframes.values():
+            print(f"removing {frame[1]}")
+            frame[1].remove()
+        self._keyframes.clear()
+        self.update_spline()
+        print("camera traj reset")
+    def spline_t_from_t_sec(self, time: np.ndarray) -> np.ndarray:
+        """From a time value in seconds, compute a t value for our geometric
+        spline interpolation. An increment of 1 for the latter will move the
+        camera forward by one keyframe.
+        We use a PCHIP spline here to guarantee monotonicity.
+        """
+        transition_times_cumsum = self.compute_transition_times_cumsum()
+        spline_indices = np.arange(transition_times_cumsum.shape[0])
+        if self.loop:
+            # In the case of a loop, we pad the spline to match the start/end
+            # slopes.
+            interpolator = scipy.interpolate.PchipInterpolator(
+                x=np.concatenate(
+                    [
+                        [-(transition_times_cumsum[-1] - transition_times_cumsum[-2])],
+                        transition_times_cumsum,
+                        transition_times_cumsum[-1:] + transition_times_cumsum[1:2],
+                    ],
+                    axis=0,
+                ),
+                y=np.concatenate(
+                    [[-1], spline_indices, [spline_indices[-1] + 1]],  # type: ignore
+                    axis=0,
+                ),
+            )
+        else:
+            interpolator = scipy.interpolate.PchipInterpolator(
+                x=transition_times_cumsum, y=spline_indices
+            )
+        # Clip to account for floating point error.
+        return np.clip(interpolator(time), 0, spline_indices[-1])
+    def interpolate_pose_and_fov_rad(
+        self, normalized_t: float
+    ) -> tuple[vt.SE3, float] | None:
+        if len(self._keyframes) < 2:
+            return None
+        self._fov_spline = splines.KochanekBartels(
+            [
+                (
+                    keyframe[0].override_fov_rad
+                    if keyframe[0].override_fov_enabled
+                    else self.default_fov
+                )
+                for keyframe in self._keyframes.values()
+            ],
+            tcb=(self.tension, 0.0, 0.0),
+            endconditions="closed" if self.loop else "natural",
+        )
+        assert self._orientation_spline is not None
+        assert self._position_spline is not None
+        assert self._fov_spline is not None
+        max_t = self.compute_duration()
+        t = max_t * normalized_t
+        spline_t = float(self.spline_t_from_t_sec(np.array(t)))
+        quat = self._orientation_spline.evaluate(spline_t)
+        assert isinstance(quat, splines.quaternion.UnitQuaternion)
+        return (
+            vt.SE3.from_rotation_and_translation(
+                vt.SO3(np.array([quat.scalar, *quat.vector])),
+                self._position_spline.evaluate(spline_t),
+            ),
+            float(self._fov_spline.evaluate(spline_t)),
+        )
+    def update_spline(self) -> None:
+        num_frames = int(self.compute_duration() * self.framerate)
+        keyframes = list(self._keyframes.values())
+        if num_frames <= 0 or not self.show_spline or len(keyframes) < 2:
+            for node in self._spline_nodes:
+                node.remove()
+            self._spline_nodes.clear()
+            return
+        transition_times_cumsum = self.compute_transition_times_cumsum()
+        self._orientation_spline = splines.quaternion.KochanekBartels(
+            [
+                splines.quaternion.UnitQuaternion.from_unit_xyzw(
+                    np.roll(keyframe[0].wxyz, shift=-1)
+                )
+                for keyframe in keyframes
+            ],
+            tcb=(self.tension, 0.0, 0.0),
+            endconditions="closed" if self.loop else "natural",
+        )
+        self._position_spline = splines.KochanekBartels(
+            [keyframe[0].position for keyframe in keyframes],
+            tcb=(self.tension, 0.0, 0.0),
+            endconditions="closed" if self.loop else "natural",
+        )
+        # Update visualized spline.
+        points_array = self._position_spline.evaluate(
+            self.spline_t_from_t_sec(
+                np.linspace(0, transition_times_cumsum[-1], num_frames)
+            )
+        )
+        colors_array = np.array(
+            [
+                colorsys.hls_to_rgb(h, 0.5, 1.0)
+                for h in np.linspace(0.0, 1.0, len(points_array))
+            ]
+        )
+        # Clear prior spline nodes.
+        for node in self._spline_nodes:
+            node.remove()
+        self._spline_nodes.clear()
+        self._spline_nodes.append(
+            self._server.scene.add_spline_catmull_rom(
+                str(Path(self._scene_node_prefix) / "camera_spline"),
+                positions=points_array,
+                color=(220, 220, 220),
+                closed=self.loop,
+                line_width=1.0,
+                segments=points_array.shape[0] + 1,
+            )
+        )
+        self._spline_nodes.append(
+            self._server.scene.add_point_cloud(
+                str(Path(self._scene_node_prefix) / "camera_spline/points"),
+                points=points_array,
+                colors=colors_array,
+                point_size=0.04,
+            )
+        )
+        def make_transition_handle(i: int) -> None:
+            assert self._position_spline is not None
+            transition_pos = self._position_spline.evaluate(
+                float(
+                    self.spline_t_from_t_sec(
+                        (transition_times_cumsum[i] + transition_times_cumsum[i + 1])
+                        / 2.0,
+                    )
+                )
+            )
+            transition_sphere = self._server.scene.add_icosphere(
+                str(Path(self._scene_node_prefix) / f"camera_spline/transition_{i}"),
+                radius=0.04,
+                color=(255, 0, 0),
+                position=transition_pos,
+            )
+            self._spline_nodes.append(transition_sphere)
+            @transition_sphere.on_click
+            def _(_) -> None:
+                server = self._server
+                if self._camera_edit_panel is not None:
+                    self._camera_edit_panel.remove()
+                    self._camera_edit_panel = None
+                keyframe_index = (i + 1) % len(self._keyframes)
+                keyframe = keyframes[keyframe_index][0]
+                with server.scene.add_3d_gui_container(
+                    "/camera_edit_panel",
+                    position=transition_pos,
+                ) as camera_edit_panel:
+                    self._camera_edit_panel = camera_edit_panel
+                    override_transition_enabled = server.gui.add_checkbox(
+                        "Override transition",
+                        initial_value=keyframe.override_transition_enabled,
+                    )
+                    override_transition_sec = server.gui.add_number(
+                        "Override transition (sec)",
+                        initial_value=(
+                            keyframe.override_transition_sec
+                            if keyframe.override_transition_sec is not None
+                            else self.default_transition_sec
+                        ),
+                        min=0.001,
+                        max=30.0,
+                        step=0.001,
+                        disabled=not override_transition_enabled.value,
+                    )
+                    close_button = server.gui.add_button("Close")
+                @override_transition_enabled.on_update
+                def _(_) -> None:
+                    keyframe.override_transition_enabled = (
+                        override_transition_enabled.value
+                    )
+                    override_transition_sec.disabled = (
+                        not override_transition_enabled.value
+                    )
+                    self._duration_element.value = self.compute_duration()
+                @override_transition_sec.on_update
+                def _(_) -> None:
+                    keyframe.override_transition_sec = override_transition_sec.value
+                    self._duration_element.value = self.compute_duration()
+                @close_button.on_click
+                def _(_) -> None:
+                    assert camera_edit_panel is not None
+                    camera_edit_panel.remove()
+                    self._camera_edit_panel = None
+        (num_transitions_plus_1,) = transition_times_cumsum.shape
+        for i in range(num_transitions_plus_1 - 1):
+            make_transition_handle(i)
+    def compute_duration(self) -> float:
+        """Compute the total duration of the trajectory."""
+        total = 0.0
+        for i, (keyframe, frustum) in enumerate(self._keyframes.values()):
+            if i == 0 and not self.loop:
+                continue
+            del frustum
+            total += (
+                keyframe.override_transition_sec
+                if keyframe.override_transition_enabled
+                and keyframe.override_transition_sec is not None
+                else self.default_transition_sec
+            )
+        return total
+    def compute_transition_times_cumsum(self) -> np.ndarray:
+        """Compute the total duration of the trajectory."""
+        total = 0.0
+        out = [0.0]
+        for i, (keyframe, frustum) in enumerate(self._keyframes.values()):
+            if i == 0:
+                continue
+            del frustum
+            total += (
+                keyframe.override_transition_sec
+                if keyframe.override_transition_enabled
+                and keyframe.override_transition_sec is not None
+                else self.default_transition_sec
+            )
+            out.append(total)
+        if self.loop:
+            keyframe = next(iter(self._keyframes.values()))[0]
+            total += (
+                keyframe.override_transition_sec
+                if keyframe.override_transition_enabled
+                and keyframe.override_transition_sec is not None
+                else self.default_transition_sec
+            )
+            out.append(total)
+        return np.array(out)
+@dataclasses.dataclass
+class GuiState:
+    preview_render: bool
+    preview_fov: float
+    preview_aspect: float
+    camera_traj_list: list | None
+    active_input_index: int
+def define_gui(
+    server: viser.ViserServer,
+    init_fov: float = 75.0,
+    img_wh: tuple[int, int] = (576, 576),
+    **kwargs,
+) -> GuiState:
+    gui_state = GuiState(
+        preview_render=False,
+        preview_fov=0.0,
+        preview_aspect=1.0,
+        camera_traj_list=None,
+        active_input_index=0,
+    )
+    with server.gui.add_folder(
+        "Preset camera trajectories", order=99, expand_by_default=False
+    ):
+        preset_traj_dropdown = server.gui.add_dropdown(
+            "Options",
+            [
+                "orbit",
+                "spiral",
+                "lemniscate",
+                "zoom-out",
+                "dolly zoom-out",
+            ],
+            initial_value="orbit",
+            hint="Select a preset camera trajectory.",
+        )
+        preset_duration_num = server.gui.add_number(
+            "Duration (sec)",
+            min=1.0,
+            max=60.0,
+            step=0.5,
+            initial_value=2.0,
+        )
+        preset_submit_button = server.gui.add_button(
+            "Submit",
+            icon=viser.Icon.PICK,
+            hint="Add a new keyframe at the current pose.",
+        )
+        @preset_submit_button.on_click
+        def _(event: viser.GuiEvent) -> None:
+            camera_traj.reset()
+            gui_state.camera_traj_list = None
+            duration = preset_duration_num.value
+            fps = framerate_number.value
+            num_frames = int(duration * fps)
+            transition_sec = duration / num_frames
+            transition_sec_number.value = transition_sec
+            assert event.client_id is not None
+            transition_sec_number.disabled = True
+            loop_checkbox.disabled = True
+            add_keyframe_button.disabled = True
+            camera = server.get_clients()[event.client_id].camera
+            start_w2c = torch.linalg.inv(
+                torch.as_tensor(
+                    vt.SE3.from_rotation_and_translation(
+                        vt.SO3(camera.wxyz), camera.position
+                    ).as_matrix(),
+                    dtype=torch.float32,
+                )
+            )
+            look_at = torch.as_tensor(camera.look_at, dtype=torch.float32)
+            up_direction = torch.as_tensor(camera.up_direction, dtype=torch.float32)
+            poses, fovs = get_preset_pose_fov(
+                option=preset_traj_dropdown.value,  # type: ignore
+                num_frames=num_frames,
+                start_w2c=start_w2c,
+                look_at=look_at,
+                up_direction=up_direction,
+                fov=camera.fov,
+            )
+            assert poses is not None and fovs is not None
+            for pose, fov in zip(poses, fovs):
+                camera_traj.add_camera(
+                    Keyframe.from_se3(
+                        vt.SE3.from_matrix(pose),
+                        fov=fov,
+                        aspect=img_wh[0] / img_wh[1],
+                    )
+                )
+            duration_number.value = camera_traj.compute_duration()
+            camera_traj.update_spline()
+    with server.gui.add_folder("Advanced", expand_by_default=False, order=100):
+        transition_sec_number = server.gui.add_number(
+            "Transition (sec)",
+            min=0.001,
+            max=30.0,
+            step=0.001,
+            initial_value=1.5,
+            hint="Time in seconds between each keyframe, which can also be overridden on a per-transition basis.",
+        )
+        framerate_number = server.gui.add_number(
+            "FPS", min=0.1, max=240.0, step=1e-2, initial_value=30.0
+        )
+        framerate_buttons = server.gui.add_button_group("", ("24", "30", "60"))
+        duration_number = server.gui.add_number(
+            "Duration (sec)",
+            min=0.0,
+            max=1e8,
+            step=0.001,
+            initial_value=0.0,
+            disabled=True,
+        )
+        @framerate_buttons.on_click
+        def _(_) -> None:
+            framerate_number.value = float(framerate_buttons.value)
+    fov_degree_slider = server.gui.add_slider(
+        "FOV",
+        initial_value=init_fov,
+        min=0.1,
+        max=175.0,
+        step=0.01,
+        hint="Field-of-view for rendering, which can also be overridden on a per-keyframe basis.",
+    )
+    @fov_degree_slider.on_update
+    def _(_) -> None:
+        fov_radians = fov_degree_slider.value / 180.0 * np.pi
+        for client in server.get_clients().values():
+            client.camera.fov = fov_radians
+        camera_traj.default_fov = fov_radians
+        # Updating the aspect ratio will also re-render the camera frustums.
+        # Could rethink this.
+        camera_traj.update_aspect(img_wh[0] / img_wh[1])
+        compute_and_update_preview_camera_state()
+    scene_node_prefix = "/render_assets"
+    base_scene_node = server.scene.add_frame(scene_node_prefix, show_axes=False)
+    add_keyframe_button = server.gui.add_button(
+        "Add keyframe",
+        icon=viser.Icon.PLUS,
+        hint="Add a new keyframe at the current pose.",
+    )
+    @add_keyframe_button.on_click
+    def _(event: viser.GuiEvent) -> None:
+        assert event.client_id is not None
+        camera = server.get_clients()[event.client_id].camera
+        pose = vt.SE3.from_rotation_and_translation(
+            vt.SO3(camera.wxyz), camera.position
+        )
+        print(f"client {event.client_id} at {camera.position} {camera.wxyz}")
+        print(f"camera pose {pose.as_matrix()}")
+        # Add this camera to the trajectory.
+        camera_traj.add_camera(
+            Keyframe.from_camera(
+                camera,
+                aspect=img_wh[0] / img_wh[1],
+            ),
+        )
+        duration_number.value = camera_traj.compute_duration()
+        camera_traj.update_spline()
+    clear_keyframes_button = server.gui.add_button(
+        "Clear keyframes",
+        icon=viser.Icon.TRASH,
+        hint="Remove all keyframes from the render trajectory.",
+    )
+    @clear_keyframes_button.on_click
+    def _(event: viser.GuiEvent) -> None:
+        assert event.client_id is not None
+        client = server.get_clients()[event.client_id]
+        with client.atomic(), client.gui.add_modal("Confirm") as modal:
+            client.gui.add_markdown("Clear all keyframes?")
+            confirm_button = client.gui.add_button(
+                "Yes", color="red", icon=viser.Icon.TRASH
+            )
+            exit_button = client.gui.add_button("Cancel")
+            @confirm_button.on_click
+            def _(_) -> None:
+                camera_traj.reset()
+                modal.close()
+                duration_number.value = camera_traj.compute_duration()
+                add_keyframe_button.disabled = False
+                transition_sec_number.disabled = False
+                transition_sec_number.value = 1.5
+                loop_checkbox.disabled = False
+                nonlocal gui_state
+                gui_state.camera_traj_list = None
+            @exit_button.on_click
+            def _(_) -> None:
+                modal.close()
+    play_button = server.gui.add_button("Play", icon=viser.Icon.PLAYER_PLAY)
+    pause_button = server.gui.add_button(
+        "Pause", icon=viser.Icon.PLAYER_PAUSE, visible=False
+    )
+    # Poll the play button to see if we should be playing endlessly.
+    def play() -> None:
+        while True:
+            while not play_button.visible:
+                max_frame = int(framerate_number.value * duration_number.value)
+                if max_frame > 0:
+                    assert preview_frame_slider is not None
+                    preview_frame_slider.value = (
+                        preview_frame_slider.value + 1
+                    ) % max_frame
+                time.sleep(1.0 / framerate_number.value)
+            time.sleep(0.1)
+    threading.Thread(target=play).start()
+    # Play the camera trajectory when the play button is pressed.
+    @play_button.on_click
+    def _(_) -> None:
+        play_button.visible = False
+        pause_button.visible = True
+    # Play the camera trajectory when the play button is pressed.
+    @pause_button.on_click
+    def _(_) -> None:
+        play_button.visible = True
+        pause_button.visible = False
+    preview_render_button = server.gui.add_button(
+        "Preview render",
+        hint="Show a preview of the render in the viewport.",
+        icon=viser.Icon.CAMERA_CHECK,
+    )
+    preview_render_stop_button = server.gui.add_button(
+        "Exit render preview",
+        color="red",
+        icon=viser.Icon.CAMERA_CANCEL,
+        visible=False,
+    )
+    @preview_render_button.on_click
+    def _(_) -> None:
+        gui_state.preview_render = True
+        preview_render_button.visible = False
+        preview_render_stop_button.visible = True
+        play_button.visible = False
+        pause_button.visible = True
+        preset_submit_button.disabled = True
+        maybe_pose_and_fov_rad = compute_and_update_preview_camera_state()
+        if maybe_pose_and_fov_rad is None:
+            remove_preview_camera()
+            return
+        pose, fov = maybe_pose_and_fov_rad
+        del fov
+        # Hide all render assets when we're previewing the render.
+        nonlocal base_scene_node
+        base_scene_node.visible = False
+        # Back up and then set camera poses.
+        for client in server.get_clients().values():
+            camera_pose_backup_from_id[client.client_id] = (
+                client.camera.position,
+                client.camera.look_at,
+                client.camera.up_direction,
+            )
+            with client.atomic():
+                client.camera.wxyz = pose.rotation().wxyz
+                client.camera.position = pose.translation()
+    def stop_preview_render() -> None:
+        gui_state.preview_render = False
+        preview_render_button.visible = True
+        preview_render_stop_button.visible = False
+        play_button.visible = True
+        pause_button.visible = False
+        preset_submit_button.disabled = False
+        # Revert camera poses.
+        for client in server.get_clients().values():
+            if client.client_id not in camera_pose_backup_from_id:
+                continue
+            cam_position, cam_look_at, cam_up = camera_pose_backup_from_id.pop(
+                client.client_id
+            )
+            with client.atomic():
+                client.camera.position = cam_position
+                client.camera.look_at = cam_look_at
+                client.camera.up_direction = cam_up
+            client.flush()
+        # Un-hide render assets.
+        nonlocal base_scene_node
+        base_scene_node.visible = True
+        remove_preview_camera()
+    @preview_render_stop_button.on_click
+    def _(_) -> None:
+        stop_preview_render()
+    def get_max_frame_index() -> int:
+        return max(1, int(framerate_number.value * duration_number.value) - 1)
+    def add_preview_frame_slider() -> viser.GuiInputHandle[int] | None:
+        """Helper for creating the current frame # slider. This is removed and
+        re-added anytime the `max` value changes."""
+        preview_frame_slider = server.gui.add_slider(
+            "Preview frame",
+            min=0,
+            max=get_max_frame_index(),
+            step=1,
+            initial_value=0,
+            order=set_traj_button.order + 0.01,
+            disabled=get_max_frame_index() == 1,
+        )
+        play_button.disabled = preview_frame_slider.disabled
+        preview_render_button.disabled = preview_frame_slider.disabled
+        set_traj_button.disabled = preview_frame_slider.disabled
+        @preview_frame_slider.on_update
+        def _(_) -> None:
+            nonlocal preview_camera_handle
+            maybe_pose_and_fov_rad = compute_and_update_preview_camera_state()
+            if maybe_pose_and_fov_rad is None:
+                return
+            pose, fov_rad = maybe_pose_and_fov_rad
+            preview_camera_handle = server.scene.add_camera_frustum(
+                str(Path(scene_node_prefix) / "preview_camera"),
+                fov=fov_rad,
+                aspect=img_wh[0] / img_wh[1],
+                scale=0.35,
+                wxyz=pose.rotation().wxyz,
+                position=pose.translation(),
+                color=(10, 200, 30),
+            )
+            if gui_state.preview_render:
+                for client in server.get_clients().values():
+                    with client.atomic():
+                        client.camera.wxyz = pose.rotation().wxyz
+                        client.camera.position = pose.translation()
+        return preview_frame_slider
+    set_traj_button = server.gui.add_button(
+        "Set camera trajectory",
+        color="green",
+        icon=viser.Icon.CHECK,
+        hint="Save the camera trajectory for rendering.",
+    )
+    @set_traj_button.on_click
+    def _(event: viser.GuiEvent) -> None:
+        assert event.client is not None
+        num_frames = int(framerate_number.value * duration_number.value)
+        def get_intrinsics(W, H, fov_rad):
+            focal = 0.5 * H / np.tan(0.5 * fov_rad)
+            return np.array(
+                [[focal, 0.0, 0.5 * W], [0.0, focal, 0.5 * H], [0.0, 0.0, 1.0]]
+            )
+        camera_traj_list = []
+        for i in range(num_frames):
+            maybe_pose_and_fov_rad = camera_traj.interpolate_pose_and_fov_rad(
+                i / num_frames
+            )
+            if maybe_pose_and_fov_rad is None:
+                return
+            pose, fov_rad = maybe_pose_and_fov_rad
+            H = img_wh[1]
+            W = img_wh[0]
+            K = get_intrinsics(W, H, fov_rad)
+            w2c = pose.inverse().as_matrix()
+            camera_traj_list.append(
+                {
+                    "w2c": w2c.flatten().tolist(),
+                    "K": K.flatten().tolist(),
+                    "img_wh": (W, H),
+                }
+            )
+        nonlocal gui_state
+        gui_state.camera_traj_list = camera_traj_list
+        print(f"Get camera_traj_list: {gui_state.camera_traj_list}")
+        stop_preview_render()
+    preview_frame_slider = add_preview_frame_slider()
+    loop_checkbox = server.gui.add_checkbox(
+        "Loop", False, hint="Add a segment between the first and last keyframes."
+    )
+    @loop_checkbox.on_update
+    def _(_) -> None:
+        camera_traj.loop = loop_checkbox.value
+        duration_number.value = camera_traj.compute_duration()
+    @transition_sec_number.on_update
+    def _(_) -> None:
+        camera_traj.default_transition_sec = transition_sec_number.value
+        duration_number.value = camera_traj.compute_duration()
+    preview_camera_handle: viser.SceneNodeHandle | None = None
+    def remove_preview_camera() -> None:
+        nonlocal preview_camera_handle
+        if preview_camera_handle is not None:
+            preview_camera_handle.remove()
+            preview_camera_handle = None
+    def compute_and_update_preview_camera_state() -> tuple[vt.SE3, float] | None:
+        """Update the render tab state with the current preview camera pose.
+        Returns current camera pose + FOV if available."""
+        if preview_frame_slider is None:
+            return None
+        maybe_pose_and_fov_rad = camera_traj.interpolate_pose_and_fov_rad(
+            preview_frame_slider.value / get_max_frame_index()
+        )
+        if maybe_pose_and_fov_rad is None:
+            remove_preview_camera()
+            return None
+        pose, fov_rad = maybe_pose_and_fov_rad
+        gui_state.preview_fov = fov_rad
+        gui_state.preview_aspect = camera_traj.get_aspect()
+        return pose, fov_rad
+    # We back up the camera poses before and after we start previewing renders.
+    camera_pose_backup_from_id: dict[int, tuple] = {}
+    # Update the # of frames.
+    @duration_number.on_update
+    @framerate_number.on_update
+    def _(_) -> None:
+        remove_preview_camera()  # Will be re-added when slider is updated.
+        nonlocal preview_frame_slider
+        old = preview_frame_slider
+        assert old is not None
+        preview_frame_slider = add_preview_frame_slider()
+        if preview_frame_slider is not None:
+            old.remove()
+        else:
+            preview_frame_slider = old
+        camera_traj.framerate = framerate_number.value
+        camera_traj.update_spline()
+    camera_traj = CameraTrajectory(
+        server,
+        duration_number,
+        scene_node_prefix=scene_node_prefix,
+        **kwargs,
+    )
+    camera_traj.default_fov = fov_degree_slider.value / 180.0 * np.pi
+    camera_traj.default_transition_sec = transition_sec_number.value
+    return gui_state

seva/model.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from dataclasses import dataclass, field
+import torch
+import torch.nn as nn
+from seva.modules.layers import (
+    Downsample,
+    GroupNorm32,
+    ResBlock,
+    TimestepEmbedSequential,
+    Upsample,
+    timestep_embedding,
+)
+from seva.modules.transformer import MultiviewTransformer
+@dataclass
+class SevaParams(object):
+    in_channels: int = 11
+    model_channels: int = 320
+    out_channels: int = 4
+    num_frames: int = 21
+    num_res_blocks: int = 2
+    attention_resolutions: list[int] = field(default_factory=lambda: [4, 2, 1])
+    channel_mult: list[int] = field(default_factory=lambda: [1, 2, 4, 4])
+    num_head_channels: int = 64
+    transformer_depth: list[int] = field(default_factory=lambda: [1, 1, 1, 1])
+    context_dim: int = 1024
+    dense_in_channels: int = 6
+    dropout: float = 0.0
+    unflatten_names: list[str] = field(
+        default_factory=lambda: ["middle_ds8", "output_ds4", "output_ds2"]
+    )
+    def __post_init__(self):
+        assert len(self.channel_mult) == len(self.transformer_depth)
+class Seva(nn.Module):
+    def __init__(self, params: SevaParams) -> None:
+        super().__init__()
+        self.params = params
+        self.model_channels = params.model_channels
+        self.out_channels = params.out_channels
+        self.num_head_channels = params.num_head_channels
+        time_embed_dim = params.model_channels * 4
+        self.time_embed = nn.Sequential(
+            nn.Linear(params.model_channels, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, time_embed_dim),
+        )
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(
+                    nn.Conv2d(params.in_channels, params.model_channels, 3, padding=1)
+                )
+            ]
+        )
+        self._feature_size = params.model_channels
+        input_block_chans = [params.model_channels]
+        ch = params.model_channels
+        ds = 1
+        for level, mult in enumerate(params.channel_mult):
+            for _ in range(params.num_res_blocks):
+                input_layers: list[ResBlock | MultiviewTransformer | Downsample] = [
+                    ResBlock(
+                        channels=ch,
+                        emb_channels=time_embed_dim,
+                        out_channels=mult * params.model_channels,
+                        dense_in_channels=params.dense_in_channels,
+                        dropout=params.dropout,
+                    )
+                ]
+                ch = mult * params.model_channels
+                if ds in params.attention_resolutions:
+                    num_heads = ch // params.num_head_channels
+                    dim_head = params.num_head_channels
+                    input_layers.append(
+                        MultiviewTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            name=f"input_ds{ds}",
+                            depth=params.transformer_depth[level],
+                            context_dim=params.context_dim,
+                            unflatten_names=params.unflatten_names,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*input_layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(params.channel_mult) - 1:
+                ds *= 2
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(Downsample(ch, out_channels=out_ch))
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                self._feature_size += ch
+        num_heads = ch // params.num_head_channels
+        dim_head = params.num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                channels=ch,
+                emb_channels=time_embed_dim,
+                out_channels=None,
+                dense_in_channels=params.dense_in_channels,
+                dropout=params.dropout,
+            ),
+            MultiviewTransformer(
+                ch,
+                num_heads,
+                dim_head,
+                name=f"middle_ds{ds}",
+                depth=params.transformer_depth[-1],
+                context_dim=params.context_dim,
+                unflatten_names=params.unflatten_names,
+            ),
+            ResBlock(
+                channels=ch,
+                emb_channels=time_embed_dim,
+                out_channels=None,
+                dense_in_channels=params.dense_in_channels,
+                dropout=params.dropout,
+            ),
+        )
+        self._feature_size += ch
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(params.channel_mult))[::-1]:
+            for i in range(params.num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                output_layers: list[ResBlock | MultiviewTransformer | Upsample] = [
+                    ResBlock(
+                        channels=ch + ich,
+                        emb_channels=time_embed_dim,
+                        out_channels=params.model_channels * mult,
+                        dense_in_channels=params.dense_in_channels,
+                        dropout=params.dropout,
+                    )
+                ]
+                ch = params.model_channels * mult
+                if ds in params.attention_resolutions:
+                    num_heads = ch // params.num_head_channels
+                    dim_head = params.num_head_channels
+                    output_layers.append(
+                        MultiviewTransformer(
+                            ch,
+                            num_heads,
+                            dim_head,
+                            name=f"output_ds{ds}",
+                            depth=params.transformer_depth[level],
+                            context_dim=params.context_dim,
+                            unflatten_names=params.unflatten_names,
+                        )
+                    )
+                if level and i == params.num_res_blocks:
+                    out_ch = ch
+                    ds //= 2
+                    output_layers.append(Upsample(ch, out_ch))
+                self.output_blocks.append(TimestepEmbedSequential(*output_layers))
+                self._feature_size += ch
+        self.out = nn.Sequential(
+            GroupNorm32(32, ch),
+            nn.SiLU(),
+            nn.Conv2d(self.model_channels, params.out_channels, 3, padding=1),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        y: torch.Tensor,
+        dense_y: torch.Tensor,
+        num_frames: int | None = None,
+    ) -> torch.Tensor:
+        num_frames = num_frames or self.params.num_frames
+        t_emb = timestep_embedding(t, self.model_channels)
+        t_emb = self.time_embed(t_emb)
+        hs = []
+        h = x
+        for module in self.input_blocks:
+            h = module(
+                h,
+                emb=t_emb,
+                context=y,
+                dense_emb=dense_y,
+                num_frames=num_frames,
+            )
+            hs.append(h)
+        h = self.middle_block(
+            h,
+            emb=t_emb,
+            context=y,
+            dense_emb=dense_y,
+            num_frames=num_frames,
+        )
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(
+                h,
+                emb=t_emb,
+                context=y,
+                dense_emb=dense_y,
+                num_frames=num_frames,
+            )
+        h = h.type(x.dtype)
+        return self.out(h)
+class SGMWrapper(nn.Module):
+    def __init__(self, module: Seva):
+        super().__init__()
+        self.module = module
+    def forward(
+        self, x: torch.Tensor, t: torch.Tensor, c: dict, **kwargs
+    ) -> torch.Tensor:
+        x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
+        return self.module(
+            x,
+            t=t,
+            y=c["crossattn"],
+            dense_y=c["dense_vector"],
+            **kwargs,
+        )

seva/modules/__init__.py ADDED Viewed

File without changes

seva/modules/autoencoder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+from diffusers.models import AutoencoderKL  # type: ignore
+from torch import nn
+class AutoEncoder(nn.Module):
+    scale_factor: float = 0.18215
+    downsample: int = 8
+    def __init__(self, chunk_size: int | None = None):
+        super().__init__()
+        self.module = AutoencoderKL.from_pretrained(
+            "stabilityai/stable-diffusion-2-1-base",
+            subfolder="vae",
+            force_download=False,
+            low_cpu_mem_usage=False,
+        )
+        self.module.eval().requires_grad_(False)  # type: ignore
+        self.chunk_size = chunk_size
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        return (
+            self.module.encode(x).latent_dist.mean  # type: ignore
+            * self.scale_factor
+        )
+    def encode(self, x: torch.Tensor, chunk_size: int | None = None) -> torch.Tensor:
+        chunk_size = chunk_size or self.chunk_size
+        if chunk_size is not None:
+            return torch.cat(
+                [self._encode(x_chunk) for x_chunk in x.split(chunk_size)],
+                dim=0,
+            )
+        else:
+            return self._encode(x)
+    def _decode(self, z: torch.Tensor) -> torch.Tensor:
+        return self.module.decode(z / self.scale_factor).sample  # type: ignore
+    def decode(self, z: torch.Tensor, chunk_size: int | None = None) -> torch.Tensor:
+        chunk_size = chunk_size or self.chunk_size
+        if chunk_size is not None:
+            return torch.cat(
+                [self._decode(z_chunk) for z_chunk in z.split(chunk_size)],
+                dim=0,
+            )
+        else:
+            return self._decode(z)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.decode(self.encode(x))

seva/modules/conditioner.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import kornia
+import open_clip
+import torch
+from torch import nn
+class CLIPConditioner(nn.Module):
+    mean: torch.Tensor
+    std: torch.Tensor
+    def __init__(self):
+        super().__init__()
+        self.module = open_clip.create_model_and_transforms(
+            "ViT-H-14", pretrained="laion2b_s32b_b79k"
+        )[0]
+        self.module.eval().requires_grad_(False)  # type: ignore
+        self.register_buffer(
+            "mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
+        )
+        self.register_buffer(
+            "std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
+        )
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        x = kornia.geometry.resize(
+            x,
+            (224, 224),
+            interpolation="bicubic",
+            align_corners=True,
+            antialias=True,
+        )
+        x = (x + 1.0) / 2.0
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.preprocess(x)
+        x = self.module.encode_image(x)
+        return x

seva/modules/layers.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import math
+import torch
+import torch.nn.functional as F
+from einops import repeat
+from torch import nn
+from .transformer import MultiviewTransformer
+def timestep_embedding(
+    timesteps: torch.Tensor,
+    dim: int,
+    max_period: int = 10000,
+    repeat_only: bool = False,
+) -> torch.Tensor:
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+    else:
+        embedding = repeat(timesteps, "b -> b d", d=dim)
+    return embedding
+class Upsample(nn.Module):
+    def __init__(self, channels: int, out_channels: int | None = None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.conv = nn.Conv2d(self.channels, self.out_channels, 3, 1, 1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[1] == self.channels
+        x = F.interpolate(x, scale_factor=2, mode="nearest")
+        x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, channels: int, out_channels: int | None = None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.op = nn.Conv2d(self.channels, self.out_channels, 3, 2, 1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input.float()).type(input.dtype)
+class TimestepEmbedSequential(nn.Sequential):
+    def forward(  # type: ignore[override]
+        self,
+        x: torch.Tensor,
+        emb: torch.Tensor,
+        context: torch.Tensor,
+        dense_emb: torch.Tensor,
+        num_frames: int,
+    ) -> torch.Tensor:
+        for layer in self:
+            if isinstance(layer, MultiviewTransformer):
+                assert num_frames is not None
+                x = layer(x, context, num_frames)
+            elif isinstance(layer, ResBlock):
+                x = layer(x, emb, dense_emb)
+            else:
+                x = layer(x)
+        return x
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        emb_channels: int,
+        out_channels: int | None,
+        dense_in_channels: int,
+        dropout: float,
+    ):
+        super().__init__()
+        out_channels = out_channels or channels
+        self.in_layers = nn.Sequential(
+            GroupNorm32(32, channels),
+            nn.SiLU(),
+            nn.Conv2d(channels, out_channels, 3, 1, 1),
+        )
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(), nn.Linear(emb_channels, out_channels)
+        )
+        self.dense_emb_layers = nn.Sequential(
+            nn.Conv2d(dense_in_channels, 2 * channels, 1, 1, 0)
+        )
+        self.out_layers = nn.Sequential(
+            GroupNorm32(32, out_channels),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv2d(out_channels, out_channels, 3, 1, 1),
+        )
+        if out_channels == channels:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = nn.Conv2d(channels, out_channels, 1, 1, 0)
+    def forward(
+        self, x: torch.Tensor, emb: torch.Tensor, dense_emb: torch.Tensor
+    ) -> torch.Tensor:
+        in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+        h = in_rest(x)
+        dense = self.dense_emb_layers(
+            F.interpolate(
+                dense_emb, size=h.shape[2:], mode="bilinear", align_corners=True
+            )
+        ).type(h.dtype)
+        dense_scale, dense_shift = torch.chunk(dense, 2, dim=1)
+        h = h * (1 + dense_scale) + dense_shift
+        h = in_conv(h)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        # TODO(hangg): Optimize this?
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        h = h + emb_out
+        h = self.out_layers(h)
+        h = self.skip_connection(x) + h
+        return h

seva/modules/preprocessor.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import contextlib
+import os
+import os.path as osp
+import sys
+from typing import cast
+import imageio.v3 as iio
+import numpy as np
+import torch
+class Dust3rPipeline(object):
+    def __init__(self, device: str | torch.device = "cuda"):
+        submodule_path = osp.realpath(
+            osp.join(osp.dirname(__file__), "../../third_party/dust3r/")
+        )
+        if submodule_path not in sys.path:
+            sys.path.insert(0, submodule_path)
+        try:
+            with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
+                from dust3r.cloud_opt import (  # type: ignore[import]
+                    GlobalAlignerMode,
+                    global_aligner,
+                )
+                from dust3r.image_pairs import make_pairs  # type: ignore[import]
+                from dust3r.inference import inference  # type: ignore[import]
+                from dust3r.model import AsymmetricCroCo3DStereo  # type: ignore[import]
+                from dust3r.utils.image import load_images  # type: ignore[import]
+        except ImportError:
+            raise ImportError(
+                "Missing required submodule: 'dust3r'. Please ensure that all submodules are properly set up.\n\n"
+                "To initialize them, run the following command in the project root:\n"
+                "  git submodule update --init --recursive"
+            )
+        self.device = torch.device(device)
+        self.model = AsymmetricCroCo3DStereo.from_pretrained(
+            "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+        ).to(self.device)
+        self._GlobalAlignerMode = GlobalAlignerMode
+        self._global_aligner = global_aligner
+        self._make_pairs = make_pairs
+        self._inference = inference
+        self._load_images = load_images
+    def infer_cameras_and_points(
+        self,
+        img_paths: list[str],
+        Ks: list[list] = None,
+        c2ws: list[list] = None,
+        batch_size: int = 16,
+        schedule: str = "cosine",
+        lr: float = 0.01,
+        niter: int = 500,
+        min_conf_thr: int = 3,
+    ) -> tuple[
+        list[np.ndarray], np.ndarray, np.ndarray, list[np.ndarray], list[np.ndarray]
+    ]:
+        num_img = len(img_paths)
+        if num_img == 1:
+            print("Only one image found, duplicating it to create a stereo pair.")
+            img_paths = img_paths * 2
+        images = self._load_images(img_paths, size=512)
+        pairs = self._make_pairs(
+            images,
+            scene_graph="complete",
+            prefilter=None,
+            symmetrize=True,
+        )
+        output = self._inference(pairs, self.model, self.device, batch_size=batch_size)
+        ori_imgs = [iio.imread(p) for p in img_paths]
+        ori_img_whs = np.array([img.shape[1::-1] for img in ori_imgs])
+        img_whs = np.concatenate([image["true_shape"][:, ::-1] for image in images], 0)
+        scene = self._global_aligner(
+            output,
+            device=self.device,
+            mode=self._GlobalAlignerMode.PointCloudOptimizer,
+            same_focals=True,
+            optimize_pp=False,  # True,
+            min_conf_thr=min_conf_thr,
+        )
+        # if Ks is not None:
+        #     scene.preset_focal(
+        #         torch.tensor([[K[0, 0], K[1, 1]] for K in Ks])
+        #     )
+        if c2ws is not None:
+            scene.preset_pose(c2ws)
+        _ = scene.compute_global_alignment(
+            init="msp", niter=niter, schedule=schedule, lr=lr
+        )
+        imgs = cast(list, scene.imgs)
+        Ks = scene.get_intrinsics().detach().cpu().numpy().copy()
+        c2ws = scene.get_im_poses().detach().cpu().numpy()  # type: ignore
+        pts3d = [x.detach().cpu().numpy() for x in scene.get_pts3d()]  # type: ignore
+        if num_img > 1:
+            masks = [x.detach().cpu().numpy() for x in scene.get_masks()]
+            points = [p[m] for p, m in zip(pts3d, masks)]
+            point_colors = [img[m] for img, m in zip(imgs, masks)]
+        else:
+            points = [p.reshape(-1, 3) for p in pts3d]
+            point_colors = [img.reshape(-1, 3) for img in imgs]
+        # Convert back to the original image size.
+        imgs = ori_imgs
+        Ks[:, :2, -1] *= ori_img_whs / img_whs
+        Ks[:, :2, :2] *= (ori_img_whs / img_whs).mean(axis=1, keepdims=True)[..., None]
+        return imgs, Ks, c2ws, points, point_colors

seva/modules/transformer.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import nn
+from torch.nn.attention import SDPBackend, sdpa_kernel
+class GEGLU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int | None = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out or dim
+        self.net = nn.Sequential(
+            GEGLU(dim, inner_dim), nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        context_dim: int | None = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.dim_head = dim_head
+        inner_dim = dim_head * heads
+        context_dim = context_dim or query_dim
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+        )
+    def forward(
+        self, x: torch.Tensor, context: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        q = self.to_q(x)
+        context = context if context is not None else x
+        k = self.to_k(context)
+        v = self.to_v(context)
+        q, k, v = map(
+            lambda t: rearrange(t, "b l (h d) -> b h l d", h=self.heads),
+            (q, k, v),
+        )
+        with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+            out = F.scaled_dot_product_attention(q, k, v)
+        out = rearrange(out, "b h l d -> b l (h d)")
+        out = self.to_out(out)
+        return out
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        d_head: int,
+        context_dim: int,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.attn1 = Attention(
+            query_dim=dim,
+            context_dim=None,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.ff = FeedForward(dim, dropout=dropout)
+        self.attn2 = Attention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+    def forward(self, x: torch.Tensor, context: torch.Tensor) -> torch.Tensor:
+        x = self.attn1(self.norm1(x)) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+class TransformerBlockTimeMix(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        d_head: int,
+        context_dim: int,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        inner_dim = n_heads * d_head
+        self.norm_in = nn.LayerNorm(dim)
+        self.ff_in = FeedForward(dim, dim_out=inner_dim, dropout=dropout)
+        self.attn1 = Attention(
+            query_dim=inner_dim,
+            context_dim=None,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.ff = FeedForward(inner_dim, dim_out=dim, dropout=dropout)
+        self.attn2 = Attention(
+            query_dim=inner_dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+        )
+        self.norm1 = nn.LayerNorm(inner_dim)
+        self.norm2 = nn.LayerNorm(inner_dim)
+        self.norm3 = nn.LayerNorm(inner_dim)
+    def forward(
+        self, x: torch.Tensor, context: torch.Tensor, num_frames: int
+    ) -> torch.Tensor:
+        _, s, _ = x.shape
+        x = rearrange(x, "(b t) s c -> (b s) t c", t=num_frames)
+        x = self.ff_in(self.norm_in(x)) + x
+        x = self.attn1(self.norm1(x), context=None) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x))
+        x = rearrange(x, "(b s) t c -> (b t) s c", s=s)
+        return x
+class SkipConnect(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(
+        self, x_spatial: torch.Tensor, x_temporal: torch.Tensor
+    ) -> torch.Tensor:
+        return x_spatial + x_temporal
+class MultiviewTransformer(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        n_heads: int,
+        d_head: int,
+        name: str,
+        unflatten_names: list[str] = [],
+        depth: int = 1,
+        context_dim: int = 1024,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.name = name
+        self.unflatten_names = unflatten_names
+        inner_dim = n_heads * d_head
+        self.norm = nn.GroupNorm(32, in_channels, eps=1e-6)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    context_dim=context_dim,
+                    dropout=dropout,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+        self.time_mixer = SkipConnect()
+        self.time_mix_blocks = nn.ModuleList(
+            [
+                TransformerBlockTimeMix(
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    context_dim=context_dim,
+                    dropout=dropout,
+                )
+                for _ in range(depth)
+            ]
+        )
+    def forward(
+        self, x: torch.Tensor, context: torch.Tensor, num_frames: int
+    ) -> torch.Tensor:
+        assert context.ndim == 3
+        _, _, h, w = x.shape
+        x_in = x
+        time_context = context
+        time_context_first_timestep = time_context[::num_frames]
+        time_context = repeat(
+            time_context_first_timestep, "b ... -> (b n) ...", n=h * w
+        )
+        if self.name in self.unflatten_names:
+            context = context[::num_frames]
+        x = self.norm(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        x = self.proj_in(x)
+        for block, mix_block in zip(self.transformer_blocks, self.time_mix_blocks):
+            if self.name in self.unflatten_names:
+                x = rearrange(x, "(b t) (h w) c -> b (t h w) c", t=num_frames, h=h, w=w)
+            x = block(x, context=context)
+            if self.name in self.unflatten_names:
+                x = rearrange(x, "b (t h w) c -> (b t) (h w) c", t=num_frames, h=h, w=w)
+            x_mix = mix_block(x, context=time_context, num_frames=num_frames)
+            x = self.time_mixer(x_spatial=x, x_temporal=x_mix)
+        x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        out = x + x_in
+        return out

seva/sampling.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from tqdm import tqdm
+from seva.geometry import get_camera_dist
+def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    return x[(...,) + (None,) * dims_to_append]
+def append_zero(x: torch.Tensor) -> torch.Tensor:
+    return torch.cat([x, x.new_zeros([1])])
+def to_d(x: torch.Tensor, sigma: torch.Tensor, denoised: torch.Tensor) -> torch.Tensor:
+    return (x - denoised) / append_dims(sigma, x.ndim)
+def make_betas(
+    num_timesteps: int, linear_start: float = 1e-4, linear_end: float = 2e-2
+) -> np.ndarray:
+    betas = (
+        torch.linspace(
+            linear_start**0.5, linear_end**0.5, num_timesteps, dtype=torch.float64
+        )
+        ** 2
+    )
+    return betas.numpy()
+def generate_roughly_equally_spaced_steps(
+    num_substeps: int, max_step: int
+) -> np.ndarray:
+    return np.linspace(max_step - 1, 0, num_substeps, endpoint=False).astype(int)[::-1]
+class EpsScaling(object):
+    def __call__(
+        self, sigma: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        c_skip = torch.ones_like(sigma, device=sigma.device)
+        c_out = -sigma
+        c_in = 1 / (sigma**2 + 1.0) ** 0.5
+        c_noise = sigma.clone()
+        return c_skip, c_out, c_in, c_noise
+class DDPMDiscretization(object):
+    def __init__(
+        self,
+        linear_start: float = 5e-06,
+        linear_end: float = 0.012,
+        num_timesteps: int = 1000,
+        log_snr_shift: float | None = 2.4,
+    ):
+        self.num_timesteps = num_timesteps
+        betas = make_betas(
+            num_timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+        )
+        self.log_snr_shift = log_snr_shift
+        alphas = 1.0 - betas  # first alpha here is on data side
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+    def get_sigmas(self, n: int, device: str | torch.device = "cpu") -> torch.Tensor:
+        if n < self.num_timesteps:
+            timesteps = generate_roughly_equally_spaced_steps(n, self.num_timesteps)
+            alphas_cumprod = self.alphas_cumprod[timesteps]
+        elif n == self.num_timesteps:
+            alphas_cumprod = self.alphas_cumprod
+        else:
+            raise ValueError(f"Expected n <= {self.num_timesteps}, but got n = {n}.")
+        sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        if self.log_snr_shift is not None:
+            sigmas = sigmas * np.exp(self.log_snr_shift)
+        return torch.flip(
+            torch.tensor(sigmas, dtype=torch.float32, device=device), (0,)
+        )
+    def __call__(
+        self,
+        n: int,
+        do_append_zero: bool = True,
+        flip: bool = False,
+        device: str | torch.device = "cpu",
+    ) -> torch.Tensor:
+        sigmas = self.get_sigmas(n, device=device)
+        sigmas = append_zero(sigmas) if do_append_zero else sigmas
+        return sigmas if not flip else torch.flip(sigmas, (0,))
+class DiscreteDenoiser(object):
+    sigmas: torch.Tensor
+    def __init__(
+        self,
+        discretization: DDPMDiscretization,
+        num_idx: int = 1000,
+        device: str | torch.device = "cpu",
+    ):
+        self.scaling = EpsScaling()
+        self.discretization = discretization
+        self.num_idx = num_idx
+        self.device = device
+        self.register_sigmas()
+    def register_sigmas(self):
+        self.sigmas = self.discretization(
+            self.num_idx, do_append_zero=False, flip=True, device=self.device
+        )
+    def sigma_to_idx(self, sigma: torch.Tensor) -> torch.Tensor:
+        dists = sigma - self.sigmas[:, None]
+        return dists.abs().argmin(dim=0).view(sigma.shape)
+    def idx_to_sigma(self, idx: torch.Tensor | int) -> torch.Tensor:
+        return self.sigmas[idx]
+    def __call__(
+        self,
+        network: nn.Module,
+        input: torch.Tensor,
+        sigma: torch.Tensor,
+        cond: dict,
+        **additional_model_inputs,
+    ) -> torch.Tensor:
+        sigma = self.idx_to_sigma(self.sigma_to_idx(sigma))
+        sigma_shape = sigma.shape
+        sigma = append_dims(sigma, input.ndim)
+        c_skip, c_out, c_in, c_noise = self.scaling(sigma)
+        c_noise = self.sigma_to_idx(c_noise.reshape(sigma_shape))
+        if "replace" in cond:
+            x, mask = cond.pop("replace").split((input.shape[1], 1), dim=1)
+            input = input * (1 - mask) + x * mask
+        return (
+            network(input * c_in, c_noise, cond, **additional_model_inputs) * c_out
+            + input * c_skip
+        )
+class ConstantScaleRule(object):
+    def __call__(self, scale: float | torch.Tensor) -> float | torch.Tensor:
+        return scale
+class MultiviewScaleRule(object):
+    def __init__(self, min_scale: float = 1.0):
+        self.min_scale = min_scale
+    def __call__(
+        self,
+        scale: float | torch.Tensor,
+        c2w: torch.Tensor,
+        K: torch.Tensor,
+        input_frame_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        c2w_input = c2w[input_frame_mask]
+        rotation_diff = get_camera_dist(c2w, c2w_input, mode="rotation").min(-1).values
+        translation_diff = (
+            get_camera_dist(c2w, c2w_input, mode="translation").min(-1).values
+        )
+        K_diff = (
+            ((K[:, None] - K[input_frame_mask][None]).flatten(-2) == 0).all(-1).any(-1)
+        )
+        close_frame = (rotation_diff < 10.0) & (translation_diff < 1e-5) & K_diff
+        if isinstance(scale, torch.Tensor):
+            scale = scale.clone()
+            scale[close_frame] = self.min_scale
+        elif isinstance(scale, float):
+            scale = torch.where(close_frame, self.min_scale, scale)
+        else:
+            raise ValueError(f"Invalid scale type {type(scale)}.")
+        return scale
+class ConstantScaleSchedule(object):
+    def __call__(
+        self, sigma: float | torch.Tensor, scale: float | torch.Tensor
+    ) -> float | torch.Tensor:
+        if isinstance(sigma, float):
+            return scale
+        elif isinstance(sigma, torch.Tensor):
+            if len(sigma.shape) == 1 and isinstance(scale, torch.Tensor):
+                sigma = append_dims(sigma, scale.ndim)
+            return scale * torch.ones_like(sigma)
+        else:
+            raise ValueError(f"Invalid sigma type {type(sigma)}.")
+class ConstantGuidance(object):
+    def __call__(
+        self,
+        uncond: torch.Tensor,
+        cond: torch.Tensor,
+        scale: float | torch.Tensor,
+    ) -> torch.Tensor:
+        if isinstance(scale, torch.Tensor) and len(scale.shape) == 1:
+            scale = append_dims(scale, cond.ndim)
+        return uncond + scale * (cond - uncond)
+class VanillaCFG(object):
+    def __init__(self):
+        self.scale_rule = ConstantScaleRule()
+        self.scale_schedule = ConstantScaleSchedule()
+        self.guidance = ConstantGuidance()
+    def __call__(
+        self, x: torch.Tensor, sigma: float | torch.Tensor, scale: float | torch.Tensor
+    ) -> torch.Tensor:
+        x_u, x_c = x.chunk(2)
+        scale = self.scale_rule(scale)
+        scale_value = self.scale_schedule(sigma, scale)
+        x_pred = self.guidance(x_u, x_c, scale_value)
+        return x_pred
+    def prepare_inputs(
+        self, x: torch.Tensor, s: torch.Tensor, c: dict, uc: dict
+    ) -> tuple[torch.Tensor, torch.Tensor, dict]:
+        c_out = dict()
+        for k in c:
+            if k in ["vector", "crossattn", "concat", "replace", "dense_vector"]:
+                c_out[k] = torch.cat((uc[k], c[k]), 0)
+            else:
+                assert c[k] == uc[k]
+                c_out[k] = c[k]
+        return torch.cat([x] * 2), torch.cat([s] * 2), c_out
+class MultiviewCFG(VanillaCFG):
+    def __init__(self, cfg_min: float = 1.0):
+        self.scale_min = cfg_min
+        self.scale_rule = MultiviewScaleRule(min_scale=cfg_min)
+        self.scale_schedule = ConstantScaleSchedule()
+        self.guidance = ConstantGuidance()
+    def __call__(  # type: ignore
+        self,
+        x: torch.Tensor,
+        sigma: float | torch.Tensor,
+        scale: float | torch.Tensor,
+        c2w: torch.Tensor,
+        K: torch.Tensor,
+        input_frame_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        x_u, x_c = x.chunk(2)
+        scale = self.scale_rule(scale, c2w, K, input_frame_mask)
+        scale_value = self.scale_schedule(sigma, scale)
+        x_pred = self.guidance(x_u, x_c, scale_value)
+        return x_pred
+class MultiviewTemporalCFG(MultiviewCFG):
+    def __init__(self, num_frames: int, cfg_min: float = 1.0):
+        super().__init__(cfg_min=cfg_min)
+        self.num_frames = num_frames
+        distance_matrix = (
+            torch.arange(num_frames)[None] - torch.arange(num_frames)[:, None]
+        ).abs()
+        self.distance_matrix = distance_matrix
+    def __call__(
+        self,
+        x: torch.Tensor,
+        sigma: float | torch.Tensor,
+        scale: float | torch.Tensor,
+        c2w: torch.Tensor,
+        K: torch.Tensor,
+        input_frame_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        input_frame_mask = rearrange(
+            input_frame_mask, "(b t) ... -> b t ...", t=self.num_frames
+        )
+        min_distance = (
+            self.distance_matrix[None].to(x.device)
+            + (~input_frame_mask[:, None]) * self.num_frames
+        ).min(-1)[0]
+        min_distance = min_distance / min_distance.max(-1, keepdim=True)[0].clamp(min=1)
+        scale = min_distance * (scale - self.scale_min) + self.scale_min
+        scale = rearrange(scale, "b t ... -> (b t) ...")
+        scale = append_dims(scale, x.ndim)
+        return super().__call__(x, sigma, scale, c2w, K, input_frame_mask.flatten(0, 1))
+class EulerEDMSampler(object):
+    def __init__(
+        self,
+        discretization: DDPMDiscretization,
+        guider: VanillaCFG | MultiviewCFG | MultiviewTemporalCFG,
+        num_steps: int | None = None,
+        verbose: bool = False,
+        device: str | torch.device = "cuda",
+        s_churn=0.0,
+        s_tmin=0.0,
+        s_tmax=float("inf"),
+        s_noise=1.0,
+    ):
+        self.num_steps = num_steps
+        self.discretization = discretization
+        self.guider = guider
+        self.verbose = verbose
+        self.device = device
+        self.s_churn = s_churn
+        self.s_tmin = s_tmin
+        self.s_tmax = s_tmax
+        self.s_noise = s_noise
+    def prepare_sampling_loop(
+        self, x: torch.Tensor, cond: dict, uc: dict, num_steps: int | None = None
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, dict, dict]:
+        num_steps = num_steps or self.num_steps
+        assert num_steps is not None, "num_steps must be specified"
+        sigmas = self.discretization(num_steps, device=self.device)
+        x *= torch.sqrt(1.0 + sigmas[0] ** 2.0)
+        num_sigmas = len(sigmas)
+        s_in = x.new_ones([x.shape[0]])
+        return x, s_in, sigmas, num_sigmas, cond, uc
+    def get_sigma_gen(self, num_sigmas: int, verbose: bool = True) -> range | tqdm:
+        sigma_generator = range(num_sigmas - 1)
+        if self.verbose and verbose:
+            sigma_generator = tqdm(
+                sigma_generator,
+                total=num_sigmas - 1,
+                desc="Sampling",
+                leave=False,
+            )
+        return sigma_generator
+    def sampler_step(
+        self,
+        sigma: torch.Tensor,
+        next_sigma: torch.Tensor,
+        denoiser,
+        x: torch.Tensor,
+        scale: float | torch.Tensor,
+        cond: dict,
+        uc: dict,
+        gamma: float = 0.0,
+        **guider_kwargs,
+    ) -> torch.Tensor:
+        sigma_hat = sigma * (gamma + 1.0) + 1e-6
+        eps = torch.randn_like(x) * self.s_noise
+        x = x + eps * append_dims(sigma_hat**2 - sigma**2, x.ndim) ** 0.5
+        denoised = denoiser(*self.guider.prepare_inputs(x, sigma_hat, cond, uc))
+        denoised = self.guider(denoised, sigma_hat, scale, **guider_kwargs)
+        d = to_d(x, sigma_hat, denoised)
+        dt = append_dims(next_sigma - sigma_hat, x.ndim)
+        return x + dt * d
+    def __call__(
+        self,
+        denoiser,
+        x: torch.Tensor,
+        scale: float | torch.Tensor,
+        cond: dict,
+        uc: dict | None = None,
+        num_steps: int | None = None,
+        verbose: bool = True,
+        **guider_kwargs,
+    ) -> torch.Tensor:
+        uc = cond if uc is None else uc
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x,
+            cond,
+            uc,
+            num_steps,
+        )
+        for i in self.get_sigma_gen(num_sigmas, verbose=verbose):
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                if self.s_tmin <= sigmas[i] <= self.s_tmax
+                else 0.0
+            )
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                scale,
+                cond,
+                uc,
+                gamma,
+                **guider_kwargs,
+            )
+        return x

seva/utils.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import safetensors.torch
+import torch
+from huggingface_hub import hf_hub_download
+from seva.model import Seva, SevaParams
+def seed_everything(seed: int = 0):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
+    if len(missing) > 0 and len(unexpected) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        print("\n" + "-" * 79 + "\n")
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+    elif len(missing) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+    elif len(unexpected) > 0:
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+def load_model(
+    pretrained_model_name_or_path: str = "stabilityai/stable-virtual-camera",
+    weight_name: str = "model.safetensors",
+    device: str | torch.device = "cuda",
+    verbose: bool = False,
+) -> Seva:
+    if os.path.isdir(pretrained_model_name_or_path):
+        weight_path = os.path.join(pretrained_model_name_or_path, weight_name)
+    else:
+        weight_path = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename=weight_name
+        )
+        _ = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename="config.yaml"
+        )
+    state_dict = safetensors.torch.load_file(
+        weight_path,
+        device=str(device),
+    )
+    with torch.device("meta"):
+        model = Seva(SevaParams()).to(torch.bfloat16)
+    missing, unexpected = model.load_state_dict(state_dict, strict=False, assign=True)
+    if verbose:
+        print_load_warning(missing, unexpected)
+    return model

third_party/dust3r ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 44b87f5a466ec32435036e40125d0b87a5746c20