Spaces:

multimodalart
/

stable-video-diffusion

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Nov 28, 2023

Commit

90e5afe

1 Parent(s): c1a4c61

Delete scripts

Browse files

Files changed (19) hide show

scripts/.DS_Store +0 -0
scripts/__init__.py +0 -0
scripts/demo/__init__.py +0 -0
scripts/demo/detect.py +0 -156
scripts/demo/discretization.py +0 -59
scripts/demo/sampling.py +0 -364
scripts/demo/streamlit_helpers.py +0 -928
scripts/demo/video_sampling.py +0 -200
scripts/sampling/configs/svd.yaml +0 -146
scripts/sampling/configs/svd_image_decoder.yaml +0 -129
scripts/sampling/configs/svd_xt.yaml +0 -146
scripts/sampling/configs/svd_xt_image_decoder.yaml +0 -129
scripts/sampling/simple_video_sample.py +0 -278
scripts/tests/attention.py +0 -319
scripts/util/__init__.py +0 -0
scripts/util/detection/__init__.py +0 -0
scripts/util/detection/nsfw_and_watermark_dectection.py +0 -110
scripts/util/detection/p_head_v1.npz +0 -3
scripts/util/detection/w_head_v1.npz +0 -3

scripts/.DS_Store DELETED Viewed

Binary file (6.15 kB)

scripts/__init__.py DELETED Viewed

File without changes

scripts/demo/__init__.py DELETED Viewed

File without changes

scripts/demo/detect.py DELETED Viewed

@@ -1,156 +0,0 @@
-import argparse
-import cv2
-import numpy as np
-try:
-    from imwatermark import WatermarkDecoder
-except ImportError as e:
-    try:
-        # Assume some of the other dependencies such as torch are not fulfilled
-        # import file without loading unnecessary libraries.
-        import importlib.util
-        import sys
-        spec = importlib.util.find_spec("imwatermark.maxDct")
-        assert spec is not None
-        maxDct = importlib.util.module_from_spec(spec)
-        sys.modules["maxDct"] = maxDct
-        spec.loader.exec_module(maxDct)
-        class WatermarkDecoder(object):
-            """A minimal version of
-            https://github.com/ShieldMnt/invisible-watermark/blob/main/imwatermark/watermark.py
-            to only reconstruct bits using dwtDct"""
-            def __init__(self, wm_type="bytes", length=0):
-                assert wm_type == "bits", "Only bits defined in minimal import"
-                self._wmType = wm_type
-                self._wmLen = length
-            def reconstruct(self, bits):
-                if len(bits) != self._wmLen:
-                    raise RuntimeError("bits are not matched with watermark length")
-                return bits
-            def decode(self, cv2Image, method="dwtDct", **configs):
-                (r, c, channels) = cv2Image.shape
-                if r * c < 256 * 256:
-                    raise RuntimeError("image too small, should be larger than 256x256")
-                bits = []
-                assert method == "dwtDct"
-                embed = maxDct.EmbedMaxDct(watermarks=[], wmLen=self._wmLen, **configs)
-                bits = embed.decode(cv2Image)
-                return self.reconstruct(bits)
-    except:
-        raise e
-# A fixed 48-bit message that was choosen at random
-# WATERMARK_MESSAGE = 0xB3EC907BB19E
-WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
-# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
-WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
-MATCH_VALUES = [
-    [27, "No watermark detected"],
-    [33, "Partial watermark match. Cannot determine with certainty."],
-    [
-        35,
-        (
-            "Likely watermarked. In our test 0.02% of real images were "
-            'falsely detected as "Likely watermarked"'
-        ),
-    ],
-    [
-        49,
-        (
-            "Very likely watermarked. In our test no real images were "
-            'falsely detected as "Very likely watermarked"'
-        ),
-    ],
-]
-class GetWatermarkMatch:
-    def __init__(self, watermark):
-        self.watermark = watermark
-        self.num_bits = len(self.watermark)
-        self.decoder = WatermarkDecoder("bits", self.num_bits)
-    def __call__(self, x: np.ndarray) -> np.ndarray:
-        """
-        Detects the number of matching bits the predefined watermark with one
-        or multiple images. Images should be in cv2 format, e.g. h x w x c BGR.
-        Args:
-            x: ([B], h w, c) in range [0, 255]
-        Returns:
-           number of matched bits ([B],)
-        """
-        squeeze = len(x.shape) == 3
-        if squeeze:
-            x = x[None, ...]
-        bs = x.shape[0]
-        detected = np.empty((bs, self.num_bits), dtype=bool)
-        for k in range(bs):
-            detected[k] = self.decoder.decode(x[k], "dwtDct")
-        result = np.sum(detected == self.watermark, axis=-1)
-        if squeeze:
-            return result[0]
-        else:
-            return result
-get_watermark_match = GetWatermarkMatch(WATERMARK_BITS)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "filename",
-        nargs="+",
-        type=str,
-        help="Image files to check for watermarks",
-    )
-    opts = parser.parse_args()
-    print(
-        """
-        This script tries to detect watermarked images. Please be aware of
-        the following:
-        - As the watermark is supposed to be invisible, there is the risk that
-          watermarked images may not be detected.
-        - To maximize the chance of detection make sure that the image has the same
-          dimensions as when the watermark was applied (most likely 1024x1024
-          or 512x512).
-        - Specific image manipulation may drastically decrease the chance that
-          watermarks can be detected.
-        - There is also the chance that an image has the characteristics of the
-          watermark by chance.
-        - The watermark script is public, anybody may watermark any images, and
-          could therefore claim it to be generated.
-        - All numbers below are based on a test using 10,000 images without any
-          modifications after applying the watermark.
-        """
-    )
-    for fn in opts.filename:
-        image = cv2.imread(fn)
-        if image is None:
-            print(f"Couldn't read {fn}. Skipping")
-            continue
-        num_bits = get_watermark_match(image)
-        k = 0
-        while num_bits > MATCH_VALUES[k][0]:
-            k += 1
-        print(
-            f"{fn}: {MATCH_VALUES[k][1]}",
-            f"Bits that matched the watermark {num_bits} from {len(WATERMARK_BITS)}\n",
-            sep="\n\t",
-        )

scripts/demo/discretization.py DELETED Viewed

@@ -1,59 +0,0 @@
-import torch
-from sgm.modules.diffusionmodules.discretizer import Discretization
-class Img2ImgDiscretizationWrapper:
-    """
-    wraps a discretizer, and prunes the sigmas
-    params:
-        strength: float between 0.0 and 1.0. 1.0 means full sampling (all sigmas are returned)
-    """
-    def __init__(self, discretization: Discretization, strength: float = 1.0):
-        self.discretization = discretization
-        self.strength = strength
-        assert 0.0 <= self.strength <= 1.0
-    def __call__(self, *args, **kwargs):
-        # sigmas start large first, and decrease then
-        sigmas = self.discretization(*args, **kwargs)
-        print(f"sigmas after discretization, before pruning img2img: ", sigmas)
-        sigmas = torch.flip(sigmas, (0,))
-        sigmas = sigmas[: max(int(self.strength * len(sigmas)), 1)]
-        print("prune index:", max(int(self.strength * len(sigmas)), 1))
-        sigmas = torch.flip(sigmas, (0,))
-        print(f"sigmas after pruning: ", sigmas)
-        return sigmas
-class Txt2NoisyDiscretizationWrapper:
-    """
-    wraps a discretizer, and prunes the sigmas
-    params:
-        strength: float between 0.0 and 1.0. 0.0 means full sampling (all sigmas are returned)
-    """
-    def __init__(
-        self, discretization: Discretization, strength: float = 0.0, original_steps=None
-    ):
-        self.discretization = discretization
-        self.strength = strength
-        self.original_steps = original_steps
-        assert 0.0 <= self.strength <= 1.0
-    def __call__(self, *args, **kwargs):
-        # sigmas start large first, and decrease then
-        sigmas = self.discretization(*args, **kwargs)
-        print(f"sigmas after discretization, before pruning img2img: ", sigmas)
-        sigmas = torch.flip(sigmas, (0,))
-        if self.original_steps is None:
-            steps = len(sigmas)
-        else:
-            steps = self.original_steps + 1
-        prune_index = max(min(int(self.strength * steps) - 1, steps - 1), 0)
-        sigmas = sigmas[prune_index:]
-        print("prune index:", prune_index)
-        sigmas = torch.flip(sigmas, (0,))
-        print(f"sigmas after pruning: ", sigmas)
-        return sigmas

scripts/demo/sampling.py DELETED Viewed

@@ -1,364 +0,0 @@
-from pytorch_lightning import seed_everything
-from scripts.demo.streamlit_helpers import *
-SAVE_PATH = "outputs/demo/txt2img/"
-SD_XL_BASE_RATIOS = {
-    "0.5": (704, 1408),
-    "0.52": (704, 1344),
-    "0.57": (768, 1344),
-    "0.6": (768, 1280),
-    "0.68": (832, 1216),
-    "0.72": (832, 1152),
-    "0.78": (896, 1152),
-    "0.82": (896, 1088),
-    "0.88": (960, 1088),
-    "0.94": (960, 1024),
-    "1.0": (1024, 1024),
-    "1.07": (1024, 960),
-    "1.13": (1088, 960),
-    "1.21": (1088, 896),
-    "1.29": (1152, 896),
-    "1.38": (1152, 832),
-    "1.46": (1216, 832),
-    "1.67": (1280, 768),
-    "1.75": (1344, 768),
-    "1.91": (1344, 704),
-    "2.0": (1408, 704),
-    "2.09": (1472, 704),
-    "2.4": (1536, 640),
-    "2.5": (1600, 640),
-    "2.89": (1664, 576),
-    "3.0": (1728, 576),
-}
-VERSION2SPECS = {
-    "SDXL-base-1.0": {
-        "H": 1024,
-        "W": 1024,
-        "C": 4,
-        "f": 8,
-        "is_legacy": False,
-        "config": "configs/inference/sd_xl_base.yaml",
-        "ckpt": "checkpoints/sd_xl_base_1.0.safetensors",
-    },
-    "SDXL-base-0.9": {
-        "H": 1024,
-        "W": 1024,
-        "C": 4,
-        "f": 8,
-        "is_legacy": False,
-        "config": "configs/inference/sd_xl_base.yaml",
-        "ckpt": "checkpoints/sd_xl_base_0.9.safetensors",
-    },
-    "SD-2.1": {
-        "H": 512,
-        "W": 512,
-        "C": 4,
-        "f": 8,
-        "is_legacy": True,
-        "config": "configs/inference/sd_2_1.yaml",
-        "ckpt": "checkpoints/v2-1_512-ema-pruned.safetensors",
-    },
-    "SD-2.1-768": {
-        "H": 768,
-        "W": 768,
-        "C": 4,
-        "f": 8,
-        "is_legacy": True,
-        "config": "configs/inference/sd_2_1_768.yaml",
-        "ckpt": "checkpoints/v2-1_768-ema-pruned.safetensors",
-    },
-    "SDXL-refiner-0.9": {
-        "H": 1024,
-        "W": 1024,
-        "C": 4,
-        "f": 8,
-        "is_legacy": True,
-        "config": "configs/inference/sd_xl_refiner.yaml",
-        "ckpt": "checkpoints/sd_xl_refiner_0.9.safetensors",
-    },
-    "SDXL-refiner-1.0": {
-        "H": 1024,
-        "W": 1024,
-        "C": 4,
-        "f": 8,
-        "is_legacy": True,
-        "config": "configs/inference/sd_xl_refiner.yaml",
-        "ckpt": "checkpoints/sd_xl_refiner_1.0.safetensors",
-    },
-}
-def load_img(display=True, key=None, device="cuda"):
-    image = get_interactive_image(key=key)
-    if image is None:
-        return None
-    if display:
-        st.image(image)
-    w, h = image.size
-    print(f"loaded input image of size ({w}, {h})")
-    width, height = map(
-        lambda x: x - x % 64, (w, h)
-    )  # resize to integer multiple of 64
-    image = image.resize((width, height))
-    image = np.array(image.convert("RGB"))
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-    return image.to(device)
-def run_txt2img(
-    state,
-    version,
-    version_dict,
-    is_legacy=False,
-    return_latents=False,
-    filter=None,
-    stage2strength=None,
-):
-    if version.startswith("SDXL-base"):
-        W, H = st.selectbox("Resolution:", list(SD_XL_BASE_RATIOS.values()), 10)
-    else:
-        H = st.number_input("H", value=version_dict["H"], min_value=64, max_value=2048)
-        W = st.number_input("W", value=version_dict["W"], min_value=64, max_value=2048)
-    C = version_dict["C"]
-    F = version_dict["f"]
-    init_dict = {
-        "orig_width": W,
-        "orig_height": H,
-        "target_width": W,
-        "target_height": H,
-    }
-    value_dict = init_embedder_options(
-        get_unique_embedder_keys_from_conditioner(state["model"].conditioner),
-        init_dict,
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-    )
-    sampler, num_rows, num_cols = init_sampling(stage2strength=stage2strength)
-    num_samples = num_rows * num_cols
-    if st.button("Sample"):
-        st.write(f"**Model I:** {version}")
-        out = do_sample(
-            state["model"],
-            sampler,
-            value_dict,
-            num_samples,
-            H,
-            W,
-            C,
-            F,
-            force_uc_zero_embeddings=["txt"] if not is_legacy else [],
-            return_latents=return_latents,
-            filter=filter,
-        )
-        return out
-def run_img2img(
-    state,
-    version_dict,
-    is_legacy=False,
-    return_latents=False,
-    filter=None,
-    stage2strength=None,
-):
-    img = load_img()
-    if img is None:
-        return None
-    H, W = img.shape[2], img.shape[3]
-    init_dict = {
-        "orig_width": W,
-        "orig_height": H,
-        "target_width": W,
-        "target_height": H,
-    }
-    value_dict = init_embedder_options(
-        get_unique_embedder_keys_from_conditioner(state["model"].conditioner),
-        init_dict,
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-    )
-    strength = st.number_input(
-        "**Img2Img Strength**", value=0.75, min_value=0.0, max_value=1.0
-    )
-    sampler, num_rows, num_cols = init_sampling(
-        img2img_strength=strength,
-        stage2strength=stage2strength,
-    )
-    num_samples = num_rows * num_cols
-    if st.button("Sample"):
-        out = do_img2img(
-            repeat(img, "1 ... -> n ...", n=num_samples),
-            state["model"],
-            sampler,
-            value_dict,
-            num_samples,
-            force_uc_zero_embeddings=["txt"] if not is_legacy else [],
-            return_latents=return_latents,
-            filter=filter,
-        )
-        return out
-def apply_refiner(
-    input,
-    state,
-    sampler,
-    num_samples,
-    prompt,
-    negative_prompt,
-    filter=None,
-    finish_denoising=False,
-):
-    init_dict = {
-        "orig_width": input.shape[3] * 8,
-        "orig_height": input.shape[2] * 8,
-        "target_width": input.shape[3] * 8,
-        "target_height": input.shape[2] * 8,
-    }
-    value_dict = init_dict
-    value_dict["prompt"] = prompt
-    value_dict["negative_prompt"] = negative_prompt
-    value_dict["crop_coords_top"] = 0
-    value_dict["crop_coords_left"] = 0
-    value_dict["aesthetic_score"] = 6.0
-    value_dict["negative_aesthetic_score"] = 2.5
-    st.warning(f"refiner input shape: {input.shape}")
-    samples = do_img2img(
-        input,
-        state["model"],
-        sampler,
-        value_dict,
-        num_samples,
-        skip_encode=True,
-        filter=filter,
-        add_noise=not finish_denoising,
-    )
-    return samples
-if __name__ == "__main__":
-    st.title("Stable Diffusion")
-    version = st.selectbox("Model Version", list(VERSION2SPECS.keys()), 0)
-    version_dict = VERSION2SPECS[version]
-    if st.checkbox("Load Model"):
-        mode = st.radio("Mode", ("txt2img", "img2img"), 0)
-    else:
-        mode = "skip"
-    st.write("__________________________")
-    set_lowvram_mode(st.checkbox("Low vram mode", True))
-    if version.startswith("SDXL-base"):
-        add_pipeline = st.checkbox("Load SDXL-refiner?", False)
-        st.write("__________________________")
-    else:
-        add_pipeline = False
-    seed = st.sidebar.number_input("seed", value=42, min_value=0, max_value=int(1e9))
-    seed_everything(seed)
-    save_locally, save_path = init_save_locally(os.path.join(SAVE_PATH, version))
-    if mode != "skip":
-        state = init_st(version_dict, load_filter=True)
-        if state["msg"]:
-            st.info(state["msg"])
-        model = state["model"]
-    is_legacy = version_dict["is_legacy"]
-    prompt = st.text_input(
-        "prompt",
-        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    )
-    if is_legacy:
-        negative_prompt = st.text_input("negative prompt", "")
-    else:
-        negative_prompt = ""  # which is unused
-    stage2strength = None
-    finish_denoising = False
-    if add_pipeline:
-        st.write("__________________________")
-        version2 = st.selectbox("Refiner:", ["SDXL-refiner-1.0", "SDXL-refiner-0.9"])
-        st.warning(
-            f"Running with {version2} as the second stage model. Make sure to provide (V)RAM :) "
-        )
-        st.write("**Refiner Options:**")
-        version_dict2 = VERSION2SPECS[version2]
-        state2 = init_st(version_dict2, load_filter=False)
-        st.info(state2["msg"])
-        stage2strength = st.number_input(
-            "**Refinement strength**", value=0.15, min_value=0.0, max_value=1.0
-        )
-        sampler2, *_ = init_sampling(
-            key=2,
-            img2img_strength=stage2strength,
-            specify_num_samples=False,
-        )
-        st.write("__________________________")
-        finish_denoising = st.checkbox("Finish denoising with refiner.", True)
-        if not finish_denoising:
-            stage2strength = None
-    if mode == "txt2img":
-        out = run_txt2img(
-            state,
-            version,
-            version_dict,
-            is_legacy=is_legacy,
-            return_latents=add_pipeline,
-            filter=state.get("filter"),
-            stage2strength=stage2strength,
-        )
-    elif mode == "img2img":
-        out = run_img2img(
-            state,
-            version_dict,
-            is_legacy=is_legacy,
-            return_latents=add_pipeline,
-            filter=state.get("filter"),
-            stage2strength=stage2strength,
-        )
-    elif mode == "skip":
-        out = None
-    else:
-        raise ValueError(f"unknown mode {mode}")
-    if isinstance(out, (tuple, list)):
-        samples, samples_z = out
-    else:
-        samples = out
-        samples_z = None
-    if add_pipeline and samples_z is not None:
-        st.write("**Running Refinement Stage**")
-        samples = apply_refiner(
-            samples_z,
-            state2,
-            sampler2,
-            samples_z.shape[0],
-            prompt=prompt,
-            negative_prompt=negative_prompt if is_legacy else "",
-            filter=state.get("filter"),
-            finish_denoising=finish_denoising,
-        )
-    if save_locally and samples is not None:
-        perform_save_locally(save_path, samples)

scripts/demo/streamlit_helpers.py DELETED Viewed

@@ -1,928 +0,0 @@
-import copy
-import math
-import os
-from glob import glob
-from typing import Dict, List, Optional, Tuple, Union
-import cv2
-import numpy as np
-import streamlit as st
-import torch
-import torch.nn as nn
-import torchvision.transforms as TT
-from einops import rearrange, repeat
-from imwatermark import WatermarkEncoder
-from omegaconf import ListConfig, OmegaConf
-from PIL import Image
-from safetensors.torch import load_file as load_safetensors
-from torch import autocast
-from torchvision import transforms
-from torchvision.utils import make_grid, save_image
-from scripts.demo.discretization import (Img2ImgDiscretizationWrapper,
-                                         Txt2NoisyDiscretizationWrapper)
-from scripts.util.detection.nsfw_and_watermark_dectection import \
-    DeepFloydDataFiltering
-from sgm.inference.helpers import embed_watermark
-from sgm.modules.diffusionmodules.guiders import (LinearPredictionGuider,
-                                                  VanillaCFG)
-from sgm.modules.diffusionmodules.sampling import (DPMPP2MSampler,
-                                                   DPMPP2SAncestralSampler,
-                                                   EulerAncestralSampler,
-                                                   EulerEDMSampler,
-                                                   HeunEDMSampler,
-                                                   LinearMultistepSampler)
-from sgm.util import append_dims, default, instantiate_from_config
-@st.cache_resource()
-def init_st(version_dict, load_ckpt=True, load_filter=True):
-    state = dict()
-    if not "model" in state:
-        config = version_dict["config"]
-        ckpt = version_dict["ckpt"]
-        config = OmegaConf.load(config)
-        model, msg = load_model_from_config(config, ckpt if load_ckpt else None)
-        state["msg"] = msg
-        state["model"] = model
-        state["ckpt"] = ckpt if load_ckpt else None
-        state["config"] = config
-        if load_filter:
-            state["filter"] = DeepFloydDataFiltering(verbose=False)
-    return state
-def load_model(model):
-    model.cuda()
-lowvram_mode = False
-def set_lowvram_mode(mode):
-    global lowvram_mode
-    lowvram_mode = mode
-def initial_model_load(model):
-    global lowvram_mode
-    if lowvram_mode:
-        model.model.half()
-    else:
-        model.cuda()
-    return model
-def unload_model(model):
-    global lowvram_mode
-    if lowvram_mode:
-        model.cpu()
-        torch.cuda.empty_cache()
-def load_model_from_config(config, ckpt=None, verbose=True):
-    model = instantiate_from_config(config.model)
-    if ckpt is not None:
-        print(f"Loading model from {ckpt}")
-        if ckpt.endswith("ckpt"):
-            pl_sd = torch.load(ckpt, map_location="cpu")
-            if "global_step" in pl_sd:
-                global_step = pl_sd["global_step"]
-                st.info(f"loaded ckpt from global step {global_step}")
-                print(f"Global Step: {pl_sd['global_step']}")
-            sd = pl_sd["state_dict"]
-        elif ckpt.endswith("safetensors"):
-            sd = load_safetensors(ckpt)
-        else:
-            raise NotImplementedError
-        msg = None
-        m, u = model.load_state_dict(sd, strict=False)
-        if len(m) > 0 and verbose:
-            print("missing keys:")
-            print(m)
-        if len(u) > 0 and verbose:
-            print("unexpected keys:")
-            print(u)
-    else:
-        msg = None
-    model = initial_model_load(model)
-    model.eval()
-    return model, msg
-def get_unique_embedder_keys_from_conditioner(conditioner):
-    return list(set([x.input_key for x in conditioner.embedders]))
-def init_embedder_options(keys, init_dict, prompt=None, negative_prompt=None):
-    # Hardcoded demo settings; might undergo some changes in the future
-    value_dict = {}
-    for key in keys:
-        if key == "txt":
-            if prompt is None:
-                prompt = "A professional photograph of an astronaut riding a pig"
-            if negative_prompt is None:
-                negative_prompt = ""
-            prompt = st.text_input("Prompt", prompt)
-            negative_prompt = st.text_input("Negative prompt", negative_prompt)
-            value_dict["prompt"] = prompt
-            value_dict["negative_prompt"] = negative_prompt
-        if key == "original_size_as_tuple":
-            orig_width = st.number_input(
-                "orig_width",
-                value=init_dict["orig_width"],
-                min_value=16,
-            )
-            orig_height = st.number_input(
-                "orig_height",
-                value=init_dict["orig_height"],
-                min_value=16,
-            )
-            value_dict["orig_width"] = orig_width
-            value_dict["orig_height"] = orig_height
-        if key == "crop_coords_top_left":
-            crop_coord_top = st.number_input("crop_coords_top", value=0, min_value=0)
-            crop_coord_left = st.number_input("crop_coords_left", value=0, min_value=0)
-            value_dict["crop_coords_top"] = crop_coord_top
-            value_dict["crop_coords_left"] = crop_coord_left
-        if key == "aesthetic_score":
-            value_dict["aesthetic_score"] = 6.0
-            value_dict["negative_aesthetic_score"] = 2.5
-        if key == "target_size_as_tuple":
-            value_dict["target_width"] = init_dict["target_width"]
-            value_dict["target_height"] = init_dict["target_height"]
-        if key in ["fps_id", "fps"]:
-            fps = st.number_input("fps", value=6, min_value=1)
-            value_dict["fps"] = fps
-            value_dict["fps_id"] = fps - 1
-        if key == "motion_bucket_id":
-            mb_id = st.number_input("motion bucket id", 0, 511, value=127)
-            value_dict["motion_bucket_id"] = mb_id
-        if key == "pool_image":
-            st.text("Image for pool conditioning")
-            image = load_img(
-                key="pool_image_input",
-                size=224,
-                center_crop=True,
-            )
-            if image is None:
-                st.info("Need an image here")
-                image = torch.zeros(1, 3, 224, 224)
-            value_dict["pool_image"] = image
-    return value_dict
-def perform_save_locally(save_path, samples):
-    os.makedirs(os.path.join(save_path), exist_ok=True)
-    base_count = len(os.listdir(os.path.join(save_path)))
-    samples = embed_watermark(samples)
-    for sample in samples:
-        sample = 255.0 * rearrange(sample.cpu().numpy(), "c h w -> h w c")
-        Image.fromarray(sample.astype(np.uint8)).save(
-            os.path.join(save_path, f"{base_count:09}.png")
-        )
-        base_count += 1
-def init_save_locally(_dir, init_value: bool = False):
-    save_locally = st.sidebar.checkbox("Save images locally", value=init_value)
-    if save_locally:
-        save_path = st.text_input("Save path", value=os.path.join(_dir, "samples"))
-    else:
-        save_path = None
-    return save_locally, save_path
-def get_guider(options, key):
-    guider = st.sidebar.selectbox(
-        f"Discretization #{key}",
-        [
-            "VanillaCFG",
-            "IdentityGuider",
-            "LinearPredictionGuider",
-        ],
-        options.get("guider", 0),
-    )
-    additional_guider_kwargs = options.pop("additional_guider_kwargs", {})
-    if guider == "IdentityGuider":
-        guider_config = {
-            "target": "sgm.modules.diffusionmodules.guiders.IdentityGuider"
-        }
-    elif guider == "VanillaCFG":
-        scale_schedule = st.sidebar.selectbox(
-            f"Scale schedule #{key}",
-            ["Identity", "Oscillating"],
-        )
-        if scale_schedule == "Identity":
-            scale = st.number_input(
-                f"cfg-scale #{key}",
-                value=options.get("cfg", 5.0),
-                min_value=0.0,
-            )
-            scale_schedule_config = {
-                "target": "sgm.modules.diffusionmodules.guiders.IdentitySchedule",
-                "params": {"scale": scale},
-            }
-        elif scale_schedule == "Oscillating":
-            small_scale = st.number_input(
-                f"small cfg-scale #{key}",
-                value=4.0,
-                min_value=0.0,
-            )
-            large_scale = st.number_input(
-                f"large cfg-scale #{key}",
-                value=16.0,
-                min_value=0.0,
-            )
-            sigma_cutoff = st.number_input(
-                f"sigma cutoff #{key}",
-                value=1.0,
-                min_value=0.0,
-            )
-            scale_schedule_config = {
-                "target": "sgm.modules.diffusionmodules.guiders.OscillatingSchedule",
-                "params": {
-                    "small_scale": small_scale,
-                    "large_scale": large_scale,
-                    "sigma_cutoff": sigma_cutoff,
-                },
-            }
-        else:
-            raise NotImplementedError
-        guider_config = {
-            "target": "sgm.modules.diffusionmodules.guiders.VanillaCFG",
-            "params": {
-                "scale_schedule_config": scale_schedule_config,
-                **additional_guider_kwargs,
-            },
-        }
-    elif guider == "LinearPredictionGuider":
-        max_scale = st.number_input(
-            f"max-cfg-scale #{key}",
-            value=options.get("cfg", 1.5),
-            min_value=1.0,
-        )
-        min_scale = st.number_input(
-            f"min guidance scale",
-            value=options.get("min_cfg", 1.0),
-            min_value=1.0,
-            max_value=10.0,
-        )
-        guider_config = {
-            "target": "sgm.modules.diffusionmodules.guiders.LinearPredictionGuider",
-            "params": {
-                "max_scale": max_scale,
-                "min_scale": min_scale,
-                "num_frames": options["num_frames"],
-                **additional_guider_kwargs,
-            },
-        }
-    else:
-        raise NotImplementedError
-    return guider_config
-def init_sampling(
-    key=1,
-    img2img_strength: Optional[float] = None,
-    specify_num_samples: bool = True,
-    stage2strength: Optional[float] = None,
-    options: Optional[Dict[str, int]] = None,
-):
-    options = {} if options is None else options
-    num_rows, num_cols = 1, 1
-    if specify_num_samples:
-        num_cols = st.number_input(
-            f"num cols #{key}", value=num_cols, min_value=1, max_value=10
-        )
-    steps = st.sidebar.number_input(
-        f"steps #{key}", value=options.get("num_steps", 40), min_value=1, max_value=1000
-    )
-    sampler = st.sidebar.selectbox(
-        f"Sampler #{key}",
-        [
-            "EulerEDMSampler",
-            "HeunEDMSampler",
-            "EulerAncestralSampler",
-            "DPMPP2SAncestralSampler",
-            "DPMPP2MSampler",
-            "LinearMultistepSampler",
-        ],
-        options.get("sampler", 0),
-    )
-    discretization = st.sidebar.selectbox(
-        f"Discretization #{key}",
-        [
-            "LegacyDDPMDiscretization",
-            "EDMDiscretization",
-        ],
-        options.get("discretization", 0),
-    )
-    discretization_config = get_discretization(discretization, options=options, key=key)
-    guider_config = get_guider(options=options, key=key)
-    sampler = get_sampler(sampler, steps, discretization_config, guider_config, key=key)
-    if img2img_strength is not None:
-        st.warning(
-            f"Wrapping {sampler.__class__.__name__} with Img2ImgDiscretizationWrapper"
-        )
-        sampler.discretization = Img2ImgDiscretizationWrapper(
-            sampler.discretization, strength=img2img_strength
-        )
-    if stage2strength is not None:
-        sampler.discretization = Txt2NoisyDiscretizationWrapper(
-            sampler.discretization, strength=stage2strength, original_steps=steps
-        )
-    return sampler, num_rows, num_cols
-def get_discretization(discretization, options, key=1):
-    if discretization == "LegacyDDPMDiscretization":
-        discretization_config = {
-            "target": "sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization",
-        }
-    elif discretization == "EDMDiscretization":
-        sigma_min = st.number_input(
-            f"sigma_min #{key}", value=options.get("sigma_min", 0.03)
-        )  # 0.0292
-        sigma_max = st.number_input(
-            f"sigma_max #{key}", value=options.get("sigma_max", 14.61)
-        )  # 14.6146
-        rho = st.number_input(f"rho #{key}", value=options.get("rho", 3.0))
-        discretization_config = {
-            "target": "sgm.modules.diffusionmodules.discretizer.EDMDiscretization",
-            "params": {
-                "sigma_min": sigma_min,
-                "sigma_max": sigma_max,
-                "rho": rho,
-            },
-        }
-    return discretization_config
-def get_sampler(sampler_name, steps, discretization_config, guider_config, key=1):
-    if sampler_name == "EulerEDMSampler" or sampler_name == "HeunEDMSampler":
-        s_churn = st.sidebar.number_input(f"s_churn #{key}", value=0.0, min_value=0.0)
-        s_tmin = st.sidebar.number_input(f"s_tmin #{key}", value=0.0, min_value=0.0)
-        s_tmax = st.sidebar.number_input(f"s_tmax #{key}", value=999.0, min_value=0.0)
-        s_noise = st.sidebar.number_input(f"s_noise #{key}", value=1.0, min_value=0.0)
-        if sampler_name == "EulerEDMSampler":
-            sampler = EulerEDMSampler(
-                num_steps=steps,
-                discretization_config=discretization_config,
-                guider_config=guider_config,
-                s_churn=s_churn,
-                s_tmin=s_tmin,
-                s_tmax=s_tmax,
-                s_noise=s_noise,
-                verbose=True,
-            )
-        elif sampler_name == "HeunEDMSampler":
-            sampler = HeunEDMSampler(
-                num_steps=steps,
-                discretization_config=discretization_config,
-                guider_config=guider_config,
-                s_churn=s_churn,
-                s_tmin=s_tmin,
-                s_tmax=s_tmax,
-                s_noise=s_noise,
-                verbose=True,
-            )
-    elif (
-        sampler_name == "EulerAncestralSampler"
-        or sampler_name == "DPMPP2SAncestralSampler"
-    ):
-        s_noise = st.sidebar.number_input("s_noise", value=1.0, min_value=0.0)
-        eta = st.sidebar.number_input("eta", value=1.0, min_value=0.0)
-        if sampler_name == "EulerAncestralSampler":
-            sampler = EulerAncestralSampler(
-                num_steps=steps,
-                discretization_config=discretization_config,
-                guider_config=guider_config,
-                eta=eta,
-                s_noise=s_noise,
-                verbose=True,
-            )
-        elif sampler_name == "DPMPP2SAncestralSampler":
-            sampler = DPMPP2SAncestralSampler(
-                num_steps=steps,
-                discretization_config=discretization_config,
-                guider_config=guider_config,
-                eta=eta,
-                s_noise=s_noise,
-                verbose=True,
-            )
-    elif sampler_name == "DPMPP2MSampler":
-        sampler = DPMPP2MSampler(
-            num_steps=steps,
-            discretization_config=discretization_config,
-            guider_config=guider_config,
-            verbose=True,
-        )
-    elif sampler_name == "LinearMultistepSampler":
-        order = st.sidebar.number_input("order", value=4, min_value=1)
-        sampler = LinearMultistepSampler(
-            num_steps=steps,
-            discretization_config=discretization_config,
-            guider_config=guider_config,
-            order=order,
-            verbose=True,
-        )
-    else:
-        raise ValueError(f"unknown sampler {sampler_name}!")
-    return sampler
-def get_interactive_image() -> Image.Image:
-    image = st.file_uploader("Input", type=["jpg", "JPEG", "png"])
-    if image is not None:
-        image = Image.open(image)
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-        return image
-def load_img(
-    display: bool = True,
-    size: Union[None, int, Tuple[int, int]] = None,
-    center_crop: bool = False,
-):
-    image = get_interactive_image()
-    if image is None:
-        return None
-    if display:
-        st.image(image)
-    w, h = image.size
-    print(f"loaded input image of size ({w}, {h})")
-    transform = []
-    if size is not None:
-        transform.append(transforms.Resize(size))
-    if center_crop:
-        transform.append(transforms.CenterCrop(size))
-    transform.append(transforms.ToTensor())
-    transform.append(transforms.Lambda(lambda x: 2.0 * x - 1.0))
-    transform = transforms.Compose(transform)
-    img = transform(image)[None, ...]
-    st.text(f"input min/max/mean: {img.min():.3f}/{img.max():.3f}/{img.mean():.3f}")
-    return img
-def get_init_img(batch_size=1, key=None):
-    init_image = load_img(key=key).cuda()
-    init_image = repeat(init_image, "1 ... -> b ...", b=batch_size)
-    return init_image
-def do_sample(
-    model,
-    sampler,
-    value_dict,
-    num_samples,
-    H,
-    W,
-    C,
-    F,
-    force_uc_zero_embeddings: Optional[List] = None,
-    force_cond_zero_embeddings: Optional[List] = None,
-    batch2model_input: List = None,
-    return_latents=False,
-    filter=None,
-    T=None,
-    additional_batch_uc_fields=None,
-    decoding_t=None,
-):
-    force_uc_zero_embeddings = default(force_uc_zero_embeddings, [])
-    batch2model_input = default(batch2model_input, [])
-    additional_batch_uc_fields = default(additional_batch_uc_fields, [])
-    st.text("Sampling")
-    outputs = st.empty()
-    precision_scope = autocast
-    with torch.no_grad():
-        with precision_scope("cuda"):
-            with model.ema_scope():
-                if T is not None:
-                    num_samples = [num_samples, T]
-                else:
-                    num_samples = [num_samples]
-                load_model(model.conditioner)
-                batch, batch_uc = get_batch(
-                    get_unique_embedder_keys_from_conditioner(model.conditioner),
-                    value_dict,
-                    num_samples,
-                    T=T,
-                    additional_batch_uc_fields=additional_batch_uc_fields,
-                )
-                c, uc = model.conditioner.get_unconditional_conditioning(
-                    batch,
-                    batch_uc=batch_uc,
-                    force_uc_zero_embeddings=force_uc_zero_embeddings,
-                    force_cond_zero_embeddings=force_cond_zero_embeddings,
-                )
-                unload_model(model.conditioner)
-                for k in c:
-                    if not k == "crossattn":
-                        c[k], uc[k] = map(
-                            lambda y: y[k][: math.prod(num_samples)].to("cuda"), (c, uc)
-                        )
-                    if k in ["crossattn", "concat"] and T is not None:
-                        uc[k] = repeat(uc[k], "b ... -> b t ...", t=T)
-                        uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=T)
-                        c[k] = repeat(c[k], "b ... -> b t ...", t=T)
-                        c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=T)
-                additional_model_inputs = {}
-                for k in batch2model_input:
-                    if k == "image_only_indicator":
-                        assert T is not None
-                        if isinstance(
-                            sampler.guider, (VanillaCFG, LinearPredictionGuider)
-                        ):
-                            additional_model_inputs[k] = torch.zeros(
-                                num_samples[0] * 2, num_samples[1]
-                            ).to("cuda")
-                        else:
-                            additional_model_inputs[k] = torch.zeros(num_samples).to(
-                                "cuda"
-                            )
-                    else:
-                        additional_model_inputs[k] = batch[k]
-                shape = (math.prod(num_samples), C, H // F, W // F)
-                randn = torch.randn(shape).to("cuda")
-                def denoiser(input, sigma, c):
-                    return model.denoiser(
-                        model.model, input, sigma, c, **additional_model_inputs
-                    )
-                load_model(model.denoiser)
-                load_model(model.model)
-                samples_z = sampler(denoiser, randn, cond=c, uc=uc)
-                unload_model(model.model)
-                unload_model(model.denoiser)
-                load_model(model.first_stage_model)
-                model.en_and_decode_n_samples_a_time = (
-                    decoding_t  # Decode n frames at a time
-                )
-                samples_x = model.decode_first_stage(samples_z)
-                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-                unload_model(model.first_stage_model)
-                if filter is not None:
-                    samples = filter(samples)
-                if T is None:
-                    grid = torch.stack([samples])
-                    grid = rearrange(grid, "n b c h w -> (n h) (b w) c")
-                    outputs.image(grid.cpu().numpy())
-                else:
-                    as_vids = rearrange(samples, "(b t) c h w -> b t c h w", t=T)
-                    for i, vid in enumerate(as_vids):
-                        grid = rearrange(make_grid(vid, nrow=4), "c h w -> h w c")
-                        st.image(
-                            grid.cpu().numpy(),
-                            f"Sample #{i} as image",
-                        )
-                if return_latents:
-                    return samples, samples_z
-                return samples
-def get_batch(
-    keys,
-    value_dict: dict,
-    N: Union[List, ListConfig],
-    device: str = "cuda",
-    T: int = None,
-    additional_batch_uc_fields: List[str] = [],
-):
-    # Hardcoded demo setups; might undergo some changes in the future
-    batch = {}
-    batch_uc = {}
-    for key in keys:
-        if key == "txt":
-            batch["txt"] = [value_dict["prompt"]] * math.prod(N)
-            batch_uc["txt"] = [value_dict["negative_prompt"]] * math.prod(N)
-        elif key == "original_size_as_tuple":
-            batch["original_size_as_tuple"] = (
-                torch.tensor([value_dict["orig_height"], value_dict["orig_width"]])
-                .to(device)
-                .repeat(math.prod(N), 1)
-            )
-        elif key == "crop_coords_top_left":
-            batch["crop_coords_top_left"] = (
-                torch.tensor(
-                    [value_dict["crop_coords_top"], value_dict["crop_coords_left"]]
-                )
-                .to(device)
-                .repeat(math.prod(N), 1)
-            )
-        elif key == "aesthetic_score":
-            batch["aesthetic_score"] = (
-                torch.tensor([value_dict["aesthetic_score"]])
-                .to(device)
-                .repeat(math.prod(N), 1)
-            )
-            batch_uc["aesthetic_score"] = (
-                torch.tensor([value_dict["negative_aesthetic_score"]])
-                .to(device)
-                .repeat(math.prod(N), 1)
-            )
-        elif key == "target_size_as_tuple":
-            batch["target_size_as_tuple"] = (
-                torch.tensor([value_dict["target_height"], value_dict["target_width"]])
-                .to(device)
-                .repeat(math.prod(N), 1)
-            )
-        elif key == "fps":
-            batch[key] = (
-                torch.tensor([value_dict["fps"]]).to(device).repeat(math.prod(N))
-            )
-        elif key == "fps_id":
-            batch[key] = (
-                torch.tensor([value_dict["fps_id"]]).to(device).repeat(math.prod(N))
-            )
-        elif key == "motion_bucket_id":
-            batch[key] = (
-                torch.tensor([value_dict["motion_bucket_id"]])
-                .to(device)
-                .repeat(math.prod(N))
-            )
-        elif key == "pool_image":
-            batch[key] = repeat(value_dict[key], "1 ... -> b ...", b=math.prod(N)).to(
-                device, dtype=torch.half
-            )
-        elif key == "cond_aug":
-            batch[key] = repeat(
-                torch.tensor([value_dict["cond_aug"]]).to("cuda"),
-                "1 -> b",
-                b=math.prod(N),
-            )
-        elif key == "cond_frames":
-            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
-        elif key == "cond_frames_without_noise":
-            batch[key] = repeat(
-                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
-            )
-        else:
-            batch[key] = value_dict[key]
-    if T is not None:
-        batch["num_video_frames"] = T
-    for key in batch.keys():
-        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
-            batch_uc[key] = torch.clone(batch[key])
-        elif key in additional_batch_uc_fields and key not in batch_uc:
-            batch_uc[key] = copy.copy(batch[key])
-    return batch, batch_uc
-@torch.no_grad()
-def do_img2img(
-    img,
-    model,
-    sampler,
-    value_dict,
-    num_samples,
-    force_uc_zero_embeddings: Optional[List] = None,
-    force_cond_zero_embeddings: Optional[List] = None,
-    additional_kwargs={},
-    offset_noise_level: int = 0.0,
-    return_latents=False,
-    skip_encode=False,
-    filter=None,
-    add_noise=True,
-):
-    st.text("Sampling")
-    outputs = st.empty()
-    precision_scope = autocast
-    with torch.no_grad():
-        with precision_scope("cuda"):
-            with model.ema_scope():
-                load_model(model.conditioner)
-                batch, batch_uc = get_batch(
-                    get_unique_embedder_keys_from_conditioner(model.conditioner),
-                    value_dict,
-                    [num_samples],
-                )
-                c, uc = model.conditioner.get_unconditional_conditioning(
-                    batch,
-                    batch_uc=batch_uc,
-                    force_uc_zero_embeddings=force_uc_zero_embeddings,
-                    force_cond_zero_embeddings=force_cond_zero_embeddings,
-                )
-                unload_model(model.conditioner)
-                for k in c:
-                    c[k], uc[k] = map(lambda y: y[k][:num_samples].to("cuda"), (c, uc))
-                for k in additional_kwargs:
-                    c[k] = uc[k] = additional_kwargs[k]
-                if skip_encode:
-                    z = img
-                else:
-                    load_model(model.first_stage_model)
-                    z = model.encode_first_stage(img)
-                    unload_model(model.first_stage_model)
-                noise = torch.randn_like(z)
-                sigmas = sampler.discretization(sampler.num_steps).cuda()
-                sigma = sigmas[0]
-                st.info(f"all sigmas: {sigmas}")
-                st.info(f"noising sigma: {sigma}")
-                if offset_noise_level > 0.0:
-                    noise = noise + offset_noise_level * append_dims(
-                        torch.randn(z.shape[0], device=z.device), z.ndim
-                    )
-                if add_noise:
-                    noised_z = z + noise * append_dims(sigma, z.ndim).cuda()
-                    noised_z = noised_z / torch.sqrt(
-                        1.0 + sigmas[0] ** 2.0
-                    )  # Note: hardcoded to DDPM-like scaling. need to generalize later.
-                else:
-                    noised_z = z / torch.sqrt(1.0 + sigmas[0] ** 2.0)
-                def denoiser(x, sigma, c):
-                    return model.denoiser(model.model, x, sigma, c)
-                load_model(model.denoiser)
-                load_model(model.model)
-                samples_z = sampler(denoiser, noised_z, cond=c, uc=uc)
-                unload_model(model.model)
-                unload_model(model.denoiser)
-                load_model(model.first_stage_model)
-                samples_x = model.decode_first_stage(samples_z)
-                unload_model(model.first_stage_model)
-                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-                if filter is not None:
-                    samples = filter(samples)
-                grid = rearrange(grid, "n b c h w -> (n h) (b w) c")
-                outputs.image(grid.cpu().numpy())
-                if return_latents:
-                    return samples, samples_z
-                return samples
-def get_resizing_factor(
-    desired_shape: Tuple[int, int], current_shape: Tuple[int, int]
-) -> float:
-    r_bound = desired_shape[1] / desired_shape[0]
-    aspect_r = current_shape[1] / current_shape[0]
-    if r_bound >= 1.0:
-        if aspect_r >= r_bound:
-            factor = min(desired_shape) / min(current_shape)
-        else:
-            if aspect_r < 1.0:
-                factor = max(desired_shape) / min(current_shape)
-            else:
-                factor = max(desired_shape) / max(current_shape)
-    else:
-        if aspect_r <= r_bound:
-            factor = min(desired_shape) / min(current_shape)
-        else:
-            if aspect_r > 1:
-                factor = max(desired_shape) / min(current_shape)
-            else:
-                factor = max(desired_shape) / max(current_shape)
-    return factor
-def get_interactive_image(key=None) -> Image.Image:
-    image = st.file_uploader("Input", type=["jpg", "JPEG", "png"], key=key)
-    if image is not None:
-        image = Image.open(image)
-        if not image.mode == "RGB":
-            image = image.convert("RGB")
-        return image
-def load_img_for_prediction(
-    W: int, H: int, display=True, key=None, device="cuda"
-) -> torch.Tensor:
-    image = get_interactive_image(key=key)
-    if image is None:
-        return None
-    if display:
-        st.image(image)
-    w, h = image.size
-    image = np.array(image).transpose(2, 0, 1)
-    image = torch.from_numpy(image).to(dtype=torch.float32) / 255.0
-    image = image.unsqueeze(0)
-    rfs = get_resizing_factor((H, W), (h, w))
-    resize_size = [int(np.ceil(rfs * s)) for s in (h, w)]
-    top = (resize_size[0] - H) // 2
-    left = (resize_size[1] - W) // 2
-    image = torch.nn.functional.interpolate(
-        image, resize_size, mode="area", antialias=False
-    )
-    image = TT.functional.crop(image, top=top, left=left, height=H, width=W)
-    if display:
-        numpy_img = np.transpose(image[0].numpy(), (1, 2, 0))
-        pil_image = Image.fromarray((numpy_img * 255).astype(np.uint8))
-        st.image(pil_image)
-    return image.to(device) * 2.0 - 1.0
-def save_video_as_grid_and_mp4(
-    video_batch: torch.Tensor, save_path: str, T: int, fps: int = 5
-):
-    os.makedirs(save_path, exist_ok=True)
-    base_count = len(glob(os.path.join(save_path, "*.mp4")))
-    video_batch = rearrange(video_batch, "(b t) c h w -> b t c h w", t=T)
-    video_batch = embed_watermark(video_batch)
-    for vid in video_batch:
-        save_image(vid, fp=os.path.join(save_path, f"{base_count:06d}.png"), nrow=4)
-        video_path = os.path.join(save_path, f"{base_count:06d}.mp4")
-        writer = cv2.VideoWriter(
-            video_path,
-            cv2.VideoWriter_fourcc(*"MP4V"),
-            fps,
-            (vid.shape[-1], vid.shape[-2]),
-        )
-        vid = (
-            (rearrange(vid, "t c h w -> t h w c") * 255).cpu().numpy().astype(np.uint8)
-        )
-        for frame in vid:
-            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-            writer.write(frame)
-        writer.release()
-        video_path_h264 = video_path[:-4] + "_h264.mp4"
-        os.system(f"ffmpeg -i {video_path} -c:v libx264 {video_path_h264}")
-        with open(video_path_h264, "rb") as f:
-            video_bytes = f.read()
-        st.video(video_bytes)
-        base_count += 1

scripts/demo/video_sampling.py DELETED Viewed

@@ -1,200 +0,0 @@
-import os
-from pytorch_lightning import seed_everything
-from scripts.demo.streamlit_helpers import *
-SAVE_PATH = "outputs/demo/vid/"
-VERSION2SPECS = {
-    "svd": {
-        "T": 14,
-        "H": 576,
-        "W": 1024,
-        "C": 4,
-        "f": 8,
-        "config": "configs/inference/svd.yaml",
-        "ckpt": "checkpoints/svd.safetensors",
-        "options": {
-            "discretization": 1,
-            "cfg": 2.5,
-            "sigma_min": 0.002,
-            "sigma_max": 700.0,
-            "rho": 7.0,
-            "guider": 2,
-            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
-            "num_steps": 25,
-        },
-    },
-    "svd_image_decoder": {
-        "T": 14,
-        "H": 576,
-        "W": 1024,
-        "C": 4,
-        "f": 8,
-        "config": "configs/inference/svd_image_decoder.yaml",
-        "ckpt": "checkpoints/svd_image_decoder.safetensors",
-        "options": {
-            "discretization": 1,
-            "cfg": 2.5,
-            "sigma_min": 0.002,
-            "sigma_max": 700.0,
-            "rho": 7.0,
-            "guider": 2,
-            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
-            "num_steps": 25,
-        },
-    },
-    "svd_xt": {
-        "T": 25,
-        "H": 576,
-        "W": 1024,
-        "C": 4,
-        "f": 8,
-        "config": "configs/inference/svd.yaml",
-        "ckpt": "checkpoints/svd_xt.safetensors",
-        "options": {
-            "discretization": 1,
-            "cfg": 3.0,
-            "min_cfg": 1.5,
-            "sigma_min": 0.002,
-            "sigma_max": 700.0,
-            "rho": 7.0,
-            "guider": 2,
-            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
-            "num_steps": 30,
-            "decoding_t": 14,
-        },
-    },
-    "svd_xt_image_decoder": {
-        "T": 25,
-        "H": 576,
-        "W": 1024,
-        "C": 4,
-        "f": 8,
-        "config": "configs/inference/svd_image_decoder.yaml",
-        "ckpt": "checkpoints/svd_xt_image_decoder.safetensors",
-        "options": {
-            "discretization": 1,
-            "cfg": 3.0,
-            "min_cfg": 1.5,
-            "sigma_min": 0.002,
-            "sigma_max": 700.0,
-            "rho": 7.0,
-            "guider": 2,
-            "force_uc_zero_embeddings": ["cond_frames", "cond_frames_without_noise"],
-            "num_steps": 30,
-            "decoding_t": 14,
-        },
-    },
-}
-if __name__ == "__main__":
-    st.title("Stable Video Diffusion")
-    version = st.selectbox(
-        "Model Version",
-        [k for k in VERSION2SPECS.keys()],
-        0,
-    )
-    version_dict = VERSION2SPECS[version]
-    if st.checkbox("Load Model"):
-        mode = "img2vid"
-    else:
-        mode = "skip"
-    H = st.sidebar.number_input(
-        "H", value=version_dict["H"], min_value=64, max_value=2048
-    )
-    W = st.sidebar.number_input(
-        "W", value=version_dict["W"], min_value=64, max_value=2048
-    )
-    T = st.sidebar.number_input(
-        "T", value=version_dict["T"], min_value=0, max_value=128
-    )
-    C = version_dict["C"]
-    F = version_dict["f"]
-    options = version_dict["options"]
-    if mode != "skip":
-        state = init_st(version_dict, load_filter=True)
-        if state["msg"]:
-            st.info(state["msg"])
-        model = state["model"]
-        ukeys = set(
-            get_unique_embedder_keys_from_conditioner(state["model"].conditioner)
-        )
-        value_dict = init_embedder_options(
-            ukeys,
-            {},
-        )
-        value_dict["image_only_indicator"] = 0
-        if mode == "img2vid":
-            img = load_img_for_prediction(W, H)
-            cond_aug = st.number_input(
-                "Conditioning augmentation:", value=0.02, min_value=0.0
-            )
-            value_dict["cond_frames_without_noise"] = img
-            value_dict["cond_frames"] = img + cond_aug * torch.randn_like(img)
-            value_dict["cond_aug"] = cond_aug
-        seed = st.sidebar.number_input(
-            "seed", value=23, min_value=0, max_value=int(1e9)
-        )
-        seed_everything(seed)
-        save_locally, save_path = init_save_locally(
-            os.path.join(SAVE_PATH, version), init_value=True
-        )
-        options["num_frames"] = T
-        sampler, num_rows, num_cols = init_sampling(options=options)
-        num_samples = num_rows * num_cols
-        decoding_t = st.number_input(
-            "Decode t frames at a time (set small if you are low on VRAM)",
-            value=options.get("decoding_t", T),
-            min_value=1,
-            max_value=int(1e9),
-        )
-        if st.checkbox("Overwrite fps in mp4 generator", False):
-            saving_fps = st.number_input(
-                f"saving video at fps:", value=value_dict["fps"], min_value=1
-            )
-        else:
-            saving_fps = value_dict["fps"]
-        if st.button("Sample"):
-            out = do_sample(
-                model,
-                sampler,
-                value_dict,
-                num_samples,
-                H,
-                W,
-                C,
-                F,
-                T=T,
-                batch2model_input=["num_video_frames", "image_only_indicator"],
-                force_uc_zero_embeddings=options.get("force_uc_zero_embeddings", None),
-                force_cond_zero_embeddings=options.get(
-                    "force_cond_zero_embeddings", None
-                ),
-                return_latents=False,
-                decoding_t=decoding_t,
-            )
-            if isinstance(out, (tuple, list)):
-                samples, samples_z = out
-            else:
-                samples = out
-                samples_z = None
-            if save_locally:
-                save_video_as_grid_and_mp4(samples, save_path, T, fps=saving_fps)

scripts/sampling/configs/svd.yaml DELETED Viewed

@@ -1,146 +0,0 @@
-model:
-  target: sgm.models.diffusion.DiffusionEngine
-  params:
-    scale_factor: 0.18215
-    disable_first_stage_autocast: True
-    ckpt_path: checkpoints/svd.safetensors
-    denoiser_config:
-      target: sgm.modules.diffusionmodules.denoiser.Denoiser
-      params:
-        scaling_config:
-          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
-    network_config:
-      target: sgm.modules.diffusionmodules.video_model.VideoUNet
-      params:
-        adm_in_channels: 768
-        num_classes: sequential
-        use_checkpoint: True
-        in_channels: 8
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [4, 2, 1]
-        num_res_blocks: 2
-        channel_mult: [1, 2, 4, 4]
-        num_head_channels: 64
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        spatial_transformer_attn_type: softmax-xformers
-        extra_ff_mix_layer: True
-        use_spatial_context: True
-        merge_strategy: learned_with_images
-        video_kernel_size: [3, 1, 1]
-    conditioner_config:
-      target: sgm.modules.GeneralConditioner
-      params:
-        emb_models:
-        - is_trainable: False
-          input_key: cond_frames_without_noise
-          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
-          params:
-            n_cond_frames: 1
-            n_copies: 1
-            open_clip_embedding_config:
-              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
-              params:
-                freeze: True
-        - input_key: fps_id
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-        - input_key: motion_bucket_id
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-        - input_key: cond_frames
-          is_trainable: False
-          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
-          params:
-            disable_encoder_autocast: True
-            n_cond_frames: 1
-            n_copies: 1
-            is_ae: True
-            encoder_config:
-              target: sgm.models.autoencoder.AutoencoderKLModeOnly
-              params:
-                embed_dim: 4
-                monitor: val/rec_loss
-                ddconfig:
-                  attn_type: vanilla-xformers
-                  double_z: True
-                  z_channels: 4
-                  resolution: 256
-                  in_channels: 3
-                  out_ch: 3
-                  ch: 128
-                  ch_mult: [1, 2, 4, 4]
-                  num_res_blocks: 2
-                  attn_resolutions: []
-                  dropout: 0.0
-                lossconfig:
-                  target: torch.nn.Identity
-        - input_key: cond_aug
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-    first_stage_config:
-      target: sgm.models.autoencoder.AutoencodingEngine
-      params:
-        loss_config:
-          target: torch.nn.Identity
-        regularizer_config:
-          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
-        encoder_config:
-          target: sgm.modules.diffusionmodules.model.Encoder
-          params:
-            attn_type: vanilla
-            double_z: True
-            z_channels: 4
-            resolution: 256
-            in_channels: 3
-            out_ch: 3
-            ch: 128
-            ch_mult: [1, 2, 4, 4]
-            num_res_blocks: 2
-            attn_resolutions: []
-            dropout: 0.0
-        decoder_config:
-          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
-          params:
-            attn_type: vanilla
-            double_z: True
-            z_channels: 4
-            resolution: 256
-            in_channels: 3
-            out_ch: 3
-            ch: 128
-            ch_mult: [1, 2, 4, 4]
-            num_res_blocks: 2
-            attn_resolutions: []
-            dropout: 0.0
-            video_kernel_size: [3, 1, 1]
-    sampler_config:
-      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
-      params:
-        discretization_config:
-          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
-          params:
-            sigma_max: 700.0
-        guider_config:
-          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
-          params:
-            max_scale: 2.5
-            min_scale: 1.0

scripts/sampling/configs/svd_image_decoder.yaml DELETED Viewed

@@ -1,129 +0,0 @@
-model:
-  target: sgm.models.diffusion.DiffusionEngine
-  params:
-    scale_factor: 0.18215
-    disable_first_stage_autocast: True
-    ckpt_path: checkpoints/svd_image_decoder.safetensors
-    denoiser_config:
-      target: sgm.modules.diffusionmodules.denoiser.Denoiser
-      params:
-        scaling_config:
-          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
-    network_config:
-      target: sgm.modules.diffusionmodules.video_model.VideoUNet
-      params:
-        adm_in_channels: 768
-        num_classes: sequential
-        use_checkpoint: True
-        in_channels: 8
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [4, 2, 1]
-        num_res_blocks: 2
-        channel_mult: [1, 2, 4, 4]
-        num_head_channels: 64
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        spatial_transformer_attn_type: softmax-xformers
-        extra_ff_mix_layer: True
-        use_spatial_context: True
-        merge_strategy: learned_with_images
-        video_kernel_size: [3, 1, 1]
-    conditioner_config:
-      target: sgm.modules.GeneralConditioner
-      params:
-        emb_models:
-        - is_trainable: False
-          input_key: cond_frames_without_noise
-          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
-          params:
-            n_cond_frames: 1
-            n_copies: 1
-            open_clip_embedding_config:
-              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
-              params:
-                freeze: True
-        - input_key: fps_id
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-        - input_key: motion_bucket_id
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-        - input_key: cond_frames
-          is_trainable: False
-          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
-          params:
-            disable_encoder_autocast: True
-            n_cond_frames: 1
-            n_copies: 1
-            is_ae: True
-            encoder_config:
-              target: sgm.models.autoencoder.AutoencoderKLModeOnly
-              params:
-                embed_dim: 4
-                monitor: val/rec_loss
-                ddconfig:
-                  attn_type: vanilla-xformers
-                  double_z: True
-                  z_channels: 4
-                  resolution: 256
-                  in_channels: 3
-                  out_ch: 3
-                  ch: 128
-                  ch_mult: [1, 2, 4, 4]
-                  num_res_blocks: 2
-                  attn_resolutions: []
-                  dropout: 0.0
-                lossconfig:
-                  target: torch.nn.Identity
-        - input_key: cond_aug
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-    first_stage_config:
-      target: sgm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          attn_type: vanilla-xformers
-          double_z: True
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult: [1, 2, 4, 4]
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    sampler_config:
-      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
-      params:
-        discretization_config:
-          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
-          params:
-            sigma_max: 700.0
-        guider_config:
-          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
-          params:
-            max_scale: 2.5
-            min_scale: 1.0

scripts/sampling/configs/svd_xt.yaml DELETED Viewed

@@ -1,146 +0,0 @@
-model:
-  target: sgm.models.diffusion.DiffusionEngine
-  params:
-    scale_factor: 0.18215
-    disable_first_stage_autocast: True
-    ckpt_path: checkpoints/svd_xt.safetensors
-    denoiser_config:
-      target: sgm.modules.diffusionmodules.denoiser.Denoiser
-      params:
-        scaling_config:
-          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
-    network_config:
-      target: sgm.modules.diffusionmodules.video_model.VideoUNet
-      params:
-        adm_in_channels: 768
-        num_classes: sequential
-        use_checkpoint: True
-        in_channels: 8
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [4, 2, 1]
-        num_res_blocks: 2
-        channel_mult: [1, 2, 4, 4]
-        num_head_channels: 64
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        spatial_transformer_attn_type: softmax-xformers
-        extra_ff_mix_layer: True
-        use_spatial_context: True
-        merge_strategy: learned_with_images
-        video_kernel_size: [3, 1, 1]
-    conditioner_config:
-      target: sgm.modules.GeneralConditioner
-      params:
-        emb_models:
-        - is_trainable: False
-          input_key: cond_frames_without_noise
-          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
-          params:
-            n_cond_frames: 1
-            n_copies: 1
-            open_clip_embedding_config:
-              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
-              params:
-                freeze: True
-        - input_key: fps_id
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-        - input_key: motion_bucket_id
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-        - input_key: cond_frames
-          is_trainable: False
-          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
-          params:
-            disable_encoder_autocast: True
-            n_cond_frames: 1
-            n_copies: 1
-            is_ae: True
-            encoder_config:
-              target: sgm.models.autoencoder.AutoencoderKLModeOnly
-              params:
-                embed_dim: 4
-                monitor: val/rec_loss
-                ddconfig:
-                  attn_type: vanilla-xformers
-                  double_z: True
-                  z_channels: 4
-                  resolution: 256
-                  in_channels: 3
-                  out_ch: 3
-                  ch: 128
-                  ch_mult: [1, 2, 4, 4]
-                  num_res_blocks: 2
-                  attn_resolutions: []
-                  dropout: 0.0
-                lossconfig:
-                  target: torch.nn.Identity
-        - input_key: cond_aug
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-    first_stage_config:
-      target: sgm.models.autoencoder.AutoencodingEngine
-      params:
-        loss_config:
-          target: torch.nn.Identity
-        regularizer_config:
-          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
-        encoder_config:
-          target: sgm.modules.diffusionmodules.model.Encoder
-          params:
-            attn_type: vanilla
-            double_z: True
-            z_channels: 4
-            resolution: 256
-            in_channels: 3
-            out_ch: 3
-            ch: 128
-            ch_mult: [1, 2, 4, 4]
-            num_res_blocks: 2
-            attn_resolutions: []
-            dropout: 0.0
-        decoder_config:
-          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
-          params:
-            attn_type: vanilla
-            double_z: True
-            z_channels: 4
-            resolution: 256
-            in_channels: 3
-            out_ch: 3
-            ch: 128
-            ch_mult: [1, 2, 4, 4]
-            num_res_blocks: 2
-            attn_resolutions: []
-            dropout: 0.0
-            video_kernel_size: [3, 1, 1]
-    sampler_config:
-      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
-      params:
-        discretization_config:
-          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
-          params:
-            sigma_max: 700.0
-        guider_config:
-          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
-          params:
-            max_scale: 3.0
-            min_scale: 1.5

scripts/sampling/configs/svd_xt_image_decoder.yaml DELETED Viewed

@@ -1,129 +0,0 @@
-model:
-  target: sgm.models.diffusion.DiffusionEngine
-  params:
-    scale_factor: 0.18215
-    disable_first_stage_autocast: True
-    ckpt_path: checkpoints/svd_xt_image_decoder.safetensors
-    denoiser_config:
-      target: sgm.modules.diffusionmodules.denoiser.Denoiser
-      params:
-        scaling_config:
-          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
-    network_config:
-      target: sgm.modules.diffusionmodules.video_model.VideoUNet
-      params:
-        adm_in_channels: 768
-        num_classes: sequential
-        use_checkpoint: True
-        in_channels: 8
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [4, 2, 1]
-        num_res_blocks: 2
-        channel_mult: [1, 2, 4, 4]
-        num_head_channels: 64
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        spatial_transformer_attn_type: softmax-xformers
-        extra_ff_mix_layer: True
-        use_spatial_context: True
-        merge_strategy: learned_with_images
-        video_kernel_size: [3, 1, 1]
-    conditioner_config:
-      target: sgm.modules.GeneralConditioner
-      params:
-        emb_models:
-        - is_trainable: False
-          input_key: cond_frames_without_noise
-          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
-          params:
-            n_cond_frames: 1
-            n_copies: 1
-            open_clip_embedding_config:
-              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
-              params:
-                freeze: True
-        - input_key: fps_id
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-        - input_key: motion_bucket_id
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-        - input_key: cond_frames
-          is_trainable: False
-          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
-          params:
-            disable_encoder_autocast: True
-            n_cond_frames: 1
-            n_copies: 1
-            is_ae: True
-            encoder_config:
-              target: sgm.models.autoencoder.AutoencoderKLModeOnly
-              params:
-                embed_dim: 4
-                monitor: val/rec_loss
-                ddconfig:
-                  attn_type: vanilla-xformers
-                  double_z: True
-                  z_channels: 4
-                  resolution: 256
-                  in_channels: 3
-                  out_ch: 3
-                  ch: 128
-                  ch_mult: [1, 2, 4, 4]
-                  num_res_blocks: 2
-                  attn_resolutions: []
-                  dropout: 0.0
-                lossconfig:
-                  target: torch.nn.Identity
-        - input_key: cond_aug
-          is_trainable: False
-          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
-          params:
-            outdim: 256
-    first_stage_config:
-      target: sgm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          attn_type: vanilla-xformers
-          double_z: True
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult: [1, 2, 4, 4]
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    sampler_config:
-      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
-      params:
-        discretization_config:
-          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
-          params:
-            sigma_max: 700.0
-        guider_config:
-          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
-          params:
-            max_scale: 3.0
-            min_scale: 1.5

scripts/sampling/simple_video_sample.py DELETED Viewed

@@ -1,278 +0,0 @@
-import math
-import os
-from glob import glob
-from pathlib import Path
-from typing import Optional
-import cv2
-import numpy as np
-import torch
-from einops import rearrange, repeat
-from fire import Fire
-from omegaconf import OmegaConf
-from PIL import Image
-from torchvision.transforms import ToTensor
-from scripts.util.detection.nsfw_and_watermark_dectection import \
-    DeepFloydDataFiltering
-from sgm.inference.helpers import embed_watermark
-from sgm.util import default, instantiate_from_config
-def sample(
-    input_path: str = "assets/test_image.png",  # Can either be image file or folder with image files
-    num_frames: Optional[int] = None,
-    num_steps: Optional[int] = None,
-    version: str = "svd",
-    fps_id: int = 6,
-    motion_bucket_id: int = 127,
-    cond_aug: float = 0.02,
-    seed: int = 23,
-    decoding_t: int = 14,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
-    device: str = "cuda",
-    output_folder: Optional[str] = None,
-):
-    """
-    Simple script to generate a single sample conditioned on an image `input_path` or multiple images, one for each
-    image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t`.
-    """
-    if version == "svd":
-        num_frames = default(num_frames, 14)
-        num_steps = default(num_steps, 25)
-        output_folder = default(output_folder, "outputs/simple_video_sample/svd/")
-        model_config = "scripts/sampling/configs/svd.yaml"
-    elif version == "svd_xt":
-        num_frames = default(num_frames, 25)
-        num_steps = default(num_steps, 30)
-        output_folder = default(output_folder, "outputs/simple_video_sample/svd_xt/")
-        model_config = "scripts/sampling/configs/svd_xt.yaml"
-    elif version == "svd_image_decoder":
-        num_frames = default(num_frames, 14)
-        num_steps = default(num_steps, 25)
-        output_folder = default(
-            output_folder, "outputs/simple_video_sample/svd_image_decoder/"
-        )
-        model_config = "scripts/sampling/configs/svd_image_decoder.yaml"
-    elif version == "svd_xt_image_decoder":
-        num_frames = default(num_frames, 25)
-        num_steps = default(num_steps, 30)
-        output_folder = default(
-            output_folder, "outputs/simple_video_sample/svd_xt_image_decoder/"
-        )
-        model_config = "scripts/sampling/configs/svd_xt_image_decoder.yaml"
-    else:
-        raise ValueError(f"Version {version} does not exist.")
-    model, filter = load_model(
-        model_config,
-        device,
-        num_frames,
-        num_steps,
-    )
-    torch.manual_seed(seed)
-    path = Path(input_path)
-    all_img_paths = []
-    if path.is_file():
-        if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
-            all_img_paths = [input_path]
-        else:
-            raise ValueError("Path is not valid image file.")
-    elif path.is_dir():
-        all_img_paths = sorted(
-            [
-                f
-                for f in path.iterdir()
-                if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
-            ]
-        )
-        if len(all_img_paths) == 0:
-            raise ValueError("Folder does not contain any images.")
-    else:
-        raise ValueError
-    for input_img_path in all_img_paths:
-        with Image.open(input_img_path) as image:
-            if image.mode == "RGBA":
-                image = image.convert("RGB")
-            w, h = image.size
-            if h % 64 != 0 or w % 64 != 0:
-                width, height = map(lambda x: x - x % 64, (w, h))
-                image = image.resize((width, height))
-                print(
-                    f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
-                )
-            image = ToTensor()(image)
-            image = image * 2.0 - 1.0
-        image = image.unsqueeze(0).to(device)
-        H, W = image.shape[2:]
-        assert image.shape[1] == 3
-        F = 8
-        C = 4
-        shape = (num_frames, C, H // F, W // F)
-        if (H, W) != (576, 1024):
-            print(
-                "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
-            )
-        if motion_bucket_id > 255:
-            print(
-                "WARNING: High motion bucket! This may lead to suboptimal performance."
-            )
-        if fps_id < 5:
-            print("WARNING: Small fps value! This may lead to suboptimal performance.")
-        if fps_id > 30:
-            print("WARNING: Large fps value! This may lead to suboptimal performance.")
-        value_dict = {}
-        value_dict["motion_bucket_id"] = motion_bucket_id
-        value_dict["fps_id"] = fps_id
-        value_dict["cond_aug"] = cond_aug
-        value_dict["cond_frames_without_noise"] = image
-        value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
-        value_dict["cond_aug"] = cond_aug
-        with torch.no_grad():
-            with torch.autocast(device):
-                batch, batch_uc = get_batch(
-                    get_unique_embedder_keys_from_conditioner(model.conditioner),
-                    value_dict,
-                    [1, num_frames],
-                    T=num_frames,
-                    device=device,
-                )
-                c, uc = model.conditioner.get_unconditional_conditioning(
-                    batch,
-                    batch_uc=batch_uc,
-                    force_uc_zero_embeddings=[
-                        "cond_frames",
-                        "cond_frames_without_noise",
-                    ],
-                )
-                for k in ["crossattn", "concat"]:
-                    uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
-                    uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
-                    c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
-                    c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
-                randn = torch.randn(shape, device=device)
-                additional_model_inputs = {}
-                additional_model_inputs["image_only_indicator"] = torch.zeros(
-                    2, num_frames
-                ).to(device)
-                additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
-                def denoiser(input, sigma, c):
-                    return model.denoiser(
-                        model.model, input, sigma, c, **additional_model_inputs
-                    )
-                samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
-                model.en_and_decode_n_samples_a_time = decoding_t
-                samples_x = model.decode_first_stage(samples_z)
-                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-                os.makedirs(output_folder, exist_ok=True)
-                base_count = len(glob(os.path.join(output_folder, "*.mp4")))
-                video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
-                writer = cv2.VideoWriter(
-                    video_path,
-                    cv2.VideoWriter_fourcc(*"MP4V"),
-                    fps_id + 1,
-                    (samples.shape[-1], samples.shape[-2]),
-                )
-                samples = embed_watermark(samples)
-                samples = filter(samples)
-                vid = (
-                    (rearrange(samples, "t c h w -> t h w c") * 255)
-                    .cpu()
-                    .numpy()
-                    .astype(np.uint8)
-                )
-                for frame in vid:
-                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-                    writer.write(frame)
-                writer.release()
-def get_unique_embedder_keys_from_conditioner(conditioner):
-    return list(set([x.input_key for x in conditioner.embedders]))
-def get_batch(keys, value_dict, N, T, device):
-    batch = {}
-    batch_uc = {}
-    for key in keys:
-        if key == "fps_id":
-            batch[key] = (
-                torch.tensor([value_dict["fps_id"]])
-                .to(device)
-                .repeat(int(math.prod(N)))
-            )
-        elif key == "motion_bucket_id":
-            batch[key] = (
-                torch.tensor([value_dict["motion_bucket_id"]])
-                .to(device)
-                .repeat(int(math.prod(N)))
-            )
-        elif key == "cond_aug":
-            batch[key] = repeat(
-                torch.tensor([value_dict["cond_aug"]]).to(device),
-                "1 -> b",
-                b=math.prod(N),
-            )
-        elif key == "cond_frames":
-            batch[key] = repeat(value_dict["cond_frames"], "1 ... -> b ...", b=N[0])
-        elif key == "cond_frames_without_noise":
-            batch[key] = repeat(
-                value_dict["cond_frames_without_noise"], "1 ... -> b ...", b=N[0]
-            )
-        else:
-            batch[key] = value_dict[key]
-    if T is not None:
-        batch["num_video_frames"] = T
-    for key in batch.keys():
-        if key not in batch_uc and isinstance(batch[key], torch.Tensor):
-            batch_uc[key] = torch.clone(batch[key])
-    return batch, batch_uc
-def load_model(
-    config: str,
-    device: str,
-    num_frames: int,
-    num_steps: int,
-):
-    config = OmegaConf.load(config)
-    if device == "cuda":
-        config.model.params.conditioner_config.params.emb_models[
-            0
-        ].params.open_clip_embedding_config.params.init_device = device
-    config.model.params.sampler_config.params.num_steps = num_steps
-    config.model.params.sampler_config.params.guider_config.params.num_frames = (
-        num_frames
-    )
-    if device == "cuda":
-        with torch.device(device):
-            model = instantiate_from_config(config.model).to(device).eval()
-    else:
-        model = instantiate_from_config(config.model).to(device).eval()
-    filter = DeepFloydDataFiltering(verbose=False, device=device)
-    return model, filter
-if __name__ == "__main__":
-    Fire(sample)

scripts/tests/attention.py DELETED Viewed

@@ -1,319 +0,0 @@
-import einops
-import torch
-import torch.nn.functional as F
-import torch.utils.benchmark as benchmark
-from torch.backends.cuda import SDPBackend
-from sgm.modules.attention import BasicTransformerBlock, SpatialTransformer
-def benchmark_attn():
-    # Lets define a helpful benchmarking function:
-    # https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
-        t0 = benchmark.Timer(
-            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
-        )
-        return t0.blocked_autorange().mean * 1e6
-    # Lets define the hyper-parameters of our input
-    batch_size = 32
-    max_sequence_len = 1024
-    num_heads = 32
-    embed_dimension = 32
-    dtype = torch.float16
-    query = torch.rand(
-        batch_size,
-        num_heads,
-        max_sequence_len,
-        embed_dimension,
-        device=device,
-        dtype=dtype,
-    )
-    key = torch.rand(
-        batch_size,
-        num_heads,
-        max_sequence_len,
-        embed_dimension,
-        device=device,
-        dtype=dtype,
-    )
-    value = torch.rand(
-        batch_size,
-        num_heads,
-        max_sequence_len,
-        embed_dimension,
-        device=device,
-        dtype=dtype,
-    )
-    print(f"q/k/v shape:", query.shape, key.shape, value.shape)
-    # Lets explore the speed of each of the 3 implementations
-    from torch.backends.cuda import SDPBackend, sdp_kernel
-    # Helpful arguments mapper
-    backend_map = {
-        SDPBackend.MATH: {
-            "enable_math": True,
-            "enable_flash": False,
-            "enable_mem_efficient": False,
-        },
-        SDPBackend.FLASH_ATTENTION: {
-            "enable_math": False,
-            "enable_flash": True,
-            "enable_mem_efficient": False,
-        },
-        SDPBackend.EFFICIENT_ATTENTION: {
-            "enable_math": False,
-            "enable_flash": False,
-            "enable_mem_efficient": True,
-        },
-    }
-    from torch.profiler import ProfilerActivity, profile, record_function
-    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
-    print(
-        f"The default implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
-    )
-    with profile(
-        activities=activities, record_shapes=False, profile_memory=True
-    ) as prof:
-        with record_function("Default detailed stats"):
-            for _ in range(25):
-                o = F.scaled_dot_product_attention(query, key, value)
-    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-    print(
-        f"The math implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
-    )
-    with sdp_kernel(**backend_map[SDPBackend.MATH]):
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("Math implmentation stats"):
-                for _ in range(25):
-                    o = F.scaled_dot_product_attention(query, key, value)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-    with sdp_kernel(**backend_map[SDPBackend.FLASH_ATTENTION]):
-        try:
-            print(
-                f"The flash attention implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
-            )
-        except RuntimeError:
-            print("FlashAttention is not supported. See warnings for reasons.")
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("FlashAttention stats"):
-                for _ in range(25):
-                    o = F.scaled_dot_product_attention(query, key, value)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-    with sdp_kernel(**backend_map[SDPBackend.EFFICIENT_ATTENTION]):
-        try:
-            print(
-                f"The memory efficient implementation runs in {benchmark_torch_function_in_microseconds(F.scaled_dot_product_attention, query, key, value):.3f} microseconds"
-            )
-        except RuntimeError:
-            print("EfficientAttention is not supported. See warnings for reasons.")
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("EfficientAttention stats"):
-                for _ in range(25):
-                    o = F.scaled_dot_product_attention(query, key, value)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-def run_model(model, x, context):
-    return model(x, context)
-def benchmark_transformer_blocks():
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    import torch.utils.benchmark as benchmark
-    def benchmark_torch_function_in_microseconds(f, *args, **kwargs):
-        t0 = benchmark.Timer(
-            stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
-        )
-        return t0.blocked_autorange().mean * 1e6
-    checkpoint = True
-    compile = False
-    batch_size = 32
-    h, w = 64, 64
-    context_len = 77
-    embed_dimension = 1024
-    context_dim = 1024
-    d_head = 64
-    transformer_depth = 4
-    n_heads = embed_dimension // d_head
-    dtype = torch.float16
-    model_native = SpatialTransformer(
-        embed_dimension,
-        n_heads,
-        d_head,
-        context_dim=context_dim,
-        use_linear=True,
-        use_checkpoint=checkpoint,
-        attn_type="softmax",
-        depth=transformer_depth,
-        sdp_backend=SDPBackend.FLASH_ATTENTION,
-    ).to(device)
-    model_efficient_attn = SpatialTransformer(
-        embed_dimension,
-        n_heads,
-        d_head,
-        context_dim=context_dim,
-        use_linear=True,
-        depth=transformer_depth,
-        use_checkpoint=checkpoint,
-        attn_type="softmax-xformers",
-    ).to(device)
-    if not checkpoint and compile:
-        print("compiling models")
-        model_native = torch.compile(model_native)
-        model_efficient_attn = torch.compile(model_efficient_attn)
-    x = torch.rand(batch_size, embed_dimension, h, w, device=device, dtype=dtype)
-    c = torch.rand(batch_size, context_len, context_dim, device=device, dtype=dtype)
-    from torch.profiler import ProfilerActivity, profile, record_function
-    activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
-    with torch.autocast("cuda"):
-        print(
-            f"The native model runs in {benchmark_torch_function_in_microseconds(model_native.forward, x, c):.3f} microseconds"
-        )
-        print(
-            f"The efficientattn model runs in {benchmark_torch_function_in_microseconds(model_efficient_attn.forward, x, c):.3f} microseconds"
-        )
-        print(75 * "+")
-        print("NATIVE")
-        print(75 * "+")
-        torch.cuda.reset_peak_memory_stats()
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("NativeAttention stats"):
-                for _ in range(25):
-                    model_native(x, c)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by native block")
-        print(75 * "+")
-        print("Xformers")
-        print(75 * "+")
-        torch.cuda.reset_peak_memory_stats()
-        with profile(
-            activities=activities, record_shapes=False, profile_memory=True
-        ) as prof:
-            with record_function("xformers stats"):
-                for _ in range(25):
-                    model_efficient_attn(x, c)
-        print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-        print(torch.cuda.max_memory_allocated() * 1e-9, "GB used by xformers block")
-def test01():
-    # conv1x1 vs linear
-    from sgm.util import count_params
-    conv = torch.nn.Conv2d(3, 32, kernel_size=1).cuda()
-    print(count_params(conv))
-    linear = torch.nn.Linear(3, 32).cuda()
-    print(count_params(linear))
-    print(conv.weight.shape)
-    # use same initialization
-    linear.weight = torch.nn.Parameter(conv.weight.squeeze(-1).squeeze(-1))
-    linear.bias = torch.nn.Parameter(conv.bias)
-    print(linear.weight.shape)
-    x = torch.randn(11, 3, 64, 64).cuda()
-    xr = einops.rearrange(x, "b c h w -> b (h w) c").contiguous()
-    print(xr.shape)
-    out_linear = linear(xr)
-    print(out_linear.mean(), out_linear.shape)
-    out_conv = conv(x)
-    print(out_conv.mean(), out_conv.shape)
-    print("done with test01.\n")
-def test02():
-    # try cosine flash attention
-    import time
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.backends.cudnn.benchmark = True
-    print("testing cosine flash attention...")
-    DIM = 1024
-    SEQLEN = 4096
-    BS = 16
-    print(" softmax (vanilla) first...")
-    model = BasicTransformerBlock(
-        dim=DIM,
-        n_heads=16,
-        d_head=64,
-        dropout=0.0,
-        context_dim=None,
-        attn_mode="softmax",
-    ).cuda()
-    try:
-        x = torch.randn(BS, SEQLEN, DIM).cuda()
-        tic = time.time()
-        y = model(x)
-        toc = time.time()
-        print(y.shape, toc - tic)
-    except RuntimeError as e:
-        # likely oom
-        print(str(e))
-    print("\n now flash-cosine...")
-    model = BasicTransformerBlock(
-        dim=DIM,
-        n_heads=16,
-        d_head=64,
-        dropout=0.0,
-        context_dim=None,
-        attn_mode="flash-cosine",
-    ).cuda()
-    x = torch.randn(BS, SEQLEN, DIM).cuda()
-    tic = time.time()
-    y = model(x)
-    toc = time.time()
-    print(y.shape, toc - tic)
-    print("done with test02.\n")
-if __name__ == "__main__":
-    # test01()
-    # test02()
-    # test03()
-    # benchmark_attn()
-    benchmark_transformer_blocks()
-    print("done.")

scripts/util/__init__.py DELETED Viewed

File without changes

scripts/util/detection/__init__.py DELETED Viewed

File without changes

scripts/util/detection/nsfw_and_watermark_dectection.py DELETED Viewed

@@ -1,110 +0,0 @@
-import os
-import clip
-import numpy as np
-import torch
-import torchvision.transforms as T
-from PIL import Image
-RESOURCES_ROOT = "scripts/util/detection/"
-def predict_proba(X, weights, biases):
-    logits = X @ weights.T + biases
-    proba = np.where(
-        logits >= 0, 1 / (1 + np.exp(-logits)), np.exp(logits) / (1 + np.exp(logits))
-    )
-    return proba.T
-def load_model_weights(path: str):
-    model_weights = np.load(path)
-    return model_weights["weights"], model_weights["biases"]
-def clip_process_images(images: torch.Tensor) -> torch.Tensor:
-    min_size = min(images.shape[-2:])
-    return T.Compose(
-        [
-            T.CenterCrop(min_size),  # TODO: this might affect the watermark, check this
-            T.Resize(224, interpolation=T.InterpolationMode.BICUBIC, antialias=True),
-            T.Normalize(
-                (0.48145466, 0.4578275, 0.40821073),
-                (0.26862954, 0.26130258, 0.27577711),
-            ),
-        ]
-    )(images)
-class DeepFloydDataFiltering(object):
-    def __init__(
-        self, verbose: bool = False, device: torch.device = torch.device("cpu")
-    ):
-        super().__init__()
-        self.verbose = verbose
-        self._device = None
-        self.clip_model, _ = clip.load("ViT-L/14", device=device)
-        self.clip_model.eval()
-        self.cpu_w_weights, self.cpu_w_biases = load_model_weights(
-            os.path.join(RESOURCES_ROOT, "w_head_v1.npz")
-        )
-        self.cpu_p_weights, self.cpu_p_biases = load_model_weights(
-            os.path.join(RESOURCES_ROOT, "p_head_v1.npz")
-        )
-        self.w_threshold, self.p_threshold = 0.5, 0.5
-    @torch.inference_mode()
-    def __call__(self, images: torch.Tensor) -> torch.Tensor:
-        imgs = clip_process_images(images)
-        if self._device is None:
-            self._device = next(p for p in self.clip_model.parameters()).device
-        image_features = self.clip_model.encode_image(imgs.to(self._device))
-        image_features = image_features.detach().cpu().numpy().astype(np.float16)
-        p_pred = predict_proba(image_features, self.cpu_p_weights, self.cpu_p_biases)
-        w_pred = predict_proba(image_features, self.cpu_w_weights, self.cpu_w_biases)
-        print(f"p_pred = {p_pred}, w_pred = {w_pred}") if self.verbose else None
-        query = p_pred > self.p_threshold
-        if query.sum() > 0:
-            print(f"Hit for p_threshold: {p_pred}") if self.verbose else None
-            images[query] = T.GaussianBlur(99, sigma=(100.0, 100.0))(images[query])
-        query = w_pred > self.w_threshold
-        if query.sum() > 0:
-            print(f"Hit for w_threshold: {w_pred}") if self.verbose else None
-            images[query] = T.GaussianBlur(99, sigma=(100.0, 100.0))(images[query])
-        return images
-def load_img(path: str) -> torch.Tensor:
-    image = Image.open(path)
-    if not image.mode == "RGB":
-        image = image.convert("RGB")
-    image_transforms = T.Compose(
-        [
-            T.ToTensor(),
-        ]
-    )
-    return image_transforms(image)[None, ...]
-def test(root):
-    from einops import rearrange
-    filter = DeepFloydDataFiltering(verbose=True)
-    for p in os.listdir((root)):
-        print(f"running on {p}...")
-        img = load_img(os.path.join(root, p))
-        filtered_img = filter(img)
-        filtered_img = rearrange(
-            255.0 * (filtered_img.numpy())[0], "c h w -> h w c"
-        ).astype(np.uint8)
-        Image.fromarray(filtered_img).save(
-            os.path.join(root, f"{os.path.splitext(p)[0]}-filtered.jpg")
-        )
-if __name__ == "__main__":
-    import fire
-    fire.Fire(test)
-    print("done.")

scripts/util/detection/p_head_v1.npz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b4653a64d5f85d8d4c5f6c5ec175f1c5c5e37db8f38d39b2ed8b5979da7fdc76
-size 3588

scripts/util/detection/w_head_v1.npz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b6af23687aa347073e692025f405ccc48c14aadc5dbe775b3312041006d496d1
-size 3588