Spaces:

TempoFunk
/

makeavid-sd-jax

Runtime error

App Files Files Community

lopho commited on May 6, 2023

Commit

b83ebfb

1 Parent(s): 5b09d17

nicer defaults, selecable scheduler, image cfg separate

Browse files

Files changed (5) hide show

README.md +1 -1
app.py +127 -73
example.webp +2 -2
example_input.png +0 -0
makeavid_sd/inference.py +91 -55

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ library_name: diffusers
 pipeline_tag: text-to-video
 datasets:
 - TempoFunk/tempofunk-sdance
-- TempoFunk/tempofunk-m
 models:
 - TempoFunk/makeavid-sd-jax
 - runwayml/stable-diffusion-v1-5

 pipeline_tag: text-to-video
 datasets:
 - TempoFunk/tempofunk-sdance
+- TempoFunk/small
 models:
 - TempoFunk/makeavid-sd-jax
 - runwayml/stable-diffusion-v1-5

app.py CHANGED Viewed

@@ -7,7 +7,11 @@ from functools import partial
 from PIL import Image, ImageOps
 import gradio as gr
-from makeavid_sd.inference import InferenceUNetPseudo3D, FlaxDPMSolverMultistepScheduler, jnp
 print(os.environ.get('XLA_PYTHON_CLIENT_PREALLOCATE', 'NotSet'))
 print(os.environ.get('XLA_PYTHON_CLIENT_ALLOCATOR', 'NotSet'))
@@ -17,8 +21,7 @@ _preheat: bool = False
 _seen_compilations = set()
 _model = InferenceUNetPseudo3D(
-        model_path = 'TempoFunk/makeavid-sd-jax',
-        scheduler_cls = FlaxDPMSolverMultistepScheduler,
         dtype = jnp.float16,
         hf_auth_token = os.environ.get('HUGGING_FACE_HUB_TOKEN', None)
 )
@@ -30,69 +33,85 @@ if _model.failed != False:
     demo.launch()
 # gradio is illiterate. type hints make it go poopoo in pantsu.
 def generate(
         prompt = 'An elderly man having a great time in the park.',
         neg_prompt = '',
-        image = None,
         inference_steps = 20,
-        cfg = 12.0,
         seed = 0,
         fps = 24,
         num_frames = 24,
         height = 512,
-        width = 512
 ) -> str:
     height = int(height)
     width = int(width)
-    num_frames = int(num_frames)
-    seed = int(seed)
     height = (height // 64) * 64
     width = (width // 64) * 64
     if seed < 0:
         seed = -seed
-    inference_steps = int(inference_steps)
-    hint_image = image
     if hint_image is not None:
         if hint_image.mode != 'RGB':
             hint_image = hint_image.convert('RGB')
         if hint_image.size != (width, height):
             hint_image = ImageOps.fit(hint_image, (width, height), method = Image.Resampling.LANCZOS)
     images = _model.generate(
             prompt = [prompt] * _model.device_count,
             neg_prompt = neg_prompt,
             hint_image = hint_image,
-            mask_image = None,
             inference_steps = inference_steps,
             cfg = cfg,
             height = height,
             width = width,
             num_frames = num_frames,
-            seed = seed
     )
     _seen_compilations.add((hint_image is None, inference_steps, height, width, num_frames))
     buffer = BytesIO()
-    images[0].save(
             buffer,
-            format = 'webp',
             save_all = True,
-            append_images = images[1:],
             loop = 0,
             duration = round(1000 / fps),
             allow_mixed = True
     )
     data = base64.b64encode(buffer.getvalue()).decode()
-    data = 'data:image/webp;base64,' + data
     buffer.close()
     return data
-def check_if_compiled(image, inference_steps, height, width, num_frames, message):
     height = int(height)
     width = int(width)
     height = (height // 64) * 64
     width = (width // 64) * 64
-    hint_image = image
-    if (hint_image is None, inference_steps, height, width, num_frames) in _seen_compilations:
         return ''
     else:
         return  f"""{message}"""
@@ -126,19 +145,19 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
                         # Make-A-Video Stable Diffusion JAX
                         We have extended a pretrained LDM inpainting image generation model with temporal convolutions and attention.
-                        We take advantage of the extra 5 input channels of the inpaint model to guide the video generation with a hint image and mask.
-                        The hint image can be given by the user, otherwise it is generated by an generative image model.
-                        The temporal convolution and attention is a port of [Make-A-Video Pytorch](https://github.com/lucidrains/make-a-video-pytorch/blob/main/make_a_video_pytorch) to FLAX.
-                        It is a pseudo 3D convolution that seperately convolves accross the spatial dimension in 2D and over the temporal dimension in 1D.
-                        Temporal attention is purely self attention and also separately attends to time and space.
                         Only the new temporal layers have been fine tuned on a dataset of videos themed around dance.
-                        The model has been trained for 60 epochs on a dataset of 10,000 Videos with 120 frames each, randomly selecting a 24 frame range from each sample.
                         See model and dataset links in the metadata.
-                        Model implementation and training code can be found at [https://github.com/lopho/makeavid-sd-tpu](https://github.com/lopho/makeavid-sd-tpu)
             """)
         with gr.Column():
             intro3 = gr.Markdown("""
@@ -151,40 +170,44 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
                         Changes to the following parameters require the model to compile
                         - Number of frames
                         - Width & Height
-                        - Steps
                         - Input image vs. no input image
             """)
     with gr.Row(variant = variant):
-        with gr.Column(variant = variant):
             with gr.Row():
                 #cancel_button = gr.Button(value = 'Cancel')
                 submit_button = gr.Button(value = 'Make A Video', variant = 'primary')
             prompt_input = gr.Textbox(
                     label = 'Prompt',
-                    value = 'They are dancing in the club while sweat drips from the ceiling.',
                     interactive = True
             )
             neg_prompt_input = gr.Textbox(
                     label = 'Negative prompt (optional)',
-                    value = '',
                     interactive = True
             )
-            inference_steps_input = gr.Slider(
-                label = 'Steps',
-                minimum = 2,
-                maximum = 100,
-                value = 20,
-                step = 1
-            )
             cfg_input = gr.Slider(
-                    label = 'Guidance scale',
                     minimum = 1.0,
                     maximum = 20.0,
                     step = 0.1,
                     value = 15.0,
                     interactive = True
             )
             seed_input = gr.Number(
                     label = 'Random seed',
                     value = 0,
@@ -192,43 +215,68 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
                     precision = 0
             )
             image_input = gr.Image(
-                    label = 'Input image (optional)',
                     interactive = True,
                     image_mode = 'RGB',
                     type = 'pil',
                     optional = True,
-                    source = 'upload'
             )
             num_frames_input = gr.Slider(
                     label = 'Number of frames to generate',
                     minimum = 1,
                     maximum = 24,
                     step = 1,
-                    value = 24
             )
             width_input = gr.Slider(
                     label = 'Width',
                     minimum = 64,
-                    maximum = 512,
                     step = 64,
-                    value = 448
             )
             height_input = gr.Slider(
                     label = 'Height',
                     minimum = 64,
-                    maximum = 512,
                     step = 64,
-                    value = 448
             )
-            fps_input = gr.Slider(
-                    label = 'Output FPS',
-                    minimum = 1,
-                    maximum = 1000,
-                    step = 1,
-                    value = 12
             )
-        with gr.Column(variant = variant):
-            #no_gpu = gr.Markdown('**Until a GPU is assigned expect extremely long runtimes up to 1h+**')
             #will_trigger = gr.Markdown('')
             patience = gr.Markdown('**Please be patient. The model might have to compile with current parameters.**')
             image_output = gr.Image(
@@ -236,33 +284,39 @@ with gr.Blocks(title = 'Make-A-Video Stable Diffusion JAX', analytics_enabled =
                     value = 'example.webp',
                     interactive = False
             )
-    #trigger_inputs = [ image_input, inference_steps_input, height_input, width_input, num_frames_input ]
-    #trigger_check_fun = partial(check_if_compiled, message = 'Current parameters will trigger compilation.')
     #height_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     #width_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     #num_frames_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     #image_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     #inference_steps_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
-    #will_trigger.value = trigger_check_fun(image_input.value, inference_steps_input.value, height_input.value, width_input.value, num_frames_input.value)
-    ev = submit_button.click(
-        fn = generate,
-        inputs = [
-                prompt_input,
-                neg_prompt_input,
-                image_input,
-                inference_steps_input,
-                cfg_input,
-                seed_input,
-                fps_input,
-                num_frames_input,
-                height_input,
-                width_input
-        ],
-        outputs = image_output,
-        postprocess = False
     )
     #cancel_button.click(fn = lambda: None, cancels = ev)
-demo.queue(concurrency_count = 1, max_size = 32)
 demo.launch()

 from PIL import Image, ImageOps
 import gradio as gr
+from makeavid_sd.inference import (
+        InferenceUNetPseudo3D,
+        jnp,
+        SCHEDULERS
+)
 print(os.environ.get('XLA_PYTHON_CLIENT_PREALLOCATE', 'NotSet'))
 print(os.environ.get('XLA_PYTHON_CLIENT_ALLOCATOR', 'NotSet'))
 _seen_compilations = set()
 _model = InferenceUNetPseudo3D(
+        model_path = '/mnt/work1/make_a_vid/makeavid-space/model/model',
         dtype = jnp.float16,
         hf_auth_token = os.environ.get('HUGGING_FACE_HUB_TOKEN', None)
 )
     demo.launch()
+_output_formats = (
+    'webp', 'gif'
+)
 # gradio is illiterate. type hints make it go poopoo in pantsu.
 def generate(
         prompt = 'An elderly man having a great time in the park.',
         neg_prompt = '',
+        hint_image = None,
         inference_steps = 20,
+        cfg = 15.0,
+        cfg_image = 9.0,
         seed = 0,
         fps = 24,
         num_frames = 24,
         height = 512,
+        width = 512,
+        scheduler_type = 'DPM',
+        output_format = 'webp'
 ) -> str:
+    num_frames = int(num_frames)
+    inference_steps = int(inference_steps)
     height = int(height)
     width = int(width)
     height = (height // 64) * 64
     width = (width // 64) * 64
+    cfg = max(cfg, 1.0)
+    cfg_image = max(cfg_image, 1.0)
+    seed = int(seed)
     if seed < 0:
         seed = -seed
     if hint_image is not None:
         if hint_image.mode != 'RGB':
             hint_image = hint_image.convert('RGB')
         if hint_image.size != (width, height):
             hint_image = ImageOps.fit(hint_image, (width, height), method = Image.Resampling.LANCZOS)
+    if scheduler_type not in SCHEDULERS:
+        scheduler_type = 'DPM'
+    output_format = output_format.lower()
+    if output_format not in _output_formats:
+        output_format = 'webp'
+    mask_image = None
     images = _model.generate(
             prompt = [prompt] * _model.device_count,
             neg_prompt = neg_prompt,
             hint_image = hint_image,
+            mask_image = mask_image,
             inference_steps = inference_steps,
             cfg = cfg,
+            cfg_image = cfg_image,
             height = height,
             width = width,
             num_frames = num_frames,
+            seed = seed,
+            scheduler_type = scheduler_type
     )
     _seen_compilations.add((hint_image is None, inference_steps, height, width, num_frames))
     buffer = BytesIO()
+    images[1].save(
             buffer,
+            format = output_format,
             save_all = True,
+            append_images = images[2:],
             loop = 0,
             duration = round(1000 / fps),
             allow_mixed = True
     )
     data = base64.b64encode(buffer.getvalue()).decode()
     buffer.close()
+    data = f'data:image/{output_format};base64,' + data
     return data
+def check_if_compiled(hint_image, inference_steps, height, width, num_frames, scheduler_type, message):
     height = int(height)
     width = int(width)
+    inference_steps = int(inference_steps)
     height = (height // 64) * 64
     width = (width // 64) * 64
+    if (hint_image is None, inference_steps, height, width, num_frames, scheduler_type) in _seen_compilations:
         return ''
     else:
         return  f"""{message}"""
                         # Make-A-Video Stable Diffusion JAX
                         We have extended a pretrained LDM inpainting image generation model with temporal convolutions and attention.
+                        By taking advantage of the extra 5 input channels of the inpaint model, we guide the video generation with a hint image.
+                        In this demo the hint image can be given by the user, otherwise it is generated by an generative image model.
+                        The temporal layers are a port of [Make-A-Video PyTorch](https://github.com/lucidrains/make-a-video-pytorch) to FLAX.
+                        The convolution is pseudo 3D and seperately convolves accross the spatial dimension in 2D and over the temporal dimension in 1D.
+                        Temporal attention is purely self attention and also separately attends to time.
                         Only the new temporal layers have been fine tuned on a dataset of videos themed around dance.
+                        The model has been trained for 80 epochs on a dataset of 18,000 Videos with 120 frames each, randomly selecting a 24 frame range from each sample.
                         See model and dataset links in the metadata.
+                        Model implementation and training code can be found at <https://github.com/lopho/makeavid-sd-tpu>
             """)
         with gr.Column():
             intro3 = gr.Markdown("""
                         Changes to the following parameters require the model to compile
                         - Number of frames
                         - Width & Height
+                        - Inference steps
                         - Input image vs. no input image
+                        - Noise scheduler type
+                        If you encounter any issues, please report them here: [Space discussions](https://huggingface.co/spaces/TempoFunk/makeavid-sd-jax/discussions)
             """)
     with gr.Row(variant = variant):
+        with gr.Column():
             with gr.Row():
                 #cancel_button = gr.Button(value = 'Cancel')
                 submit_button = gr.Button(value = 'Make A Video', variant = 'primary')
             prompt_input = gr.Textbox(
                     label = 'Prompt',
+                    value = 'They are dancing in the club but everybody is a 3d cg  hairy monster wearing a hairy costume.',
                     interactive = True
             )
             neg_prompt_input = gr.Textbox(
                     label = 'Negative prompt (optional)',
+                    value = 'monochrome, saturated',
                     interactive = True
             )
             cfg_input = gr.Slider(
+                    label = 'Guidance scale video',
                     minimum = 1.0,
                     maximum = 20.0,
                     step = 0.1,
                     value = 15.0,
                     interactive = True
             )
+            cfg_image_input = gr.Slider(
+                    label = 'Guidance scale hint (no effect with input image)',
+                    minimum = 1.0,
+                    maximum = 20.0,
+                    step = 0.1,
+                    value = 9.0,
+                    interactive = True
+            )
             seed_input = gr.Number(
                     label = 'Random seed',
                     value = 0,
                     precision = 0
             )
             image_input = gr.Image(
+                    label = 'Hint image (optional)',
                     interactive = True,
                     image_mode = 'RGB',
                     type = 'pil',
                     optional = True,
+                    source = 'upload',
+                    value = 'example_input.png'
+            )
+            inference_steps_input = gr.Slider(
+                    label = 'Steps',
+                    minimum = 2,
+                    maximum = 100,
+                    value = 20,
+                    step = 1,
+                    interactive = True
             )
             num_frames_input = gr.Slider(
                     label = 'Number of frames to generate',
                     minimum = 1,
                     maximum = 24,
                     step = 1,
+                    value = 24,
+                    interactive = True
             )
             width_input = gr.Slider(
                     label = 'Width',
                     minimum = 64,
+                    maximum = 576,
                     step = 64,
+                    value = 512,
+                    interactive = True
             )
             height_input = gr.Slider(
                     label = 'Height',
                     minimum = 64,
+                    maximum = 576,
                     step = 64,
+                    value = 512,
+                    interactive = True
             )
+            scheduler_input = gr.Dropdown(
+                    label = 'Noise scheduler',
+                    choices = list(SCHEDULERS.keys()),
+                    value = 'DPM',
+                    interactive = True
             )
+            with gr.Row():
+                fps_input = gr.Slider(
+                        label = 'Output FPS',
+                        minimum = 1,
+                        maximum = 1000,
+                        step = 1,
+                        value = 12,
+                        interactive = True
+                )
+                output_format = gr.Dropdown(
+                        label = 'Output format',
+                        choices = _output_formats,
+                        value = 'gif',
+                        interactive = True
+                )
+        with gr.Column():
             #will_trigger = gr.Markdown('')
             patience = gr.Markdown('**Please be patient. The model might have to compile with current parameters.**')
             image_output = gr.Image(
                     value = 'example.webp',
                     interactive = False
             )
+    #trigger_inputs = [ image_input, inference_steps_input, height_input, width_input, num_frames_input, scheduler_input ]
+    #trigger_check_fun = partial(check_if_compiled, message = 'Current parameters need compilation.')
     #height_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     #width_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     #num_frames_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     #image_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
     #inference_steps_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
+    #scheduler_input.change(fn = trigger_check_fun, inputs = trigger_inputs, outputs = will_trigger)
+    submit_button.click(
+            fn = generate,
+            inputs = [
+                    prompt_input,
+                    neg_prompt_input,
+                    image_input,
+                    inference_steps_input,
+                    cfg_input,
+                    cfg_image_input,
+                    seed_input,
+                    fps_input,
+                    num_frames_input,
+                    height_input,
+                    width_input,
+                    scheduler_input,
+                    output_format
+            ],
+            outputs = image_output,
+            postprocess = False
     )
     #cancel_button.click(fn = lambda: None, cancels = ev)
+demo.queue(concurrency_count = 1, max_size = 12)
 demo.launch()
+# Photorealistic fantasy oil painting of  the angry minotaur in a threatening pose by Randy Vargas.
+# A girl is dancing by a beautiful lake by sophie anderson and greg rutkowski and alphonse mucha.
+# They are dancing in the club but everybody is a 3d cg  hairy monster wearing a hairy costume.

example.webp CHANGED Viewed

Git LFS Details

SHA256: e04074345eb8c6157398eef5db65167ebaa29356c16a087555d4058cbe2cad6a
Pointer size: 132 Bytes
Size of remote file: 1.2 MB

Git LFS Details

SHA256: ffd7cb93989a8e311395799f6d6e566e698ad7654f9f5a471196d8c781f46c1f
Pointer size: 132 Bytes
Size of remote file: 1.45 MB

example_input.png ADDED Viewed

makeavid_sd/inference.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from typing import Any, Union, Tuple, List, Dict
 import os
 import gc
 from functools import partial
@@ -17,13 +17,14 @@ import einops
 from diffusers import FlaxAutoencoderKL, FlaxUNet2DConditionModel
 from diffusers import (
         FlaxDDIMScheduler,
-        FlaxDDPMScheduler,
         FlaxPNDMScheduler,
         FlaxLMSDiscreteScheduler,
         FlaxDPMSolverMultistepScheduler,
-        FlaxKarrasVeScheduler,
-        FlaxScoreSdeVeScheduler
 )
 from transformers import FlaxCLIPTextModel, CLIPTokenizer
@@ -31,14 +32,31 @@ from .flax_impl.flax_unet_pseudo3d_condition import UNetPseudo3DConditionModel
 SchedulerType = Union[
         FlaxDDIMScheduler,
-        FlaxDDPMScheduler,
         FlaxPNDMScheduler,
         FlaxLMSDiscreteScheduler,
         FlaxDPMSolverMultistepScheduler,
-        FlaxKarrasVeScheduler,
-        FlaxScoreSdeVeScheduler
 ]
 def dtypestr(x: jnp.dtype):
     if x == jnp.float32: return 'float32'
     elif x == jnp.float16: return 'float16'
@@ -53,7 +71,6 @@ def castto(dtype, m, x):
 class InferenceUNetPseudo3D:
     def __init__(self,
             model_path: str,
-            scheduler_cls: SchedulerType = FlaxDDIMScheduler,
             dtype: jnp.dtype = jnp.float16,
             hf_auth_token: Union[str, None] = None
     ) -> None:
@@ -129,28 +146,27 @@ class InferenceUNetPseudo3D:
                 subfolder = 'tokenizer',
                 use_auth_token = self.hf_auth_token
         )
-        scheduler, scheduler_state = scheduler_cls.from_pretrained(
-                self.model_path,
-                subfolder = 'scheduler',
-                dtype = jnp.float32,
-                use_auth_token = self.hf_auth_token
-        )
-        self.scheduler: scheduler_cls = scheduler
-        self.params['scheduler'] = scheduler_state
         self.vae_scale_factor: int = int(2 ** (len(self.vae.config.block_out_channels) - 1))
         self.device_count = jax.device_count()
         gc.collect()
-    def set_scheduler(self, scheduler_cls: SchedulerType) -> None:
-        scheduler, scheduler_state = scheduler_cls.from_pretrained(
-                self.model_path,
-                subfolder = 'scheduler',
-                dtype = jnp.float32,
-                use_auth_token = self.hf_auth_token
-        )
-        self.scheduler: scheduler_cls = scheduler
-        self.params['scheduler'] = scheduler_state
     def prepare_inputs(self,
             prompt: List[str],
             neg_prompt: List[str],
@@ -213,11 +229,13 @@ class InferenceUNetPseudo3D:
             hint_image: Union[Image.Image, List[Image.Image], None] = None,
             mask_image: Union[Image.Image, List[Image.Image], None] = None,
             neg_prompt: Union[str, List[str]] = '',
-            cfg: float = 10.0,
             num_frames: int = 24,
             width: int = 512,
             height: int = 512,
-            seed: int = 0
     ) -> List[List[Image.Image]]:
         assert inference_steps > 0, f'number of inference steps must be > 0 but is {inference_steps}'
         assert num_frames > 0, f'number of frames must be > 0 but is {num_frames}'
@@ -243,6 +261,7 @@ class InferenceUNetPseudo3D:
         if isinstance(neg_prompt, str):
             neg_prompt = [ neg_prompt ] * batch_size
         assert len(neg_prompt) == batch_size, f'number of negative prompts must be equal to batch size {batch_size} but is {len(neg_prompt)}'
         tokens, neg_tokens, hint, mask = self.prepare_inputs(
                 prompt = prompt,
                 neg_prompt = neg_prompt,
@@ -251,11 +270,14 @@ class InferenceUNetPseudo3D:
                 width = width,
                 height = height
         )
         # NOTE splitting rngs is not deterministic,
         # running on different device counts gives different seeds
         #rng = jax.random.PRNGKey(seed)
         #rngs = jax.random.split(rng, self.device_count)
-        # manually assign seeded RNGs to devices for reproducability
         rngs = jnp.array([ jax.random.PRNGKey(seed + i) for i in range(self.device_count) ])
         params = jax_utils.replicate(self.params)
         tokens = shard(tokens)
@@ -272,9 +294,11 @@ class InferenceUNetPseudo3D:
             height,
             width,
             cfg,
             rngs,
             params,
-            use_imagegen
         )
         if images.ndim == 5:
             images = einops.rearrange(images, 'd f c h w -> (d f) h w c')
@@ -295,9 +319,11 @@ class InferenceUNetPseudo3D:
             height,
             width,
             cfg: float,
             rng: jax.random.KeyArray,
             params: Union[Dict[str, Any], FrozenDict[str, Any]],
-            use_imagegen: bool
     ) -> List[Image.Image]:
         batch_size = tokens.shape[0]
         latent_h = height // self.vae_scale_factor
@@ -312,15 +338,18 @@ class InferenceUNetPseudo3D:
         encoded_prompt = self.text_encoder(tokens, params = params['text_encoder'])[0]
         encoded_neg_prompt = self.text_encoder(neg_tokens, params = params['text_encoder'])[0]
         if use_imagegen:
             image_latent_shape = (batch_size, self.vae.config.latent_channels, latent_h, latent_w)
             image_latents = jax.random.normal(
                     rng,
                     shape = image_latent_shape,
                     dtype = jnp.float32
-            ) * params['scheduler'].init_noise_sigma
-            image_scheduler_state = self.scheduler.set_timesteps(
-                    params['scheduler'],
                     num_inference_steps = inference_steps,
                     shape = image_latents.shape
             )
@@ -328,21 +357,21 @@ class InferenceUNetPseudo3D:
                 image_latents, image_scheduler_state = args
                 t = image_scheduler_state.timesteps[step]
                 tt = jnp.broadcast_to(t, image_latents.shape[0])
-                latents_input = self.scheduler.scale_model_input(image_scheduler_state, image_latents, t)
                 noise_pred = self.imunet.apply(
-                        {'params': params['imunet']},
                         latents_input,
                         tt,
                         encoder_hidden_states = encoded_prompt
                 ).sample
                 noise_pred_uncond = self.imunet.apply(
-                        {'params': params['imunet']},
                         latents_input,
                         tt,
                         encoder_hidden_states = encoded_neg_prompt
                 ).sample
                 noise_pred = noise_pred_uncond + cfg * (noise_pred - noise_pred_uncond)
-                image_latents, image_scheduler_state = self.scheduler.step(
                         image_scheduler_state,
                         noise_pred.astype(jnp.float32),
                         t,
@@ -357,7 +386,7 @@ class InferenceUNetPseudo3D:
             hint = image_latents
         else:
             hint = self.vae.apply(
-                    {'params': params['vae']},
                     hint,
                     method = self.vae.encode
             ).latent_dist.mean * self.vae.config.scaling_factor
@@ -375,9 +404,9 @@ class InferenceUNetPseudo3D:
                 rng,
                 shape = latent_shape,
                 dtype = jnp.float32
-        ) * params['scheduler'].init_noise_sigma
-        scheduler_state = self.scheduler.set_timesteps(
-                params['scheduler'],
                 num_inference_steps = inference_steps,
                 shape = latents.shape
         )
@@ -386,7 +415,7 @@ class InferenceUNetPseudo3D:
             latents, scheduler_state = args
             t = scheduler_state.timesteps[step]#jnp.array(scheduler_state.timesteps, dtype = jnp.int32)[step]
             tt = jnp.broadcast_to(t, latents.shape[0])
-            latents_input = self.scheduler.scale_model_input(scheduler_state, latents, t)
             latents_input = jnp.concatenate([latents_input, mask, hint], axis = 1)
             noise_pred = self.unet.apply(
                     { 'params': params['unet'] },
@@ -401,7 +430,7 @@ class InferenceUNetPseudo3D:
                     encoded_neg_prompt
             ).sample
             noise_pred = noise_pred_uncond + cfg * (noise_pred - noise_pred_uncond)
-            latents, scheduler_state = self.scheduler.step(
                     scheduler_state,
                     noise_pred.astype(jnp.float32),
                     t,
@@ -453,9 +482,11 @@ class InferenceUNetPseudo3D:
                 None,   #  7 height
                 None,   #  8 width
                 None,   #  9 cfg
-                0,      # 10 rng
-                0,      # 11 params
-                None,   # 12 use_imagegen
         ),
         static_broadcasted_argnums = ( # trigger recompilation on change
                 0,      # inference_class
@@ -463,7 +494,8 @@ class InferenceUNetPseudo3D:
                 6,      # num_frames
                 7,      # height
                 8,      # width
-                12,     # use_imagegen
         )
 )
 def _p_generate(
@@ -472,14 +504,16 @@ def _p_generate(
         neg_tokens,
         hint,
         mask,
-        inference_steps,
-        num_frames,
-        height,
-        width,
-        cfg,
         rng,
         params,
-        use_imagegen
 ):
     return inference_class._generate(
             tokens,
@@ -491,8 +525,10 @@ def _p_generate(
             height,
             width,
             cfg,
             rng,
             params,
-            use_imagegen
     )

+from typing import Any, Union, Optional, Tuple, List, Dict
 import os
 import gc
 from functools import partial
 from diffusers import FlaxAutoencoderKL, FlaxUNet2DConditionModel
 from diffusers import (
         FlaxDDIMScheduler,
         FlaxPNDMScheduler,
         FlaxLMSDiscreteScheduler,
         FlaxDPMSolverMultistepScheduler,
 )
+from diffusers.schedulers.scheduling_ddim_flax import DDIMSchedulerState
+from diffusers.schedulers.scheduling_pndm_flax import PNDMSchedulerState
+from diffusers.schedulers.scheduling_lms_discrete_flax import LMSDiscreteSchedulerState
+from diffusers.schedulers.scheduling_dpmsolver_multistep_flax import DPMSolverMultistepSchedulerState
 from transformers import FlaxCLIPTextModel, CLIPTokenizer
 SchedulerType = Union[
         FlaxDDIMScheduler,
         FlaxPNDMScheduler,
         FlaxLMSDiscreteScheduler,
         FlaxDPMSolverMultistepScheduler,
 ]
+SchedulerStateType = Union[
+        DDIMSchedulerState,
+        PNDMSchedulerState,
+        LMSDiscreteSchedulerState,
+        DPMSolverMultistepSchedulerState,
+]
+SCHEDULERS: Dict[str, SchedulerType] = {
+        'DPM': FlaxDPMSolverMultistepScheduler, # husbando
+        'DDIM': FlaxDDIMScheduler,
+        #'PLMS': FlaxPNDMScheduler, # its not correctly implemented in diffusers, output is bad, but at least it "works"
+        #'LMS': FlaxLMSDiscreteScheduler, # borked
+        #    image_latents, image_scheduler_state = scheduler.step(
+        #    File "/mnt/work1/make_a_vid/makeavid-space/.venv/lib/python3.10/site-packages/diffusers/schedulers/scheduling_lms_discrete_flax.py", line 255, in step
+        #    order = min(timestep + 1, order)
+        #    jax._src.errors.ConcretizationTypeError: Abstract tracer value encountered where concrete value is expected: Traced<ShapedArray(bool[])>with<DynamicJaxprTrace(level=1/1)>
+        #    The problem arose with the `bool` function.
+        # The error occurred while tracing the function scanned_fun at /mnt/work1/make_a_vid/makeavid-space/.venv/lib/python3.10/site-packages/jax/_src/lax/control_flow/loops.py:1668 for scan. This concrete value was not available in Python because it depends on the values of the arguments loop_carry[0] and loop_carry[1][1].timesteps
+}
 def dtypestr(x: jnp.dtype):
     if x == jnp.float32: return 'float32'
     elif x == jnp.float16: return 'float16'
 class InferenceUNetPseudo3D:
     def __init__(self,
             model_path: str,
             dtype: jnp.dtype = jnp.float16,
             hf_auth_token: Union[str, None] = None
     ) -> None:
                 subfolder = 'tokenizer',
                 use_auth_token = self.hf_auth_token
         )
+        self.schedulers: Dict[str, Dict[str, SchedulerType]] = {}
+        for scheduler_name in SCHEDULERS:
+            if scheduler_name not in ['KarrasVe', 'SDEVe']:
+                scheduler, scheduler_state = SCHEDULERS[scheduler_name].from_pretrained(
+                        self.model_path,
+                        subfolder = 'scheduler',
+                        dtype = jnp.float32,
+                        use_auth_token = self.hf_auth_token
+                )
+            else:
+                scheduler, scheduler_state = SCHEDULERS[scheduler_name].from_pretrained(
+                        self.model_path,
+                        subfolder = 'scheduler',
+                        use_auth_token = self.hf_auth_token
+                )
+            self.schedulers[scheduler_name] = scheduler
+            self.params[scheduler_name] = scheduler_state
         self.vae_scale_factor: int = int(2 ** (len(self.vae.config.block_out_channels) - 1))
         self.device_count = jax.device_count()
         gc.collect()
     def prepare_inputs(self,
             prompt: List[str],
             neg_prompt: List[str],
             hint_image: Union[Image.Image, List[Image.Image], None] = None,
             mask_image: Union[Image.Image, List[Image.Image], None] = None,
             neg_prompt: Union[str, List[str]] = '',
+            cfg: float = 15.0,
+            cfg_image: Optional[float] = None,
             num_frames: int = 24,
             width: int = 512,
             height: int = 512,
+            seed: int = 0,
+            scheduler_type: str = 'DDIM'
     ) -> List[List[Image.Image]]:
         assert inference_steps > 0, f'number of inference steps must be > 0 but is {inference_steps}'
         assert num_frames > 0, f'number of frames must be > 0 but is {num_frames}'
         if isinstance(neg_prompt, str):
             neg_prompt = [ neg_prompt ] * batch_size
         assert len(neg_prompt) == batch_size, f'number of negative prompts must be equal to batch size {batch_size} but is {len(neg_prompt)}'
+        assert scheduler_type in SCHEDULERS, f'unknown type of noise scheduler: {scheduler_type}, must be one of {list(SCHEDULERS.keys())}'
         tokens, neg_tokens, hint, mask = self.prepare_inputs(
                 prompt = prompt,
                 neg_prompt = neg_prompt,
                 width = width,
                 height = height
         )
+        if cfg_image is None:
+            cfg_image = cfg
+        #params['scheduler'] = scheduler_state
         # NOTE splitting rngs is not deterministic,
         # running on different device counts gives different seeds
         #rng = jax.random.PRNGKey(seed)
         #rngs = jax.random.split(rng, self.device_count)
+        # manually assign seeded RNGs to devices for reproducability
         rngs = jnp.array([ jax.random.PRNGKey(seed + i) for i in range(self.device_count) ])
         params = jax_utils.replicate(self.params)
         tokens = shard(tokens)
             height,
             width,
             cfg,
+            cfg_image,
             rngs,
             params,
+            use_imagegen,
+            scheduler_type,
         )
         if images.ndim == 5:
             images = einops.rearrange(images, 'd f c h w -> (d f) h w c')
             height,
             width,
             cfg: float,
+            cfg_image: float,
             rng: jax.random.KeyArray,
             params: Union[Dict[str, Any], FrozenDict[str, Any]],
+            use_imagegen: bool,
+            scheduler_type: str
     ) -> List[Image.Image]:
         batch_size = tokens.shape[0]
         latent_h = height // self.vae_scale_factor
         encoded_prompt = self.text_encoder(tokens, params = params['text_encoder'])[0]
         encoded_neg_prompt = self.text_encoder(neg_tokens, params = params['text_encoder'])[0]
+        scheduler = self.schedulers[scheduler_type]
+        scheduler_state = params[scheduler_type]
         if use_imagegen:
             image_latent_shape = (batch_size, self.vae.config.latent_channels, latent_h, latent_w)
             image_latents = jax.random.normal(
                     rng,
                     shape = image_latent_shape,
                     dtype = jnp.float32
+            ) * scheduler_state.init_noise_sigma
+            image_scheduler_state = scheduler.set_timesteps(
+                    scheduler_state,
                     num_inference_steps = inference_steps,
                     shape = image_latents.shape
             )
                 image_latents, image_scheduler_state = args
                 t = image_scheduler_state.timesteps[step]
                 tt = jnp.broadcast_to(t, image_latents.shape[0])
+                latents_input = scheduler.scale_model_input(image_scheduler_state, image_latents, t)
                 noise_pred = self.imunet.apply(
+                        { 'params': params['imunet']} ,
                         latents_input,
                         tt,
                         encoder_hidden_states = encoded_prompt
                 ).sample
                 noise_pred_uncond = self.imunet.apply(
+                        { 'params': params['imunet'] },
                         latents_input,
                         tt,
                         encoder_hidden_states = encoded_neg_prompt
                 ).sample
                 noise_pred = noise_pred_uncond + cfg * (noise_pred - noise_pred_uncond)
+                image_latents, image_scheduler_state = scheduler.step(
                         image_scheduler_state,
                         noise_pred.astype(jnp.float32),
                         t,
             hint = image_latents
         else:
             hint = self.vae.apply(
+                    { 'params': params['vae'] },
                     hint,
                     method = self.vae.encode
             ).latent_dist.mean * self.vae.config.scaling_factor
                 rng,
                 shape = latent_shape,
                 dtype = jnp.float32
+        ) * scheduler_state.init_noise_sigma
+        scheduler_state = scheduler.set_timesteps(
+                scheduler_state,
                 num_inference_steps = inference_steps,
                 shape = latents.shape
         )
             latents, scheduler_state = args
             t = scheduler_state.timesteps[step]#jnp.array(scheduler_state.timesteps, dtype = jnp.int32)[step]
             tt = jnp.broadcast_to(t, latents.shape[0])
+            latents_input = scheduler.scale_model_input(scheduler_state, latents, t)
             latents_input = jnp.concatenate([latents_input, mask, hint], axis = 1)
             noise_pred = self.unet.apply(
                     { 'params': params['unet'] },
                     encoded_neg_prompt
             ).sample
             noise_pred = noise_pred_uncond + cfg * (noise_pred - noise_pred_uncond)
+            latents, scheduler_state = scheduler.step(
                     scheduler_state,
                     noise_pred.astype(jnp.float32),
                     t,
                 None,   #  7 height
                 None,   #  8 width
                 None,   #  9 cfg
+                None,   # 10 cfg_image
+                0,      # 11 rng
+                0,      # 12 params
+                None,   # 13 use_imagegen
+                None,   # 14 scheduler_type
         ),
         static_broadcasted_argnums = ( # trigger recompilation on change
                 0,      # inference_class
                 6,      # num_frames
                 7,      # height
                 8,      # width
+                13,     # use_imagegen
+                14,     # scheduler_type
         )
 )
 def _p_generate(
         neg_tokens,
         hint,
         mask,
+        inference_steps: int,
+        num_frames: int,
+        height: int,
+        width: int,
+        cfg: float,
+        cfg_image: float,
         rng,
         params,
+        use_imagegen: bool,
+        scheduler_type: str
 ):
     return inference_class._generate(
             tokens,
             height,
             width,
             cfg,
+            cfg_image,
             rng,
             params,
+            use_imagegen,
+            scheduler_type
     )