Spaces:

orronai
/

FlowOpt

Running on Zero

App Files Files Community

orronai commited on Oct 2

Commit

8d5a128

1 Parent(s): 492742b

feat: add application files

Browse files

Files changed (5) hide show

README.md +6 -9
app.py +207 -0
requirements.txt +6 -0
utils/flux.py +374 -0
utils/sd3.py +264 -0

README.md CHANGED Viewed

@@ -1,14 +1,11 @@
----
 title: FlowOpt
-emoji: 📈
-colorFrom: purple
-colorTo: pink
 sdk: gradio
-sdk_version: 5.48.0
 app_file: app.py
 pinned: false
 license: mit
-short_description: 'FlowOpt Gradio: Fast Optimization for Training-Free Editing'
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 title: FlowOpt
+emoji: 📚
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.8.0
 app_file: app.py
 pinned: false
 license: mit
+hf_oauth: true
+short_description: 'FlowOpt Gradio: Fast-Optimization for Training-Free Editing.'

app.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os
+import random
+from typing import Tuple
+import numpy as np
+import spaces
+import torch
+from diffusers import FluxPipeline, StableDiffusion3Pipeline
+from PIL import Image
+import gradio as gr
+from utils.flux import flux_editing
+from utils.sd3 import sd3_editing
+device = "cuda" if torch.cuda.is_available() else "cpu"
+pipe_sd3 = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16, token=os.getenv('HF_ACCESS_TOK'))
+pipe_flux = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.float16, token=os.getenv('HF_ACCESS_TOK'))
+def seed_everything(seed: int) -> None:
+    """
+    Set the random seed for reproducibility.
+    Args:
+        seed (int): The seed value to set.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def on_T_steps_change(T_steps: int) -> gr.update:
+    """
+    Update the maximum value of the n_max slider based on the T_steps value.
+    Args:
+        T_steps (int): The current value of the T_steps slider.
+    Returns:
+        gr.update: An update object to modify the n_max slider's maximum value.
+    """
+    return gr.update(maximum=T_steps)
+def on_model_change(model_type: str) -> Tuple[int, int, float]:
+    if model_type == 'SD3':
+        T_steps_value = 15
+        n_max_value = 12
+        eta_value = 0.01
+    elif model_type == 'FLUX':
+        T_steps_value = 15
+        n_max_value = 13
+        eta_value = 0.0025
+    else:
+        raise NotImplementedError(f"Model type {model_type} not implemented")
+    return T_steps_value, n_max_value, eta_value
+def get_examples():
+    case = [
+        ["inputs/corgi_walking.png", "FLUX", 15, 13, 0.0025, 7, "A cute brown and white dog walking on a sidewalk near a body of water. The dog is wearing a pink vest, adding a touch of color to the scene.", "A cute brown and white guinea pig walking on a sidewalk near a body of water. The guinea pig is wearing a pink vest, adding a touch of color to the scene.", 1.0, 3.5, [(f"example_outputs/corgi_walking/guinea_pig/flux_iterations={i}.png", f"Iteration {i}") for i in range(8)]],
+        ["inputs/corgi_walking.png", "SD3", 15, 12, 0.01, 7, "A cute brown and white dog walking on a sidewalk near a body of water. The dog is wearing a pink vest, adding a touch of color to the scene.", "A cute brown and white rabbit walking on a sidewalk near a body of water. The rabbit is wearing a pink vest, adding a touch of color to the scene.", 1.0, 3.5, [(f"example_outputs/corgi_walking/rabbit/sd3_iterations={i}.png", f"Iteration {i}") for i in range(8)]],
+        ["inputs/puppies.png", "FLUX", 15, 13, 0.0025, 7, "Two adorable golden retriever puppies sitting in a grassy field. They are positioned close to each other, with one dog on the left and the other on the right. Both dogs have their mouths open, possibly panting.", "Two adorable crochet golden retriever puppies sitting in a grassy field. They are positioned close to each other, with one dog on the left and the other on the right. Both dogs have their mouths open, possibly panting or enjoying the outdoor environment.", 1.0, 3.5, [(f"example_outputs/puppies/crochet/flux_iterations={i}.png", f"Iteration {i}") for i in range(8)]],
+        ["inputs/puppies.png", "SD3", 15, 12, 0.01, 5, "Two adorable golden retriever puppies sitting in a grassy field. They are positioned close to each other, with one dog on the left and the other on the right. Both dogs have their mouths open, possibly panting.", "Two adorable husky puppies sitting in a grassy field. They are positioned close to each other, with one dog on the left and the other on the right. Both dogs have their mouths open, possibly panting or enjoying the outdoor environment.", 1.0, 3.5, [(f"example_outputs/puppies/husky/sd3_iterations={i}.png", f"Iteration {i}") for i in range(6)]],
+        ["inputs/iguana.png", "FLUX", 15, 13, 0.0025, 7, "A large orange lizard sitting on a rock near the ocean. The lizard is positioned in the center of the scene, with the ocean waves visible in the background. The rock is located close to the water, providing a picturesque setting for the lizard''s resting spot.", "A large crochet lizard sitting on a rock near the ocean. The lizard is positioned in the center of the scene, with the ocean waves visible in the background. The rock is located close to the water, providing a picturesque setting for the lizard''s resting spot.", 1.0, 3.5, [(f"example_outputs/iguana/crochet/flux_iterations={i}.png", f"Iteration {i}") for i in range(8)]],
+        ["inputs/iguana.png", "FLUX", 15, 13, 0.0025, 7, "A large orange lizard sitting on a rock near the ocean. The lizard is positioned in the center of the scene, with the ocean waves visible in the background. The rock is located close to the water, providing a picturesque setting for the lizard''s resting spot.", "A large lizard made out of lego bricks sitting on a rock near the ocean. The lizard is positioned in the center of the scene, with the ocean waves visible in the background. The rock is located close to the water, providing a picturesque setting for the lizard''s resting spot.", 1.0, 3.5, [(f"example_outputs/iguana/lego_bricks/flux_iterations={i}.png", f"Iteration {i}") for i in range(8)]],
+        ["inputs/cow_grass2.png", "FLUX", 15, 12, 0.0025, 6, "A large brown and white cow standing in a grassy field. The cow is positioned towards the center of the scene. The field is lush and green, providing a perfect environment for the cow to graze.", "A large cow made out of colorful toy bricks standing in a grassy field. The colorful toy brick cow is positioned towards the center of the scene. The field is lush and green, providing a perfect environment for the cow to graze.", 1.0, 3.5, [(f"example_outputs/cow_grass2/colorful_toy_bricks/flux_iterations={i}.png", f"Iteration {i}") for i in range(7)]],
+        ["inputs/cow_grass2.png", "FLUX", 15, 13, 0.0025, 5, "A large brown and white cow standing in a grassy field. The cow is positioned towards the center of the scene. The field is lush and green, providing a perfect environment for the cow to graze.", "A large cow made out of flowers standing in a grassy field. The flower cow is positioned towards the center of the scene. The field is lush and green, providing a perfect environment for the cow to graze.", 1.0, 3.5, [(f"example_outputs/cow_grass2/flowers/flux_iterations={i}.png", f"Iteration {i}") for i in range(6)]],
+        ["inputs/cow_grass2.png", "SD3", 15, 12, 0.01, 8, "A large brown and white cow standing in a grassy field. The cow is positioned towards the center of the scene. The field is lush and green, providing a perfect environment for the cow to graze.", "A large cow made out of wooden blocks standing in a grassy field. The wooden block cow is positioned towards the center of the scene. The field is lush and green, providing a perfect environment for the cow to graze.", 1.0, 3.5, [(f"example_outputs/cow_grass2/wooden_blocks/sd3_iterations={i}.png", f"Iteration {i}") for i in range(9)]],
+        ["inputs/cat_fridge.png", "SD3", 15, 12, 0.01, 8, "A cat sitting on top of a counter in a store. The cat is positioned towards the right side of the counter, and it appears to be looking at the camera. The store has a variety of items displayed, including several bottles scattered around the counter.", "A cat sitting on top of a counter in a store, with the cat and counter crafted using origami folded paper art techniques. The cat has a delicate and intricate appearance, with paper folds used to create its fur and features. The store has a variety of items displayed, including several bottles scattered around the counter.", 1.0, 3.5, [(f"example_outputs/cat_fridge/origami/sd3_iterations={i}.png", f"Iteration {i}") for i in range(9)]],
+        ["inputs/cat.png", "FLUX", 15, 13, 0.0025, 7, "A small, fluffy kitten sitting in a grassy field. The kitten is positioned in the center of the scene, surrounded by a field. The kitten appears to be looking at something in the field.", "A small bear cub sitting in a grassy field. The bear cub is positioned in the center of the scene, surrounded by a field. The bear cub appears to be looking at something in the field.", 1.0, 3.5, [(f"example_outputs/cat/bear/flux_iterations={i}.png", f"Iteration {i}") for i in range(8)]],
+        ["inputs/cat.png", "SD3", 15, 12, 0.01, 6, "A small, fluffy kitten sitting in a grassy field. The kitten is positioned in the center of the scene, surrounded by a field. The kitten appears to be looking at something in the field.", "A small puppy sitting in a grassy field. The puppy is positioned in the center of the scene, surrounded by a field. The puppy appears to be looking at something in the field.", 1.0, 3.5, [(f"example_outputs/cat/puppy/sd3_iterations={i}.png", f"Iteration {i}") for i in range(7)]],
+        ["inputs/wolf_grass.png", "FLUX", 15, 13, 0.0025, 7, "A wolf standing in a grassy field with yellow flowers. The wolf is positioned towards the center of the scene, and its body is facing the camera. The field is filled with grass, and the yellow flowers are scattered throughout the area.", "A fox standing in a grassy field with yellow flowers. The fox is positioned towards the center of the scene, and its body is facing the camera. The field is filled with grass, and the yellow flowers are scattered throughout the area.", 1.0, 3.5, [(f"example_outputs/wolf_grass/fox/flux_iterations={i}.png", f"Iteration {i}") for i in range(8)]],
+        ["inputs/wolf_grass.png", "SD3", 15, 12, 0.01, 4, "A wolf standing in a grassy field with yellow flowers. The wolf is positioned towards the center of the scene, and its body is facing the camera. The field is filled with grass, and the yellow flowers are scattered throughout the area.", "A baby deer standing in a grassy field with yellow flowers. The baby deer is positioned towards the center of the scene, and its body is facing the camera. The field is filled with grass, and the yellow flowers are scattered throughout the area.", 1.0, 3.5, [(f"example_outputs/wolf_grass/deer/sd3_iterations={i}.png", f"Iteration {i}") for i in range(5)]],
+    ]
+    return case
+@spaces.GPU(duration=200)
+def FlowOpt_run(
+    image_src_val: str, model_type_val: str, T_steps_val: int,
+    n_max_val: int, eta_val: float, flowopt_iterations_val: int,
+    src_prompt_val: str, tar_prompt_val: str,
+    src_guidance_scale_val: float, tar_guidance_scale_val: float,
+):
+    if not len(src_prompt_val):
+        raise gr.Error("Source prompt cannot be empty")
+    if not len(tar_prompt_val):
+        raise gr.Error("Target prompt cannot be empty")
+    if model_type_val == 'FLUX':
+        pipe = pipe_flux.to(device)
+    elif model_type_val == 'SD3':
+        pipe = pipe_sd3.to(device)
+    else:
+        raise NotImplementedError(f"Model type {model_type_val} not implemented")
+    scheduler = pipe.scheduler
+    # set seed
+    seed = 1024
+    seed_everything(seed)
+    # load image
+    image = Image.open(image_src_val)
+    # crop image to have both dimensions divisibe by 16 - avoids issues with resizing
+    image = image.crop((0, 0, image.width - image.width % 16, image.height - image.height % 16))
+    image_src_val = pipe.image_processor.preprocess(image)
+    # cast image to half precision
+    image_src_val = image_src_val.to(device).half()
+    with torch.autocast("cuda"), torch.inference_mode():
+        x0_src_denorm = pipe.vae.encode(image_src_val).latent_dist.mode()
+    x0_src = (x0_src_denorm - pipe.vae.config.shift_factor) * pipe.vae.config.scaling_factor
+    # send to cuda
+    x0_src = x0_src.to(device)
+    negative_prompt =  ""  # (SD3)
+    if model_type_val == 'SD3':
+        yield from sd3_editing(
+            pipe, scheduler, T_steps_val, n_max_val, x0_src,
+            src_prompt_val, tar_prompt_val, negative_prompt,
+            src_guidance_scale_val, tar_guidance_scale_val,
+            flowopt_iterations_val, eta_val,
+        )
+    elif model_type_val == 'FLUX':
+        yield from flux_editing(
+            pipe, scheduler, T_steps_val, n_max_val, x0_src,
+            src_prompt_val, tar_prompt_val,
+            src_guidance_scale_val, tar_guidance_scale_val,
+            flowopt_iterations_val, eta_val,
+        )
+    else:
+        raise NotImplementedError(f"Sampler type {model_type_val} not implemented")
+intro = """
+<h1 style="font-weight: 1000; text-align: center; margin: 0px;">FlowOpt: Fast Optimization Through Whole Flow Processes for Training-Free Editing</h1>
+<h3 style="margin-bottom: 10px; text-align: center;">
+    <a href="">[Paper]</a>&nbsp;|&nbsp;
+    <a href="https://orronai.github.io/FlowOpt/">[Project Page]</a>&nbsp;|&nbsp;
+    <a href="https://github.com/orronai/FlowOpt">[Code]</a>
+</h3>
+<br> 🎨 Edit your image using FlowOpt for Flow models! Upload an image, add a description of it, and specify the edits you want to make.
+<h3>Notes:</h3>
+<ol>
+  <li>We use FLUX.1 dev and SD3 for the demo. The models are large and may take a while to load.</li>
+  <li>We recommend 1024x1024 images for the best results. If the input images are too large, there may be out-of-memory errors. For other resolutions, we encourage you to find a suitable set of hyperparameters.</li>
+  <li>Default hyperparameters for each model used in the paper are provided as examples.</li>
+</ol>
+"""
+css="""
+#col-container {
+    margin: 0 auto;
+    max-width: 960px;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.HTML(intro)
+        with gr.Row():
+            with gr.Column():
+                image_src = gr.Image(type="filepath", label="Source Image", value="inputs/cat.png",)
+                src_prompt = gr.Textbox(lines=2, label="Source Prompt", value="A cat sitting in the grass")
+                tar_prompt = gr.Textbox(lines=2, label="Target Prompt", value="A puppy sitting in the grass")
+                submit_button = gr.Button("Run FlowOpt", variant="primary")
+                with gr.Row():
+                    model_type = gr.Dropdown(["FLUX", "SD3"], label="Model Type", value="FLUX")
+                    T_steps = gr.Slider(value=15, minimum=10, maximum=50, step=1, label="Total Steps", info="Total number of discretization steps.")
+                    n_max = gr.Slider(value=13, minimum=1, maximum=15, step=1, label="n_max", info="Control the strength of the edit.")
+                    eta = gr.Slider(value=0.0025, minimum=0.0001, maximum=0.05, label="eta", info="Control the optimization step-size.")
+                    flowopt_iterations = gr.Number(value=10, minimum=1, maximum=15, label="flowopt_iterations", info="Max number of FlowOpt iterations")
+            with gr.Column():
+                image_tar = gr.Gallery(
+                    label="Outputs", show_label=True, format="png",
+                    columns=[3], rows=[3], height="auto",
+                )
+        with gr.Accordion(label="Advanced Settings", open=False):
+            src_guidance_scale = gr.Slider(value=1.0, minimum=0.0, maximum=15.0, label="src_guidance_scale", info="Source prompt CFG scale.")
+            tar_guidance_scale = gr.Slider(value=3.5, minimum=1.0, maximum=15.0, label="tar_guidance_scale", info="Target prompt CFG scale.")
+    submit_button.click(
+        fn=FlowOpt_run,
+        inputs=[
+            image_src, model_type, T_steps, n_max, eta, flowopt_iterations,
+            src_prompt, tar_prompt, src_guidance_scale, tar_guidance_scale,
+        ],
+        outputs=[image_tar],
+    )
+    gr.Examples(
+        label="Examples",
+        examples=get_examples(),
+        inputs=[
+            image_src, model_type, T_steps, n_max, eta,
+            flowopt_iterations, src_prompt, tar_prompt,
+            src_guidance_scale, tar_guidance_scale, image_tar,
+        ],
+        outputs=[image_tar],
+    )
+    model_type.input(fn=on_model_change, inputs=[model_type], outputs=[T_steps, n_max, eta])
+    T_steps.change(fn=on_T_steps_change, inputs=[T_steps], outputs=[n_max])
+demo.queue()
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+diffusers
+transformers
+accelerate
+sentencepiece
+protobuf

utils/flux.py ADDED Viewed

	@@ -0,0 +1,374 @@

+from typing import Iterator, List, Tuple
+import numpy as np
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler, FluxPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
+from PIL import Image
+@torch.no_grad()
+def calculate_shift(
+    image_seq_len: int,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+) -> float:
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+@torch.no_grad()
+def calc_v_flux(
+    pipe: FluxPipeline, latents: torch.Tensor, prompt_embeds: torch.Tensor,
+    pooled_prompt_embeds: torch.Tensor, guidance: torch.Tensor,
+    text_ids: torch.Tensor, latent_image_ids: torch.Tensor, t: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Calculate the velocity (v) for FLUX.
+    Args:
+        pipe (FluxPipeline): The FLUX pipeline.
+        latents (torch.Tensor): The latent tensor at the current timestep.
+        prompt_embeds (torch.Tensor): The prompt embeddings.
+        pooled_prompt_embeds (torch.Tensor): The pooled prompt embeddings.
+        guidance (torch.Tensor): The guidance scale tensor.
+        text_ids (torch.Tensor): The text token IDs.
+        latent_image_ids (torch.Tensor): The latent image token IDs.
+        t (torch.Tensor): The current timestep.
+    Returns:
+        torch.Tensor: The predicted noise (velocity).
+    """
+    timestep = t.expand(latents.shape[0])
+    noise_pred = pipe.transformer(
+        hidden_states=latents,
+        timestep=timestep / 1000,
+        guidance=guidance,
+        encoder_hidden_states=prompt_embeds,
+        txt_ids=text_ids,
+        img_ids=latent_image_ids,
+        pooled_projections=pooled_prompt_embeds,
+        joint_attention_kwargs=None,
+        return_dict=False,
+    )[0]
+    return noise_pred
+@torch.no_grad()
+def prep_input(
+    pipe: FluxPipeline, scheduler: FlowMatchEulerDiscreteScheduler,
+    T_steps: int, x0_src: torch.Tensor, src_prompt: str,
+    src_guidance_scale: float,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, int, int,
+    torch.Tensor, torch.Tensor, torch.Tensor,
+]:
+    """
+    Prepare the input tensors for the FLUX pipeline.
+    Args:
+        pipe (FluxPipeline): The FLUX pipeline.
+        scheduler (FlowMatchEulerDiscreteScheduler): The scheduler for the diffusion process.
+        T_steps (int): The total number of timesteps for the diffusion process.
+        x0_src (torch.Tensor): The source latent tensor.
+        src_prompt (str): The source text prompt.
+        src_guidance_scale (float): The guidance scale for classifier-free guidance.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, torch.Tensor, torch.Tensor, torch.Tensor]:
+            - Prepared source latent tensor.
+            - Latent image token IDs.
+            - Timesteps tensor for the diffusion process.
+            - Original height of the input image.
+            - Original width of the input image.
+            - Source prompt embeddings.
+            - Source pooled prompt embeddings.
+            - Source text token IDs.
+    """
+    orig_height, orig_width = x0_src.shape[2] * pipe.vae_scale_factor, x0_src.shape[3] * pipe.vae_scale_factor
+    num_channels_latents = pipe.transformer.config.in_channels // 4
+    pipe.check_inputs(
+        prompt=src_prompt,
+        prompt_2=None,
+        height=orig_height,
+        width=orig_width,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=512,
+    )
+    x0_src, latent_src_image_ids = pipe.prepare_latents(
+        batch_size=x0_src.shape[0], num_channels_latents=num_channels_latents,
+        height=orig_height, width=orig_width,
+        dtype=x0_src.dtype, device=x0_src.device, generator=None, latents=x0_src,
+    )
+    x0_src = pipe._pack_latents(x0_src, x0_src.shape[0], num_channels_latents, x0_src.shape[2], x0_src.shape[3])
+    sigmas = np.linspace(1.0, 1 / T_steps, T_steps)
+    image_seq_len = x0_src.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        scheduler.config.base_image_seq_len,
+        scheduler.config.max_image_seq_len,
+        scheduler.config.base_shift,
+        scheduler.config.max_shift,
+    )
+    timesteps, T_steps = retrieve_timesteps(
+        scheduler,
+        T_steps,
+        x0_src.device,
+        timesteps=None,
+        sigmas=sigmas,
+        mu=mu,
+    )
+    pipe._num_timesteps = len(timesteps)
+    pipe._guidance_scale = src_guidance_scale
+    (
+        src_prompt_embeds,
+        src_pooled_prompt_embeds,
+        src_text_ids,
+    ) = pipe.encode_prompt(
+        prompt=src_prompt,
+        prompt_2=None,
+        device=x0_src.device,
+    )
+    return (
+        x0_src, latent_src_image_ids, timesteps, orig_height, orig_width,
+        src_prompt_embeds, src_pooled_prompt_embeds, src_text_ids
+    )
+# https://github.com/DSL-Lab/UniEdit-Flow
+@torch.no_grad()
+def uniinv(
+    pipe: FluxPipeline, scheduler: FlowMatchEulerDiscreteScheduler,
+    timesteps: torch.Tensor, n_start: int, x0_src: torch.Tensor,
+    src_prompt_embeds: torch.Tensor, src_pooled_prompt_embeds: torch.Tensor,
+    src_guidance: torch.Tensor, src_text_ids: torch.Tensor,
+    latent_src_image_ids: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Perform the UniInv inversion process for FLUX.
+    Args:
+        pipe (FluxPipeline): The FLUX pipeline.
+        scheduler (FlowMatchEulerDiscreteScheduler): The scheduler for the diffusion process.
+        timesteps (torch.Tensor): The timesteps for the diffusion process.
+        n_start (int): The number of initial timesteps to skip.
+        x0_src (torch.Tensor): The source latent tensor.
+        src_prompt_embeds (torch.Tensor): The source prompt embeddings.
+        src_pooled_prompt_embeds (torch.Tensor): The source pooled prompt embeddings.
+        src_guidance (torch.Tensor): The guidance scale tensor.
+        src_text_ids (torch.Tensor): The source text token IDs.
+        latent_src_image_ids (torch.Tensor): The latent image token IDs.
+    Returns:
+        torch.Tensor: The inverted latent tensor.
+    """
+    x_t = x0_src.clone()
+    timesteps_inv = timesteps.flip(dims=(0,))[:-n_start] if n_start > 0 else timesteps.flip(dims=(0,))
+    next_v = None
+    for _i, t in enumerate(timesteps_inv):
+        scheduler._init_step_index(t)
+        t_i = scheduler.sigmas[scheduler.step_index]
+        t_ip1 = scheduler.sigmas[scheduler.step_index + 1]
+        dt = t_i - t_ip1
+        if next_v is None:
+            v_tar = calc_v_flux(
+                pipe, latents=x_t, prompt_embeds=src_prompt_embeds,
+                pooled_prompt_embeds=src_pooled_prompt_embeds, guidance=src_guidance,
+                text_ids=src_text_ids, latent_image_ids=latent_src_image_ids, t=t_ip1 * 1000,
+            )
+        else:
+            v_tar = next_v
+        x_t = x_t.to(torch.float32)
+        x_t_next = x_t + v_tar * dt
+        x_t_next = x_t_next.to(pipe.dtype)
+        v_tar_next = calc_v_flux(
+            pipe, latents=x_t_next, prompt_embeds=src_prompt_embeds,
+            pooled_prompt_embeds=src_pooled_prompt_embeds, guidance=src_guidance,
+            text_ids=src_text_ids, latent_image_ids=latent_src_image_ids, t=t,
+        )
+        next_v = v_tar_next
+        x_t = x_t + v_tar_next * dt
+        x_t = x_t.to(pipe.dtype)
+    return x_t
+@torch.no_grad()
+def initialization(
+    pipe: FluxPipeline, scheduler: FlowMatchEulerDiscreteScheduler,
+    T_steps: int, n_start: int, x0_src: torch.Tensor, src_prompt: str,
+    src_guidance_scale: float,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int,
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+]:
+    """
+    Initialize the inversion process by preparing the latent tensor and prompt embeddings, and performing UniInv.
+    Args:
+        pipe (FluxPipeline): The FLUX pipeline.
+        scheduler (FlowMatchEulerDiscreteScheduler): The scheduler for the diffusion process.
+        T_steps (int): The total number of timesteps for the diffusion process.
+        n_start (int): The number of initial timesteps to skip.
+        x0_src (torch.Tensor): The source latent tensor.
+        src_prompt (str): The source text prompt.
+        src_guidance_scale (float): The guidance scale for classifier-free guidance.
+    Returns:
+        Tuple[
+            torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int,
+            torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+        ]:
+            - The inverted latent tensor.
+            - The source latent tensor.
+            - The timesteps for the diffusion process.
+            - The latent image token IDs.
+            - The original height of the input image.
+            - The original width of the input image.
+            - The source prompt embeddings.
+            - The source pooled prompt embeddings.
+            - The source text token IDs.
+            - The guidance scale tensor.
+    """
+    (
+        x0_src, latent_src_image_ids, timesteps, orig_height, orig_width,
+        src_prompt_embeds, src_pooled_prompt_embeds, src_text_ids
+    ) = prep_input(pipe, scheduler, T_steps, x0_src, src_prompt, src_guidance_scale)
+    # handle guidance
+    if pipe.transformer.config.guidance_embeds:
+        src_guidance = torch.tensor([src_guidance_scale], device=pipe.device)
+        src_guidance = src_guidance.expand(x0_src.shape[0])
+    else:
+        src_guidance = None
+    x_t = uniinv(
+        pipe, scheduler, timesteps, n_start, x0_src,
+        src_prompt_embeds, src_pooled_prompt_embeds, src_guidance,
+        src_text_ids, latent_src_image_ids,
+    )
+    return (
+        x_t, x0_src, timesteps, latent_src_image_ids, orig_height, orig_width,
+    )
+@torch.no_grad()
+def flux_denoise(
+    pipe: FluxPipeline, scheduler: FlowMatchEulerDiscreteScheduler,
+    timesteps: torch.Tensor, n_start: int, x_t: torch.Tensor,
+    prompt_embeds: torch.Tensor, pooled_prompt_embeds: torch.Tensor,
+    guidance: torch.Tensor, text_ids: torch.Tensor,
+    latent_image_ids: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Perform the denoising process for FLUX.
+    Args:
+        pipe (FluxPipeline): The FLUX pipeline.
+        scheduler (FlowMatchEulerDiscreteScheduler): The scheduler for the diffusion process.
+        timesteps (torch.Tensor): The timesteps for the diffusion process.
+        n_start (int): The number of initial timesteps to skip.
+        x_t (torch.Tensor): The latent tensor at the starting timestep.
+        prompt_embeds (torch.Tensor): The prompt embeddings.
+        pooled_prompt_embeds (torch.Tensor): The pooled prompt embeddings.
+        guidance (torch.Tensor): The guidance scale tensor.
+        text_ids (torch.Tensor): The text token IDs.
+        latent_image_ids (torch.Tensor): The latent image token IDs.
+    Returns:
+        torch.Tensor: The denoised latent tensor.
+    """
+    f_xt = x_t.clone()
+    for _i, t in enumerate(timesteps[n_start:]):
+        scheduler._init_step_index(t)
+        t_i = scheduler.sigmas[scheduler.step_index]
+        t_im1 = scheduler.sigmas[scheduler.step_index + 1]
+        dt = t_im1 - t_i
+        v_tar = calc_v_flux(
+            pipe, latents=f_xt, prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds, guidance=guidance,
+            text_ids=text_ids, latent_image_ids=latent_image_ids, t=t,
+        )
+        f_xt = f_xt.to(torch.float32)
+        f_xt = f_xt + v_tar * dt
+        f_xt = f_xt.to(pipe.dtype)
+    return f_xt
+@torch.no_grad()
+def flux_editing(
+    pipe: FluxPipeline, scheduler: FlowMatchEulerDiscreteScheduler,
+    T_steps: int, n_max: int, x0_src: torch.Tensor, src_prompt: str,
+    tar_prompt: str, src_guidance_scale: float, tar_guidance_scale: float,
+    flowopt_iterations: int, eta: float,
+) -> Iterator[List[Tuple[Image.Image, str]]]:
+    """
+    Perform the editing process for FLUX using FlowOpt.
+    Args:
+        pipe (FluxPipeline): The FLUX pipeline.
+        scheduler (FlowMatchEulerDiscreteScheduler): The scheduler for the diffusion process.
+        T_steps (int): The total number of timesteps for the diffusion process.
+        n_max (int): The maximum number of timesteps to consider.
+        x0_src (torch.Tensor): The source latent tensor.
+        src_prompt (str): The source text prompt.
+        tar_prompt (str): The target text prompt for editing.
+        src_guidance_scale (float): The guidance scale for the source prompt.
+        tar_guidance_scale (float): The guidance scale for the target prompt.
+        flowopt_iterations (int): The number of FlowOpt iterations to perform.
+        eta (float): The step size for the FlowOpt update.
+    Yields:
+        Iterator[List[Tuple[Image.Image, str]]]: A list of tuples containing the generated images and their corresponding iteration labels.
+    """
+    n_start = T_steps - n_max
+    (
+        x_t, x0_src, timesteps, latent_src_image_ids, orig_height, orig_width,
+    ) = initialization(
+        pipe, scheduler, T_steps, n_start, x0_src, src_prompt, src_guidance_scale,
+    )
+    pipe._guidance_scale = tar_guidance_scale
+    (
+        tar_prompt_embeds,
+        pooled_tar_prompt_embeds,
+        tar_text_ids,
+    ) = pipe.encode_prompt(
+        prompt=tar_prompt,
+        prompt_2=None,
+        device=pipe.device,
+    )
+    # handle guidance
+    if pipe.transformer.config.guidance_embeds:
+        tar_guidance = torch.tensor([tar_guidance_scale], device=pipe.device)
+        tar_guidance = tar_guidance.expand(x0_src.shape[0])
+    else:
+        tar_guidance = None
+    history = []
+    j_star = x0_src.clone().to(torch.float32)  # y
+    for flowopt_iter in range(flowopt_iterations + 1):
+        f_xt = flux_denoise(
+            pipe, scheduler, timesteps, n_start, x_t,
+            tar_prompt_embeds, pooled_tar_prompt_embeds, tar_guidance,
+            tar_text_ids, latent_src_image_ids,
+        )  # Eq. (3)
+        if flowopt_iter < flowopt_iterations:
+            x_t = x_t.to(torch.float32)
+            x_t = x_t - eta * (f_xt - j_star)  # Eq. (6) with c = c_tar
+            x_t = x_t.to(x0_src.dtype)
+        x0_flowopt = f_xt.clone()
+        unpacked_x0_flowopt = pipe._unpack_latents(x0_flowopt, orig_height, orig_width, pipe.vae_scale_factor)
+        x0_flowopt_denorm = (unpacked_x0_flowopt / pipe.vae.config.scaling_factor) + pipe.vae.config.shift_factor
+        with torch.autocast("cuda"), torch.inference_mode():
+            x0_flowopt_image = pipe.vae.decode(x0_flowopt_denorm, return_dict=False)[0].clamp(-1, 1)
+        x0_flowopt_image_pil = pipe.image_processor.postprocess(x0_flowopt_image)[0]
+        history.append((x0_flowopt_image_pil, f"Iteration {flowopt_iter}"))
+        yield history

utils/sd3.py ADDED Viewed

	@@ -0,0 +1,264 @@

+from typing import Iterator, List, Tuple
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler, StableDiffusion3Pipeline
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
+from PIL import Image
+@torch.no_grad()
+def calc_v_sd3(
+    pipe: StableDiffusion3Pipeline, latent_model_input: torch.Tensor,
+    prompt_embeds: torch.Tensor, pooled_prompt_embeds: torch.Tensor,
+    guidance_scale: float, t: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Calculate the velocity (v) for Stable Diffusion 3.
+    Args:
+        pipe (StableDiffusion3Pipeline): The Stable Diffusion 3 pipeline.
+        latent_model_input (torch.Tensor): The input latent tensor.
+        prompt_embeds (torch.Tensor): The text embeddings for the prompt.
+        pooled_prompt_embeds (torch.Tensor): The pooled text embeddings for the prompt.
+        guidance_scale (float): The guidance scale for classifier-free guidance.
+        t (torch.Tensor): The current timestep.
+    Returns:
+        torch.Tensor: The predicted noise (velocity).
+    """
+    timestep = t.expand(latent_model_input.shape[0])
+    noise_pred = pipe.transformer(
+        hidden_states=latent_model_input,
+        timestep=timestep,
+        encoder_hidden_states=prompt_embeds,
+        pooled_projections=pooled_prompt_embeds,
+        joint_attention_kwargs=None,
+        return_dict=False,
+    )[0]
+    # perform guidance source
+    if pipe.do_classifier_free_guidance:
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+    return noise_pred
+# https://github.com/DSL-Lab/UniEdit-Flow
+@torch.no_grad()
+def uniinv(
+    pipe: StableDiffusion3Pipeline, timesteps: torch.Tensor, n_start: int,
+    x0_src: torch.Tensor, src_prompt_embeds_all: torch.Tensor,
+    src_pooled_prompt_embeds_all: torch.Tensor, src_guidance_scale: float,
+) -> torch.Tensor:
+    """
+    Perform the UniInv inversion process for Stable Diffusion 3.
+    Args:
+        pipe (StableDiffusion3Pipeline): The Stable Diffusion 3 pipeline.
+        timesteps (torch.Tensor): The timesteps for the diffusion process.
+        n_start (int): The number of initial timesteps to skip.
+        x0_src (torch.Tensor): The source latent tensor.
+        src_prompt_embeds_all (torch.Tensor): The text embeddings for the source prompt.
+        src_pooled_prompt_embeds_all (torch.Tensor): The pooled text embeddings for the source prompt.
+        src_guidance_scale (float): The guidance scale for classifier-free guidance.
+    Returns:
+        torch.Tensor: The inverted latent tensor.
+    """
+    x_t = x0_src.clone()
+    timesteps_inv = torch.cat([torch.tensor([0.0], device=pipe.device), timesteps.flip(dims=(0,))], dim=0)
+    if n_start > 0:
+        zipped_timesteps_inv = zip(timesteps_inv[:-n_start - 1], timesteps_inv[1:-n_start])
+    else:
+        zipped_timesteps_inv = zip(timesteps_inv[:-1], timesteps_inv[1:])
+    next_v = None
+    for _i, (t_cur, t_prev) in enumerate(zipped_timesteps_inv):
+        t_i = t_cur / 1000
+        t_ip1 = t_prev / 1000
+        dt = t_ip1 - t_i
+        if next_v is None:
+            latent_model_input = torch.cat([x_t, x_t]) if pipe.do_classifier_free_guidance else (x_t)
+            v_tar = calc_v_sd3(
+                pipe, latent_model_input, src_prompt_embeds_all,
+                src_pooled_prompt_embeds_all, src_guidance_scale, t_cur,
+            )
+        else:
+            v_tar = next_v
+        x_t = x_t.to(torch.float32)
+        x_t_next = x_t + v_tar * dt
+        x_t_next = x_t_next.to(pipe.dtype)
+        latent_model_input = torch.cat([x_t_next, x_t_next]) if pipe.do_classifier_free_guidance else (x_t_next)
+        v_tar_next = calc_v_sd3(
+            pipe, latent_model_input, src_prompt_embeds_all,
+            src_pooled_prompt_embeds_all, src_guidance_scale, t_prev,
+        )
+        next_v = v_tar_next
+        x_t = x_t + v_tar_next * dt
+        x_t = x_t.to(pipe.dtype)
+    return x_t
+@torch.no_grad()
+def initialization(
+    pipe: StableDiffusion3Pipeline, scheduler: FlowMatchEulerDiscreteScheduler,
+    T_steps: int, n_start: int, x0_src: torch.Tensor,
+    src_prompt: str, negative_prompt: str, src_guidance_scale: float,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Initialize the inversion process by preparing the latent tensor and prompt embeddings, and performing UniInv.
+    Args:
+        pipe (StableDiffusion3Pipeline): The Stable Diffusion 3 pipeline.
+        scheduler (FlowMatchEulerDiscreteScheduler): The scheduler for the diffusion process.
+        T_steps (int): The total number of timesteps for the diffusion process.
+        n_start (int): The number of initial timesteps to skip.
+        x0_src (torch.Tensor): The source latent tensor.
+        src_prompt (str): The source text prompt.
+        negative_prompt (str): The negative text prompt for classifier-free guidance.
+        src_guidance_scale (float): The guidance scale for classifier-free guidance.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+            - The inverted latent tensor.
+            - The original source latent tensor.
+            - The timesteps for the diffusion process.
+            - The text embeddings for the source prompt.
+            - The pooled text embeddings for the source prompt.
+    """
+    pipe._guidance_scale = src_guidance_scale
+    (
+        src_prompt_embeds,
+        src_negative_prompt_embeds,
+        src_pooled_prompt_embeds,
+        src_negative_pooled_prompt_embeds,
+    ) = pipe.encode_prompt(
+        prompt=src_prompt,
+        prompt_2=None,
+        prompt_3=None,
+        negative_prompt=negative_prompt,
+        do_classifier_free_guidance=pipe.do_classifier_free_guidance,
+        device=pipe.device,
+    )
+    src_prompt_embeds_all = torch.cat([src_negative_prompt_embeds, src_prompt_embeds], dim=0) if pipe.do_classifier_free_guidance else src_prompt_embeds
+    src_pooled_prompt_embeds_all = torch.cat([src_negative_pooled_prompt_embeds, src_pooled_prompt_embeds], dim=0) if pipe.do_classifier_free_guidance else src_pooled_prompt_embeds
+    timesteps, T_steps = retrieve_timesteps(scheduler, T_steps, x0_src.device, timesteps=None)
+    pipe._num_timesteps = len(timesteps)
+    x_t = uniinv(
+        pipe, timesteps, n_start, x0_src, src_prompt_embeds_all,
+        src_pooled_prompt_embeds_all, src_guidance_scale,
+    )
+    return x_t, x0_src, timesteps
+@torch.no_grad()
+def sd3_denoise(
+    pipe: StableDiffusion3Pipeline, timesteps: torch.Tensor, n_start: int,
+    x_t: torch.Tensor, prompt_embeds_all: torch.Tensor,
+    pooled_prompt_embeds_all: torch.Tensor, guidance_scale: float,
+) -> torch.Tensor:
+    """
+    Perform the denoising process for Stable Diffusion 3.
+    Args:
+        pipe (StableDiffusion3Pipeline): The Stable Diffusion 3 pipeline.
+        timesteps (torch.Tensor): The timesteps for the diffusion process.
+        n_start (int): The number of initial timesteps to skip.
+        x_t (torch.Tensor): The latent tensor at the starting timestep.
+        prompt_embeds_all (torch.Tensor): The text embeddings for the prompt.
+        pooled_prompt_embeds_all (torch.Tensor): The pooled text embeddings for the prompt.
+        guidance_scale (float): The guidance scale for classifier-free guidance.
+    Returns:
+        torch.Tensor: The denoised latent tensor.
+    """
+    f_xt = x_t.clone()
+    for i, t in enumerate(timesteps[n_start:]):
+        t_i = t / 1000
+        if i + 1 < len(timesteps[n_start:]):
+            t_im1 = (timesteps[n_start + i + 1]) / 1000
+        else:
+            t_im1 = torch.zeros_like(t_i).to(t_i.device)
+        dt = t_im1 - t_i
+        latent_model_input = torch.cat([f_xt, f_xt]) if pipe.do_classifier_free_guidance else (f_xt)
+        v_tar = calc_v_sd3(
+            pipe, latent_model_input, prompt_embeds_all,
+            pooled_prompt_embeds_all, guidance_scale, t,
+        )
+        f_xt = f_xt.to(torch.float32)
+        f_xt = f_xt + v_tar * dt
+        f_xt = f_xt.to(pipe.dtype)
+    return f_xt
+@torch.no_grad()
+def sd3_editing(
+    pipe: StableDiffusion3Pipeline, scheduler: FlowMatchEulerDiscreteScheduler,
+    T_steps: int, n_max: int, x0_src: torch.Tensor, src_prompt: str,
+    tar_prompt: str, negative_prompt: str, src_guidance_scale: float,
+    tar_guidance_scale: float, flowopt_iterations: int, eta: float,
+) -> Iterator[List[Tuple[Image.Image, str]]]:
+    """
+    Perform the editing process for Stable Diffusion 3 using FlowOpt.
+    Args:
+        pipe (StableDiffusion3Pipeline): The Stable Diffusion 3 pipeline.
+        scheduler (FlowMatchEulerDiscreteScheduler): The scheduler for the diffusion process.
+        T_steps (int): The total number of timesteps for the diffusion process.
+        n_max (int): The maximum number of timesteps to consider.
+        x0_src (torch.Tensor): The source latent tensor.
+        src_prompt (str): The source text prompt.
+        tar_prompt (str): The target text prompt for editing.
+        negative_prompt (str): The negative text prompt for classifier-free guidance.
+        src_guidance_scale (float): The guidance scale for the source prompt.
+        tar_guidance_scale (float): The guidance scale for the target prompt.
+        flowopt_iterations (int): The number of FlowOpt iterations to perform.
+        eta (float): The step size for the FlowOpt update.
+    Yields:
+        Iterator[List[Tuple[Image.Image, str]]]: A list of tuples containing the generated images and their corresponding iteration labels.
+    """
+    n_start = T_steps - n_max
+    x_t, x0_src, timesteps = initialization(
+        pipe, scheduler, T_steps, n_start, x0_src, src_prompt,
+        negative_prompt, src_guidance_scale,
+    )
+    pipe._guidance_scale = tar_guidance_scale
+    (
+        tar_prompt_embeds,
+        tar_negative_prompt_embeds,
+        tar_pooled_prompt_embeds,
+        tar_negative_pooled_prompt_embeds,
+    ) = pipe.encode_prompt(
+        prompt=tar_prompt,
+        prompt_2=None,
+        prompt_3=None,
+        negative_prompt=negative_prompt,
+        do_classifier_free_guidance=pipe.do_classifier_free_guidance,
+        device=pipe.device,
+    )
+    tar_prompt_embeds_all = torch.cat([tar_negative_prompt_embeds, tar_prompt_embeds], dim=0) if pipe.do_classifier_free_guidance else tar_prompt_embeds
+    tar_pooled_prompt_embeds_all = torch.cat([tar_negative_pooled_prompt_embeds, tar_pooled_prompt_embeds], dim=0) if pipe.do_classifier_free_guidance else tar_pooled_prompt_embeds
+    history = []
+    j_star = x0_src.clone().to(torch.float32)  # y
+    for flowopt_iter in range(flowopt_iterations + 1):
+        f_xt = sd3_denoise(
+            pipe, timesteps, n_start, x_t, tar_prompt_embeds_all,
+            tar_pooled_prompt_embeds_all, tar_guidance_scale,
+        )  # Eq. (3)
+        if flowopt_iter < flowopt_iterations:
+            x_t = x_t.to(torch.float32)
+            x_t = x_t - eta * (f_xt - j_star)  # Eq. (6) with c = c_tar
+            x_t = x_t.to(x0_src.dtype)
+        x0_flowopt = f_xt.clone()
+        x0_flowopt_denorm = (x0_flowopt / pipe.vae.config.scaling_factor) + pipe.vae.config.shift_factor
+        with torch.autocast("cuda"), torch.inference_mode():
+            x0_flowopt_image = pipe.vae.decode(x0_flowopt_denorm, return_dict=False)[0].clamp(-1, 1)
+        x0_flowopt_image_pil = pipe.image_processor.postprocess(x0_flowopt_image)[0]
+        history.append((x0_flowopt_image_pil, f"Iteration {flowopt_iter}"))
+        yield history