Spaces:

wcy1122
/

DreamOmni2-Edit

Running on Zero

App Files Files Community

wcy1122 commited on 21 days ago

Commit

de27e62

1 Parent(s): 3afda02

initial commit

Browse files

Files changed (39) hide show

.gitattributes +3 -0
.gitignore +220 -0
README.md +17 -13
app.py +233 -0
edit_tests/1/ref_0.jpg +3 -0
edit_tests/1/ref_1.jpg +3 -0
edit_tests/1/res.jpg +3 -0
edit_tests/2/ref_0.jpg +3 -0
edit_tests/2/ref_1.jpg +3 -0
edit_tests/2/res.jpg +3 -0
edit_tests/3/ref_0.jpg +3 -0
edit_tests/3/ref_1.jpg +3 -0
edit_tests/3/res.jpg +3 -0
edit_tests/4/ref_0.jpg +3 -0
edit_tests/4/ref_1.jpg +3 -0
edit_tests/4/res.jpg +3 -0
edit_tests/5/ref_0.jpg +3 -0
edit_tests/5/ref_1.jpg +3 -0
edit_tests/5/res.jpg +3 -0
edit_tests/6/ref_0.jpg +3 -0
edit_tests/6/ref_1.jpg +3 -0
edit_tests/6/res.jpg +3 -0
edit_tests/7/ref_0.jpg +3 -0
edit_tests/7/ref_1.jpg +3 -0
edit_tests/7/res.jpg +3 -0
edit_tests/8/ref_0.jpg +3 -0
edit_tests/8/ref_1.jpg +3 -0
edit_tests/8/res.jpg +3 -0
edit_tests/edi_res.png +3 -0
edit_tests/ref.jpg +3 -0
edit_tests/src.jpg +3 -0
gen_tests/gen_res.png +3 -0
gen_tests/img1.jpg +3 -0
gen_tests/img2.jpg +3 -0
pipeline_flux_kontext.py +1151 -0
requirements.txt +11 -0
script.sh +12 -0
web_edit.py +252 -0
web_generate.py +251 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,220 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# Redis
+*.rdb
+*.aof
+*.pid
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+# ActiveMQ
+activemq-data/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml
+tmp_script.sh
+.gradio/
+gradio_tmp

README.md CHANGED Viewed

@@ -1,14 +1,18 @@
----
-title: DreamOmni2
-emoji: 📚
-colorFrom: purple
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Multimodal Instruction-based Editing and Generation
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# DreamOmni2
+This project is the official implementation of 'DreamOmni2: Multimodal Instruction-based Editing and Generation''
+## Web Demo
+```
+CUDA_VISIBLE_DEVICES=0 python web_edit.py \
+    --vlm_path PATH_TO_VLM \
+    --edit_lora_path PATH_TO_DEIT_LORA \
+    --server_name "0.0.0.0" \
+    --server_port 7860
+CUDA_VISIBLE_DEVICES=1 python web_generate.py \
+    --vlm_path PATH_TO_VLM \
+    --gen_lora_path PATH_TO_GENERATION_LORA \
+    --server_name "0.0.0.0" \
+    --server_port 7861
+```

app.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import torch
+from pipeline_flux_kontext import FluxKontextPipeline
+from diffusers.utils import load_image
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import os
+import re
+from PIL import Image
+import gradio as gr
+import uuid
+import argparse
+def _load_model_processor():
+    local_vlm_dir = snapshot_download(
+        repo_id="xiabs/DreamOmni2",
+        revision="main",
+        allow_patterns=["vlm-model/**"],
+    )
+    local_lora_dir = snapshot_download(
+        repo_id="xiabs/DreamOmni2",
+        revision="main",
+        allow_patterns=["edit_lora/**"],
+    )
+    print(f"Loading models from vlm_path: {local_vlm_dir}, edit_lora_path: {local_lora_dir}")
+    pipe = FluxKontextPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-Kontext-dev",
+        torch_dtype=torch.bfloat16
+    )
+    pipe.load_lora_weights(local_lora_dir, adapter_name="edit")
+    pipe.set_adapters(["edit"], adapter_weights=[1])
+    vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        local_vlm_dir,
+        torch_dtype="bfloat16",
+        device_map="cuda"
+    )
+    processor = AutoProcessor.from_pretrained(local_vlm_dir)
+    return vlm_model, processor, pipe
+def _launch_demo(vlm_model, processor, pipe):
+    @spaces.GPU()
+    def infer_vlm(input_img_path, input_instruction, prefix):
+        if not vlm_model or not processor:
+            raise gr.Error("VLM Model not loaded. Cannot process prompt.")
+        tp = []
+        for path in input_img_path:
+            tp.append({"type": "image", "image": path})
+        tp.append({"type": "text", "text": input_instruction + prefix})
+        messages = [{"role": "user", "content": tp}]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+        inputs = inputs.to("cuda")
+        generated_ids = vlm_model.generate(**inputs, do_sample=False, max_new_tokens=4096)
+        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        return output_text[0]
+    PREFERRED_KONTEXT_RESOLUTIONS = [
+        (672, 1568),
+        (688, 1504),
+        (720, 1456),
+        (752, 1392),
+        (800, 1328),
+        (832, 1248),
+        (880, 1184),
+        (944, 1104),
+        (1024, 1024),
+        (1104, 944),
+        (1184, 880),
+        (1248, 832),
+        (1328, 800),
+        (1392, 752),
+        (1456, 720),
+        (1504, 688),
+        (1568, 672),
+    ]
+    def find_closest_resolution(width, height, preferred_resolutions):
+        input_ratio = width / height
+        closest_resolution = min(
+            preferred_resolutions,
+            key=lambda res: abs((res[0] / res[1]) - input_ratio)
+        )
+        return closest_resolution
+    def extract_gen_content(text):
+        text = text[6:-7]
+        return text
+    @spaces.GPU()
+    def perform_edit(input_img_paths, input_instruction, output_path):
+        prefix = " It is editing task."
+        source_imgs = [load_image(path) for path in input_img_paths]
+        resized_imgs = []
+        for img in source_imgs:
+            target_resolution = find_closest_resolution(img.width, img.height, PREFERRED_KONTEXT_RESOLUTIONS)
+            resized_img = img.resize(target_resolution, Image.LANCZOS)
+            resized_imgs.append(resized_img)
+        prompt = infer_vlm(input_img_paths, input_instruction, prefix)
+        prompt = extract_gen_content(prompt)
+        print(f"Generated Prompt for VLM: {prompt}")
+        image = pipe(
+            images=resized_imgs,
+            height=resized_imgs[0].height,
+            width=resized_imgs[0].width,
+            prompt=prompt,
+            num_inference_steps=30,
+            guidance_scale=3.5,
+        ).images[0]
+        image.save(output_path)
+        print(f"Edit result saved to {output_path}")
+    def process_request(image_file_1, image_file_2, instruction):
+        # debugpy.listen(5678)
+        # print("Waiting for debugger attach...")
+        # debugpy.wait_for_client()
+        if not image_file_1 or not image_file_2:
+            raise gr.Error("Please upload both images.")
+        if not instruction:
+            raise gr.Error("Please provide an instruction.")
+        if not pipe or not vlm_model:
+            raise gr.Error("Models not loaded. Check the console for errors.")
+        output_path = f"/tmp/{uuid.uuid4()}.png"
+        input_img_paths = [image_file_1, image_file_2]  # List of file paths from the two gr.File inputs
+        perform_edit(input_img_paths, instruction, output_path)
+        return output_path
+    css = """
+    .text-center { text-align: center; }
+    .result-img img {
+        max-height: 60vh !important;
+        min-height: 30vh !important;
+        width: auto !important;
+        object-fit: contain;
+    }
+    .input-img img {
+        max-height: 30vh !important;
+        width: auto !important;
+        object-fit: contain;
+    }
+    """
+    with gr.Blocks(theme=gr.themes.Soft(), title="DreamOmni2", css=css) as demo:
+        gr.HTML(
+            """
+            <h1 style="text-align:center; font-size:48px; font-weight:bold; margin-bottom:20px;">
+                DreamOmni2: Omni-purpose Image Generation and Editing
+            </h1>
+            """
+        )
+        gr.Markdown(
+            "Select a mode, upload two images, provide an instruction, and click 'Run'.",
+            elem_classes="text-center"
+        )
+        with gr.Row():
+            with gr.Column(scale=2):
+                gr.Markdown("⬆️ Upload images. Click or drag to upload.")
+                with gr.Row():
+                    image_uploader_1 = gr.Image(
+                        label="Img 1",
+                        type="filepath",
+                        interactive=True,
+                        elem_classes="input-img",
+                    )
+                    image_uploader_2 = gr.Image(
+                        label="Img 2",
+                        type="filepath",
+                        interactive=True,
+                        elem_classes="input-img",
+                    )
+                instruction_text = gr.Textbox(
+                    label="Instruction",
+                    lines=2,
+                    placeholder="Input your instruction for generation or editing here...",
+                )
+                run_button = gr.Button("Run", variant="primary")
+            with gr.Column(scale=2):
+                gr.Markdown(
+                    "✏️ **Editing Mode**: Modify an existing image using instructions and references.\n\n"
+                    "Tip: If the result is not what you expect, try clicking **Run** again. "
+                )
+                output_image = gr.Image(
+                    label="Result",
+                    type="filepath",
+                    elem_classes="result-img",
+                )
+        # --- Examples (不变) ---
+        gr.Markdown("## Examples")
+        gr.Examples(
+            label="Editing Examples",
+            examples=[
+                ["edit_tests/4/ref_0.jpg", "edit_tests/4/ref_1.jpg", "Replace the first image have the same image style as the second image.","edit_tests/4/res.jpg"],
+                ["edit_tests/5/ref_0.jpg", "edit_tests/5/ref_1.jpg", "Make the person in the first image have the same hairstyle as the person in the second image.","edit_tests/5/res.jpg"],
+                ["edit_tests/src.jpg", "edit_tests/ref.jpg", "Make the woman from the second image stand on the road in the first image.","edit_tests/edi_res.png"],
+                ["edit_tests/1/ref_0.jpg", "edit_tests/1/ref_1.jpg", "Replace the lantern in the first image with the dog in the second image.","edit_tests/1/res.jpg"],
+                ["edit_tests/2/ref_0.jpg", "edit_tests/2/ref_1.jpg", "Replace the suit in the first image with the clothes in the second image.","edit_tests/2/res.jpg"],
+                ["edit_tests/3/ref_0.jpg", "edit_tests/3/ref_1.jpg", "Make the first image has the same light condition as the second image.","edit_tests/3/res.jpg"],
+                ["edit_tests/6/ref_0.jpg", "edit_tests/6/ref_1.jpg", "Make the words in the first image have the same font as the words in the second image.","edit_tests/6/res.jpg"],
+                ["edit_tests/7/ref_0.jpg", "edit_tests/7/ref_1.jpg", "Make the car in the first image have the same pattern as the mouse in the second image.","edit_tests/7/res.jpg"],
+                ["edit_tests/8/ref_0.jpg", "edit_tests/8/ref_1.jpg", "Make the dress in the first image have the same pattern in the second image.","edit_tests/8/res.jpg"],
+            ],
+            inputs=[image_uploader_1, image_uploader_2, instruction_text, output_image],
+            cache_examples=False,
+        )
+        run_button.click(
+            fn=process_request,
+            inputs=[image_uploader_1, image_uploader_2, instruction_text],
+            outputs=output_image
+        )
+if __name__ == "__main__":
+    vlm_model, processor, pipe = _load_model_processor()
+    print("Launching Gradio Demo...")
+    _launch_demo(vlm_model, processor, pipe)

edit_tests/1/ref_0.jpg ADDED Viewed

Git LFS Details

SHA256: 22698b9eee36955254029d0d84a8c3c3e13f8cb12d19cd361087a294f34fc18e
Pointer size: 130 Bytes
Size of remote file: 99.9 kB

edit_tests/1/ref_1.jpg ADDED Viewed

Git LFS Details

SHA256: fde235d7896c5175d335831b3c4124161b8d909ebbe1bf708922005db04c345a
Pointer size: 130 Bytes
Size of remote file: 27 kB

edit_tests/1/res.jpg ADDED Viewed

Git LFS Details

SHA256: 8bf7f1fb69bd052478d63643b0a71cc38ca0fe032508568c20dfcc0beca20a39
Pointer size: 131 Bytes
Size of remote file: 239 kB

edit_tests/2/ref_0.jpg ADDED Viewed

Git LFS Details

SHA256: d18bd9e3d7a15d9ed65660b491a6960ce1794fbcd20ebb9ec1c8a2277820f8f9
Pointer size: 130 Bytes
Size of remote file: 54.1 kB

edit_tests/2/ref_1.jpg ADDED Viewed

Git LFS Details

SHA256: 1c60313358e445081aab723047ca9e36f7e60c1a7e40e58db9ae6c3b46b87120
Pointer size: 130 Bytes
Size of remote file: 28.3 kB

edit_tests/2/res.jpg ADDED Viewed

Git LFS Details

SHA256: 0d40aceef14fca011243e808305f2c23693a6250cca1d6d9060d26b83cce2f2a
Pointer size: 130 Bytes
Size of remote file: 61.9 kB

edit_tests/3/ref_0.jpg ADDED Viewed

Git LFS Details

SHA256: 77d1b1d0f8d9177bcc44151384b41ee6fb3e4a6408049181d56bf261fa5892e9
Pointer size: 131 Bytes
Size of remote file: 102 kB

edit_tests/3/ref_1.jpg ADDED Viewed

Git LFS Details

SHA256: 3de52fbd95ff89eb97781eb88f85a897a39d516d41218bb8fc2a86cf2f005e1b
Pointer size: 130 Bytes
Size of remote file: 60.3 kB

edit_tests/3/res.jpg ADDED Viewed

Git LFS Details

SHA256: 61c6cad1b69a0d7665390bfe4b48dc6639d214bf49084e4f2df6795f8f091f35
Pointer size: 130 Bytes
Size of remote file: 47.8 kB

edit_tests/4/ref_0.jpg ADDED Viewed

Git LFS Details

SHA256: 17df467e2a56748929f7bf5cdbd3b3f41c3fe3e504e07eef183ac3c7af8f64d7
Pointer size: 130 Bytes
Size of remote file: 87.6 kB

edit_tests/4/ref_1.jpg ADDED Viewed

Git LFS Details

SHA256: 210c93974b8a9216320abe8ff34c03c74004cb8fafc8c691bb80291660559215
Pointer size: 131 Bytes
Size of remote file: 174 kB

edit_tests/4/res.jpg ADDED Viewed

Git LFS Details

SHA256: f0db24e49a7bb341eec00388110a4af08aefbb5ab560dfbcbd30e08c210bfd55
Pointer size: 130 Bytes
Size of remote file: 64.8 kB

edit_tests/5/ref_0.jpg ADDED Viewed

Git LFS Details

SHA256: 3421c8b427b5a8376bf16672ebcfe64f942a5a91fe6f5e579eb8e61bcfa10367
Pointer size: 131 Bytes
Size of remote file: 147 kB

edit_tests/5/ref_1.jpg ADDED Viewed

Git LFS Details

SHA256: 5bb7ab3bcdc17452fef64c311e0b4a56dcfe89ff37a7d64cfc63e766ce6baaf5
Pointer size: 130 Bytes
Size of remote file: 84 kB

edit_tests/5/res.jpg ADDED Viewed

Git LFS Details

SHA256: 2fcd5e9bda10daab54e6802774b810ba1956c8f0dc8dd4f0ee3b4d42c228005e
Pointer size: 130 Bytes
Size of remote file: 56.7 kB

edit_tests/6/ref_0.jpg ADDED Viewed

Git LFS Details

SHA256: b7dd6574ec93c156a31b152f557921002a0d4a33add61f905cb86d03f06b7f5e
Pointer size: 130 Bytes
Size of remote file: 57.1 kB

edit_tests/6/ref_1.jpg ADDED Viewed

Git LFS Details

SHA256: fd1d25c842ea30975b197e864338862efb6b81439da6929995fb2142f6adc49c
Pointer size: 130 Bytes
Size of remote file: 68.5 kB

edit_tests/6/res.jpg ADDED Viewed

Git LFS Details

SHA256: 04b49a1f3eab77008c4f31064599a8b96e8bcd9b8acd2e4687058e09a0962413
Pointer size: 131 Bytes
Size of remote file: 102 kB

edit_tests/7/ref_0.jpg ADDED Viewed

Git LFS Details

SHA256: 937bab58713aa2610839b6fe57c1b642aad23938dd769362739c6178a734bc5e
Pointer size: 131 Bytes
Size of remote file: 198 kB

edit_tests/7/ref_1.jpg ADDED Viewed

Git LFS Details

SHA256: 9a74d1289a992867620747bcb48c59c6c598100b319d8fe910a03519f3e7e370
Pointer size: 130 Bytes
Size of remote file: 86.6 kB

edit_tests/7/res.jpg ADDED Viewed

Git LFS Details

SHA256: 2ae24e25ef7606da785e2158abbbddae9d8077a1cbecf6c9dbf787675235ef7a
Pointer size: 131 Bytes
Size of remote file: 196 kB

edit_tests/8/ref_0.jpg ADDED Viewed

Git LFS Details

SHA256: 5d6d515d3eb11732c4de3d7989b6315a93b2f754be937b9b7631e2c22043bf61
Pointer size: 130 Bytes
Size of remote file: 78.3 kB

edit_tests/8/ref_1.jpg ADDED Viewed

Git LFS Details

SHA256: 810b2e27db7c7965d90bd76d83fc6f088862f9034fdb407972bd49c577845b57
Pointer size: 131 Bytes
Size of remote file: 144 kB

edit_tests/8/res.jpg ADDED Viewed

Git LFS Details

SHA256: 5adc6f7c8f1c5061909d367f5248be99117ae0f81164516c927a4f2a843ee691
Pointer size: 130 Bytes
Size of remote file: 47.1 kB

edit_tests/edi_res.png ADDED Viewed

Git LFS Details

SHA256: 3e5352a19b82623523b73e66a083689e3eb0b8fa738445892ea8af0ab733ed11
Pointer size: 132 Bytes
Size of remote file: 1.66 MB

edit_tests/ref.jpg ADDED Viewed

Git LFS Details

SHA256: 7e12827dd85e3b2bf39d11ad27121a09655deb60a500de469569335a3a60566b
Pointer size: 130 Bytes
Size of remote file: 70.2 kB

edit_tests/src.jpg ADDED Viewed

Git LFS Details

SHA256: 8b7231bdbf219b6313e95effbce355d11e2789c3a9fd9251d8723cd9a9900624
Pointer size: 131 Bytes
Size of remote file: 252 kB

gen_tests/gen_res.png ADDED Viewed

Git LFS Details

SHA256: 649f4c45658120fffdaac58478d0b88f211a365f4aac9fc678aa7ba82d4da371
Pointer size: 132 Bytes
Size of remote file: 1.22 MB

gen_tests/img1.jpg ADDED Viewed

Git LFS Details

SHA256: 2bfd72e3aa607a5cf05e3b52c30743921ff8862cb7dda26d14cef8c8180b4242
Pointer size: 131 Bytes
Size of remote file: 114 kB

gen_tests/img2.jpg ADDED Viewed

Git LFS Details

SHA256: 82b414fafd19c3ec7763281e359c8863b72e5bf36561920c94a516539a013a14
Pointer size: 130 Bytes
Size of remote file: 92.3 kB

pipeline_flux_kontext.py ADDED Viewed

	@@ -0,0 +1,1151 @@

+# Copyright 2025 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxKontextPipeline
+        >>> from diffusers.utils import load_image
+        >>> pipe = FluxKontextPipeline.from_pretrained(
+        ...     "black-forest-labs/FLUX.1-Kontext-dev", torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png"
+        ... ).convert("RGB")
+        >>> prompt = "Make Pikachu hold a sign that says 'Black Forest Labs is awesome', yarn art style, detailed, vibrant colors"
+        >>> image = pipe(
+        ...     image=image,
+        ...     prompt=prompt,
+        ...     guidance_scale=2.5,
+        ...     generator=torch.Generator().manual_seed(42),
+        ... ).images[0]
+        >>> image.save("output.png")
+        ```
+"""
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (800, 1328),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1328, 800),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class FluxKontextPipeline(
+    DiffusionPipeline,
+    FluxLoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+    FluxIPAdapterMixin,
+):
+    r"""
+    The Flux Kontext pipeline for image-to-image and text-to-image generation.
+    Reference: https://bfl.ai/announcements/flux-1-kontext-dev
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
+    _optional_components = ["image_encoder", "feature_extractor"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        return image_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+    ):
+        image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
+                )
+            for single_ip_adapter_image in ip_adapter_image:
+                single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
+                image_embeds.append(single_image_embeds[None, :])
+        else:
+            if not isinstance(ip_adapter_image_embeds, list):
+                ip_adapter_image_embeds = [ip_adapter_image_embeds]
+            if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
+                )
+            for single_image_embeds in ip_adapter_image_embeds:
+                image_embeds.append(single_image_embeds)
+        ip_adapter_image_embeds = []
+        for single_image_embeds in image_embeds:
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+        return ip_adapter_image_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+        return latents
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i], sample_mode="argmax")
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax")
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+        self,
+        images: Optional[torch.Tensor],
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, height, width)
+        h_offset = 0
+        w_offset = 0
+        image_latents = image_ids = None
+        if images is not None:
+            tp_image_latents = []
+            tp_image_ids = []
+            for i, image in enumerate(images):
+                image = image.to(device=device, dtype=dtype)
+                if image.shape[1] != self.latent_channels:
+                    image_latents = self._encode_vae_image(image=image, generator=generator)
+                else:
+                    image_latents = image
+                if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+                    # expand init_latents for batch_size
+                    additional_image_per_prompt = batch_size // image_latents.shape[0]
+                    image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+                elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+                    )
+                else:
+                    image_latents = torch.cat([image_latents], dim=0)
+                image_latent_height, image_latent_width = image_latents.shape[2:]
+                image_latents = self._pack_latents(
+                    image_latents, batch_size, num_channels_latents, image_latent_height, image_latent_width
+                )
+                image_ids = self._prepare_latent_image_ids(
+                    batch_size, image_latent_height // 2, image_latent_width // 2, device, dtype
+                )
+                # image ids are the same as latent ids with the first dimension set to 1 instead of 0
+                # image_ids[..., 0] = 0.9+i*0.1
+                image_ids[..., 0] = i+1
+                # image_ids[..., 1] += h_offset
+                image_ids[..., 2] += w_offset
+                tp_image_latents.append(image_latents)
+                tp_image_ids.append(image_ids)
+                h_offset += image_latent_height //2
+                w_offset += image_latent_width //2
+            image_latents = torch.cat(tp_image_latents, dim=1)
+            image_ids = torch.cat(tp_image_ids, dim=0)
+        latent_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+        return latents, image_latents, latent_ids, image_ids
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        images: Optional[List[PipelineImageInput]] = None,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        true_cfg_scale: float = 1.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_ip_adapter_image: Optional[PipelineImageInput] = None,
+        negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        max_area: int = 1024**2,
+        _auto_resize: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+                not greater than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            true_cfg_scale (`float`, *optional*, defaults to 1.0):
+                When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_ip_adapter_image:
+                (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+            max_area (`int`, defaults to `1024 ** 2`):
+                The maximum area of the generated image in pixels. The height and width will be adjusted to fit this
+                area while maintaining the aspect ratio.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_height, original_width = height, width
+        aspect_ratio = width / height
+        width = round((max_area * aspect_ratio) ** 0.5)
+        height = round((max_area / aspect_ratio) ** 0.5)
+        multiple_of = self.vae_scale_factor * 2
+        width = width // multiple_of * multiple_of
+        height = height // multiple_of * multiple_of
+        if height != original_height or width != original_width:
+            logger.warning(
+                f"Generation `height` and `width` have been adjusted to {height} and {width} to fit the model requirements."
+            )
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        has_neg_prompt = negative_prompt is not None or (
+            negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
+        )
+        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        if do_true_cfg:
+            (
+                negative_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                negative_text_ids,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_2=negative_prompt_2,
+                prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
+            )
+        # 3. Preprocess image
+        if images is not None and not (isinstance(images[0], torch.Tensor) and images[0].size(1) == self.latent_channels):
+            tp_images=[]
+            for img in images:
+                image = img
+                image_height, image_width = self.image_processor.get_default_height_width(img)
+                aspect_ratio = image_width / image_height
+                if _auto_resize:
+                    # Kontext is trained on specific resolutions, using one of them is recommended
+                    _, image_width, image_height = min(
+                        (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+                    )
+                image_width = image_width // multiple_of * multiple_of
+                image_height = image_height // multiple_of * multiple_of
+                image = self.image_processor.resize(image, image_height, image_width)
+                image = self.image_processor.preprocess(image, image_height, image_width)
+                tp_images.append(image)
+            images = tp_images
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, image_latents, latent_ids, image_ids = self.prepare_latents(
+            images,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        if image_ids is not None:
+            latent_ids = torch.cat([latent_ids, image_ids], dim=0)  # dim 0 is sequence dimension
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
+            negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
+        ):
+            negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            negative_ip_adapter_image = [negative_ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+        elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
+            negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
+        ):
+            ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            ip_adapter_image = [ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+        image_embeds = None
+        negative_image_embeds = None
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
+            negative_image_embeds = self.prepare_ip_adapter_image_embeds(
+                negative_ip_adapter_image,
+                negative_ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        # 6. Denoising loop
+        # We set the index here to remove DtoH sync, helpful especially during compilation.
+        # Check out more details here: https://github.com/huggingface/diffusers/pull/11696
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                if image_embeds is not None:
+                    self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
+                latent_model_input = latents
+                if image_latents is not None:
+                    latent_model_input = torch.cat([latents, image_latents], dim=1)
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred[:, : latents.size(1)]
+                if do_true_cfg:
+                    if negative_image_embeds is not None:
+                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+                    neg_noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        pooled_projections=negative_pooled_prompt_embeds,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        txt_ids=negative_text_ids,
+                        img_ids=latent_ids,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    neg_noise_pred = neg_noise_pred[:, : latents.size(1)]
+                    noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+timm
+ujson
+peft
+datasets
+transformers
+opencv-python
+qwen-vl-utils
+lmdb
+diffusers
+numpy
+gradio

script.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+CUDA_VISIBLE_DEVICES=3 GRADIO_TEMP_DIR=gradio_tmp python web_edit.py \
+    --vlm_path /gpfs/bhpeng/generation/do2/vlm-model \
+    --edit_lora_path /gpfs/bhpeng/generation/do2/edit_lora \
+    --server_name "0.0.0.0" \
+    --server_port 7869
+CUDA_VISIBLE_DEVICES=1 python web_generate.py \
+    --vlm_path /gpfs/bhpeng/generation/do2/vlm-model \
+    --gen_lora_path /gpfs/bhpeng/generation/do2/gen_lora \
+    --server_name "0.0.0.0" \
+    --server_port 7861

web_edit.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import torch
+from pipeline_flux_kontext import FluxKontextPipeline
+from diffusers.utils import load_image
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import os
+import re
+from PIL import Image
+import gradio as gr
+import uuid
+import argparse
+def parse_args():
+    """Parses command-line arguments for model paths and server configuration."""
+    parser = argparse.ArgumentParser(description="Launch DreamOmni2 Editing Gradio Demo.")
+    parser.add_argument(
+        "--vlm_path",
+        type=str,
+        default="vlm-model",
+        help="Path to the Qwen2_5_VL VLM model directory."
+    )
+    parser.add_argument(
+        "--edit_lora_path",
+        type=str,
+        default="edit_lora",
+        help="Path to the FLUX.1-Kontext editing LoRA weights directory."
+    )
+    parser.add_argument(
+        "--server_name",
+        type=str,
+        default="0.0.0.0",
+        help="The server name (IP address) to host the Gradio demo."
+    )
+    parser.add_argument(
+        "--server_port",
+        type=int,
+        default=7860,
+        help="The port number to host the Gradio demo."
+    )
+    args = parser.parse_args()
+    return args
+ARGS = parse_args()
+vlm_path = ARGS.vlm_path
+edit_lora_path = ARGS.edit_lora_path
+server_name = ARGS.server_name
+server_port = ARGS.server_port
+device = "cuda"
+def extract_gen_content(text):
+    text = text[6:-7]
+    return text
+print(f"Loading models from vlm_path: {vlm_path}, edit_lora_path: {edit_lora_path}")
+pipe = FluxKontextPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-Kontext-dev",
+    torch_dtype=torch.bfloat16
+)
+pipe.to(device)
+pipe.load_lora_weights(edit_lora_path, adapter_name="edit")
+pipe.set_adapters(["edit"], adapter_weights=[1])
+vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    vlm_path,
+    torch_dtype="bfloat16",
+    device_map="cuda"
+)
+processor = AutoProcessor.from_pretrained(vlm_path)
+def infer_vlm(input_img_path, input_instruction, prefix):
+    if not vlm_model or not processor:
+        raise gr.Error("VLM Model not loaded. Cannot process prompt.")
+    tp = []
+    for path in input_img_path:
+        tp.append({"type": "image", "image": path})
+    tp.append({"type": "text", "text": input_instruction + prefix})
+    messages = [{"role": "user", "content": tp}]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+    inputs = inputs.to("cuda")
+    generated_ids = vlm_model.generate(**inputs, do_sample=False, max_new_tokens=4096)
+    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    return output_text[0]
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (800, 1328),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1328, 800),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+def find_closest_resolution(width, height, preferred_resolutions):
+    input_ratio = width / height
+    closest_resolution = min(
+        preferred_resolutions,
+        key=lambda res: abs((res[0] / res[1]) - input_ratio)
+    )
+    return closest_resolution
+def perform_edit(input_img_paths, input_instruction, output_path):
+    prefix = " It is editing task."
+    source_imgs = [load_image(path) for path in input_img_paths]
+    resized_imgs = []
+    for img in source_imgs:
+        target_resolution = find_closest_resolution(img.width, img.height, PREFERRED_KONTEXT_RESOLUTIONS)
+        resized_img = img.resize(target_resolution, Image.LANCZOS)
+        resized_imgs.append(resized_img)
+    prompt = infer_vlm(input_img_paths, input_instruction, prefix)
+    prompt = extract_gen_content(prompt)
+    print(f"Generated Prompt for VLM: {prompt}")
+    image = pipe(
+        images=resized_imgs,
+        height=resized_imgs[0].height,
+        width=resized_imgs[0].width,
+        prompt=prompt,
+        num_inference_steps=30,
+        guidance_scale=3.5,
+    ).images[0]
+    image.save(output_path)
+    print(f"Edit result saved to {output_path}")
+def process_request(image_file_1, image_file_2, instruction):
+    # debugpy.listen(5678)
+    # print("Waiting for debugger attach...")
+    # debugpy.wait_for_client()
+    if not image_file_1 or not image_file_2:
+        raise gr.Error("Please upload both images.")
+    if not instruction:
+        raise gr.Error("Please provide an instruction.")
+    if not pipe or not vlm_model:
+        raise gr.Error("Models not loaded. Check the console for errors.")
+    output_path = f"/tmp/{uuid.uuid4()}.png"
+    input_img_paths = [image_file_1, image_file_2]  # List of file paths from the two gr.File inputs
+    perform_edit(input_img_paths, instruction, output_path)
+    return output_path
+css = """
+.text-center { text-align: center; }
+.result-img img {
+    max-height: 60vh !important;
+    min-height: 30vh !important;
+    width: auto !important;
+    object-fit: contain;
+}
+.input-img img {
+    max-height: 30vh !important;
+    width: auto !important;
+    object-fit: contain;
+}
+"""
+with gr.Blocks(theme=gr.themes.Soft(), title="DreamOmni2", css=css) as demo:
+    gr.HTML(
+        """
+        <h1 style="text-align:center; font-size:48px; font-weight:bold; margin-bottom:20px;">
+            DreamOmni2: Omni-purpose Image Generation and Editing
+        </h1>
+        """
+    )
+    gr.Markdown(
+        "Select a mode, upload two images, provide an instruction, and click 'Run'.",
+        elem_classes="text-center"
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("⬆️ Upload images. Click or drag to upload.")
+            with gr.Row():
+                image_uploader_1 = gr.Image(
+                    label="Img 1",
+                    type="filepath",
+                    interactive=True,
+                    elem_classes="input-img",
+                )
+                image_uploader_2 = gr.Image(
+                    label="Img 2",
+                    type="filepath",
+                    interactive=True,
+                    elem_classes="input-img",
+                )
+            instruction_text = gr.Textbox(
+                label="Instruction",
+                lines=2,
+                placeholder="Input your instruction for generation or editing here...",
+            )
+            run_button = gr.Button("Run", variant="primary")
+        with gr.Column(scale=2):
+            gr.Markdown(
+                "✏️ **Editing Mode**: Modify an existing image using instructions and references.\n\n"
+                "Tip: If the result is not what you expect, try clicking **Run** again. "
+            )
+            output_image = gr.Image(
+                label="Result",
+                type="filepath",
+                elem_classes="result-img",
+            )
+    # --- Examples (不变) ---
+    gr.Markdown("## Examples")
+    gr.Examples(
+        label="Editing Examples",
+        examples=[
+            ["edit_tests/4/ref_0.jpg", "edit_tests/4/ref_1.jpg", "Replace the first image have the same image style as the second image.","edit_tests/4/res.jpg"],
+            ["edit_tests/5/ref_0.jpg", "edit_tests/5/ref_1.jpg", "Make the person in the first image have the same hairstyle as the person in the second image.","edit_tests/5/res.jpg"],
+            ["edit_tests/src.jpg", "edit_tests/ref.jpg", "Make the woman from the second image stand on the road in the first image.","edit_tests/edi_res.png"],
+            ["edit_tests/1/ref_0.jpg", "edit_tests/1/ref_1.jpg", "Replace the lantern in the first image with the dog in the second image.","edit_tests/1/res.jpg"],
+            ["edit_tests/2/ref_0.jpg", "edit_tests/2/ref_1.jpg", "Replace the suit in the first image with the clothes in the second image.","edit_tests/2/res.jpg"],
+            ["edit_tests/3/ref_0.jpg", "edit_tests/3/ref_1.jpg", "Make the first image has the same light condition as the second image.","edit_tests/3/res.jpg"],
+            ["edit_tests/6/ref_0.jpg", "edit_tests/6/ref_1.jpg", "Make the words in the first image have the same font as the words in the second image.","edit_tests/6/res.jpg"],
+            ["edit_tests/7/ref_0.jpg", "edit_tests/7/ref_1.jpg", "Make the car in the first image have the same pattern as the mouse in the second image.","edit_tests/7/res.jpg"],
+            ["edit_tests/8/ref_0.jpg", "edit_tests/8/ref_1.jpg", "Make the dress in the first image have the same pattern in the second image.","edit_tests/8/res.jpg"],
+        ],
+        inputs=[image_uploader_1, image_uploader_2, instruction_text, output_image],
+        cache_examples=False,
+    )
+    run_button.click(
+        fn=process_request,
+        inputs=[image_uploader_1, image_uploader_2, instruction_text],
+        outputs=output_image
+    )
+if __name__ == "__main__":
+    print("Launching Gradio Demo...")
+    demo.launch(server_name=server_name, server_port=server_port)

web_generate.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import torch
+from pipeline_flux_kontext import FluxKontextPipeline
+from diffusers.utils import load_image
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import os
+import re
+from PIL import Image
+import gradio as gr
+import uuid
+import argparse
+def parse_args():
+    """Parses command-line arguments for model paths and server configuration."""
+    parser = argparse.ArgumentParser(description="Launch DreamOmni2 Editing Gradio Demo.")
+    parser.add_argument(
+        "--vlm_path",
+        type=str,
+        default="vlm-model",
+        help="Path to the Qwen2_5_VL VLM model directory."
+    )
+    parser.add_argument(
+        "--gen_lora_path",
+        type=str,
+        default="gen_lora",
+        help="Path to the FLUX.1-Kontext generation LoRA weights directory."
+    )
+    parser.add_argument(
+        "--server_name",
+        type=str,
+        default="0.0.0.0",
+        help="The server name (IP address) to host the Gradio demo."
+    )
+    parser.add_argument(
+        "--server_port",
+        type=int,
+        default=7860,
+        help="The port number to host the Gradio demo."
+    )
+    args = parser.parse_args()
+    return args
+ARGS = parse_args()
+vlm_path = ARGS.vlm_path
+gen_lora_path = ARGS.gen_lora_path
+server_name = ARGS.server_name
+server_port = ARGS.server_port
+device = "cuda"
+def extract_gen_content(text):
+    text = text[6:-7]
+    return text
+print(f"Loading models from vlm_path: {vlm_path}, gen_lora_path: {gen_lora_path}")
+pipe = FluxKontextPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-Kontext-dev",
+    torch_dtype=torch.bfloat16
+)
+pipe.to(device)
+pipe.load_lora_weights(gen_lora_path, adapter_name="generation")
+pipe.set_adapters(["generation"], adapter_weights=[1])
+vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    vlm_path,
+    torch_dtype="bfloat16",
+    device_map="cuda"
+)
+processor = AutoProcessor.from_pretrained(vlm_path)
+def infer_vlm(input_img_path, input_instruction, prefix):
+    if not vlm_model or not processor:
+        raise gr.Error("VLM Model not loaded. Cannot process prompt.")
+    tp = []
+    for path in input_img_path:
+        tp.append({"type": "image", "image": path})
+    tp.append({"type": "text", "text": input_instruction + prefix})
+    messages = [{"role": "user", "content": tp}]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+    inputs = inputs.to("cuda")
+    generated_ids = vlm_model.generate(**inputs, do_sample=False, max_new_tokens=4096)
+    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    return output_text[0]
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (800, 1328),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1328, 800),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+def find_closest_resolution(width, height, preferred_resolutions):
+    input_ratio = width / height
+    closest_resolution = min(
+        preferred_resolutions,
+        key=lambda res: abs((res[0] / res[1]) - input_ratio)
+    )
+    return closest_resolution
+def perform_generation(input_img_paths, input_instruction, output_path, height=1024, width=1024):
+    prefix = " It is generation task."
+    source_imgs = [load_image(path) for path in input_img_paths]
+    resized_imgs = []
+    for img in source_imgs:
+        target_resolution = find_closest_resolution(img.width, img.height, PREFERRED_KONTEXT_RESOLUTIONS)
+        resized_img = img.resize(target_resolution, Image.LANCZOS)
+        resized_imgs.append(resized_img)
+    prompt = infer_vlm(input_img_paths, input_instruction, prefix)
+    prompt = extract_gen_content(prompt)
+    print(f"Generated Prompt for VLM: {prompt}")
+    image = pipe(
+        images=resized_imgs,
+        height=height,
+        width=width,
+        prompt=prompt,
+        num_inference_steps=30,
+        guidance_scale=3.5,
+    ).images[0]
+    image.save(output_path)
+    print(f"Generation result saved to {output_path}")
+# --- Gradio Interface Logic ---
+def process_request(image_file_1, image_file_2, instruction):
+    # debugpy.listen(5678)
+    # print("Waiting for debugger attach...")
+    # debugpy.wait_for_client()
+    if not image_file_1 or not image_file_2:
+        raise gr.Error("Please upload both images.")
+    if not instruction:
+        raise gr.Error("Please provide an instruction.")
+    if not pipe or not vlm_model:
+        raise gr.Error("Models not loaded. Check the console for errors.")
+    output_path = f"/tmp/{uuid.uuid4()}.png"
+    input_img_paths = [image_file_1, image_file_2]  # List of file paths from the two gr.File inputs
+    perform_generation(input_img_paths, instruction, output_path)
+    return output_path
+css = """
+.text-center { text-align: center; }
+.result-img img {
+    max-height: 60vh !important;
+    min-height: 30vh !important;
+    width: auto !important;
+    object-fit: contain;
+}
+.input-img img {
+    max-height: 30vh !important;
+    width: auto !important;
+    object-fit: contain;
+}
+"""
+with gr.Blocks(theme=gr.themes.Soft(), title="DreamOmni2", css=css) as demo:
+    gr.HTML(
+        """
+        <h1 style="text-align:center; font-size:48px; font-weight:bold; margin-bottom:20px;">
+            DreamOmni2: Omni-purpose Image Generation and Editing
+        </h1>
+        """
+    )
+    gr.Markdown(
+        "Select a mode, upload two images, provide an instruction, and click 'Run'.",
+        elem_classes="text-center"
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("⬆️ Upload images. Click or drag to upload.")
+            with gr.Row():
+                image_uploader_1 = gr.Image(
+                    label="Img 1",
+                    type="filepath",
+                    interactive=True,
+                    elem_classes="input-img",
+                )
+                image_uploader_2 = gr.Image(
+                    label="Img 2",
+                    type="filepath",
+                    interactive=True,
+                    elem_classes="input-img",
+                )
+            instruction_text = gr.Textbox(
+                label="Instruction",
+                lines=2,
+                placeholder="Input your instruction for generation or editing here...",
+            )
+            run_button = gr.Button("Run", variant="primary")
+        with gr.Column(scale=2):
+            gr.Markdown("🖼️ **Generation Mode**: Create new scenes from reference images."
+                        "Tip: If the result is not what you expect, try clicking **Run** again. ")
+            output_image = gr.Image(
+                label="Result",
+                type="filepath",
+                elem_classes="result-img",
+            )
+    # --- Examples ---
+    gr.Markdown("## Examples")
+    gr.Examples(
+        label="Generation Examples",
+        examples=[
+            [
+                "gen_tests/img1.jpg",
+                "gen_tests/img2.jpg",
+                "In the scene, the character from the first image stands on the left, and the character from the second image stands on the right. They are shaking hands against the backdrop of a spaceship interior.",
+                "gen_tests/gen_res.png"
+            ]
+        ],
+        inputs=[image_uploader_1, image_uploader_2, instruction_text, output_image],
+        cache_examples=False,
+    )
+    run_button.click(
+        fn=process_request,
+        inputs=[image_uploader_1, image_uploader_2, instruction_text],
+        outputs=output_image
+    )
+if __name__ == "__main__":
+    print("Launching Gradio Demo...")
+    demo.launch(server_name="0.0.0.0", server_port=7861, )