Spaces:

wcy1122
/

DreamOmni2-Edit

Running on Zero

File size: 9,962 Bytes

import os
import re
from PIL import Image
import spaces
import gradio as gr
import uuid
import argparse
from huggingface_hub import login, snapshot_download

import torch
from dreamomni2.pipeline_dreamomni2 import DreamOmni2Pipeline
from diffusers.utils import load_image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from utils.vprocess import process_vision_info, resizeinput


def extract_gen_content(text):
    text = text[6:-7]
    return text

def _load_model_processor():

    device = "cuda" if torch.cuda.is_available() else "cpu"
    local_dir = snapshot_download(
        repo_id="xiabs/DreamOmni2",
        revision="main",
        allow_patterns=["vlm-model/**", "edit_lora/**"],
    )
    vlm_dir = os.path.join(local_dir, 'vlm-model')
    lora_dir = os.path.join(local_dir, 'edit_lora')

    print(f"Loading models from vlm_path: {vlm_dir}, edit_lora_path: {lora_dir}")
    pipe = DreamOmni2Pipeline.from_pretrained(
        "black-forest-labs/FLUX.1-Kontext-dev",
        torch_dtype=torch.bfloat16
    ).to(device)
    pipe.load_lora_weights(lora_dir, adapter_name="edit")
    pipe.set_adapters(["edit"], adapter_weights=[1])

    vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        vlm_dir,
        torch_dtype="bfloat16"
    ).to(device)
    processor = AutoProcessor.from_pretrained(vlm_dir)
    return vlm_model, processor, pipe


def _launch_demo(vlm_model, processor, pipe):

    @spaces.GPU(duration=90)
    def infer_vlm(input_img_path, input_instruction, prefix):
        if not vlm_model or not processor:
            raise gr.Error("VLM Model not loaded. Cannot process prompt.")
        tp = []
        for path in input_img_path:
            tp.append({"type": "image", "image": path})
        tp.append({"type": "text", "text": input_instruction + prefix})
        messages = [{"role": "user", "content": tp}]

        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
        inputs = inputs.to(device=vlm_model.device)

        generated_ids = vlm_model.generate(**inputs, do_sample=False, max_new_tokens=4096)
        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
        output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        return output_text[0]

    PREFERRED_KONTEXT_RESOLUTIONS = [
        (672, 1568),
        (688, 1504),
        (720, 1456),
        (752, 1392),
        (800, 1328),
        (832, 1248),
        (880, 1184),
        (944, 1104),
        (1024, 1024),
        (1104, 944),
        (1184, 880),
        (1248, 832),
        (1328, 800),
        (1392, 752),
        (1456, 720),
        (1504, 688),
        (1568, 672),
    ]
    def find_closest_resolution(width, height, preferred_resolutions):
        input_ratio = width / height
        closest_resolution = min(
            preferred_resolutions,
            key=lambda res: abs((res[0] / res[1]) - input_ratio)
        )
        return closest_resolution

    @spaces.GPU(duration=90)
    def perform_edit(input_img_paths, input_instruction, output_path):
        prefix = " It is editing task."
        source_imgs = []
        for path in input_img_paths:
            img = load_image(path)
            # source_imgs.append(img)
            source_imgs.append(resizeinput(img))
        prompt = infer_vlm(input_img_paths, input_instruction, prefix)
        prompt = extract_gen_content(prompt)
        print(f"Generated Prompt for VLM: {prompt}")

        image = pipe(
            images=source_imgs,
            height=source_imgs[0].height,
            width=source_imgs[0].width,
            prompt=prompt,
            num_inference_steps=30,
            guidance_scale=3.5,
        ).images[0]
        image.save(output_path)
        print(f"Edit result saved to {output_path}")

    @spaces.GPU(duration=90)
    def process_request(image_file_1, image_file_2, instruction):
        # debugpy.listen(5678)
        # print("Waiting for debugger attach...")
        # debugpy.wait_for_client()
        if not image_file_1 or not image_file_2:
            raise gr.Error("Please upload both images.")
        if not instruction:
            raise gr.Error("Please provide an instruction.")
        if not pipe or not vlm_model:
            raise gr.Error("Models not loaded. Check the console for errors.")
        
        output_path = f"/tmp/{uuid.uuid4()}.png"
        input_img_paths = [image_file_1, image_file_2]  # List of file paths from the two gr.File inputs

        perform_edit(input_img_paths, instruction, output_path)
        return output_path

    css = """
    .text-center { text-align: center; }
    .result-img img {
        max-height: 60vh !important; 
        min-height: 30vh !important;
        width: auto !important;      
        object-fit: contain;         
    }
    .input-img img {
        max-height: 30vh !important; 
        width: auto !important;      
        object-fit: contain;         
    }
    """


    with gr.Blocks(theme=gr.themes.Soft(), title="DreamOmni2", css=css) as demo:
        gr.HTML(
            """
            <h1 style="text-align:center; font-size:40px; font-weight:bold; margin-bottom:16px;">
                DreamOmni2: Multimodal Image Generation and Editing
            </h1>
            """
        )
        gr.Markdown(
            "Upload two images, provide an instruction, and click 'Run'.\n\n"
            "**Hint**: For editing tasks, due to the format settings of the training data, we need to place the image to be edited in the first position.",
            elem_classes="text-center"
        )
        with gr.Row():
            with gr.Column(scale=2):
                gr.Markdown("⬆️ Upload images. Click or drag to upload. Edit image on the left.")
                
                with gr.Row():
                    image_uploader_1 = gr.Image(
                        label="Img 1",
                        type="filepath",
                        interactive=True,
                        elem_classes="input-img",
                    )
                    image_uploader_2 = gr.Image(
                        label="Img 2",
                        type="filepath",
                        interactive=True,
                        elem_classes="input-img",
                    )
                
                instruction_text = gr.Textbox(
                    label="Instruction",
                    lines=2,
                    placeholder="Input your instruction for generation or editing here...",
                )
                run_button = gr.Button("Run", variant="primary")

            with gr.Column(scale=2):
                gr.Markdown(
                    "✏️ **Editing Mode**: Modify an existing image using instructions and references.\n\n"
                    "Tip: If the result is not what you expect, try clicking **Run** again. "
                )
                output_image = gr.Image(
                    label="Result",
                    type="filepath",
                    elem_classes="result-img",
                )

        # --- Examples (不变) ---
        gr.Markdown("## Examples")

        gr.Examples(
            label="Editing Examples",
            examples=[
                ["example_input/edit_tests/4/ref_0.jpg", "example_input/edit_tests/4/ref_1.jpg", "Replace the first image have the same image style as the second image.","example_input/edit_tests/4/res.jpg"],
                ["example_input/edit_tests/5/ref_0.jpg", "example_input/edit_tests/5/ref_1.jpg", "Make the person in the first image have the same hairstyle as the person in the second image.","example_input/edit_tests/5/res.jpg"],
                ["example_input/edit_tests/src.jpg", "example_input/edit_tests/ref.jpg", "Make the woman from the second image stand on the road in the first image.","example_input/edit_tests/edi_res.png"],
                ["example_input/edit_tests/1/ref_0.jpg", "example_input/edit_tests/1/ref_1.jpg", "Replace the lantern in the first image with the dog in the second image.","example_input/edit_tests/1/res.jpg"],
                ["example_input/edit_tests/2/ref_0.jpg", "example_input/edit_tests/2/ref_1.jpg", "Replace the suit in the first image with the clothes in the second image.","example_input/edit_tests/2/res.jpg"],
                ["example_input/edit_tests/3/ref_0.jpg", "example_input/edit_tests/3/ref_1.jpg", "Make the first image has the same light condition as the second image.","example_input/edit_tests/3/res.jpg"],
                ["example_input/edit_tests/6/ref_0.jpg", "example_input/edit_tests/6/ref_1.jpg", "Make the words in the first image have the same font as the words in the second image.","example_input/edit_tests/6/res.jpg"],
                ["example_input/edit_tests/7/ref_0.jpg", "example_input/edit_tests/7/ref_1.jpg", "Make the car in the first image have the same pattern as the mouse in the second image.","example_input/edit_tests/7/res.jpg"],
                ["example_input/edit_tests/8/ref_0.jpg", "example_input/edit_tests/8/ref_1.jpg", "Make the dress in the first image have the same pattern in the second image.","example_input/edit_tests/8/res.jpg"],
            ],
            inputs=[image_uploader_1, image_uploader_2, instruction_text, output_image],
            cache_examples=False,
        )

        run_button.click(
            fn=process_request,
            inputs=[image_uploader_1, image_uploader_2, instruction_text],
            outputs=output_image
        )
    
    demo.launch()


if __name__ == "__main__":
    vlm_model, processor, pipe = _load_model_processor()
    _launch_demo(vlm_model, processor, pipe)