Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	| import os | |
| import re | |
| from PIL import Image | |
| import spaces | |
| import gradio as gr | |
| import uuid | |
| import argparse | |
| from huggingface_hub import login, snapshot_download | |
| import torch | |
| from dreamomni2.pipeline_dreamomni2 import DreamOmni2Pipeline | |
| from diffusers.utils import load_image | |
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor | |
| from utils.vprocess import process_vision_info, resizeinput | |
| def extract_gen_content(text): | |
| text = text[6:-7] | |
| return text | |
| def _load_model_processor(): | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| local_dir = snapshot_download( | |
| repo_id="xiabs/DreamOmni2", | |
| revision="main", | |
| allow_patterns=["vlm-model/**", "edit_lora/**"], | |
| ) | |
| vlm_dir = os.path.join(local_dir, 'vlm-model') | |
| lora_dir = os.path.join(local_dir, 'edit_lora') | |
| print(f"Loading models from vlm_path: {vlm_dir}, edit_lora_path: {lora_dir}") | |
| pipe = DreamOmni2Pipeline.from_pretrained( | |
| "black-forest-labs/FLUX.1-Kontext-dev", | |
| torch_dtype=torch.bfloat16 | |
| ).to(device) | |
| pipe.load_lora_weights(lora_dir, adapter_name="edit") | |
| pipe.set_adapters(["edit"], adapter_weights=[1]) | |
| vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| vlm_dir, | |
| torch_dtype="bfloat16" | |
| ).to(device) | |
| processor = AutoProcessor.from_pretrained(vlm_dir) | |
| return vlm_model, processor, pipe | |
| def _launch_demo(vlm_model, processor, pipe): | |
| def infer_vlm(input_img_path, input_instruction, prefix): | |
| if not vlm_model or not processor: | |
| raise gr.Error("VLM Model not loaded. Cannot process prompt.") | |
| tp = [] | |
| for path in input_img_path: | |
| tp.append({"type": "image", "image": path}) | |
| tp.append({"type": "text", "text": input_instruction + prefix}) | |
| messages = [{"role": "user", "content": tp}] | |
| text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt") | |
| inputs = inputs.to(device=vlm_model.device) | |
| generated_ids = vlm_model.generate(**inputs, do_sample=False, max_new_tokens=4096) | |
| generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] | |
| output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) | |
| return output_text[0] | |
| PREFERRED_KONTEXT_RESOLUTIONS = [ | |
| (672, 1568), | |
| (688, 1504), | |
| (720, 1456), | |
| (752, 1392), | |
| (800, 1328), | |
| (832, 1248), | |
| (880, 1184), | |
| (944, 1104), | |
| (1024, 1024), | |
| (1104, 944), | |
| (1184, 880), | |
| (1248, 832), | |
| (1328, 800), | |
| (1392, 752), | |
| (1456, 720), | |
| (1504, 688), | |
| (1568, 672), | |
| ] | |
| def find_closest_resolution(width, height, preferred_resolutions): | |
| input_ratio = width / height | |
| closest_resolution = min( | |
| preferred_resolutions, | |
| key=lambda res: abs((res[0] / res[1]) - input_ratio) | |
| ) | |
| return closest_resolution | |
| def perform_edit(input_img_paths, input_instruction, output_path): | |
| prefix = " It is editing task." | |
| source_imgs = [] | |
| for path in input_img_paths: | |
| img = load_image(path) | |
| # source_imgs.append(img) | |
| source_imgs.append(resizeinput(img)) | |
| prompt = infer_vlm(input_img_paths, input_instruction, prefix) | |
| prompt = extract_gen_content(prompt) | |
| print(f"Generated Prompt for VLM: {prompt}") | |
| image = pipe( | |
| images=source_imgs, | |
| height=source_imgs[0].height, | |
| width=source_imgs[0].width, | |
| prompt=prompt, | |
| num_inference_steps=30, | |
| guidance_scale=3.5, | |
| ).images[0] | |
| image.save(output_path) | |
| print(f"Edit result saved to {output_path}") | |
| def process_request(image_file_1, image_file_2, instruction): | |
| # debugpy.listen(5678) | |
| # print("Waiting for debugger attach...") | |
| # debugpy.wait_for_client() | |
| if not image_file_1 or not image_file_2: | |
| raise gr.Error("Please upload both images.") | |
| if not instruction: | |
| raise gr.Error("Please provide an instruction.") | |
| if not pipe or not vlm_model: | |
| raise gr.Error("Models not loaded. Check the console for errors.") | |
| output_path = f"/tmp/{uuid.uuid4()}.png" | |
| input_img_paths = [image_file_1, image_file_2] # List of file paths from the two gr.File inputs | |
| perform_edit(input_img_paths, instruction, output_path) | |
| return output_path | |
| css = """ | |
| .text-center { text-align: center; } | |
| .result-img img { | |
| max-height: 60vh !important; | |
| min-height: 30vh !important; | |
| width: auto !important; | |
| object-fit: contain; | |
| } | |
| .input-img img { | |
| max-height: 30vh !important; | |
| width: auto !important; | |
| object-fit: contain; | |
| } | |
| """ | |
| with gr.Blocks(theme=gr.themes.Soft(), title="DreamOmni2", css=css) as demo: | |
| gr.HTML( | |
| """ | |
| <h1 style="text-align:center; font-size:40px; font-weight:bold; margin-bottom:16px;"> | |
| DreamOmni2: Multimodal Image Generation and Editing | |
| </h1> | |
| """ | |
| ) | |
| gr.Markdown( | |
| "Upload two images, provide an instruction, and click 'Run'.\n\n" | |
| "**Hint**: For editing tasks, due to the format settings of the training data, we need to place the image to be edited in the first position.", | |
| elem_classes="text-center" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| gr.Markdown("⬆️ Upload images. Click or drag to upload. Edit image on the left.") | |
| with gr.Row(): | |
| image_uploader_1 = gr.Image( | |
| label="Img 1", | |
| type="filepath", | |
| interactive=True, | |
| elem_classes="input-img", | |
| ) | |
| image_uploader_2 = gr.Image( | |
| label="Img 2", | |
| type="filepath", | |
| interactive=True, | |
| elem_classes="input-img", | |
| ) | |
| instruction_text = gr.Textbox( | |
| label="Instruction", | |
| lines=2, | |
| placeholder="Input your instruction for generation or editing here...", | |
| ) | |
| run_button = gr.Button("Run", variant="primary") | |
| with gr.Column(scale=2): | |
| gr.Markdown( | |
| "✏️ **Editing Mode**: Modify an existing image using instructions and references.\n\n" | |
| "Tip: If the result is not what you expect, try clicking **Run** again. " | |
| ) | |
| output_image = gr.Image( | |
| label="Result", | |
| type="filepath", | |
| elem_classes="result-img", | |
| ) | |
| # --- Examples (不变) --- | |
| gr.Markdown("## Examples") | |
| gr.Examples( | |
| label="Editing Examples", | |
| examples=[ | |
| ["example_input/edit_tests/4/ref_0.jpg", "example_input/edit_tests/4/ref_1.jpg", "Replace the first image have the same image style as the second image.","example_input/edit_tests/4/res.jpg"], | |
| ["example_input/edit_tests/5/ref_0.jpg", "example_input/edit_tests/5/ref_1.jpg", "Make the person in the first image have the same hairstyle as the person in the second image.","example_input/edit_tests/5/res.jpg"], | |
| ["example_input/edit_tests/src.jpg", "example_input/edit_tests/ref.jpg", "Make the woman from the second image stand on the road in the first image.","example_input/edit_tests/edi_res.png"], | |
| ["example_input/edit_tests/1/ref_0.jpg", "example_input/edit_tests/1/ref_1.jpg", "Replace the lantern in the first image with the dog in the second image.","example_input/edit_tests/1/res.jpg"], | |
| ["example_input/edit_tests/2/ref_0.jpg", "example_input/edit_tests/2/ref_1.jpg", "Replace the suit in the first image with the clothes in the second image.","example_input/edit_tests/2/res.jpg"], | |
| ["example_input/edit_tests/3/ref_0.jpg", "example_input/edit_tests/3/ref_1.jpg", "Make the first image has the same light condition as the second image.","example_input/edit_tests/3/res.jpg"], | |
| ["example_input/edit_tests/6/ref_0.jpg", "example_input/edit_tests/6/ref_1.jpg", "Make the words in the first image have the same font as the words in the second image.","example_input/edit_tests/6/res.jpg"], | |
| ["example_input/edit_tests/7/ref_0.jpg", "example_input/edit_tests/7/ref_1.jpg", "Make the car in the first image have the same pattern as the mouse in the second image.","example_input/edit_tests/7/res.jpg"], | |
| ["example_input/edit_tests/8/ref_0.jpg", "example_input/edit_tests/8/ref_1.jpg", "Make the dress in the first image have the same pattern in the second image.","example_input/edit_tests/8/res.jpg"], | |
| ], | |
| inputs=[image_uploader_1, image_uploader_2, instruction_text, output_image], | |
| cache_examples=False, | |
| ) | |
| run_button.click( | |
| fn=process_request, | |
| inputs=[image_uploader_1, image_uploader_2, instruction_text], | |
| outputs=output_image | |
| ) | |
| demo.launch() | |
| if __name__ == "__main__": | |
| vlm_model, processor, pipe = _load_model_processor() | |
| _launch_demo(vlm_model, processor, pipe) | |
