# server/localhosted models implementation (extended applications demo) import torch import lpips import gradio as gr import numpy as np from PIL import Image from dequantor import ( StableDiffusion3Pipeline, GGUFQuantizationConfig, SD3Transformer2DModel, QwenImageEditPlusPipeline, AutoencoderKLQwenImage, ) from transformers import ( T5EncoderModel, Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM, ) from nunchaku import ( NunchakuQwenImageTransformer2DModel, ) from gguf_connector.vrm import get_gpu_vram def launch_app(model_path1,model_path,dtype): # image recognition model MODEL_ID = "callgg/fastvlm-0.5b-bf16" IMAGE_TOKEN_INDEX = -200 tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto", trust_remote_code=True, ) def describe_image(img: Image.Image, prompt, num_tokens) -> str: if img is None: return "Please upload an image." messages = [{"role": "user", "content": f"\n{prompt}."}] rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) pre, post = rendered.split("", 1) pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype) input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device) attention_mask = torch.ones_like(input_ids, device=model.device) px = model.get_vision_tower().image_processor(images=img, return_tensors="pt")["pixel_values"] px = px.to(model.device, dtype=model.dtype) with torch.no_grad(): out = model.generate( inputs=input_ids, attention_mask=attention_mask, images=px, max_new_tokens=num_tokens ) return tok.decode(out[0], skip_special_tokens=True) sample1_prompts = ['describe this image in detail', 'describe what you see in few words', 'tell me the difference'] sample1_prompts = [[x] for x in sample1_prompts] # image generation model transformer1 = SD3Transformer2DModel.from_single_file( model_path1, quantization_config=GGUFQuantizationConfig(compute_dtype=dtype), torch_dtype=dtype, config="callgg/sd3-decoder", subfolder="transformer_2" ) text_encoder1 = T5EncoderModel.from_pretrained( "chatpig/t5-v1_1-xxl-encoder-fp32-gguf", gguf_file="t5xxl-encoder-fp32-q2_k.gguf", dtype=dtype ) pipeline = StableDiffusion3Pipeline.from_pretrained( "callgg/sd3-decoder", transformer=transformer1, text_encoder_3=text_encoder1, torch_dtype=dtype ) pipeline.enable_model_cpu_offload() # Inference function def generate_image2(prompt, num_steps, guidance): result = pipeline( prompt, height=1024, width=1024, num_inference_steps=num_steps, guidance_scale=guidance, ).images[0] return result sample_prompts2 = ['a cat in a hat', 'a pig in a hat', 'a raccoon in a hat', 'a dog walking with joy'] sample_prompts2 = [[x] for x in sample_prompts2] # image transformation model transformer = NunchakuQwenImageTransformer2DModel.from_pretrained( model_path ) text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained( "callgg/qi-decoder", subfolder="text_encoder", dtype=dtype ) vae = AutoencoderKLQwenImage.from_pretrained( "callgg/qi-decoder", subfolder="vae", torch_dtype=dtype ) pipe = QwenImageEditPlusPipeline.from_pretrained( "callgg/image-edit-plus", transformer=transformer, text_encoder=text_encoder, vae=vae, torch_dtype=dtype ) if get_gpu_vram() > 18: pipe.enable_model_cpu_offload() else: transformer.set_offload( True, use_pin_memory=False, num_blocks_on_gpu=1 ) pipe._exclude_from_cpu_offload.append("transformer") pipe.enable_sequential_cpu_offload() def generate_image(prompt, img1, img2, img3, steps, guidance): images = [] for img in [img1, img2, img3]: if img is not None: if not isinstance(img, Image.Image): img = Image.open(img) images.append(img.convert("RGB")) if not images: return None inputs = { "image": images, "prompt": prompt, "true_cfg_scale": guidance, "negative_prompt": " ", "num_inference_steps": steps, "num_images_per_prompt": 1, } with torch.inference_mode(): output = pipe(**inputs) return output.images[0] sample_prompts = ['merge it', 'color it', 'use image 1 as background of image 2'] sample_prompts = [[x] for x in sample_prompts] # image discrimination model def compare_images(img1,img2): device = 'cuda' if torch.cuda.is_available() else 'cpu' lpips_model = lpips.LPIPS(net='squeeze').to(device) if img1 is None or img2 is None: return "Please upload both images." img1_np = np.array(img1).astype(np.float32) / 255.0 img2_np = np.array(img2).astype(np.float32) / 255.0 # convert to tensor in LPIPS format img1_tensor = lpips.im2tensor(img1_np).to(device) img2_tensor = lpips.im2tensor(img2_np).to(device) # compute LPIPS distance with torch.no_grad(): distance = lpips_model(img1_tensor, img2_tensor) score = distance.item() similarity = max(0.0, 1.0 - score*100) # normalize to positive similarity result_text = ( f"LPIPS Distance: {score:.4f}\n" f"Estimated Similarity: {similarity*100:.4f}%" ) return result_text # UI block = gr.Blocks(title="image studio").queue() with block: gr.Markdown("## Discriminator") with gr.Row(): img1 = gr.Image(type="pil", label="Image 1") img2 = gr.Image(type="pil", label="Image 2") compare_btn = gr.Button("Discriminate") output_box = gr.Textbox(label="Statistics", lines=2) compare_btn.click(compare_images, inputs=[img1,img2], outputs=output_box) gr.Markdown("## Descriptor") with gr.Row(): with gr.Column(): img_input = gr.Image(type="pil", label="Input Image") prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here (or click Sample Prompt)", value="") quick_prompts = gr.Dataset(samples=sample1_prompts, label='Sample Prompt', samples_per_page=1000, components=[prompt]) quick_prompts.click(lambda x: x[0], inputs=[quick_prompts], outputs=prompt, show_progress=False, queue=False) btn = gr.Button("Describe") num_tokens = gr.Slider(minimum=64, maximum=1024, value=128, step=1, label="Output Token") with gr.Column(): output = gr.Textbox(label="Description", lines=5) btn.click(fn=describe_image, inputs=[img_input,prompt,num_tokens], outputs=output) gr.Markdown("## Generator") with gr.Row(): with gr.Column(): prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here (or click Sample Prompt)", value="") quick_prompts = gr.Dataset(samples=sample_prompts2, label='Sample Prompt', samples_per_page=1000, components=[prompt]) quick_prompts.click(lambda x: x[0], inputs=[quick_prompts], outputs=prompt, show_progress=False, queue=False) submit_btn = gr.Button("Generate") num_steps = gr.Slider(minimum=4, maximum=100, value=8, step=1, label="Step") guidance = gr.Slider(minimum=1.0, maximum=10.0, value=2.5, step=0.1, label="Scale") with gr.Column(): output_image = gr.Image(type="pil", label="Output Image") submit_btn.click(fn=generate_image2, inputs=[prompt, num_steps, guidance], outputs=output_image) gr.Markdown("## Transformer") with gr.Row(): with gr.Column(): with gr.Row(): img1 = gr.Image(label="Image 1", type="pil") img2 = gr.Image(label="Image 2", type="pil") img3 = gr.Image(label="Image 3", type="pil") prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here (or click Sample Prompt)", value="") quick_prompts = gr.Dataset(samples=sample_prompts, label='Sample Prompt', samples_per_page=1000, components=[prompt]) quick_prompts.click(lambda x: x[0], inputs=[quick_prompts], outputs=prompt, show_progress=False, queue=False) generate_btn = gr.Button("Transform") steps = gr.Slider(1, 50, value=4, step=1, label="Inference Steps", visible=False) guidance = gr.Slider(0.1, 10.0, value=1.0, step=0.1, label="Guidance Scale", visible=False) with gr.Column(): output_image = gr.Image(label="Output", type="pil") generate_btn.click( fn=generate_image, inputs=[prompt, img1, img2, img3, steps, guidance], outputs=output_image, ) block.launch() # detect your device and assign dtype accordingly device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.bfloat16 if device == "cuda" else torch.float32 # load the model from cache; or pull it from huggingface repo if you don't have model_path1 = "https://huggingface.co/calcuis/sd3.5-lite-gguf/blob/main/sd3.5-8b-lite-mxfp4_moe.gguf" model_path = "https://huggingface.co/calcuis/sketch/blob/main/sketch-s9-20b-int4.safetensors" # launch the app; call the app function above launch_app(model_path1, model_path, dtype)