kevin1kevin1k commited on
Commit
736a5b9
·
verified ·
1 Parent(s): fb2f0a7

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. models/app.py +235 -1
models/app.py CHANGED
@@ -1 +1,235 @@
1
- # server/localhost models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # server/localhosted models implementation
2
+ import torch
3
+ import lpips
4
+ import gradio as gr
5
+ import numpy as np
6
+ from PIL import Image
7
+ from dequantor import (
8
+ StableDiffusion3Pipeline,
9
+ GGUFQuantizationConfig,
10
+ SD3Transformer2DModel,
11
+ QwenImageEditPlusPipeline,
12
+ AutoencoderKLQwenImage,
13
+ )
14
+ from transformers import (
15
+ T5EncoderModel,
16
+ Qwen2_5_VLForConditionalGeneration,
17
+ AutoTokenizer,
18
+ AutoModelForCausalLM,
19
+ )
20
+ from nunchaku import (
21
+ NunchakuQwenImageTransformer2DModel,
22
+ )
23
+ from gguf_connector.vrm import get_gpu_vram
24
+
25
+ def launch_app(model_path1,model_path,dtype):
26
+ # image recognition model
27
+ MODEL_ID = "callgg/fastvlm-0.5b-bf16"
28
+ IMAGE_TOKEN_INDEX = -200
29
+ tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ MODEL_ID,
32
+ dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
33
+ device_map="auto",
34
+ trust_remote_code=True,
35
+ )
36
+ def describe_image(img: Image.Image, prompt, num_tokens) -> str:
37
+ if img is None:
38
+ return "Please upload an image."
39
+ messages = [{"role": "user", "content": f"<image>\n{prompt}."}]
40
+ rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
41
+ pre, post = rendered.split("<image>", 1)
42
+ pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids
43
+ post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids
44
+ img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype)
45
+ input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device)
46
+ attention_mask = torch.ones_like(input_ids, device=model.device)
47
+ px = model.get_vision_tower().image_processor(images=img, return_tensors="pt")["pixel_values"]
48
+ px = px.to(model.device, dtype=model.dtype)
49
+ with torch.no_grad():
50
+ out = model.generate(
51
+ inputs=input_ids,
52
+ attention_mask=attention_mask,
53
+ images=px,
54
+ max_new_tokens=num_tokens
55
+ )
56
+ return tok.decode(out[0], skip_special_tokens=True)
57
+ sample1_prompts = ['describe this image in detail',
58
+ 'describe what you see in few words',
59
+ 'tell me the difference']
60
+ sample1_prompts = [[x] for x in sample1_prompts]
61
+ # image generation model
62
+ transformer1 = SD3Transformer2DModel.from_single_file(
63
+ model_path1,
64
+ quantization_config=GGUFQuantizationConfig(compute_dtype=dtype),
65
+ torch_dtype=dtype,
66
+ config="callgg/sd3-decoder",
67
+ subfolder="transformer_2"
68
+ )
69
+ text_encoder1 = T5EncoderModel.from_pretrained(
70
+ "chatpig/t5-v1_1-xxl-encoder-fp32-gguf",
71
+ gguf_file="t5xxl-encoder-fp32-q2_k.gguf",
72
+ dtype=dtype
73
+ )
74
+ pipeline = StableDiffusion3Pipeline.from_pretrained(
75
+ "callgg/sd3-decoder",
76
+ transformer=transformer1,
77
+ text_encoder_3=text_encoder1,
78
+ torch_dtype=dtype
79
+ )
80
+ pipeline.enable_model_cpu_offload()
81
+ # Inference function
82
+ def generate_image2(prompt, num_steps, guidance):
83
+ result = pipeline(
84
+ prompt,
85
+ height=1024,
86
+ width=1024,
87
+ num_inference_steps=num_steps,
88
+ guidance_scale=guidance,
89
+ ).images[0]
90
+ return result
91
+ sample_prompts2 = ['a cat in a hat',
92
+ 'a pig in a hat',
93
+ 'a raccoon in a hat',
94
+ 'a dog walking with joy']
95
+ sample_prompts2 = [[x] for x in sample_prompts2]
96
+ # image transformation model
97
+ transformer = NunchakuQwenImageTransformer2DModel.from_pretrained(
98
+ model_path
99
+ )
100
+ text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
101
+ "callgg/qi-decoder",
102
+ subfolder="text_encoder",
103
+ dtype=dtype
104
+ )
105
+ vae = AutoencoderKLQwenImage.from_pretrained(
106
+ "callgg/qi-decoder",
107
+ subfolder="vae",
108
+ torch_dtype=dtype
109
+ )
110
+ pipe = QwenImageEditPlusPipeline.from_pretrained(
111
+ "callgg/image-edit-plus",
112
+ transformer=transformer,
113
+ text_encoder=text_encoder,
114
+ vae=vae,
115
+ torch_dtype=dtype
116
+ )
117
+ if get_gpu_vram() > 18:
118
+ pipe.enable_model_cpu_offload()
119
+ else:
120
+ transformer.set_offload(
121
+ True, use_pin_memory=False, num_blocks_on_gpu=1
122
+ )
123
+ pipe._exclude_from_cpu_offload.append("transformer")
124
+ pipe.enable_sequential_cpu_offload()
125
+ def generate_image(prompt, img1, img2, img3, steps, guidance):
126
+ images = []
127
+ for img in [img1, img2, img3]:
128
+ if img is not None:
129
+ if not isinstance(img, Image.Image):
130
+ img = Image.open(img)
131
+ images.append(img.convert("RGB"))
132
+ if not images:
133
+ return None
134
+ inputs = {
135
+ "image": images,
136
+ "prompt": prompt,
137
+ "true_cfg_scale": guidance,
138
+ "negative_prompt": " ",
139
+ "num_inference_steps": steps,
140
+ "num_images_per_prompt": 1,
141
+ }
142
+ with torch.inference_mode():
143
+ output = pipe(**inputs)
144
+ return output.images[0]
145
+ sample_prompts = ['merge it',
146
+ 'color it',
147
+ 'use image 1 as background of image 2']
148
+ sample_prompts = [[x] for x in sample_prompts]
149
+ # image discrimination model
150
+ def compare_images(img1,img2):
151
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
152
+ lpips_model = lpips.LPIPS(net='squeeze').to(device)
153
+ if img1 is None or img2 is None:
154
+ return "Please upload both images."
155
+ img1_np = np.array(img1).astype(np.float32) / 255.0
156
+ img2_np = np.array(img2).astype(np.float32) / 255.0
157
+ # convert to tensor in LPIPS format
158
+ img1_tensor = lpips.im2tensor(img1_np).to(device)
159
+ img2_tensor = lpips.im2tensor(img2_np).to(device)
160
+ # compute LPIPS distance
161
+ with torch.no_grad():
162
+ distance = lpips_model(img1_tensor, img2_tensor)
163
+ score = distance.item()
164
+ similarity = max(0.0, 1.0 - score*100) # normalize to positive similarity
165
+ result_text = (
166
+ f"LPIPS Distance: {score:.4f}\n"
167
+ f"Estimated Similarity: {similarity*100:.4f}%"
168
+ )
169
+ return result_text
170
+ # UI
171
+ block = gr.Blocks(title="image studio").queue()
172
+ with block:
173
+ gr.Markdown("## Discriminator")
174
+ with gr.Row():
175
+ img1 = gr.Image(type="pil", label="Image 1")
176
+ img2 = gr.Image(type="pil", label="Image 2")
177
+ compare_btn = gr.Button("Discriminate")
178
+ output_box = gr.Textbox(label="Statistics", lines=2)
179
+ compare_btn.click(compare_images, inputs=[img1,img2], outputs=output_box)
180
+ gr.Markdown("## Descriptor")
181
+ with gr.Row():
182
+ with gr.Column():
183
+ img_input = gr.Image(type="pil", label="Input Image")
184
+ prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here (or click Sample Prompt)", value="")
185
+ quick_prompts = gr.Dataset(samples=sample1_prompts, label='Sample Prompt', samples_per_page=1000, components=[prompt])
186
+ quick_prompts.click(lambda x: x[0], inputs=[quick_prompts], outputs=prompt, show_progress=False, queue=False)
187
+ btn = gr.Button("Describe")
188
+ num_tokens = gr.Slider(minimum=64, maximum=1024, value=128, step=1, label="Output Token")
189
+ with gr.Column():
190
+ output = gr.Textbox(label="Description", lines=5)
191
+ btn.click(fn=describe_image, inputs=[img_input,prompt,num_tokens], outputs=output)
192
+ gr.Markdown("## Generator")
193
+ with gr.Row():
194
+ with gr.Column():
195
+ prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here (or click Sample Prompt)", value="")
196
+ quick_prompts = gr.Dataset(samples=sample_prompts2, label='Sample Prompt', samples_per_page=1000, components=[prompt])
197
+ quick_prompts.click(lambda x: x[0], inputs=[quick_prompts], outputs=prompt, show_progress=False, queue=False)
198
+ submit_btn = gr.Button("Generate")
199
+ num_steps = gr.Slider(minimum=4, maximum=100, value=8, step=1, label="Step")
200
+ guidance = gr.Slider(minimum=1.0, maximum=10.0, value=2.5, step=0.1, label="Scale")
201
+ with gr.Column():
202
+ output_image = gr.Image(type="pil", label="Output Image")
203
+ submit_btn.click(fn=generate_image2, inputs=[prompt, num_steps, guidance], outputs=output_image)
204
+ gr.Markdown("## Transformer")
205
+ with gr.Row():
206
+ with gr.Column():
207
+ with gr.Row():
208
+ img1 = gr.Image(label="Image 1", type="pil")
209
+ img2 = gr.Image(label="Image 2", type="pil")
210
+ img3 = gr.Image(label="Image 3", type="pil")
211
+ prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here (or click Sample Prompt)", value="")
212
+ quick_prompts = gr.Dataset(samples=sample_prompts, label='Sample Prompt', samples_per_page=1000, components=[prompt])
213
+ quick_prompts.click(lambda x: x[0], inputs=[quick_prompts], outputs=prompt, show_progress=False, queue=False)
214
+ generate_btn = gr.Button("Transform")
215
+ steps = gr.Slider(1, 50, value=4, step=1, label="Inference Steps", visible=False)
216
+ guidance = gr.Slider(0.1, 10.0, value=1.0, step=0.1, label="Guidance Scale", visible=False)
217
+ with gr.Column():
218
+ output_image = gr.Image(label="Output", type="pil")
219
+ generate_btn.click(
220
+ fn=generate_image,
221
+ inputs=[prompt, img1, img2, img3, steps, guidance],
222
+ outputs=output_image,
223
+ )
224
+ block.launch()
225
+
226
+ # detect your device and assign dtype accordingly
227
+ device = "cuda" if torch.cuda.is_available() else "cpu"
228
+ dtype = torch.bfloat16 if device == "cuda" else torch.float32
229
+
230
+ # load the model from cache; or pull it from huggingface repo if you don't have
231
+ model_path1 = "https://huggingface.co/calcuis/sd3.5-lite-gguf/blob/main/sd3.5-8b-lite-mxfp4_moe.gguf"
232
+ model_path = "https://huggingface.co/calcuis/sketch/blob/main/sketch-s9-20b-int4.safetensors"
233
+
234
+ # launch the app; call the app function above
235
+ launch_app(model_path1, model_path, dtype)