Spaces:
Runtime error
Runtime error
Commit
·
53cf806
1
Parent(s):
1dc205a
add image2depth and depth2image
Browse files- app.py +34 -23
- visual_foundation_models.py +109 -77
app.py
CHANGED
|
@@ -52,18 +52,19 @@ import gradio as gr
|
|
| 52 |
|
| 53 |
|
| 54 |
def cut_dialogue_history(history_memory, keep_last_n_words=400):
|
|
|
|
|
|
|
| 55 |
tokens = history_memory.split()
|
| 56 |
n_tokens = len(tokens)
|
| 57 |
-
print(f"
|
| 58 |
if n_tokens < keep_last_n_words:
|
| 59 |
return history_memory
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
return '\n' + '\n'.join(paragraphs)
|
| 67 |
|
| 68 |
|
| 69 |
class ConversationBot:
|
|
@@ -74,7 +75,6 @@ class ConversationBot:
|
|
| 74 |
raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
|
| 75 |
|
| 76 |
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
| 77 |
-
|
| 78 |
self.models = dict()
|
| 79 |
for class_name, device in load_dict.items():
|
| 80 |
self.models[class_name] = globals()[class_name](device=device)
|
|
@@ -86,7 +86,6 @@ class ConversationBot:
|
|
| 86 |
func = getattr(instance, e)
|
| 87 |
self.tools.append(Tool(name=func.name, description=func.description, func=func))
|
| 88 |
|
| 89 |
-
|
| 90 |
def run_text(self, text, state):
|
| 91 |
self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
|
| 92 |
res = self.agent({"input": text})
|
|
@@ -98,7 +97,7 @@ class ConversationBot:
|
|
| 98 |
return state, state
|
| 99 |
|
| 100 |
def run_image(self, image, state, txt):
|
| 101 |
-
image_filename = os.path.join('image', str(uuid.uuid4())[
|
| 102 |
print("======>Auto Resize Image...")
|
| 103 |
img = Image.open(image.name)
|
| 104 |
width, height = img.size
|
|
@@ -111,17 +110,13 @@ class ConversationBot:
|
|
| 111 |
img.save(image_filename, "PNG")
|
| 112 |
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
|
| 113 |
description = self.models['ImageCaptioning'].inference(image_filename)
|
| 114 |
-
Human_prompt =
|
| 115 |
-
"This information helps you to understand this image, " \
|
| 116 |
-
"but you should use tools to finish following tasks, " \
|
| 117 |
-
"rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
|
| 118 |
-
image_filename, description)
|
| 119 |
AI_prompt = "Received. "
|
| 120 |
self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
|
| 121 |
state = state + [(f"*{image_filename}*", AI_prompt)]
|
| 122 |
print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
|
| 123 |
f"Current Memory: {self.agent.memory.buffer}")
|
| 124 |
-
return state, state, txt
|
| 125 |
|
| 126 |
def init_agent(self, openai_api_key):
|
| 127 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
|
@@ -136,17 +131,25 @@ class ConversationBot:
|
|
| 136 |
|
| 137 |
return gr.update(visible = True)
|
| 138 |
|
| 139 |
-
bot = ConversationBot({'Text2Image':'cuda:0',
|
| 140 |
-
'ImageCaptioning':'cuda:0',
|
| 141 |
'ImageEditing': 'cuda:0',
|
| 142 |
'VisualQuestionAnswering': 'cuda:0',
|
| 143 |
-
'Image2Canny':'cpu',
|
| 144 |
-
'CannyText2Image':'cuda:0',
|
| 145 |
-
'InstructPix2Pix':'cuda:0'
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
| 148 |
with gr.Row():
|
| 149 |
gr.Markdown("<h3><center>Visual ChatGPT</center></h3>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
with gr.Row():
|
| 152 |
openai_api_key_textbox = gr.Textbox(
|
|
@@ -177,10 +180,18 @@ with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
|
| 177 |
"Can you use this canny image to generate an oil painting of a dog",
|
| 178 |
"Make it like water-color painting",
|
| 179 |
"What is the background color",
|
| 180 |
-
"Describe this image"
|
|
|
|
|
|
|
|
|
|
| 181 |
inputs=txt
|
| 182 |
)
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
|
| 186 |
txt.submit(bot.run_text, [txt, state], [chatbot, state])
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
def cut_dialogue_history(history_memory, keep_last_n_words=400):
|
| 55 |
+
if history_memory is None or len(history_memory) == 0:
|
| 56 |
+
return history_memory
|
| 57 |
tokens = history_memory.split()
|
| 58 |
n_tokens = len(tokens)
|
| 59 |
+
print(f"history_memory:{history_memory}, n_tokens: {n_tokens}")
|
| 60 |
if n_tokens < keep_last_n_words:
|
| 61 |
return history_memory
|
| 62 |
+
paragraphs = history_memory.split('\n')
|
| 63 |
+
last_n_tokens = n_tokens
|
| 64 |
+
while last_n_tokens >= keep_last_n_words:
|
| 65 |
+
last_n_tokens -= len(paragraphs[0].split(' '))
|
| 66 |
+
paragraphs = paragraphs[1:]
|
| 67 |
+
return '\n' + '\n'.join(paragraphs)
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
class ConversationBot:
|
|
|
|
| 75 |
raise ValueError("You have to load ImageCaptioning as a basic function for VisualChatGPT")
|
| 76 |
|
| 77 |
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
|
|
|
|
| 78 |
self.models = dict()
|
| 79 |
for class_name, device in load_dict.items():
|
| 80 |
self.models[class_name] = globals()[class_name](device=device)
|
|
|
|
| 86 |
func = getattr(instance, e)
|
| 87 |
self.tools.append(Tool(name=func.name, description=func.description, func=func))
|
| 88 |
|
|
|
|
| 89 |
def run_text(self, text, state):
|
| 90 |
self.agent.memory.buffer = cut_dialogue_history(self.agent.memory.buffer, keep_last_n_words=500)
|
| 91 |
res = self.agent({"input": text})
|
|
|
|
| 97 |
return state, state
|
| 98 |
|
| 99 |
def run_image(self, image, state, txt):
|
| 100 |
+
image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png")
|
| 101 |
print("======>Auto Resize Image...")
|
| 102 |
img = Image.open(image.name)
|
| 103 |
width, height = img.size
|
|
|
|
| 110 |
img.save(image_filename, "PNG")
|
| 111 |
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
|
| 112 |
description = self.models['ImageCaptioning'].inference(image_filename)
|
| 113 |
+
Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
AI_prompt = "Received. "
|
| 115 |
self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
|
| 116 |
state = state + [(f"*{image_filename}*", AI_prompt)]
|
| 117 |
print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
|
| 118 |
f"Current Memory: {self.agent.memory.buffer}")
|
| 119 |
+
return state, state, f'{txt} {image_filename} '
|
| 120 |
|
| 121 |
def init_agent(self, openai_api_key):
|
| 122 |
self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
|
|
|
|
| 131 |
|
| 132 |
return gr.update(visible = True)
|
| 133 |
|
| 134 |
+
bot = ConversationBot({'Text2Image': 'cuda:0',
|
| 135 |
+
'ImageCaptioning': 'cuda:0',
|
| 136 |
'ImageEditing': 'cuda:0',
|
| 137 |
'VisualQuestionAnswering': 'cuda:0',
|
| 138 |
+
'Image2Canny': 'cpu',
|
| 139 |
+
'CannyText2Image': 'cuda:0',
|
| 140 |
+
'InstructPix2Pix': 'cuda:0',
|
| 141 |
+
'Image2Depth': 'cpu',
|
| 142 |
+
'DepthText2Image': 'cuda:0',
|
| 143 |
+
})
|
| 144 |
|
| 145 |
with gr.Blocks(css="#chatbot {overflow:auto; height:500px;}") as demo:
|
| 146 |
with gr.Row():
|
| 147 |
gr.Markdown("<h3><center>Visual ChatGPT</center></h3>")
|
| 148 |
+
gr.Markdown(
|
| 149 |
+
"""This is a demo to the work [Visual ChatGPT: Talking, Drawing and Editing with Visual Foundation Models](https://github.com/microsoft/visual-chatgpt).<br>
|
| 150 |
+
This space connects ChatGPT and a series of Visual Foundation Models to enable sending and receiving images during chatting.<br>
|
| 151 |
+
"""
|
| 152 |
+
)
|
| 153 |
|
| 154 |
with gr.Row():
|
| 155 |
openai_api_key_textbox = gr.Textbox(
|
|
|
|
| 180 |
"Can you use this canny image to generate an oil painting of a dog",
|
| 181 |
"Make it like water-color painting",
|
| 182 |
"What is the background color",
|
| 183 |
+
"Describe this image",
|
| 184 |
+
"please detect the depth of this image",
|
| 185 |
+
"Can you use this depth image to generate a cute dog",
|
| 186 |
+
],
|
| 187 |
inputs=txt
|
| 188 |
)
|
| 189 |
|
| 190 |
+
gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:
|
| 191 |
+
<a href="https://huggingface.co/spaces/microsoft/visual_chatgpt?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
|
| 192 |
+
</center>''')
|
| 193 |
+
|
| 194 |
+
|
| 195 |
|
| 196 |
openai_api_key_textbox.submit(bot.init_agent, [openai_api_key_textbox], [input_raws])
|
| 197 |
txt.submit(bot.run_text, [txt, state], [chatbot, state])
|
visual_foundation_models.py
CHANGED
|
@@ -44,7 +44,7 @@ def get_new_image_name(org_img_name, func_name="update"):
|
|
| 44 |
|
| 45 |
class MaskFormer:
|
| 46 |
def __init__(self, device):
|
| 47 |
-
print("Initializing MaskFormer to
|
| 48 |
self.device = device
|
| 49 |
self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
|
| 50 |
self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
|
|
@@ -74,24 +74,27 @@ class MaskFormer:
|
|
| 74 |
|
| 75 |
class ImageEditing:
|
| 76 |
def __init__(self, device):
|
| 77 |
-
print("Initializing ImageEditing to
|
| 78 |
self.device = device
|
| 79 |
self.mask_former = MaskFormer(device=self.device)
|
| 80 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
@prompts(name="Remove Something From The Photo",
|
| 83 |
description="useful when you want to remove and object or something from the photo "
|
| 84 |
"from its description or location. "
|
| 85 |
-
"The input to this tool should be a comma
|
| 86 |
"representing the image_path and the object need to be removed. ")
|
| 87 |
def inference_remove(self, inputs):
|
| 88 |
-
image_path, to_be_removed_txt = inputs.split(",")
|
| 89 |
return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
|
| 90 |
|
| 91 |
@prompts(name="Replace Something From The Photo",
|
| 92 |
description="useful when you want to replace an object from the object description or "
|
| 93 |
"location with another object from its description. "
|
| 94 |
-
"The input to this tool should be a comma
|
| 95 |
"representing the image_path, the object to be replaced, the object to be replaced with ")
|
| 96 |
def inference_replace(self, inputs):
|
| 97 |
image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
|
|
@@ -111,16 +114,18 @@ class ImageEditing:
|
|
| 111 |
|
| 112 |
class InstructPix2Pix:
|
| 113 |
def __init__(self, device):
|
| 114 |
-
print("Initializing InstructPix2Pix to
|
| 115 |
self.device = device
|
| 116 |
-
self.
|
| 117 |
-
|
|
|
|
|
|
|
| 118 |
self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
|
| 119 |
|
| 120 |
@prompts(name="Instruct Image Using Text",
|
| 121 |
description="useful when you want to the style of the image to be like the text. "
|
| 122 |
"like: make it look like a painting. or make it like a robot. "
|
| 123 |
-
"The input to this tool should be a comma
|
| 124 |
"representing the image_path and the text. ")
|
| 125 |
def inference(self, inputs):
|
| 126 |
"""Change style of image."""
|
|
@@ -163,17 +168,18 @@ class Text2Image:
|
|
| 163 |
|
| 164 |
class ImageCaptioning:
|
| 165 |
def __init__(self, device):
|
| 166 |
-
print("Initializing ImageCaptioning to
|
| 167 |
self.device = device
|
|
|
|
| 168 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 169 |
self.model = BlipForConditionalGeneration.from_pretrained(
|
| 170 |
-
"Salesforce/blip-image-captioning-base", torch_dtype=
|
| 171 |
|
| 172 |
@prompts(name="Get Photo Description",
|
| 173 |
description="useful when you want to know what is inside the photo. receives image_path as input. "
|
| 174 |
"The input to this tool should be a string, representing the image_path. ")
|
| 175 |
def inference(self, image_path):
|
| 176 |
-
inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device,
|
| 177 |
out = self.model.generate(**inputs)
|
| 178 |
captions = self.processor.decode(out[0], skip_special_tokens=True)
|
| 179 |
print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
|
|
@@ -206,29 +212,32 @@ class Image2Canny:
|
|
| 206 |
|
| 207 |
class CannyText2Image:
|
| 208 |
def __init__(self, device):
|
| 209 |
-
print("Initializing CannyText2Image to
|
| 210 |
-
self.
|
|
|
|
|
|
|
| 211 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 212 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
|
|
|
| 213 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 214 |
self.pipe.to(device)
|
| 215 |
self.seed = -1
|
| 216 |
self.a_prompt = 'best quality, extremely detailed'
|
| 217 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
| 218 |
-
|
| 219 |
|
| 220 |
@prompts(name="Generate Image Condition On Canny Image",
|
| 221 |
-
description="useful when you want to generate a new real image from both the user
|
| 222 |
" like: generate a real image of a object or something from this canny image,"
|
| 223 |
" or generate a new real image of a object or something from this edge image. "
|
| 224 |
-
"The input to this tool should be a comma
|
| 225 |
"representing the image_path and the user description. ")
|
| 226 |
def inference(self, inputs):
|
| 227 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 228 |
image = Image.open(image_path)
|
| 229 |
self.seed = random.randint(0, 65535)
|
| 230 |
seed_everything(self.seed)
|
| 231 |
-
prompt = instruct_text
|
| 232 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 233 |
guidance_scale=9.0).images[0]
|
| 234 |
updated_image_path = get_new_image_name(image_path, func_name="canny2image")
|
|
@@ -246,7 +255,7 @@ class Image2Line:
|
|
| 246 |
@prompts(name="Line Detection On Image",
|
| 247 |
description="useful when you want to detect the straight line of the image. "
|
| 248 |
"like: detect the straight lines of this image, or straight line detection on image, "
|
| 249 |
-
"or
|
| 250 |
"The input to this tool should be a string, representing the image_path")
|
| 251 |
def inference(self, inputs):
|
| 252 |
image = Image.open(inputs)
|
|
@@ -259,31 +268,34 @@ class Image2Line:
|
|
| 259 |
|
| 260 |
class LineText2Image:
|
| 261 |
def __init__(self, device):
|
| 262 |
-
print("Initializing LineText2Image to
|
| 263 |
-
self.
|
|
|
|
|
|
|
| 264 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 265 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
|
| 266 |
)
|
| 267 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 268 |
self.pipe.to(device)
|
| 269 |
self.seed = -1
|
| 270 |
self.a_prompt = 'best quality, extremely detailed'
|
| 271 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
| 272 |
-
|
| 273 |
|
| 274 |
@prompts(name="Generate Image Condition On Line Image",
|
| 275 |
-
description="useful when you want to generate a new real image from both the user
|
| 276 |
"and a straight line image. "
|
| 277 |
"like: generate a real image of a object or something from this straight line image, "
|
| 278 |
"or generate a new real image of a object or something from this straight lines. "
|
| 279 |
-
"The input to this tool should be a comma
|
| 280 |
"representing the image_path and the user description. ")
|
| 281 |
def inference(self, inputs):
|
| 282 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 283 |
image = Image.open(image_path)
|
| 284 |
self.seed = random.randint(0, 65535)
|
| 285 |
seed_everything(self.seed)
|
| 286 |
-
prompt = instruct_text
|
| 287 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 288 |
guidance_scale=9.0).images[0]
|
| 289 |
updated_image_path = get_new_image_name(image_path, func_name="line2image")
|
|
@@ -301,7 +313,7 @@ class Image2Hed:
|
|
| 301 |
@prompts(name="Hed Detection On Image",
|
| 302 |
description="useful when you want to detect the soft hed boundary of the image. "
|
| 303 |
"like: detect the soft hed boundary of this image, or hed boundary detection on image, "
|
| 304 |
-
"or
|
| 305 |
"The input to this tool should be a string, representing the image_path")
|
| 306 |
def inference(self, inputs):
|
| 307 |
image = Image.open(inputs)
|
|
@@ -314,31 +326,34 @@ class Image2Hed:
|
|
| 314 |
|
| 315 |
class HedText2Image:
|
| 316 |
def __init__(self, device):
|
| 317 |
-
print("Initializing HedText2Image to
|
| 318 |
-
self.
|
|
|
|
|
|
|
| 319 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 320 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
|
| 321 |
)
|
| 322 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 323 |
self.pipe.to(device)
|
| 324 |
self.seed = -1
|
| 325 |
self.a_prompt = 'best quality, extremely detailed'
|
| 326 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
| 327 |
-
|
| 328 |
|
| 329 |
@prompts(name="Generate Image Condition On Soft Hed Boundary Image",
|
| 330 |
-
description="useful when you want to generate a new real image from both the user
|
| 331 |
"and a soft hed boundary image. "
|
| 332 |
"like: generate a real image of a object or something from this soft hed boundary image, "
|
| 333 |
"or generate a new real image of a object or something from this hed boundary. "
|
| 334 |
-
"The input to this tool should be a comma
|
| 335 |
"representing the image_path and the user description")
|
| 336 |
def inference(self, inputs):
|
| 337 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 338 |
image = Image.open(image_path)
|
| 339 |
self.seed = random.randint(0, 65535)
|
| 340 |
seed_everything(self.seed)
|
| 341 |
-
prompt = instruct_text
|
| 342 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 343 |
guidance_scale=9.0).images[0]
|
| 344 |
updated_image_path = get_new_image_name(image_path, func_name="hed2image")
|
|
@@ -369,29 +384,32 @@ class Image2Scribble:
|
|
| 369 |
|
| 370 |
class ScribbleText2Image:
|
| 371 |
def __init__(self, device):
|
| 372 |
-
print("Initializing ScribbleText2Image to
|
| 373 |
-
self.
|
|
|
|
|
|
|
| 374 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 375 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
|
| 376 |
)
|
| 377 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 378 |
self.pipe.to(device)
|
| 379 |
self.seed = -1
|
| 380 |
self.a_prompt = 'best quality, extremely detailed'
|
| 381 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
| 382 |
-
|
| 383 |
|
| 384 |
@prompts(name="Generate Image Condition On Sketch Image",
|
| 385 |
-
description="useful when you want to generate a new real image from both the user
|
| 386 |
"a scribble image or a sketch image. "
|
| 387 |
-
"The input to this tool should be a comma
|
| 388 |
"representing the image_path and the user description")
|
| 389 |
def inference(self, inputs):
|
| 390 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 391 |
image = Image.open(image_path)
|
| 392 |
self.seed = random.randint(0, 65535)
|
| 393 |
seed_everything(self.seed)
|
| 394 |
-
prompt = instruct_text
|
| 395 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 396 |
guidance_scale=9.0).images[0]
|
| 397 |
updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
|
|
@@ -421,10 +439,13 @@ class Image2Pose:
|
|
| 421 |
|
| 422 |
class PoseText2Image:
|
| 423 |
def __init__(self, device):
|
| 424 |
-
print("Initializing PoseText2Image to
|
| 425 |
-
self.
|
|
|
|
|
|
|
| 426 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 427 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
|
| 428 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 429 |
self.pipe.to(device)
|
| 430 |
self.num_inference_steps = 20
|
|
@@ -432,21 +453,21 @@ class PoseText2Image:
|
|
| 432 |
self.unconditional_guidance_scale = 9.0
|
| 433 |
self.a_prompt = 'best quality, extremely detailed'
|
| 434 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
| 435 |
-
|
| 436 |
|
| 437 |
@prompts(name="Generate Image Condition On Pose Image",
|
| 438 |
-
description="useful when you want to generate a new real image from both the user
|
| 439 |
"and a human pose image. "
|
| 440 |
"like: generate a real image of a human from this human pose image, "
|
| 441 |
"or generate a new real image of a human from this pose. "
|
| 442 |
-
"The input to this tool should be a comma
|
| 443 |
"representing the image_path and the user description")
|
| 444 |
def inference(self, inputs):
|
| 445 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 446 |
image = Image.open(image_path)
|
| 447 |
self.seed = random.randint(0, 65535)
|
| 448 |
seed_everything(self.seed)
|
| 449 |
-
prompt = instruct_text
|
| 450 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 451 |
guidance_scale=9.0).images[0]
|
| 452 |
updated_image_path = get_new_image_name(image_path, func_name="pose2image")
|
|
@@ -503,7 +524,7 @@ class Image2Seg:
|
|
| 503 |
@prompts(name="Segmentation On Image",
|
| 504 |
description="useful when you want to detect segmentations of the image. "
|
| 505 |
"like: segment this image, or generate segmentations on this image, "
|
| 506 |
-
"or
|
| 507 |
"The input to this tool should be a string, representing the image_path")
|
| 508 |
def inference(self, inputs):
|
| 509 |
image = Image.open(inputs)
|
|
@@ -525,29 +546,32 @@ class Image2Seg:
|
|
| 525 |
|
| 526 |
class SegText2Image:
|
| 527 |
def __init__(self, device):
|
| 528 |
-
print("Initializing SegText2Image to
|
| 529 |
-
self.
|
|
|
|
|
|
|
| 530 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 531 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
|
| 532 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 533 |
self.pipe.to(device)
|
| 534 |
self.seed = -1
|
| 535 |
self.a_prompt = 'best quality, extremely detailed'
|
| 536 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
| 537 |
-
|
| 538 |
|
| 539 |
@prompts(name="Generate Image Condition On Segmentations",
|
| 540 |
-
description="useful when you want to generate a new real image from both the user
|
| 541 |
"like: generate a real image of a object or something from this segmentation image, "
|
| 542 |
"or generate a new real image of a object or something from these segmentations. "
|
| 543 |
-
"The input to this tool should be a comma
|
| 544 |
"representing the image_path and the user description")
|
| 545 |
def inference(self, inputs):
|
| 546 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 547 |
image = Image.open(image_path)
|
| 548 |
self.seed = random.randint(0, 65535)
|
| 549 |
seed_everything(self.seed)
|
| 550 |
-
prompt = instruct_text
|
| 551 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 552 |
guidance_scale=9.0).images[0]
|
| 553 |
updated_image_path = get_new_image_name(image_path, func_name="segment2image")
|
|
@@ -581,29 +605,32 @@ class Image2Depth:
|
|
| 581 |
|
| 582 |
class DepthText2Image:
|
| 583 |
def __init__(self, device):
|
| 584 |
-
print("Initializing DepthText2Image to
|
| 585 |
-
self.
|
|
|
|
|
|
|
| 586 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 587 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
|
| 588 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 589 |
self.pipe.to(device)
|
| 590 |
self.seed = -1
|
| 591 |
self.a_prompt = 'best quality, extremely detailed'
|
| 592 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
| 593 |
-
|
| 594 |
|
| 595 |
@prompts(name="Generate Image Condition On Depth",
|
| 596 |
-
description="useful when you want to generate a new real image from both the user
|
| 597 |
"like: generate a real image of a object or something from this depth image, "
|
| 598 |
"or generate a new real image of a object or something from the depth map. "
|
| 599 |
-
"The input to this tool should be a comma
|
| 600 |
"representing the image_path and the user description")
|
| 601 |
def inference(self, inputs):
|
| 602 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 603 |
image = Image.open(image_path)
|
| 604 |
self.seed = random.randint(0, 65535)
|
| 605 |
seed_everything(self.seed)
|
| 606 |
-
prompt = instruct_text
|
| 607 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 608 |
guidance_scale=9.0).images[0]
|
| 609 |
updated_image_path = get_new_image_name(image_path, func_name="depth2image")
|
|
@@ -649,29 +676,32 @@ class Image2Normal:
|
|
| 649 |
|
| 650 |
class NormalText2Image:
|
| 651 |
def __init__(self, device):
|
| 652 |
-
print("Initializing NormalText2Image to
|
| 653 |
-
self.
|
|
|
|
|
|
|
| 654 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 655 |
-
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
|
|
|
|
| 656 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 657 |
self.pipe.to(device)
|
| 658 |
self.seed = -1
|
| 659 |
self.a_prompt = 'best quality, extremely detailed'
|
| 660 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
| 661 |
-
|
| 662 |
|
| 663 |
@prompts(name="Generate Image Condition On Normal Map",
|
| 664 |
-
description="useful when you want to generate a new real image from both the user
|
| 665 |
"like: generate a real image of a object or something from this normal map, "
|
| 666 |
"or generate a new real image of a object or something from the normal map. "
|
| 667 |
-
"The input to this tool should be a comma
|
| 668 |
"representing the image_path and the user description")
|
| 669 |
def inference(self, inputs):
|
| 670 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 671 |
image = Image.open(image_path)
|
| 672 |
self.seed = random.randint(0, 65535)
|
| 673 |
seed_everything(self.seed)
|
| 674 |
-
prompt = instruct_text
|
| 675 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 676 |
guidance_scale=9.0).images[0]
|
| 677 |
updated_image_path = get_new_image_name(image_path, func_name="normal2image")
|
|
@@ -683,19 +713,21 @@ class NormalText2Image:
|
|
| 683 |
|
| 684 |
class VisualQuestionAnswering:
|
| 685 |
def __init__(self, device):
|
| 686 |
-
print("Initializing VisualQuestionAnswering to
|
|
|
|
| 687 |
self.device = device
|
| 688 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
|
| 689 |
-
self.model = BlipForQuestionAnswering.from_pretrained(
|
|
|
|
| 690 |
|
| 691 |
@prompts(name="Answer Question About The Image",
|
| 692 |
description="useful when you need an answer for a question based on an image. "
|
| 693 |
"like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
|
| 694 |
-
"The input to this tool should be a comma
|
| 695 |
def inference(self, inputs):
|
| 696 |
-
image_path, question = inputs.split(",")
|
| 697 |
raw_image = Image.open(image_path).convert('RGB')
|
| 698 |
-
inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device,
|
| 699 |
out = self.model.generate(**inputs)
|
| 700 |
answer = self.processor.decode(out[0], skip_special_tokens=True)
|
| 701 |
print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
|
|
|
|
| 44 |
|
| 45 |
class MaskFormer:
|
| 46 |
def __init__(self, device):
|
| 47 |
+
print(f"Initializing MaskFormer to {device}")
|
| 48 |
self.device = device
|
| 49 |
self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
|
| 50 |
self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
|
|
|
|
| 74 |
|
| 75 |
class ImageEditing:
|
| 76 |
def __init__(self, device):
|
| 77 |
+
print(f"Initializing ImageEditing to {device}")
|
| 78 |
self.device = device
|
| 79 |
self.mask_former = MaskFormer(device=self.device)
|
| 80 |
+
self.revision = 'fp16' if 'cuda' in device else None
|
| 81 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 82 |
+
self.inpaint = StableDiffusionInpaintPipeline.from_pretrained(
|
| 83 |
+
"runwayml/stable-diffusion-inpainting", revision=self.revision, torch_dtype=self.torch_dtype).to(device)
|
| 84 |
|
| 85 |
@prompts(name="Remove Something From The Photo",
|
| 86 |
description="useful when you want to remove and object or something from the photo "
|
| 87 |
"from its description or location. "
|
| 88 |
+
"The input to this tool should be a comma separated string of two, "
|
| 89 |
"representing the image_path and the object need to be removed. ")
|
| 90 |
def inference_remove(self, inputs):
|
| 91 |
+
image_path, to_be_removed_txt = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 92 |
return self.inference_replace(f"{image_path},{to_be_removed_txt},background")
|
| 93 |
|
| 94 |
@prompts(name="Replace Something From The Photo",
|
| 95 |
description="useful when you want to replace an object from the object description or "
|
| 96 |
"location with another object from its description. "
|
| 97 |
+
"The input to this tool should be a comma separated string of three, "
|
| 98 |
"representing the image_path, the object to be replaced, the object to be replaced with ")
|
| 99 |
def inference_replace(self, inputs):
|
| 100 |
image_path, to_be_replaced_txt, replace_with_txt = inputs.split(",")
|
|
|
|
| 114 |
|
| 115 |
class InstructPix2Pix:
|
| 116 |
def __init__(self, device):
|
| 117 |
+
print(f"Initializing InstructPix2Pix to {device}")
|
| 118 |
self.device = device
|
| 119 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 120 |
+
self.pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix",
|
| 121 |
+
safety_checker=None,
|
| 122 |
+
torch_dtype=self.torch_dtype).to(device)
|
| 123 |
self.pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(self.pipe.scheduler.config)
|
| 124 |
|
| 125 |
@prompts(name="Instruct Image Using Text",
|
| 126 |
description="useful when you want to the style of the image to be like the text. "
|
| 127 |
"like: make it look like a painting. or make it like a robot. "
|
| 128 |
+
"The input to this tool should be a comma separated string of two, "
|
| 129 |
"representing the image_path and the text. ")
|
| 130 |
def inference(self, inputs):
|
| 131 |
"""Change style of image."""
|
|
|
|
| 168 |
|
| 169 |
class ImageCaptioning:
|
| 170 |
def __init__(self, device):
|
| 171 |
+
print(f"Initializing ImageCaptioning to {device}")
|
| 172 |
self.device = device
|
| 173 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 174 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 175 |
self.model = BlipForConditionalGeneration.from_pretrained(
|
| 176 |
+
"Salesforce/blip-image-captioning-base", torch_dtype=self.torch_dtype).to(self.device)
|
| 177 |
|
| 178 |
@prompts(name="Get Photo Description",
|
| 179 |
description="useful when you want to know what is inside the photo. receives image_path as input. "
|
| 180 |
"The input to this tool should be a string, representing the image_path. ")
|
| 181 |
def inference(self, image_path):
|
| 182 |
+
inputs = self.processor(Image.open(image_path), return_tensors="pt").to(self.device, self.torch_dtype)
|
| 183 |
out = self.model.generate(**inputs)
|
| 184 |
captions = self.processor.decode(out[0], skip_special_tokens=True)
|
| 185 |
print(f"\nProcessed ImageCaptioning, Input Image: {image_path}, Output Text: {captions}")
|
|
|
|
| 212 |
|
| 213 |
class CannyText2Image:
|
| 214 |
def __init__(self, device):
|
| 215 |
+
print(f"Initializing CannyText2Image to {device}")
|
| 216 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 217 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny",
|
| 218 |
+
torch_dtype=self.torch_dtype)
|
| 219 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 220 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
| 221 |
+
torch_dtype=self.torch_dtype)
|
| 222 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 223 |
self.pipe.to(device)
|
| 224 |
self.seed = -1
|
| 225 |
self.a_prompt = 'best quality, extremely detailed'
|
| 226 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
| 227 |
+
'fewer digits, cropped, worst quality, low quality'
|
| 228 |
|
| 229 |
@prompts(name="Generate Image Condition On Canny Image",
|
| 230 |
+
description="useful when you want to generate a new real image from both the user description and a canny image."
|
| 231 |
" like: generate a real image of a object or something from this canny image,"
|
| 232 |
" or generate a new real image of a object or something from this edge image. "
|
| 233 |
+
"The input to this tool should be a comma separated string of two, "
|
| 234 |
"representing the image_path and the user description. ")
|
| 235 |
def inference(self, inputs):
|
| 236 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 237 |
image = Image.open(image_path)
|
| 238 |
self.seed = random.randint(0, 65535)
|
| 239 |
seed_everything(self.seed)
|
| 240 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
| 241 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 242 |
guidance_scale=9.0).images[0]
|
| 243 |
updated_image_path = get_new_image_name(image_path, func_name="canny2image")
|
|
|
|
| 255 |
@prompts(name="Line Detection On Image",
|
| 256 |
description="useful when you want to detect the straight line of the image. "
|
| 257 |
"like: detect the straight lines of this image, or straight line detection on image, "
|
| 258 |
+
"or perform straight line detection on this image, or detect the straight line image of this image. "
|
| 259 |
"The input to this tool should be a string, representing the image_path")
|
| 260 |
def inference(self, inputs):
|
| 261 |
image = Image.open(inputs)
|
|
|
|
| 268 |
|
| 269 |
class LineText2Image:
|
| 270 |
def __init__(self, device):
|
| 271 |
+
print(f"Initializing LineText2Image to {device}")
|
| 272 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 273 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd",
|
| 274 |
+
torch_dtype=self.torch_dtype)
|
| 275 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 276 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
| 277 |
+
torch_dtype=self.torch_dtype
|
| 278 |
)
|
| 279 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 280 |
self.pipe.to(device)
|
| 281 |
self.seed = -1
|
| 282 |
self.a_prompt = 'best quality, extremely detailed'
|
| 283 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
| 284 |
+
'fewer digits, cropped, worst quality, low quality'
|
| 285 |
|
| 286 |
@prompts(name="Generate Image Condition On Line Image",
|
| 287 |
+
description="useful when you want to generate a new real image from both the user description "
|
| 288 |
"and a straight line image. "
|
| 289 |
"like: generate a real image of a object or something from this straight line image, "
|
| 290 |
"or generate a new real image of a object or something from this straight lines. "
|
| 291 |
+
"The input to this tool should be a comma separated string of two, "
|
| 292 |
"representing the image_path and the user description. ")
|
| 293 |
def inference(self, inputs):
|
| 294 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 295 |
image = Image.open(image_path)
|
| 296 |
self.seed = random.randint(0, 65535)
|
| 297 |
seed_everything(self.seed)
|
| 298 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
| 299 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 300 |
guidance_scale=9.0).images[0]
|
| 301 |
updated_image_path = get_new_image_name(image_path, func_name="line2image")
|
|
|
|
| 313 |
@prompts(name="Hed Detection On Image",
|
| 314 |
description="useful when you want to detect the soft hed boundary of the image. "
|
| 315 |
"like: detect the soft hed boundary of this image, or hed boundary detection on image, "
|
| 316 |
+
"or perform hed boundary detection on this image, or detect soft hed boundary image of this image. "
|
| 317 |
"The input to this tool should be a string, representing the image_path")
|
| 318 |
def inference(self, inputs):
|
| 319 |
image = Image.open(inputs)
|
|
|
|
| 326 |
|
| 327 |
class HedText2Image:
|
| 328 |
def __init__(self, device):
|
| 329 |
+
print(f"Initializing HedText2Image to {device}")
|
| 330 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 331 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed",
|
| 332 |
+
torch_dtype=self.torch_dtype)
|
| 333 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 334 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
| 335 |
+
torch_dtype=self.torch_dtype
|
| 336 |
)
|
| 337 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 338 |
self.pipe.to(device)
|
| 339 |
self.seed = -1
|
| 340 |
self.a_prompt = 'best quality, extremely detailed'
|
| 341 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
| 342 |
+
'fewer digits, cropped, worst quality, low quality'
|
| 343 |
|
| 344 |
@prompts(name="Generate Image Condition On Soft Hed Boundary Image",
|
| 345 |
+
description="useful when you want to generate a new real image from both the user description "
|
| 346 |
"and a soft hed boundary image. "
|
| 347 |
"like: generate a real image of a object or something from this soft hed boundary image, "
|
| 348 |
"or generate a new real image of a object or something from this hed boundary. "
|
| 349 |
+
"The input to this tool should be a comma separated string of two, "
|
| 350 |
"representing the image_path and the user description")
|
| 351 |
def inference(self, inputs):
|
| 352 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 353 |
image = Image.open(image_path)
|
| 354 |
self.seed = random.randint(0, 65535)
|
| 355 |
seed_everything(self.seed)
|
| 356 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
| 357 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 358 |
guidance_scale=9.0).images[0]
|
| 359 |
updated_image_path = get_new_image_name(image_path, func_name="hed2image")
|
|
|
|
| 384 |
|
| 385 |
class ScribbleText2Image:
|
| 386 |
def __init__(self, device):
|
| 387 |
+
print(f"Initializing ScribbleText2Image to {device}")
|
| 388 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 389 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble",
|
| 390 |
+
torch_dtype=self.torch_dtype)
|
| 391 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 392 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
| 393 |
+
torch_dtype=self.torch_dtype
|
| 394 |
)
|
| 395 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 396 |
self.pipe.to(device)
|
| 397 |
self.seed = -1
|
| 398 |
self.a_prompt = 'best quality, extremely detailed'
|
| 399 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
|
| 400 |
+
'fewer digits, cropped, worst quality, low quality'
|
| 401 |
|
| 402 |
@prompts(name="Generate Image Condition On Sketch Image",
|
| 403 |
+
description="useful when you want to generate a new real image from both the user description and "
|
| 404 |
"a scribble image or a sketch image. "
|
| 405 |
+
"The input to this tool should be a comma separated string of two, "
|
| 406 |
"representing the image_path and the user description")
|
| 407 |
def inference(self, inputs):
|
| 408 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 409 |
image = Image.open(image_path)
|
| 410 |
self.seed = random.randint(0, 65535)
|
| 411 |
seed_everything(self.seed)
|
| 412 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
| 413 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 414 |
guidance_scale=9.0).images[0]
|
| 415 |
updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
|
|
|
|
| 439 |
|
| 440 |
class PoseText2Image:
|
| 441 |
def __init__(self, device):
|
| 442 |
+
print(f"Initializing PoseText2Image to {device}")
|
| 443 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 444 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose",
|
| 445 |
+
torch_dtype=self.torch_dtype)
|
| 446 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 447 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
| 448 |
+
torch_dtype=self.torch_dtype)
|
| 449 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 450 |
self.pipe.to(device)
|
| 451 |
self.num_inference_steps = 20
|
|
|
|
| 453 |
self.unconditional_guidance_scale = 9.0
|
| 454 |
self.a_prompt = 'best quality, extremely detailed'
|
| 455 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
| 456 |
+
' fewer digits, cropped, worst quality, low quality'
|
| 457 |
|
| 458 |
@prompts(name="Generate Image Condition On Pose Image",
|
| 459 |
+
description="useful when you want to generate a new real image from both the user description "
|
| 460 |
"and a human pose image. "
|
| 461 |
"like: generate a real image of a human from this human pose image, "
|
| 462 |
"or generate a new real image of a human from this pose. "
|
| 463 |
+
"The input to this tool should be a comma separated string of two, "
|
| 464 |
"representing the image_path and the user description")
|
| 465 |
def inference(self, inputs):
|
| 466 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 467 |
image = Image.open(image_path)
|
| 468 |
self.seed = random.randint(0, 65535)
|
| 469 |
seed_everything(self.seed)
|
| 470 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
| 471 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 472 |
guidance_scale=9.0).images[0]
|
| 473 |
updated_image_path = get_new_image_name(image_path, func_name="pose2image")
|
|
|
|
| 524 |
@prompts(name="Segmentation On Image",
|
| 525 |
description="useful when you want to detect segmentations of the image. "
|
| 526 |
"like: segment this image, or generate segmentations on this image, "
|
| 527 |
+
"or perform segmentation on this image. "
|
| 528 |
"The input to this tool should be a string, representing the image_path")
|
| 529 |
def inference(self, inputs):
|
| 530 |
image = Image.open(inputs)
|
|
|
|
| 546 |
|
| 547 |
class SegText2Image:
|
| 548 |
def __init__(self, device):
|
| 549 |
+
print(f"Initializing SegText2Image to {device}")
|
| 550 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 551 |
+
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg",
|
| 552 |
+
torch_dtype=self.torch_dtype)
|
| 553 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 554 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
| 555 |
+
torch_dtype=self.torch_dtype)
|
| 556 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 557 |
self.pipe.to(device)
|
| 558 |
self.seed = -1
|
| 559 |
self.a_prompt = 'best quality, extremely detailed'
|
| 560 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
| 561 |
+
' fewer digits, cropped, worst quality, low quality'
|
| 562 |
|
| 563 |
@prompts(name="Generate Image Condition On Segmentations",
|
| 564 |
+
description="useful when you want to generate a new real image from both the user description and segmentations. "
|
| 565 |
"like: generate a real image of a object or something from this segmentation image, "
|
| 566 |
"or generate a new real image of a object or something from these segmentations. "
|
| 567 |
+
"The input to this tool should be a comma separated string of two, "
|
| 568 |
"representing the image_path and the user description")
|
| 569 |
def inference(self, inputs):
|
| 570 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 571 |
image = Image.open(image_path)
|
| 572 |
self.seed = random.randint(0, 65535)
|
| 573 |
seed_everything(self.seed)
|
| 574 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
| 575 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 576 |
guidance_scale=9.0).images[0]
|
| 577 |
updated_image_path = get_new_image_name(image_path, func_name="segment2image")
|
|
|
|
| 605 |
|
| 606 |
class DepthText2Image:
|
| 607 |
def __init__(self, device):
|
| 608 |
+
print(f"Initializing DepthText2Image to {device}")
|
| 609 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 610 |
+
self.controlnet = ControlNetModel.from_pretrained(
|
| 611 |
+
"fusing/stable-diffusion-v1-5-controlnet-depth", torch_dtype=self.torch_dtype)
|
| 612 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 613 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
| 614 |
+
torch_dtype=self.torch_dtype)
|
| 615 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 616 |
self.pipe.to(device)
|
| 617 |
self.seed = -1
|
| 618 |
self.a_prompt = 'best quality, extremely detailed'
|
| 619 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
| 620 |
+
' fewer digits, cropped, worst quality, low quality'
|
| 621 |
|
| 622 |
@prompts(name="Generate Image Condition On Depth",
|
| 623 |
+
description="useful when you want to generate a new real image from both the user description and depth image. "
|
| 624 |
"like: generate a real image of a object or something from this depth image, "
|
| 625 |
"or generate a new real image of a object or something from the depth map. "
|
| 626 |
+
"The input to this tool should be a comma separated string of two, "
|
| 627 |
"representing the image_path and the user description")
|
| 628 |
def inference(self, inputs):
|
| 629 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 630 |
image = Image.open(image_path)
|
| 631 |
self.seed = random.randint(0, 65535)
|
| 632 |
seed_everything(self.seed)
|
| 633 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
| 634 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 635 |
guidance_scale=9.0).images[0]
|
| 636 |
updated_image_path = get_new_image_name(image_path, func_name="depth2image")
|
|
|
|
| 676 |
|
| 677 |
class NormalText2Image:
|
| 678 |
def __init__(self, device):
|
| 679 |
+
print(f"Initializing NormalText2Image to {device}")
|
| 680 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 681 |
+
self.controlnet = ControlNetModel.from_pretrained(
|
| 682 |
+
"fusing/stable-diffusion-v1-5-controlnet-normal", torch_dtype=self.torch_dtype)
|
| 683 |
self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
| 684 |
+
"runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None,
|
| 685 |
+
torch_dtype=self.torch_dtype)
|
| 686 |
self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
|
| 687 |
self.pipe.to(device)
|
| 688 |
self.seed = -1
|
| 689 |
self.a_prompt = 'best quality, extremely detailed'
|
| 690 |
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
|
| 691 |
+
' fewer digits, cropped, worst quality, low quality'
|
| 692 |
|
| 693 |
@prompts(name="Generate Image Condition On Normal Map",
|
| 694 |
+
description="useful when you want to generate a new real image from both the user description and normal map. "
|
| 695 |
"like: generate a real image of a object or something from this normal map, "
|
| 696 |
"or generate a new real image of a object or something from the normal map. "
|
| 697 |
+
"The input to this tool should be a comma separated string of two, "
|
| 698 |
"representing the image_path and the user description")
|
| 699 |
def inference(self, inputs):
|
| 700 |
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 701 |
image = Image.open(image_path)
|
| 702 |
self.seed = random.randint(0, 65535)
|
| 703 |
seed_everything(self.seed)
|
| 704 |
+
prompt = f'{instruct_text}, {self.a_prompt}'
|
| 705 |
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
|
| 706 |
guidance_scale=9.0).images[0]
|
| 707 |
updated_image_path = get_new_image_name(image_path, func_name="normal2image")
|
|
|
|
| 713 |
|
| 714 |
class VisualQuestionAnswering:
|
| 715 |
def __init__(self, device):
|
| 716 |
+
print(f"Initializing VisualQuestionAnswering to {device}")
|
| 717 |
+
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
|
| 718 |
self.device = device
|
| 719 |
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
|
| 720 |
+
self.model = BlipForQuestionAnswering.from_pretrained(
|
| 721 |
+
"Salesforce/blip-vqa-base", torch_dtype=self.torch_dtype).to(self.device)
|
| 722 |
|
| 723 |
@prompts(name="Answer Question About The Image",
|
| 724 |
description="useful when you need an answer for a question based on an image. "
|
| 725 |
"like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
|
| 726 |
+
"The input to this tool should be a comma separated string of two, representing the image_path and the question")
|
| 727 |
def inference(self, inputs):
|
| 728 |
+
image_path, question = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
|
| 729 |
raw_image = Image.open(image_path).convert('RGB')
|
| 730 |
+
inputs = self.processor(raw_image, question, return_tensors="pt").to(self.device, self.torch_dtype)
|
| 731 |
out = self.model.generate(**inputs)
|
| 732 |
answer = self.processor.decode(out[0], skip_special_tokens=True)
|
| 733 |
print(f"\nProcessed VisualQuestionAnswering, Input Image: {image_path}, Input Question: {question}, "
|