Spaces:

Yuanshi
/

OminiControl_Art

Running on Zero

App Files Files Community

OminiControl_Art / OminiControl /src /train /data.py

Yuanshi

Upload 61 files

fb6a167 verified 7 months ago

raw

history blame

11.4 kB

	from PIL import Image, ImageFilter, ImageDraw
	import cv2
	import numpy as np
	from torch.utils.data import Dataset
	import torchvision.transforms as T
	import random


	class Subject200KDataset(Dataset):
	def __init__(
	self,
	base_dataset,
	condition_size: int = 512,
	target_size: int = 512,
	image_size: int = 512,
	padding: int = 0,
	condition_type: str = "subject",
	drop_text_prob: float = 0.1,
	drop_image_prob: float = 0.1,
	return_pil_image: bool = False,
	):
	self.base_dataset = base_dataset
	self.condition_size = condition_size
	self.target_size = target_size
	self.image_size = image_size
	self.padding = padding
	self.condition_type = condition_type
	self.drop_text_prob = drop_text_prob
	self.drop_image_prob = drop_image_prob
	self.return_pil_image = return_pil_image

	self.to_tensor = T.ToTensor()

	def __len__(self):
	return len(self.base_dataset) * 2

	def __getitem__(self, idx):
	# If target is 0, left image is target, right image is condition
	target = idx % 2
	item = self.base_dataset[idx // 2]

	# Crop the image to target and condition
	image = item["image"]
	left_img = image.crop(
	(
	self.padding,
	self.padding,
	self.image_size + self.padding,
	self.image_size + self.padding,
	)
	)
	right_img = image.crop(
	(
	self.image_size + self.padding * 2,
	self.padding,
	self.image_size * 2 + self.padding * 2,
	self.image_size + self.padding,
	)
	)

	# Get the target and condition image
	target_image, condition_img = (
	(left_img, right_img) if target == 0 else (right_img, left_img)
	)

	# Resize the image
	condition_img = condition_img.resize(
	(self.condition_size, self.condition_size)
	).convert("RGB")
	target_image = target_image.resize(
	(self.target_size, self.target_size)
	).convert("RGB")

	# Get the description
	description = item["description"][
	"description_0" if target == 0 else "description_1"
	]

	# Randomly drop text or image
	drop_text = random.random() < self.drop_text_prob
	drop_image = random.random() < self.drop_image_prob
	if drop_text:
	description = ""
	if drop_image:
	condition_img = Image.new(
	"RGB", (self.condition_size, self.condition_size), (0, 0, 0)
	)

	return {
	"image": self.to_tensor(target_image),
	"condition": self.to_tensor(condition_img),
	"condition_type": self.condition_type,
	"description": description,
	# 16 is the downscale factor of the image
	"position_delta": np.array([0, -self.condition_size // 16]),
	**({"pil_image": image} if self.return_pil_image else {}),
	}


	class ImageConditionDataset(Dataset):
	def __init__(
	self,
	base_dataset,
	condition_size: int = 512,
	target_size: int = 512,
	condition_type: str = "canny",
	drop_text_prob: float = 0.1,
	drop_image_prob: float = 0.1,
	return_pil_image: bool = False,
	position_scale=1.0,
	):
	self.base_dataset = base_dataset
	self.condition_size = condition_size
	self.target_size = target_size
	self.condition_type = condition_type
	self.drop_text_prob = drop_text_prob
	self.drop_image_prob = drop_image_prob
	self.return_pil_image = return_pil_image
	self.position_scale = position_scale

	self.to_tensor = T.ToTensor()

	def __len__(self):
	return len(self.base_dataset)

	@property
	def depth_pipe(self):
	if not hasattr(self, "_depth_pipe"):
	from transformers import pipeline

	self._depth_pipe = pipeline(
	task="depth-estimation",
	model="LiheYoung/depth-anything-small-hf",
	device="cpu",
	)
	return self._depth_pipe

	def _get_canny_edge(self, img):
	resize_ratio = self.condition_size / max(img.size)
	img = img.resize(
	(int(img.size[0] * resize_ratio), int(img.size[1] * resize_ratio))
	)
	img_np = np.array(img)
	img_gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
	edges = cv2.Canny(img_gray, 100, 200)
	return Image.fromarray(edges).convert("RGB")

	def __getitem__(self, idx):
	image = self.base_dataset[idx]["jpg"]
	image = image.resize((self.target_size, self.target_size)).convert("RGB")
	description = self.base_dataset[idx]["json"]["prompt"]

	enable_scale = random.random() < 1
	if not enable_scale:
	condition_size = int(self.condition_size * self.position_scale)
	position_scale = 1.0
	else:
	condition_size = self.condition_size
	position_scale = self.position_scale

	# Get the condition image
	position_delta = np.array([0, 0])
	if self.condition_type == "canny":
	condition_img = self._get_canny_edge(image)
	elif self.condition_type == "coloring":
	condition_img = (
	image.resize((condition_size, condition_size))
	.convert("L")
	.convert("RGB")
	)
	elif self.condition_type == "deblurring":
	blur_radius = random.randint(1, 10)
	condition_img = (
	image.convert("RGB")
	.filter(ImageFilter.GaussianBlur(blur_radius))
	.resize((condition_size, condition_size))
	.convert("RGB")
	)
	elif self.condition_type == "depth":
	condition_img = self.depth_pipe(image)["depth"].convert("RGB")
	condition_img = condition_img.resize((condition_size, condition_size))
	elif self.condition_type == "depth_pred":
	condition_img = image
	image = self.depth_pipe(condition_img)["depth"].convert("RGB")
	description = f"[depth] {description}"
	elif self.condition_type == "fill":
	condition_img = image.resize((condition_size, condition_size)).convert(
	"RGB"
	)
	w, h = image.size
	x1, x2 = sorted([random.randint(0, w), random.randint(0, w)])
	y1, y2 = sorted([random.randint(0, h), random.randint(0, h)])
	mask = Image.new("L", image.size, 0)
	draw = ImageDraw.Draw(mask)
	draw.rectangle([x1, y1, x2, y2], fill=255)
	if random.random() > 0.5:
	mask = Image.eval(mask, lambda a: 255 - a)
	condition_img = Image.composite(
	image, Image.new("RGB", image.size, (0, 0, 0)), mask
	)
	elif self.condition_type == "sr":
	condition_img = image.resize((condition_size, condition_size)).convert(
	"RGB"
	)
	position_delta = np.array([0, -condition_size // 16])

	else:
	raise ValueError(f"Condition type {self.condition_type} not implemented")

	# Randomly drop text or image
	drop_text = random.random() < self.drop_text_prob
	drop_image = random.random() < self.drop_image_prob
	if drop_text:
	description = ""
	if drop_image:
	condition_img = Image.new(
	"RGB", (condition_size, condition_size), (0, 0, 0)
	)

	return {
	"image": self.to_tensor(image),
	"condition": self.to_tensor(condition_img),
	"condition_type": self.condition_type,
	"description": description,
	"position_delta": position_delta,
	**({"pil_image": [image, condition_img]} if self.return_pil_image else {}),
	**({"position_scale": position_scale} if position_scale != 1.0 else {}),
	}


	class CartoonDataset(Dataset):
	def __init__(
	self,
	base_dataset,
	condition_size: int = 1024,
	target_size: int = 1024,
	image_size: int = 1024,
	padding: int = 0,
	condition_type: str = "cartoon",
	drop_text_prob: float = 0.1,
	drop_image_prob: float = 0.1,
	return_pil_image: bool = False,
	):
	self.base_dataset = base_dataset
	self.condition_size = condition_size
	self.target_size = target_size
	self.image_size = image_size
	self.padding = padding
	self.condition_type = condition_type
	self.drop_text_prob = drop_text_prob
	self.drop_image_prob = drop_image_prob
	self.return_pil_image = return_pil_image

	self.to_tensor = T.ToTensor()

	def __len__(self):
	return len(self.base_dataset)

	def __getitem__(self, idx):
	data = self.base_dataset[idx]
	condition_img = data["condition"]
	target_image = data["target"]

	# Tag
	tag = data["tags"][0]

	target_description = data["target_description"]

	description = {
	"lion": "lion like animal",
	"bear": "bear like animal",
	"gorilla": "gorilla like animal",
	"dog": "dog like animal",
	"elephant": "elephant like animal",
	"eagle": "eagle like bird",
	"tiger": "tiger like animal",
	"owl": "owl like bird",
	"woman": "woman",
	"parrot": "parrot like bird",
	"mouse": "mouse like animal",
	"man": "man",
	"pigeon": "pigeon like bird",
	"girl": "girl",
	"panda": "panda like animal",
	"crocodile": "crocodile like animal",
	"rabbit": "rabbit like animal",
	"boy": "boy",
	"monkey": "monkey like animal",
	"cat": "cat like animal",
	}

	# Resize the image
	condition_img = condition_img.resize(
	(self.condition_size, self.condition_size)
	).convert("RGB")
	target_image = target_image.resize(
	(self.target_size, self.target_size)
	).convert("RGB")

	# Process datum to create description
	description = data.get(
	"description",
	f"Photo of a {description[tag]} cartoon character in a white background. Character is facing {target_description['facing_direction']}. Character pose is {target_description['pose']}.",
	)

	# Randomly drop text or image
	drop_text = random.random() < self.drop_text_prob
	drop_image = random.random() < self.drop_image_prob
	if drop_text:
	description = ""
	if drop_image:
	condition_img = Image.new(
	"RGB", (self.condition_size, self.condition_size), (0, 0, 0)
	)

	return {
	"image": self.to_tensor(target_image),
	"condition": self.to_tensor(condition_img),
	"condition_type": self.condition_type,
	"description": description,
	# 16 is the downscale factor of the image
	"position_delta": np.array([0, -16]),
	}