Spaces:

Salesforce
/

EDICT

Runtime error

EDICT / edict_functions.py

bram-w

safety check

93d448d over 2 years ago

42.6 kB

	import torch
	from transformers import CLIPModel, CLIPTextModel, CLIPTokenizer
	from omegaconf import OmegaConf
	import math
	import imageio
	from PIL import Image
	import torchvision
	import torch.nn.functional as F
	import torch
	import numpy as np
	from PIL import Image
	import time
	import datetime
	import torch
	import sys
	import os
	from torchvision import datasets
	import pickle



	# StableDiffusion P2P implementation originally from https://github.com/bloc97/CrossAttentionControl
	use_half_prec = True
	if use_half_prec:
	from my_half_diffusers import AutoencoderKL, UNet2DConditionModel
	from my_half_diffusers.schedulers.scheduling_utils import SchedulerOutput
	from my_half_diffusers import LMSDiscreteScheduler, PNDMScheduler, DDPMScheduler, DDIMScheduler
	else:
	from my_diffusers import AutoencoderKL, UNet2DConditionModel
	from my_diffusers.schedulers.scheduling_utils import SchedulerOutput
	from my_diffusers import LMSDiscreteScheduler, PNDMScheduler, DDPMScheduler, DDIMScheduler
	torch_dtype = torch.float16 if use_half_prec else torch.float64
	np_dtype = np.float16 if use_half_prec else np.float64



	import random
	from tqdm.auto import tqdm
	from torch import autocast
	from difflib import SequenceMatcher

	# Build our CLIP model
	model_path_clip = "openai/clip-vit-large-patch14"
	clip_tokenizer = CLIPTokenizer.from_pretrained(model_path_clip)
	clip_model = CLIPModel.from_pretrained(model_path_clip, torch_dtype=torch_dtype)
	clip = clip_model.text_model


	# Getting our HF Auth token
	auth_token = os.environ.get('auth_token')
	if auth_token is None:
	with open('hf_auth', 'r') as f:
	auth_token = f.readlines()[0].strip()
	model_path_diffusion = "CompVis/stable-diffusion-v1-4"
	# Build our SD model
	unet = UNet2DConditionModel.from_pretrained(model_path_diffusion, subfolder="unet", use_auth_token=auth_token, revision="fp16", torch_dtype=torch_dtype)
	vae = AutoencoderKL.from_pretrained(model_path_diffusion, subfolder="vae", use_auth_token=auth_token, revision="fp16", torch_dtype=torch_dtype)

	# Push to devices w/ double precision
	device = 'cuda'
	if use_half_prec:
	unet.to(device)
	vae.to(device)
	clip.to(device)
	else:
	unet.double().to(device)
	vae.double().to(device)
	clip.double().to(device)
	print("Loaded all models")

	from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
	from transformers import AutoFeatureExtractor
	# load safety model
	safety_model_id = "CompVis/stable-diffusion-safety-checker"
	safety_feature_extractor = AutoFeatureExtractor.from_pretrained(safety_model_id)
	safety_checker = StableDiffusionSafetyChecker.from_pretrained(safety_model_id)
	def load_replacement(x):
	try:
	hwc = x.shape
	y = Image.open("assets/rick.jpeg").convert("RGB").resize((hwc[1], hwc[0]))
	y = (np.array(y)/255.0).astype(x.dtype)
	assert y.shape == x.shape
	return y
	except Exception:
	return x
	def check_safety(x_image):
	safety_checker_input = safety_feature_extractor(numpy_to_pil(x_image), return_tensors="pt")
	x_checked_image, has_nsfw_concept = safety_checker(images=x_image, clip_input=safety_checker_input.pixel_values)
	assert x_checked_image.shape[0] == len(has_nsfw_concept)
	for i in range(len(has_nsfw_concept)):
	if has_nsfw_concept[i]:
	# x_checked_image[i] = load_replacement(x_checked_image[i])
	x_checked_image[i] *= 0 # load_replacement(x_checked_image[i])
	return x_checked_image, has_nsfw_concept


	def EDICT_editing(im_path,
	base_prompt,
	edit_prompt,
	use_p2p=False,
	steps=50,
	mix_weight=0.93,
	init_image_strength=0.8,
	guidance_scale=3,
	run_baseline=False,
	width=512, height=512):
	"""
	Main call of our research, performs editing with either EDICT or DDIM

	Args:
	im_path: path to image to run on
	base_prompt: conditional prompt to deterministically noise with
	edit_prompt: desired text conditoining
	steps: ddim steps
	mix_weight: Weight of mixing layers.
	Higher means more consistent generations but divergence in inversion
	Lower means opposite
	This is fairly tuned and can get good results
	init_image_strength: Editing strength. Higher = more dramatic edit.
	Typically [0.6, 0.9] is good range.
	Definitely tunable per-image/maybe best results are at a different value
	guidance_scale: classifier-free guidance scale
	3 I've found is the best for both our method and basic DDIM inversion
	Higher can result in more distorted results
	run_baseline:
	VERY IMPORTANT
	True is EDICT, False is DDIM
	Output:
	PAIR of Images (tuple)
	If run_baseline=True then [0] will be edit and [1] will be original
	If run_baseline=False then they will be two nearly identical edited versions
	"""
	# Resize/center crop to 512x512 (Can do higher res. if desired)
	if isinstance(im_path, str):
	orig_im = load_im_into_format_from_path(im_path)
	elif Image.isImageType(im_path):
	width, height = im_path.size


	# add max dim for sake of memory
	max_dim = max(width, height)
	if max_dim > 1024:
	factor = 1024 / max_dim
	width *= factor
	height *= factor
	width = int(width)
	height = int(height)
	im_path = im_path.resize((width, height))

	min_dim = min(width, height)
	if min_dim < 512:
	factor = 512 / min_dim
	width *= factor
	height *= factor
	width = int(width)
	height = int(height)
	im_path = im_path.resize((width, height))

	width = width - (width%64)
	height = height - (height%64)

	orig_im = im_path # general_crop(im_path, width, height)
	else:
	orig_im = im_path

	# compute latent pair (second one will be original latent if run_baseline=True)
	latents = coupled_stablediffusion(base_prompt,
	reverse=True,
	init_image=orig_im,
	init_image_strength=init_image_strength,
	steps=steps,
	mix_weight=mix_weight,
	guidance_scale=guidance_scale,
	run_baseline=run_baseline,
	width=width, height=height)
	# Denoise intermediate state with new conditioning
	gen = coupled_stablediffusion(edit_prompt if (not use_p2p) else base_prompt,
	None if (not use_p2p) else edit_prompt,
	fixed_starting_latent=latents,
	init_image_strength=init_image_strength,
	steps=steps,
	mix_weight=mix_weight,
	guidance_scale=guidance_scale,
	run_baseline=run_baseline,
	width=width, height=height)

	return gen


	def img2img_editing(im_path,
	edit_prompt,
	steps=50,
	init_image_strength=0.7,
	guidance_scale=3):
	"""
	Basic SDEdit/img2img, given an image add some noise and denoise with prompt
	"""
	orig_im = load_im_into_format_from_path(im_path)

	return baseline_stablediffusion(edit_prompt,
	init_image_strength=init_image_strength,
	steps=steps,
	init_image=orig_im,
	guidance_scale=guidance_scale)


	def center_crop(im):
	width, height = im.size # Get dimensions
	min_dim = min(width, height)
	left = (width - min_dim)/2
	top = (height - min_dim)/2
	right = (width + min_dim)/2
	bottom = (height + min_dim)/2

	# Crop the center of the image
	im = im.crop((left, top, right, bottom))
	return im



	def general_crop(im, target_w, target_h):
	width, height = im.size # Get dimensions
	min_dim = min(width, height)
	left = target_w / 2 # (width - min_dim)/2
	top = target_h / 2 # (height - min_dim)/2
	right = width - (target_w / 2) # (width + min_dim)/2
	bottom = height - (target_h / 2) # (height + min_dim)/2

	# Crop the center of the image
	im = im.crop((left, top, right, bottom))
	return im



	def load_im_into_format_from_path(im_path):
	return center_crop(Image.open(im_path)).resize((512,512))


	#### P2P STUFF ####
	def init_attention_weights(weight_tuples):
	tokens_length = clip_tokenizer.model_max_length
	weights = torch.ones(tokens_length)

	for i, w in weight_tuples:
	if i < tokens_length and i >= 0:
	weights[i] = w


	for name, module in unet.named_modules():
	module_name = type(module).__name__
	if module_name == "CrossAttention" and "attn2" in name:
	module.last_attn_slice_weights = weights.to(device)
	if module_name == "CrossAttention" and "attn1" in name:
	module.last_attn_slice_weights = None


	def init_attention_edit(tokens, tokens_edit):
	tokens_length = clip_tokenizer.model_max_length
	mask = torch.zeros(tokens_length)
	indices_target = torch.arange(tokens_length, dtype=torch.long)
	indices = torch.zeros(tokens_length, dtype=torch.long)

	tokens = tokens.input_ids.numpy()[0]
	tokens_edit = tokens_edit.input_ids.numpy()[0]

	for name, a0, a1, b0, b1 in SequenceMatcher(None, tokens, tokens_edit).get_opcodes():
	if b0 < tokens_length:
	if name == "equal" or (name == "replace" and a1-a0 == b1-b0):
	mask[b0:b1] = 1
	indices[b0:b1] = indices_target[a0:a1]

	for name, module in unet.named_modules():
	module_name = type(module).__name__
	if module_name == "CrossAttention" and "attn2" in name:
	module.last_attn_slice_mask = mask.to(device)
	module.last_attn_slice_indices = indices.to(device)
	if module_name == "CrossAttention" and "attn1" in name:
	module.last_attn_slice_mask = None
	module.last_attn_slice_indices = None


	def init_attention_func():
	def new_attention(self, query, key, value, sequence_length, dim):
	batch_size_attention = query.shape[0]
	hidden_states = torch.zeros(
	(batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
	)
	slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
	for i in range(hidden_states.shape[0] // slice_size):
	start_idx = i * slice_size
	end_idx = (i + 1) * slice_size
	attn_slice = (
	torch.einsum("b i d, b j d -> b i j", query[start_idx:end_idx], key[start_idx:end_idx]) * self.scale
	)
	attn_slice = attn_slice.softmax(dim=-1)

	if self.use_last_attn_slice:
	if self.last_attn_slice_mask is not None:
	new_attn_slice = torch.index_select(self.last_attn_slice, -1, self.last_attn_slice_indices)
	attn_slice = attn_slice * (1 - self.last_attn_slice_mask) + new_attn_slice * self.last_attn_slice_mask
	else:
	attn_slice = self.last_attn_slice

	self.use_last_attn_slice = False

	if self.save_last_attn_slice:
	self.last_attn_slice = attn_slice
	self.save_last_attn_slice = False

	if self.use_last_attn_weights and self.last_attn_slice_weights is not None:
	attn_slice = attn_slice * self.last_attn_slice_weights
	self.use_last_attn_weights = False

	attn_slice = torch.einsum("b i j, b j d -> b i d", attn_slice, value[start_idx:end_idx])

	hidden_states[start_idx:end_idx] = attn_slice

	# reshape hidden_states
	hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
	return hidden_states

	for name, module in unet.named_modules():
	module_name = type(module).__name__
	if module_name == "CrossAttention":
	module.last_attn_slice = None
	module.use_last_attn_slice = False
	module.use_last_attn_weights = False
	module.save_last_attn_slice = False
	module._attention = new_attention.__get__(module, type(module))

	def use_last_tokens_attention(use=True):
	for name, module in unet.named_modules():
	module_name = type(module).__name__
	if module_name == "CrossAttention" and "attn2" in name:
	module.use_last_attn_slice = use

	def use_last_tokens_attention_weights(use=True):
	for name, module in unet.named_modules():
	module_name = type(module).__name__
	if module_name == "CrossAttention" and "attn2" in name:
	module.use_last_attn_weights = use

	def use_last_self_attention(use=True):
	for name, module in unet.named_modules():
	module_name = type(module).__name__
	if module_name == "CrossAttention" and "attn1" in name:
	module.use_last_attn_slice = use

	def save_last_tokens_attention(save=True):
	for name, module in unet.named_modules():
	module_name = type(module).__name__
	if module_name == "CrossAttention" and "attn2" in name:
	module.save_last_attn_slice = save

	def save_last_self_attention(save=True):
	for name, module in unet.named_modules():
	module_name = type(module).__name__
	if module_name == "CrossAttention" and "attn1" in name:
	module.save_last_attn_slice = save
	####################################


	##### BASELINE ALGORITHM, ONLY USED NOW FOR SDEDIT ####3

	@torch.no_grad()
	def baseline_stablediffusion(prompt="",
	prompt_edit=None,
	null_prompt='',
	prompt_edit_token_weights=[],
	prompt_edit_tokens_start=0.0,
	prompt_edit_tokens_end=1.0,
	prompt_edit_spatial_start=0.0,
	prompt_edit_spatial_end=1.0,
	clip_start=0.0,
	clip_end=1.0,
	guidance_scale=7,
	steps=50,
	seed=1,
	width=512, height=512,
	init_image=None, init_image_strength=0.5,
	fixed_starting_latent = None,
	prev_image= None,
	grid=None,
	clip_guidance=None,
	clip_guidance_scale=1,
	num_cutouts=4,
	cut_power=1,
	scheduler_str='lms',
	return_latent=False,
	one_pass=False,
	normalize_noise_pred=False):
	width = width - width % 64
	height = height - height % 64

	#If seed is None, randomly select seed from 0 to 2^32-1
	if seed is None: seed = random.randrange(2**32 - 1)
	generator = torch.cuda.manual_seed(seed)

	#Set inference timesteps to scheduler
	scheduler_dict = {'ddim':DDIMScheduler,
	'lms':LMSDiscreteScheduler,
	'pndm':PNDMScheduler,
	'ddpm':DDPMScheduler}
	scheduler_call = scheduler_dict[scheduler_str]
	if scheduler_str == 'ddim':
	scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012,
	beta_schedule="scaled_linear",
	clip_sample=False, set_alpha_to_one=False)
	else:
	scheduler = scheduler_call(beta_schedule="scaled_linear",
	num_train_timesteps=1000)

	scheduler.set_timesteps(steps)
	if prev_image is not None:
	prev_scheduler = LMSDiscreteScheduler(beta_start=0.00085,
	beta_end=0.012,
	beta_schedule="scaled_linear",
	num_train_timesteps=1000)
	prev_scheduler.set_timesteps(steps)

	#Preprocess image if it exists (img2img)
	if init_image is not None:
	init_image = init_image.resize((width, height), resample=Image.Resampling.LANCZOS)
	init_image = np.array(init_image).astype(np_dtype) / 255.0 * 2.0 - 1.0
	init_image = torch.from_numpy(init_image[np.newaxis, ...].transpose(0, 3, 1, 2))

	#If there is alpha channel, composite alpha for white, as the diffusion model does not support alpha channel
	if init_image.shape[1] > 3:
	init_image = init_image[:, :3] * init_image[:, 3:] + (1 - init_image[:, 3:])

	#Move image to GPU
	init_image = init_image.to(device)

	#Encode image
	with autocast(device):
	init_latent = vae.encode(init_image).latent_dist.sample(generator=generator) * 0.18215

	t_start = steps - int(steps * init_image_strength)

	else:
	init_latent = torch.zeros((1, unet.in_channels, height // 8, width // 8), device=device)
	t_start = 0

	#Generate random normal noise
	if fixed_starting_latent is None:
	noise = torch.randn(init_latent.shape, generator=generator, device=device, dtype=unet.dtype)
	if scheduler_str == 'ddim':
	if init_image is not None:
	raise notImplementedError
	latent = scheduler.add_noise(init_latent, noise,
	1000 - int(1000 * init_image_strength)).to(device)
	else:
	latent = noise
	else:
	latent = scheduler.add_noise(init_latent, noise,
	t_start).to(device)
	else:
	latent = fixed_starting_latent
	t_start = steps - int(steps * init_image_strength)

	if prev_image is not None:
	#Resize and prev_image for numpy b h w c -> torch b c h w
	prev_image = prev_image.resize((width, height), resample=Image.Resampling.LANCZOS)
	prev_image = np.array(prev_image).astype(np_dtype) / 255.0 * 2.0 - 1.0
	prev_image = torch.from_numpy(prev_image[np.newaxis, ...].transpose(0, 3, 1, 2))

	#If there is alpha channel, composite alpha for white, as the diffusion model does not support alpha channel
	if prev_image.shape[1] > 3:
	prev_image = prev_image[:, :3] * prev_image[:, 3:] + (1 - prev_image[:, 3:])

	#Move image to GPU
	prev_image = prev_image.to(device)

	#Encode image
	with autocast(device):
	prev_init_latent = vae.encode(prev_image).latent_dist.sample(generator=generator) * 0.18215

	t_start = steps - int(steps * init_image_strength)

	prev_latent = prev_scheduler.add_noise(prev_init_latent, noise, t_start).to(device)
	else:
	prev_latent = None


	#Process clip
	with autocast(device):
	tokens_unconditional = clip_tokenizer(null_prompt, padding="max_length", max_length=clip_tokenizer.model_max_length, truncation=True, return_tensors="pt", return_overflowing_tokens=True)
	embedding_unconditional = clip(tokens_unconditional.input_ids.to(device)).last_hidden_state

	tokens_conditional = clip_tokenizer(prompt, padding="max_length", max_length=clip_tokenizer.model_max_length, truncation=True, return_tensors="pt", return_overflowing_tokens=True)
	embedding_conditional = clip(tokens_conditional.input_ids.to(device)).last_hidden_state

	#Process prompt editing
	assert not ((prompt_edit is not None) and (prev_image is not None))
	if prompt_edit is not None:
	tokens_conditional_edit = clip_tokenizer(prompt_edit, padding="max_length", max_length=clip_tokenizer.model_max_length, truncation=True, return_tensors="pt", return_overflowing_tokens=True)
	embedding_conditional_edit = clip(tokens_conditional_edit.input_ids.to(device)).last_hidden_state
	init_attention_edit(tokens_conditional, tokens_conditional_edit)
	elif prev_image is not None:
	init_attention_edit(tokens_conditional, tokens_conditional)


	init_attention_func()
	init_attention_weights(prompt_edit_token_weights)

	timesteps = scheduler.timesteps[t_start:]
	# print(timesteps)

	assert isinstance(guidance_scale, int)
	num_cycles = 1 # guidance_scale + 1

	last_noise_preds = None
	for i, t in tqdm(enumerate(timesteps), total=len(timesteps)):
	t_index = t_start + i

	latent_model_input = latent
	if scheduler_str=='lms':
	sigma = scheduler.sigmas[t_index] # last is first and first is last
	latent_model_input = (latent_model_input / ((sigma2 + 1) 0.5)).to(unet.dtype)
	else:
	assert scheduler_str in ['ddim', 'pndm', 'ddpm']

	#Predict the unconditional noise residual

	if len(t.shape) == 0:
	t = t[None].to(unet.device)
	noise_pred_uncond = unet(latent_model_input, t, encoder_hidden_states=embedding_unconditional,
	).sample

	if prev_latent is not None:
	prev_latent_model_input = prev_latent
	prev_latent_model_input = (prev_latent_model_input / ((sigma2 + 1) 0.5)).to(unet.dtype)
	prev_noise_pred_uncond = unet(prev_latent_model_input, t,
	encoder_hidden_states=embedding_unconditional,
	).sample
	# noise_pred_uncond = unet(latent_model_input, t,
	# encoder_hidden_states=embedding_unconditional)['sample']

	#Prepare the Cross-Attention layers
	if prompt_edit is not None or prev_latent is not None:
	save_last_tokens_attention()
	save_last_self_attention()
	else:
	#Use weights on non-edited prompt when edit is None
	use_last_tokens_attention_weights()

	#Predict the conditional noise residual and save the cross-attention layer activations
	if prev_latent is not None:
	raise NotImplementedError # I totally lost track of what this is
	prev_noise_pred_cond = unet(prev_latent_model_input, t, encoder_hidden_states=embedding_conditional,
	).sample
	else:
	noise_pred_cond = unet(latent_model_input, t, encoder_hidden_states=embedding_conditional,
	).sample

	#Edit the Cross-Attention layer activations
	t_scale = t / scheduler.num_train_timesteps
	if prompt_edit is not None or prev_latent is not None:
	if t_scale >= prompt_edit_tokens_start and t_scale <= prompt_edit_tokens_end:
	use_last_tokens_attention()
	if t_scale >= prompt_edit_spatial_start and t_scale <= prompt_edit_spatial_end:
	use_last_self_attention()

	#Use weights on edited prompt
	use_last_tokens_attention_weights()

	#Predict the edited conditional noise residual using the cross-attention masks
	if prompt_edit is not None:
	noise_pred_cond = unet(latent_model_input, t,
	encoder_hidden_states=embedding_conditional_edit).sample

	#Perform guidance
	# if i%(num_cycles)==0: # cycle_i+1==num_cycles:
	"""
	if cycle_i+1==num_cycles:
	noise_pred = noise_pred_uncond
	else:
	noise_pred = noise_pred_cond - noise_pred_uncond

	"""
	if last_noise_preds is not None:
	# print( (last_noise_preds[0]noise_pred_uncond).sum(), (last_noise_preds[1]noise_pred_cond).sum())
	# print(F.cosine_similarity(last_noise_preds[0].flatten(), noise_pred_uncond.flatten(), dim=0),
	# F.cosine_similarity(last_noise_preds[1].flatten(), noise_pred_cond.flatten(), dim=0))
	last_grad= last_noise_preds[1] - last_noise_preds[0]
	new_grad = noise_pred_cond - noise_pred_uncond
	# print( F.cosine_similarity(last_grad.flatten(), new_grad.flatten(), dim=0))
	last_noise_preds = (noise_pred_uncond, noise_pred_cond)

	use_cond_guidance = True
	if use_cond_guidance:
	noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
	else:
	noise_pred = noise_pred_uncond
	if clip_guidance is not None and t_scale >= clip_start and t_scale <= clip_end:
	noise_pred, latent = new_cond_fn(latent, t, t_index,
	embedding_conditional, noise_pred,clip_guidance,
	clip_guidance_scale,
	num_cutouts,
	scheduler, unet,use_cutouts=True,
	cut_power=cut_power)
	if normalize_noise_pred:
	noise_pred = noise_pred * noise_pred_uncond.norm() / noise_pred.norm()
	if scheduler_str == 'ddim':
	latent = forward_step(scheduler, noise_pred,
	t,
	latent).prev_sample
	else:
	latent = scheduler.step(noise_pred,
	t_index,
	latent).prev_sample

	if prev_latent is not None:
	prev_noise_pred = prev_noise_pred_uncond + guidance_scale * (prev_noise_pred_cond - prev_noise_pred_uncond)
	prev_latent = prev_scheduler.step(prev_noise_pred, t_index, prev_latent).prev_sample
	if one_pass: break

	#scale and decode the image latents with vae
	if return_latent: return latent
	latent = latent / 0.18215
	image = vae.decode(latent.to(vae.dtype)).sample

	image = (image / 2 + 0.5).clamp(0, 1)
	image = image.cpu().permute(0, 2, 3, 1).numpy()

	image, _ = check_safety(image)

	image = (image[0] * 255).round().astype("uint8")
	return Image.fromarray(image)
	####################################

	#### HELPER FUNCTIONS FOR OUR METHOD #####

	def get_alpha_and_beta(t, scheduler):
	# want to run this for both current and previous timnestep
	if t.dtype==torch.long:
	alpha = scheduler.alphas_cumprod[t]
	return alpha, 1-alpha

	if t<0:
	return scheduler.final_alpha_cumprod, 1 - scheduler.final_alpha_cumprod


	low = t.floor().long()
	high = t.ceil().long()
	rem = t - low

	low_alpha = scheduler.alphas_cumprod[low]
	high_alpha = scheduler.alphas_cumprod[high]
	interpolated_alpha = low_alpha * rem + high_alpha * (1-rem)
	interpolated_beta = 1 - interpolated_alpha
	return interpolated_alpha, interpolated_beta


	# A DDIM forward step function
	def forward_step(
	self,
	model_output,
	timestep: int,
	sample,
	eta: float = 0.0,
	use_clipped_model_output: bool = False,
	generator=None,
	return_dict: bool = True,
	use_double=False,
	) :
	if self.num_inference_steps is None:
	raise ValueError(
	"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
	)

	prev_timestep = timestep - self.config.num_train_timesteps / self.num_inference_steps

	if timestep > self.timesteps.max():
	raise NotImplementedError("Need to double check what the overflow is")

	alpha_prod_t, beta_prod_t = get_alpha_and_beta(timestep, self)
	alpha_prod_t_prev, _ = get_alpha_and_beta(prev_timestep, self)


	alpha_quotient = ((alpha_prod_t / alpha_prod_t_prev)**0.5)
	first_term = (1./alpha_quotient) * sample
	second_term = (1./alpha_quotient) * (beta_prod_t ** 0.5) * model_output
	third_term = ((1 - alpha_prod_t_prev)*0.5) model_output
	return first_term - second_term + third_term

	# A DDIM reverse step function, the inverse of above
	def reverse_step(
	self,
	model_output,
	timestep: int,
	sample,
	eta: float = 0.0,
	use_clipped_model_output: bool = False,
	generator=None,
	return_dict: bool = True,
	use_double=False,
	) :
	if self.num_inference_steps is None:
	raise ValueError(
	"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
	)

	prev_timestep = timestep - self.config.num_train_timesteps / self.num_inference_steps

	if timestep > self.timesteps.max():
	raise NotImplementedError
	else:
	alpha_prod_t = self.alphas_cumprod[timestep]

	alpha_prod_t, beta_prod_t = get_alpha_and_beta(timestep, self)
	alpha_prod_t_prev, _ = get_alpha_and_beta(prev_timestep, self)

	alpha_quotient = ((alpha_prod_t / alpha_prod_t_prev)**0.5)

	first_term = alpha_quotient * sample
	second_term = ((beta_prod_t)*0.5) model_output
	third_term = alpha_quotient * ((1 - alpha_prod_t_prev)*0.5) model_output
	return first_term + second_term - third_term




	@torch.no_grad()
	def latent_to_image(latent):
	image = vae.decode(latent.to(vae.dtype)/0.18215).sample
	image = prep_image_for_return(image)
	return image

	def prep_image_for_return(image):
	image = (image / 2 + 0.5).clamp(0, 1)
	image = image.cpu().permute(0, 2, 3, 1).numpy()
	image = (image[0] * 255).round().astype("uint8")
	image = Image.fromarray(image)
	return image

	#############################

	##### MAIN EDICT FUNCTION #######
	# Use EDICT_editing to perform calls

	@torch.no_grad()
	def coupled_stablediffusion(prompt="",
	prompt_edit=None,
	null_prompt='',
	prompt_edit_token_weights=[],
	prompt_edit_tokens_start=0.0,
	prompt_edit_tokens_end=1.0,
	prompt_edit_spatial_start=0.0,
	prompt_edit_spatial_end=1.0,
	guidance_scale=7.0, steps=50,
	seed=1, width=512, height=512,
	init_image=None, init_image_strength=1.0,
	run_baseline=False,
	use_lms=False,
	leapfrog_steps=True,
	reverse=False,
	return_latents=False,
	fixed_starting_latent=None,
	beta_schedule='scaled_linear',
	mix_weight=0.93):
	#If seed is None, randomly select seed from 0 to 2^32-1
	if seed is None: seed = random.randrange(2**32 - 1)
	generator = torch.cuda.manual_seed(seed)

	def image_to_latent(im):
	if isinstance(im, torch.Tensor):
	# assume it's the latent
	# used to avoid clipping new generation before inversion
	init_latent = im.to(device)
	else:
	#Resize and transpose for numpy b h w c -> torch b c h w
	im = im.resize((width, height), resample=Image.Resampling.LANCZOS)
	im = np.array(im).astype(np_dtype) / 255.0 * 2.0 - 1.0
	# check if black and white
	if len(im.shape) < 3:
	im = np.stack([im for _ in range(3)], axis=2) # putting at end b/c channels

	im = torch.from_numpy(im[np.newaxis, ...].transpose(0, 3, 1, 2))

	#If there is alpha channel, composite alpha for white, as the diffusion model does not support alpha channel
	if im.shape[1] > 3:
	im = im[:, :3] * im[:, 3:] + (1 - im[:, 3:])

	#Move image to GPU
	im = im.to(device)
	#Encode image
	if use_half_prec:
	init_latent = vae.encode(im).latent_dist.sample(generator=generator) * 0.18215
	else:
	with autocast(device):
	init_latent = vae.encode(im).latent_dist.sample(generator=generator) * 0.18215
	return init_latent
	assert not use_lms, "Can't invert LMS the same as DDIM"
	if run_baseline: leapfrog_steps=False
	#Change size to multiple of 64 to prevent size mismatches inside model
	width = width - width % 64
	height = height - height % 64


	#Preprocess image if it exists (img2img)
	if init_image is not None:
	assert reverse # want to be performing deterministic noising
	# can take either pair (output of generative process) or single image
	if isinstance(init_image, list):
	if isinstance(init_image[0], torch.Tensor):
	init_latent = [t.clone() for t in init_image]
	else:
	init_latent = [image_to_latent(im) for im in init_image]
	else:
	init_latent = image_to_latent(init_image)
	# this is t_start for forward, t_end for reverse
	t_limit = steps - int(steps * init_image_strength)
	else:
	assert not reverse, 'Need image to reverse from'
	init_latent = torch.zeros((1, unet.in_channels, height // 8, width // 8), device=device)
	t_limit = 0

	if reverse:
	latent = init_latent
	else:
	#Generate random normal noise
	noise = torch.randn(init_latent.shape,
	generator=generator,
	device=device,
	dtype=torch_dtype)
	if fixed_starting_latent is None:
	latent = noise
	else:
	if isinstance(fixed_starting_latent, list):
	latent = [l.clone() for l in fixed_starting_latent]
	else:
	latent = fixed_starting_latent.clone()
	t_limit = steps - int(steps * init_image_strength)
	if isinstance(latent, list): # initializing from pair of images
	latent_pair = latent
	else: # initializing from noise
	latent_pair = [latent.clone(), latent.clone()]


	if steps==0:
	if init_image is not None:
	return image_to_latent(init_image)
	else:
	image = vae.decode(latent.to(vae.dtype) / 0.18215).sample
	return prep_image_for_return(image)

	#Set inference timesteps to scheduler
	schedulers = []
	for i in range(2):
	# num_raw_timesteps = max(1000, steps)
	scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012,
	beta_schedule=beta_schedule,
	num_train_timesteps=1000,
	clip_sample=False,
	set_alpha_to_one=False)
	scheduler.set_timesteps(steps)
	schedulers.append(scheduler)

	with autocast(device):
	# CLIP Text Embeddings
	tokens_unconditional = clip_tokenizer(null_prompt, padding="max_length",
	max_length=clip_tokenizer.model_max_length,
	truncation=True, return_tensors="pt",
	return_overflowing_tokens=True)
	embedding_unconditional = clip(tokens_unconditional.input_ids.to(device)).last_hidden_state

	tokens_conditional = clip_tokenizer(prompt, padding="max_length",
	max_length=clip_tokenizer.model_max_length,
	truncation=True, return_tensors="pt",
	return_overflowing_tokens=True)
	embedding_conditional = clip(tokens_conditional.input_ids.to(device)).last_hidden_state

	#Process prompt editing (if running Prompt-to-Prompt)
	if prompt_edit is not None:
	tokens_conditional_edit = clip_tokenizer(prompt_edit, padding="max_length",
	max_length=clip_tokenizer.model_max_length,
	truncation=True, return_tensors="pt",
	return_overflowing_tokens=True)
	embedding_conditional_edit = clip(tokens_conditional_edit.input_ids.to(device)).last_hidden_state

	init_attention_edit(tokens_conditional, tokens_conditional_edit)

	init_attention_func()
	init_attention_weights(prompt_edit_token_weights)

	timesteps = schedulers[0].timesteps[t_limit:]
	if reverse: timesteps = timesteps.flip(0)

	for i, t in tqdm(enumerate(timesteps), total=len(timesteps)):
	t_scale = t / schedulers[0].num_train_timesteps

	if (reverse) and (not run_baseline):
	# Reverse mixing layer
	new_latents = [l.clone() for l in latent_pair]
	new_latents[1] = (new_latents[1].clone() - (1-mix_weight)*new_latents[0].clone()) / mix_weight
	new_latents[0] = (new_latents[0].clone() - (1-mix_weight)*new_latents[1].clone()) / mix_weight
	latent_pair = new_latents

	# alternate EDICT steps
	for latent_i in range(2):
	if run_baseline and latent_i==1: continue # just have one sequence for baseline
	# this modifies latent_pair[i] while using
	# latent_pair[(i+1)%2]
	if reverse and (not run_baseline):
	if leapfrog_steps:
	# what i would be from going other way
	orig_i = len(timesteps) - (i+1)
	offset = (orig_i+1) % 2
	latent_i = (latent_i + offset) % 2
	else:
	# Do 1 then 0
	latent_i = (latent_i+1)%2
	else:
	if leapfrog_steps:
	offset = i%2
	latent_i = (latent_i + offset) % 2

	latent_j = ((latent_i+1) % 2) if not run_baseline else latent_i

	latent_model_input = latent_pair[latent_j]
	latent_base = latent_pair[latent_i]

	#Predict the unconditional noise residual
	noise_pred_uncond = unet(latent_model_input, t,
	encoder_hidden_states=embedding_unconditional).sample

	#Prepare the Cross-Attention layers
	if prompt_edit is not None:
	save_last_tokens_attention()
	save_last_self_attention()
	else:
	#Use weights on non-edited prompt when edit is None
	use_last_tokens_attention_weights()

	#Predict the conditional noise residual and save the cross-attention layer activations
	noise_pred_cond = unet(latent_model_input, t,
	encoder_hidden_states=embedding_conditional).sample

	#Edit the Cross-Attention layer activations
	if prompt_edit is not None:
	t_scale = t / schedulers[0].num_train_timesteps
	if t_scale >= prompt_edit_tokens_start and t_scale <= prompt_edit_tokens_end:
	use_last_tokens_attention()
	if t_scale >= prompt_edit_spatial_start and t_scale <= prompt_edit_spatial_end:
	use_last_self_attention()

	#Use weights on edited prompt
	use_last_tokens_attention_weights()

	#Predict the edited conditional noise residual using the cross-attention masks
	noise_pred_cond = unet(latent_model_input,
	t,
	encoder_hidden_states=embedding_conditional_edit).sample

	#Perform guidance
	grad = (noise_pred_cond - noise_pred_uncond)
	noise_pred = noise_pred_uncond + guidance_scale * grad


	step_call = reverse_step if reverse else forward_step
	new_latent = step_call(schedulers[latent_i],
	noise_pred,
	t,
	latent_base)# .prev_sample
	new_latent = new_latent.to(latent_base.dtype)

	latent_pair[latent_i] = new_latent

	if (not reverse) and (not run_baseline):
	# Mixing layer (contraction) during generative process
	new_latents = [l.clone() for l in latent_pair]
	new_latents[0] = (mix_weightnew_latents[0] + (1-mix_weight)new_latents[1]).clone()
	new_latents[1] = ((1-mix_weight)new_latents[0] + (mix_weight)new_latents[1]).clone()
	latent_pair = new_latents

	#scale and decode the image latents with vae, can return latents instead of images
	if reverse or return_latents:
	results = [latent_pair]
	return results if len(results)>1 else results[0]

	# decode latents to iamges
	images = []
	for latent_i in range(2):
	latent = latent_pair[latent_i] / 0.18215
	image = vae.decode(latent.to(vae.dtype)).sample
	images.append(image)

	# Return images
	return_arr = []
	for image in images:
	image = prep_image_for_return(image)
	return_arr.append(image)
	results = [return_arr]
	return results if len(results)>1 else results[0]