Spaces:

blanchon
/

TiM

Sleeping

App Files Files Community

TiM / tim /schedulers /transition.py

blanchon

Update

3ed0796 about 1 month ago

raw

history blame contribute delete

12 kB

	from typing import Callable
	import torch
	import torch.nn.functional as F
	from copy import deepcopy
	from .transports import Transport
	from tim.models.utils.funcs import expand_t_like_x


	def mean_flat(x):
	"""
	Take the mean over all non-batch dimensions.
	"""
	return torch.mean(x, dim=list(range(1, len(x.size()))))


	def sum_flat(x):
	"""
	Take the mean over all non-batch dimensions.
	"""
	return torch.sum(x, dim=list(range(1, len(x.size()))))


	class TransitionSchedule:
	def __init__(
	self,
	transport: Transport,
	diffusion_ratio: float = 0.0,
	consistency_ratio: float = 0.0,
	derivative_type: str = "dde",
	differential_epsilon: float = 0.005,
	weight_t_and_r: bool = True,
	weight_time_type: str = "constant",
	weight_time_tangent: bool = False,
	weight_time_sigmoid: bool = False,
	):
	self.transport = transport
	self.diffusion_ratio = diffusion_ratio
	self.consistency_ratio = consistency_ratio
	self.derivative_type = derivative_type
	self.differential_epsilon = differential_epsilon
	self.weight_t_and_r = weight_t_and_r
	self.weight_time_type = weight_time_type
	self.weight_time_tangent = weight_time_tangent
	self.weight_time_sigmoid = weight_time_sigmoid

	def sample_t_and_r(self, batch_size, dtype, device):
	t_1 = self.transport.sample_t(batch_size=batch_size, dtype=dtype, device=device)
	t_2 = self.transport.sample_t(batch_size=batch_size, dtype=dtype, device=device)
	# t is the larger one, and r is the smaller one
	t = torch.maximum(t_1, t_2)
	r = torch.minimum(t_1, t_2)
	# some samples with t=r, corresponding to diffusion training
	n_diffusion = round(self.diffusion_ratio * len(t))
	r[:n_diffusion] = t[:n_diffusion]
	# some samples with r=0, corresponding to consistency training
	n_consistency = round(self.consistency_ratio * len(t))
	if n_consistency != 0:
	r[-n_consistency:] = self.transport.T_min
	return t, r, n_diffusion

	def prepare_input(self, batch_size, x, z):
	# sample timestep according to log-normal distribution of sigmas following EDM
	t, r, n_diffusion = self.sample_t_and_r(
	batch_size=batch_size, dtype=x.dtype, device=x.device
	)
	# reshape (B, ) -> (B, 1, 1, 1)
	t, r = expand_t_like_x(t, x), expand_t_like_x(r, x)
	# prepere inputs
	alpha_t, sigma_t, d_alpha_t, d_sigma_t = self.transport.interpolant(t)
	x_t = alpha_t * x + sigma_t * z
	v_t = d_alpha_t * x + d_sigma_t * z
	return x_t, v_t, t, r, n_diffusion

	def model_forward(self, model, x_t, t, r, model_kwargs, rng_state):
	# model_input
	t_input = self.transport.c_noise(t.flatten())
	r_input = self.transport.c_noise(r.flatten())
	# model_output
	torch.cuda.set_rng_state(rng_state)
	model_output = model(x_t, t_input, r_input, **model_kwargs)
	return model_output

	@torch.no_grad()
	def jvp_derivative(
	self, model, x_t, v_t, t, r, model_kwargs, rng_state, n_diffusion
	):
	if n_diffusion == x_t.size(0):
	return 0
	_dF_dv_dt = torch.zeros_like(x_t)
	# only calculate the dF_dv_dt when t!=r
	x_t, v_t, t, r = (
	x_t[n_diffusion:],
	v_t[n_diffusion:],
	t[n_diffusion:],
	r[n_diffusion:],
	)
	for k, v in model_kwargs.items():
	if type(v) == torch.Tensor:
	model_kwargs[k] = model_kwargs[k][n_diffusion:]
	model_kwargs["return_zs"] = False

	def model_jvp(x_t, t, r):
	model_kwargs["attn_type"] = "vanilla_attn"
	model_kwargs["jvp"] = True
	t_input = self.transport.c_noise(t.flatten())
	r_input = self.transport.c_noise(r.flatten())
	return model(x_t, t_input, r_input, **model_kwargs)

	torch.cuda.set_rng_state(rng_state)
	F_pred, dF_dv_dt = torch.func.jvp(
	lambda x_t, t, r: model_jvp(x_t, t, r),
	(x_t, t, r),
	(v_t, torch.ones_like(t), torch.zeros_like(r)),
	)
	_dF_dv_dt[n_diffusion:] = dF_dv_dt
	return _dF_dv_dt

	@torch.no_grad()
	def dde_derivative(self, model, x, z, t, r, model_kwargs, rng_state, n_diffusion):
	if n_diffusion == x.size(0):
	return 0
	_dF_dv_dt = torch.zeros_like(x)
	# only calculate the dF_dv_dt when t!=r
	x, z, t, r = x[n_diffusion:], z[n_diffusion:], t[n_diffusion:], r[n_diffusion:]
	for k, v in model_kwargs.items():
	if type(v) == torch.Tensor:
	model_kwargs[k] = model_kwargs[k][n_diffusion:]
	model_kwargs["return_zs"] = False
	model_kwargs["jvp"] = True

	def xfunc(t):
	alpha_t, sigma_t, _, _ = self.transport.interpolant(t)
	x_t = alpha_t * x + sigma_t * z
	return self.model_forward(model, x_t, t, r, model_kwargs, rng_state)

	epsilon = self.differential_epsilon
	fc1_dt = 1 / (2 * epsilon)
	dF_dv_dt = xfunc(t + epsilon) * fc1_dt - xfunc(t - epsilon) * fc1_dt
	_dF_dv_dt[n_diffusion:] = dF_dv_dt
	return _dF_dv_dt

	def get_enhanced_target(self, model, x_t, t, model_kwargs, null_kwargs, rng_state):
	with torch.no_grad():
	t_input = self.transport.c_noise(t.flatten())
	if self.transport.w_cond > 0:
	F_t_cond = self.model_forward(
	model, x_t, t_input, t_input, model_kwargs, rng_state
	)
	else:
	F_t_cond = 0
	F_t_uncond = self.model_forward(
	model, x_t, t_input, t_input, null_kwargs, rng_state
	)
	return F_t_cond, F_t_uncond

	def time_weighting(self, t, r, n_diffusion):
	if self.weight_time_tangent:
	t, r = torch.tan(t), torch.tan(r)
	elif self.weight_time_sigmoid:
	t, r = t / (1 - t), r / (1 - r)
	if self.weight_t_and_r:
	delta_t = (t - r).flatten()
	else:
	delta_t = t.flatten()
	if self.weight_time_type == "constant":
	weight = torch.ones_like(delta_t)
	elif self.weight_time_type == "reciprocal":
	weight = 1 / (delta_t + self.transport.sigma_d)
	elif self.weight_time_type == "sqrt":
	weight = 1 / (delta_t + self.transport.sigma_d).sqrt()
	elif self.weight_time_type == "square":
	weight = 1 / (delta_t + self.transport.sigma_d) ** 2
	elif self.weight_time_type == "Soft-Min-SNR":
	weight = 1 / (delta_t2 + self.transport.sigma_d2)
	else:
	raise NotImplementedError
	weight[:n_diffusion] = 1.0
	return weight

	def adaptive_weighting(self, loss, eps=10e-6):
	weight = 1 / (loss.detach() + eps)
	return weight

	def __call__(
	self,
	model,
	ema_model,
	unwrapped_model,
	batch_size,
	x,
	z,
	model_kwargs,
	use_dir_loss=False,
	h_target=None,
	ema_kwargs={},
	null_kwargs={},
	):
	# prepare model input
	x_t, v_t, t, r, n_diffusion = self.prepare_input(batch_size, x, z)

	rng_state = torch.cuda.get_rng_state()
	# get prediction
	F_pred, h_proj = self.model_forward(model, x_t, t, r, model_kwargs, rng_state)
	# get target
	if self.derivative_type == "jvp":
	dF_dv_dt = self.jvp_derivative(
	unwrapped_model, x_t, v_t, t, r, model_kwargs, rng_state, n_diffusion
	)
	else:
	dF_dv_dt = self.dde_derivative(
	unwrapped_model, x, z, t, r, model_kwargs, rng_state, n_diffusion
	)

	if self.transport.enhance_target:
	F_t_cond, F_t_uncond = self.get_enhanced_target(
	ema_model, x_t, t, ema_kwargs, null_kwargs, rng_state
	)
	enhance_target = True
	else:
	F_t_cond, F_t_uncond, enhance_target = 0, 0, False
	F_target = self.transport.target(
	x_t, v_t, x, z, t, r, dF_dv_dt, F_t_cond, F_t_uncond, enhance_target
	)
	denoising_loss = mean_flat((F_pred - F_target) ** 2)
	denoising_loss = torch.nan_to_num(
	denoising_loss, nan=0, posinf=1e5, neginf=-1e5
	)

	if use_dir_loss:
	directional_loss = mean_flat(
	1 - F.cosine_similarity(F_pred, F_target, dim=1)
	)
	directional_loss = torch.nan_to_num(
	directional_loss, nan=0, posinf=1e5, neginf=-1e5
	)
	denoising_loss += directional_loss

	weight = self.time_weighting(t, r, n_diffusion) * self.adaptive_weighting(
	denoising_loss
	)
	weighted_loss = weight * denoising_loss
	weighted_loss = weighted_loss.mean()

	proj_loss = mean_flat(1 - torch.cosine_similarity(h_proj, h_target, dim=-1))
	proj_loss = torch.nan_to_num(proj_loss, nan=0, posinf=1e5, neginf=-1e5)
	proj_loss = proj_loss.mean()

	loss_dict = dict(
	weighted_loss=weighted_loss.detach().item(),
	denoising_loss=denoising_loss.mean().detach().item(),
	proj_loss=proj_loss.detach().item(),
	)

	return weighted_loss, proj_loss, loss_dict

	def forward_with_cfg(
	self, model, x_t, t, r, y, y_null, cfg_scale, cfg_low, cfg_high
	):
	apply_cfg = cfg_scale > 1.0 and t > cfg_low and t < cfg_high
	if apply_cfg:
	x_cur = torch.cat([x_t] * 2, dim=0)
	y_cur = torch.cat([y, y_null], dim=0)
	else:
	x_cur = x_t
	y_cur = y
	t_cur = torch.ones(x_cur.size(0)).to(x_cur) * self.transport.c_noise(t)
	r_cur = torch.ones(x_cur.size(0)).to(x_cur) * self.transport.c_noise(r)
	F_pred = model(x_cur, t_cur, r_cur, y_cur)
	if apply_cfg:
	F_cond, F_uncond = F_pred.chunk(2)
	F_pred = F_uncond + cfg_scale * (F_cond - F_uncond)
	return F_pred

	@torch.no_grad()
	def sample(
	self,
	model,
	y,
	y_null,
	z,
	T_max,
	T_min=0.0,
	num_steps=4,
	cfg_scale=1.0,
	cfg_low=0.0,
	cfg_high=1.0,
	stochasticity_ratio=0.0,
	sample_type: str = "transition", # 'transition', diffusion
	step_callback: Callable[[int], None] \| None = None,
	):
	_dtype = z.dtype
	t_steps = torch.linspace(T_max, T_min, num_steps + 1, dtype=torch.float64).to(z)
	cfg_low = cfg_low * T_max
	cfg_high = cfg_high * T_max

	x_cur = deepcopy(z).to(torch.float64)
	samples = [z]
	for i, (t_cur, t_next) in enumerate(zip(t_steps[:-1], t_steps[1:])):
	# x_{N} -> x_{N-1} -> ... -> x_{n+1} -> x_{n} -> x_{n-1} -> ... -> x_{1} -> x_{0}
	if sample_type == "transition":
	_t_next = t_next
	elif sample_type == "ddiffusion":
	_t_next = t_cur
	else:
	raise
	F_pred = self.forward_with_cfg(
	model,
	x_cur.to(_dtype),
	t_cur,
	_t_next,
	y,
	y_null,
	cfg_scale,
	cfg_low,
	cfg_high,
	).to(torch.float64)
	if stochasticity_ratio > 0.0 and t_cur < T_max and _t_next > T_min:
	s_ratio = stochasticity_ratio
	else:
	s_ratio = 0.0
	x_next = self.transport.from_x_t_to_x_r(
	x_cur, t_cur, t_next, F_pred, s_ratio
	)
	samples.append(x_next)
	x_cur = x_next
	if step_callback is not None:
	step_callback(i)

	return torch.stack(samples, dim=0).to(torch.float32)