Spaces:

silentchen
/

Shap_Editor_demo

Runtime error

App Files Files Community

Shap_Editor_demo / shap_e /models /transmitter /channels_encoder.py

silentchen

first commit

19c4ddf almost 2 years ago

raw

history blame

34.5 kB

	from abc import ABC, abstractmethod
	from dataclasses import dataclass
	from functools import partial
	from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

	import numpy as np
	import torch.distributed as dist
	import torch.nn as nn
	import torch.nn.functional as F
	from PIL import Image
	from torch import torch

	from shap_e.models.generation.perceiver import SimplePerceiver
	from shap_e.models.generation.transformer import Transformer
	from shap_e.models.nn.camera import DifferentiableProjectiveCamera
	from shap_e.models.nn.encoding import (
	MultiviewPointCloudEmbedding,
	MultiviewPoseEmbedding,
	PosEmbLinear,
	)
	from shap_e.models.nn.ops import PointSetEmbedding
	from shap_e.rendering.point_cloud import PointCloud
	from shap_e.rendering.view_data import ProjectiveCamera
	from shap_e.util.collections import AttrDict

	from .base import ChannelsEncoder


	class TransformerChannelsEncoder(ChannelsEncoder, ABC):
	"""
	Encode point clouds using a transformer model with an extra output
	token used to extract a latent vector.
	"""

	def __init__(
	self,
	*,
	device: torch.device,
	dtype: torch.dtype,
	param_shapes: Dict[str, Tuple[int]],
	params_proj: Dict[str, Any],
	d_latent: int = 512,
	latent_bottleneck: Optional[Dict[str, Any]] = None,
	latent_warp: Optional[Dict[str, Any]] = None,
	n_ctx: int = 1024,
	width: int = 512,
	layers: int = 12,
	heads: int = 8,
	init_scale: float = 0.25,
	latent_scale: float = 1.0,
	):
	super().__init__(
	device=device,
	param_shapes=param_shapes,
	params_proj=params_proj,
	d_latent=d_latent,
	latent_bottleneck=latent_bottleneck,
	latent_warp=latent_warp,
	)
	self.width = width
	self.device = device
	self.dtype = dtype

	self.n_ctx = n_ctx

	self.backbone = Transformer(
	device=device,
	dtype=dtype,
	n_ctx=n_ctx + self.latent_ctx,
	width=width,
	layers=layers,
	heads=heads,
	init_scale=init_scale,
	)
	self.ln_pre = nn.LayerNorm(width, device=device, dtype=dtype)
	self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
	self.register_parameter(
	"output_tokens",
	nn.Parameter(torch.randn(self.latent_ctx, width, device=device, dtype=dtype)),
	)
	self.output_proj = nn.Linear(width, d_latent, device=device, dtype=dtype)
	self.latent_scale = latent_scale

	@abstractmethod
	def encode_input(self, batch: AttrDict, options: Optional[AttrDict] = None) -> torch.Tensor:
	pass

	def encode_to_channels(
	self, batch: AttrDict, options: Optional[AttrDict] = None
	) -> torch.Tensor:
	h = self.encode_input(batch, options=options)
	h = torch.cat([h, self.output_tokens[None].repeat(len(h), 1, 1)], dim=1)
	h = self.ln_pre(h)
	h = self.backbone(h)
	h = h[:, -self.latent_ctx :]
	h = self.ln_post(h)
	h = self.output_proj(h)
	return h


	class PerceiverChannelsEncoder(ChannelsEncoder, ABC):
	"""
	Encode point clouds using a perceiver model with an extra output
	token used to extract a latent vector.
	"""

	def __init__(
	self,
	*,
	device: torch.device,
	dtype: torch.dtype,
	param_shapes: Dict[str, Tuple[int]],
	params_proj: Dict[str, Any],
	min_unrolls: int,
	max_unrolls: int,
	d_latent: int = 512,
	latent_bottleneck: Optional[Dict[str, Any]] = None,
	latent_warp: Optional[Dict[str, Any]] = None,
	width: int = 512,
	layers: int = 12,
	xattn_layers: int = 1,
	heads: int = 8,
	init_scale: float = 0.25,
	# Training hparams
	inner_batch_size: Union[int, List[int]] = 1,
	data_ctx: int = 1,
	):
	super().__init__(
	device=device,
	param_shapes=param_shapes,
	params_proj=params_proj,
	d_latent=d_latent,
	latent_bottleneck=latent_bottleneck,
	latent_warp=latent_warp,
	)
	self.width = width
	self.device = device
	self.dtype = dtype

	if isinstance(inner_batch_size, int):
	inner_batch_size = [inner_batch_size]
	self.inner_batch_size = inner_batch_size
	self.data_ctx = data_ctx
	self.min_unrolls = min_unrolls
	self.max_unrolls = max_unrolls

	encoder_fn = lambda inner_batch_size: SimplePerceiver(
	device=device,
	dtype=dtype,
	n_ctx=self.data_ctx + self.latent_ctx,
	n_data=inner_batch_size,
	width=width,
	layers=xattn_layers,
	heads=heads,
	init_scale=init_scale,
	)
	self.encoder = (
	encoder_fn(self.inner_batch_size[0])
	if len(self.inner_batch_size) == 1
	else nn.ModuleList([encoder_fn(inner_bsz) for inner_bsz in self.inner_batch_size])
	)
	self.processor = Transformer(
	device=device,
	dtype=dtype,
	n_ctx=self.data_ctx + self.latent_ctx,
	layers=layers - xattn_layers,
	width=width,
	heads=heads,
	init_scale=init_scale,
	)
	self.ln_pre = nn.LayerNorm(width, device=device, dtype=dtype)
	self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
	self.register_parameter(
	"output_tokens",
	nn.Parameter(torch.randn(self.latent_ctx, width, device=device, dtype=dtype)),
	)
	self.output_proj = nn.Linear(width, d_latent, device=device, dtype=dtype)

	@abstractmethod
	def get_h_and_iterator(
	self, batch: AttrDict, options: Optional[AttrDict] = None
	) -> Tuple[torch.Tensor, Iterable[Union[torch.Tensor, Tuple]]]:
	"""
	:return: a tuple of (
	the initial output tokens of size [batch_size, data_ctx + latent_ctx, width],
	an iterator over the given data
	)
	"""

	def encode_to_channels(
	self, batch: AttrDict, options: Optional[AttrDict] = None
	) -> torch.Tensor:
	h, it = self.get_h_and_iterator(batch, options=options)
	n_unrolls = self.get_n_unrolls()

	for _ in range(n_unrolls):
	data = next(it)
	if isinstance(data, tuple):
	for data_i, encoder_i in zip(data, self.encoder):
	h = encoder_i(h, data_i)
	else:
	h = self.encoder(h, data)
	h = self.processor(h)

	h = self.output_proj(self.ln_post(h[:, -self.latent_ctx :]))
	return h

	def get_n_unrolls(self):
	if self.training:
	n_unrolls = torch.randint(
	self.min_unrolls, self.max_unrolls + 1, size=(), device=self.device
	)
	dist.broadcast(n_unrolls, 0)
	n_unrolls = n_unrolls.item()
	else:
	n_unrolls = self.max_unrolls
	return n_unrolls


	@dataclass
	class DatasetIterator:

	embs: torch.Tensor # [batch_size, dataset_size, *shape]
	batch_size: int

	def __iter__(self):
	self._reset()
	return self

	def __next__(self):
	_outer_batch_size, dataset_size, *_shape = self.embs.shape

	while True:
	start = self.idx
	self.idx += self.batch_size
	end = self.idx
	if end <= dataset_size:
	break
	self._reset()

	return self.embs[:, start:end]

	def _reset(self):
	self._shuffle()
	self.idx = 0 # pylint: disable=attribute-defined-outside-init

	def _shuffle(self):
	outer_batch_size, dataset_size, *shape = self.embs.shape
	idx = torch.stack(
	[
	torch.randperm(dataset_size, device=self.embs.device)
	for _ in range(outer_batch_size)
	],
	dim=0,
	)
	idx = idx.view(outer_batch_size, dataset_size, ([1] len(shape)))
	idx = torch.broadcast_to(idx, self.embs.shape)
	self.embs = torch.gather(self.embs, 1, idx)


	class PointCloudTransformerChannelsEncoder(TransformerChannelsEncoder):
	"""
	Encode point clouds using a transformer model with an extra output
	token used to extract a latent vector.
	"""

	def __init__(
	self,
	*,
	input_channels: int = 6,
	**kwargs,
	):
	super().__init__(**kwargs)
	self.input_channels = input_channels
	self.input_proj = nn.Linear(
	input_channels, self.width, device=self.device, dtype=self.dtype
	)

	def encode_input(self, batch: AttrDict, options: Optional[AttrDict] = None) -> torch.Tensor:
	_ = options
	points = batch.points
	h = self.input_proj(points.permute(0, 2, 1)) # NCL -> NLC
	return h


	class PointCloudPerceiverChannelsEncoder(PerceiverChannelsEncoder):
	"""
	Encode point clouds using a transformer model with an extra output
	token used to extract a latent vector.
	"""

	def __init__(
	self,
	*,
	cross_attention_dataset: str = "pcl",
	fps_method: str = "fps",
	# point cloud hyperparameters
	input_channels: int = 6,
	pos_emb: Optional[str] = None,
	# multiview hyperparameters
	image_size: int = 256,
	patch_size: int = 32,
	pose_dropout: float = 0.0,
	use_depth: bool = False,
	max_depth: float = 5.0,
	# point conv hyperparameters
	pointconv_radius: float = 0.5,
	pointconv_samples: int = 32,
	pointconv_hidden: Optional[List[int]] = None,
	pointconv_patch_size: int = 1,
	pointconv_stride: int = 1,
	pointconv_padding_mode: str = "zeros",
	use_pointconv: bool = False,
	# other hyperparameters
	**kwargs,
	):
	super().__init__(**kwargs)
	assert cross_attention_dataset in (
	"pcl",
	"multiview",
	"dense_pose_multiview",
	"multiview_pcl",
	"pcl_and_multiview_pcl",
	"incorrect_multiview_pcl",
	"pcl_and_incorrect_multiview_pcl",
	)
	assert fps_method in ("fps", "first")
	self.cross_attention_dataset = cross_attention_dataset
	self.fps_method = fps_method
	self.input_channels = input_channels
	self.input_proj = PosEmbLinear(
	pos_emb,
	input_channels,
	self.width,
	device=self.device,
	dtype=self.dtype,
	)
	self.use_pointconv = use_pointconv
	if use_pointconv:
	if pointconv_hidden is None:
	pointconv_hidden = [self.width]
	self.point_conv = PointSetEmbedding(
	n_point=self.data_ctx,
	radius=pointconv_radius,
	n_sample=pointconv_samples,
	d_input=self.input_proj.weight.shape[0],
	d_hidden=pointconv_hidden,
	patch_size=pointconv_patch_size,
	stride=pointconv_stride,
	padding_mode=pointconv_padding_mode,
	fps_method=fps_method,
	device=self.device,
	dtype=self.dtype,
	)
	if self.cross_attention_dataset == "multiview":
	self.image_size = image_size
	self.patch_size = patch_size
	self.pose_dropout = pose_dropout
	self.use_depth = use_depth
	self.max_depth = max_depth
	pos_ctx = (image_size // patch_size) ** 2
	self.register_parameter(
	"pos_emb",
	nn.Parameter(
	torch.randn(
	pos_ctx * self.inner_batch_size,
	self.width,
	device=self.device,
	dtype=self.dtype,
	)
	),
	)
	self.patch_emb = nn.Conv2d(
	in_channels=3 if not use_depth else 4,
	out_channels=self.width,
	kernel_size=patch_size,
	stride=patch_size,
	device=self.device,
	dtype=self.dtype,
	)
	self.camera_emb = nn.Sequential(
	nn.Linear(
	3 * 4 + 1, self.width, device=self.device, dtype=self.dtype
	), # input size is for origin+x+y+z+fov
	nn.GELU(),
	nn.Linear(self.width, 2 * self.width, device=self.device, dtype=self.dtype),
	)
	elif self.cross_attention_dataset == "dense_pose_multiview":
	# The number of output features is halved, because a patch_size of
	# 32 ends up with a large patch_emb weight.
	self.view_pose_width = self.width // 2
	self.image_size = image_size
	self.patch_size = patch_size
	self.use_depth = use_depth
	self.max_depth = max_depth
	self.mv_pose_embed = MultiviewPoseEmbedding(
	posemb_version="nerf",
	n_channels=4 if self.use_depth else 3,
	out_features=self.view_pose_width,
	device=self.device,
	dtype=self.dtype,
	)
	pos_ctx = (image_size // patch_size) ** 2
	# Positional embedding is unnecessary because pose information is baked into each pixel
	self.patch_emb = nn.Conv2d(
	in_channels=self.view_pose_width,
	out_channels=self.width,
	kernel_size=patch_size,
	stride=patch_size,
	device=self.device,
	dtype=self.dtype,
	)

	elif (
	self.cross_attention_dataset == "multiview_pcl"
	or self.cross_attention_dataset == "incorrect_multiview_pcl"
	):
	self.view_pose_width = self.width // 2
	self.image_size = image_size
	self.patch_size = patch_size
	self.max_depth = max_depth
	assert use_depth
	self.mv_pcl_embed = MultiviewPointCloudEmbedding(
	posemb_version="nerf",
	n_channels=3,
	out_features=self.view_pose_width,
	device=self.device,
	dtype=self.dtype,
	)
	self.patch_emb = nn.Conv2d(
	in_channels=self.view_pose_width,
	out_channels=self.width,
	kernel_size=patch_size,
	stride=patch_size,
	device=self.device,
	dtype=self.dtype,
	)

	elif (
	self.cross_attention_dataset == "pcl_and_multiview_pcl"
	or self.cross_attention_dataset == "pcl_and_incorrect_multiview_pcl"
	):
	self.view_pose_width = self.width // 2
	self.image_size = image_size
	self.patch_size = patch_size
	self.max_depth = max_depth
	assert use_depth
	self.mv_pcl_embed = MultiviewPointCloudEmbedding(
	posemb_version="nerf",
	n_channels=3,
	out_features=self.view_pose_width,
	device=self.device,
	dtype=self.dtype,
	)
	self.patch_emb = nn.Conv2d(
	in_channels=self.view_pose_width,
	out_channels=self.width,
	kernel_size=patch_size,
	stride=patch_size,
	device=self.device,
	dtype=self.dtype,
	)

	def get_h_and_iterator(
	self, batch: AttrDict, options: Optional[AttrDict] = None
	) -> Tuple[torch.Tensor, Iterable]:
	"""
	:return: a tuple of (
	the initial output tokens of size [batch_size, data_ctx + latent_ctx, width],
	an iterator over the given data
	)
	"""
	options = AttrDict() if options is None else options

	# Build the initial query embeddings
	points = batch.points.permute(0, 2, 1) # NCL -> NLC
	if self.use_pointconv:
	points = self.input_proj(points).permute(0, 2, 1) # NLC -> NCL
	xyz = batch.points[:, :3]
	data_tokens = self.point_conv(xyz, points).permute(0, 2, 1) # NCL -> NLC
	else:
	fps_samples = self.sample_pcl_fps(points)
	data_tokens = self.input_proj(fps_samples)
	batch_size = points.shape[0]
	latent_tokens = self.output_tokens.unsqueeze(0).repeat(batch_size, 1, 1)
	h = self.ln_pre(torch.cat([data_tokens, latent_tokens], dim=1))
	assert h.shape == (batch_size, self.data_ctx + self.latent_ctx, self.width)

	# Build the dataset embedding iterator
	dataset_fn = {
	"pcl": self.get_pcl_dataset,
	"multiview": self.get_multiview_dataset,
	"dense_pose_multiview": self.get_dense_pose_multiview_dataset,
	"pcl_and_multiview_pcl": self.get_pcl_and_multiview_pcl_dataset,
	"multiview_pcl": self.get_multiview_pcl_dataset,
	}[self.cross_attention_dataset]
	it = dataset_fn(batch, options=options)

	return h, it

	def sample_pcl_fps(self, points: torch.Tensor) -> torch.Tensor:
	return sample_pcl_fps(points, data_ctx=self.data_ctx, method=self.fps_method)

	def get_pcl_dataset(
	self,
	batch: AttrDict,
	options: Optional[AttrDict[str, Any]] = None,
	inner_batch_size: Optional[int] = None,
	) -> Iterable:
	_ = options
	if inner_batch_size is None:
	inner_batch_size = self.inner_batch_size[0]
	points = batch.points.permute(0, 2, 1) # NCL -> NLC
	dataset_emb = self.input_proj(points)
	assert dataset_emb.shape[1] >= inner_batch_size
	return iter(DatasetIterator(dataset_emb, batch_size=inner_batch_size))

	def get_multiview_dataset(
	self,
	batch: AttrDict,
	options: Optional[AttrDict] = None,
	inner_batch_size: Optional[int] = None,
	) -> Iterable:
	_ = options

	if inner_batch_size is None:
	inner_batch_size = self.inner_batch_size[0]

	dataset_emb = self.encode_views(batch)
	batch_size, num_views, n_patches, width = dataset_emb.shape

	assert num_views >= inner_batch_size

	it = iter(DatasetIterator(dataset_emb, batch_size=inner_batch_size))

	def gen():
	while True:
	examples = next(it)
	assert examples.shape == (batch_size, self.inner_batch_size, n_patches, self.width)
	views = examples.reshape(batch_size, -1, width) + self.pos_emb
	yield views

	return gen()

	def get_dense_pose_multiview_dataset(
	self,
	batch: AttrDict,
	options: Optional[AttrDict] = None,
	inner_batch_size: Optional[int] = None,
	) -> Iterable:
	_ = options

	if inner_batch_size is None:
	inner_batch_size = self.inner_batch_size[0]

	dataset_emb = self.encode_dense_pose_views(batch)
	batch_size, num_views, n_patches, width = dataset_emb.shape

	assert num_views >= inner_batch_size

	it = iter(DatasetIterator(dataset_emb, batch_size=inner_batch_size))

	def gen():
	while True:
	examples = next(it)
	assert examples.shape == (batch_size, inner_batch_size, n_patches, self.width)
	views = examples.reshape(batch_size, -1, width)
	yield views

	return gen()

	def get_pcl_and_multiview_pcl_dataset(
	self,
	batch: AttrDict,
	options: Optional[AttrDict] = None,
	use_distance: bool = True,
	) -> Iterable:
	_ = options

	pcl_it = self.get_pcl_dataset(
	batch, options=options, inner_batch_size=self.inner_batch_size[0]
	)
	multiview_pcl_emb = self.encode_multiview_pcl(batch, use_distance=use_distance)
	batch_size, num_views, n_patches, width = multiview_pcl_emb.shape

	assert num_views >= self.inner_batch_size[1]

	multiview_pcl_it = iter(
	DatasetIterator(multiview_pcl_emb, batch_size=self.inner_batch_size[1])
	)

	def gen():
	while True:
	pcl = next(pcl_it)
	multiview_pcl = next(multiview_pcl_it)
	assert multiview_pcl.shape == (
	batch_size,
	self.inner_batch_size[1],
	n_patches,
	self.width,
	)
	yield pcl, multiview_pcl.reshape(batch_size, -1, width)

	return gen()

	def get_multiview_pcl_dataset(
	self,
	batch: AttrDict,
	options: Optional[AttrDict] = None,
	inner_batch_size: Optional[int] = None,
	use_distance: bool = True,
	) -> Iterable:
	_ = options

	if inner_batch_size is None:
	inner_batch_size = self.inner_batch_size[0]

	multiview_pcl_emb = self.encode_multiview_pcl(batch, use_distance=use_distance)
	batch_size, num_views, n_patches, width = multiview_pcl_emb.shape

	assert num_views >= inner_batch_size

	multiview_pcl_it = iter(DatasetIterator(multiview_pcl_emb, batch_size=inner_batch_size))

	def gen():
	while True:
	multiview_pcl = next(multiview_pcl_it)
	assert multiview_pcl.shape == (
	batch_size,
	inner_batch_size,
	n_patches,
	self.width,
	)
	yield multiview_pcl.reshape(batch_size, -1, width)

	return gen()

	def encode_views(self, batch: AttrDict) -> torch.Tensor:
	"""
	:return: [batch_size, num_views, n_patches, width]
	"""
	all_views = self.views_to_tensor(batch.views).to(self.device)
	if self.use_depth:
	all_views = torch.cat([all_views, self.depths_to_tensor(batch.depths)], dim=2)
	all_cameras = self.cameras_to_tensor(batch.cameras).to(self.device)

	batch_size, num_views, _, _, _ = all_views.shape

	views_proj = self.patch_emb(
	all_views.reshape([batch_size * num_views, *all_views.shape[2:]])
	)
	views_proj = (
	views_proj.reshape([batch_size, num_views, self.width, -1])
	.permute(0, 1, 3, 2)
	.contiguous()
	) # [batch_size x num_views x n_patches x width]

	# [batch_size, num_views, 1, 2 * width]
	camera_proj = self.camera_emb(all_cameras).reshape(
	[batch_size, num_views, 1, self.width * 2]
	)
	pose_dropout = self.pose_dropout if self.training else 0.0
	mask = torch.rand(batch_size, 1, 1, 1, device=views_proj.device) >= pose_dropout
	camera_proj = torch.where(mask, camera_proj, torch.zeros_like(camera_proj))
	scale, shift = camera_proj.chunk(2, dim=3)
	views_proj = views_proj * (scale + 1.0) + shift
	return views_proj

	def encode_dense_pose_views(self, batch: AttrDict) -> torch.Tensor:
	"""
	:return: [batch_size, num_views, n_patches, width]
	"""
	all_views = self.views_to_tensor(batch.views).to(self.device)
	if self.use_depth:
	depths = self.depths_to_tensor(batch.depths)
	all_views = torch.cat([all_views, depths], dim=2)

	dense_poses, _ = self.dense_pose_cameras_to_tensor(batch.cameras)
	dense_poses = dense_poses.permute(0, 1, 4, 5, 2, 3)
	position, direction = dense_poses[:, :, 0], dense_poses[:, :, 1]
	all_view_poses = self.mv_pose_embed(all_views, position, direction)

	batch_size, num_views, _, _, _ = all_view_poses.shape

	views_proj = self.patch_emb(
	all_view_poses.reshape([batch_size * num_views, *all_view_poses.shape[2:]])
	)
	views_proj = (
	views_proj.reshape([batch_size, num_views, self.width, -1])
	.permute(0, 1, 3, 2)
	.contiguous()
	) # [batch_size x num_views x n_patches x width]

	return views_proj

	def encode_multiview_pcl(self, batch: AttrDict, use_distance: bool = True) -> torch.Tensor:
	"""
	:return: [batch_size, num_views, n_patches, width]
	"""
	all_views = self.views_to_tensor(batch.views).to(self.device)
	depths = self.raw_depths_to_tensor(batch.depths)
	all_view_alphas = self.view_alphas_to_tensor(batch.view_alphas).to(self.device)
	mask = all_view_alphas >= 0.999

	dense_poses, camera_z = self.dense_pose_cameras_to_tensor(batch.cameras)
	dense_poses = dense_poses.permute(0, 1, 4, 5, 2, 3)

	origin, direction = dense_poses[:, :, 0], dense_poses[:, :, 1]
	if use_distance:
	ray_depth_factor = torch.sum(direction * camera_z[..., None, None], dim=2, keepdim=True)
	depths = depths / ray_depth_factor
	position = origin + depths * direction
	all_view_poses = self.mv_pcl_embed(all_views, origin, position, mask)

	batch_size, num_views, _, _, _ = all_view_poses.shape

	views_proj = self.patch_emb(
	all_view_poses.reshape([batch_size * num_views, *all_view_poses.shape[2:]])
	)
	views_proj = (
	views_proj.reshape([batch_size, num_views, self.width, -1])
	.permute(0, 1, 3, 2)
	.contiguous()
	) # [batch_size x num_views x n_patches x width]

	return views_proj

	def views_to_tensor(self, views: Union[torch.Tensor, List[List[Image.Image]]]) -> torch.Tensor:
	"""
	Returns a [batch x num_views x 3 x size x size] tensor in the range [-1, 1].
	"""
	if isinstance(views, torch.Tensor):
	return views

	tensor_batch = []
	num_views = len(views[0])
	for inner_list in views:
	assert len(inner_list) == num_views
	inner_batch = []
	for img in inner_list:
	img = img.resize((self.image_size,) * 2).convert("RGB")
	inner_batch.append(
	torch.from_numpy(np.array(img)).to(device=self.device, dtype=torch.float32)
	/ 127.5
	- 1
	)
	tensor_batch.append(torch.stack(inner_batch, dim=0))
	return torch.stack(tensor_batch, dim=0).permute(0, 1, 4, 2, 3)

	def depths_to_tensor(
	self, depths: Union[torch.Tensor, List[List[Image.Image]]]
	) -> torch.Tensor:
	"""
	Returns a [batch x num_views x 1 x size x size] tensor in the range [-1, 1].
	"""
	if isinstance(depths, torch.Tensor):
	return depths

	tensor_batch = []
	num_views = len(depths[0])
	for inner_list in depths:
	assert len(inner_list) == num_views
	inner_batch = []
	for arr in inner_list:
	tensor = torch.from_numpy(arr).clamp(max=self.max_depth) / self.max_depth
	tensor = tensor * 2 - 1
	tensor = F.interpolate(
	tensor[None, None],
	(self.image_size,) * 2,
	mode="nearest",
	)
	inner_batch.append(tensor.to(device=self.device, dtype=torch.float32))
	tensor_batch.append(torch.cat(inner_batch, dim=0))
	return torch.stack(tensor_batch, dim=0)

	def view_alphas_to_tensor(
	self, view_alphas: Union[torch.Tensor, List[List[Image.Image]]]
	) -> torch.Tensor:
	"""
	Returns a [batch x num_views x 1 x size x size] tensor in the range [0, 1].
	"""
	if isinstance(view_alphas, torch.Tensor):
	return view_alphas

	tensor_batch = []
	num_views = len(view_alphas[0])
	for inner_list in view_alphas:
	assert len(inner_list) == num_views
	inner_batch = []
	for img in inner_list:
	tensor = (
	torch.from_numpy(np.array(img)).to(device=self.device, dtype=torch.float32)
	/ 255.0
	)
	tensor = F.interpolate(
	tensor[None, None],
	(self.image_size,) * 2,
	mode="nearest",
	)
	inner_batch.append(tensor)
	tensor_batch.append(torch.cat(inner_batch, dim=0))
	return torch.stack(tensor_batch, dim=0)

	def raw_depths_to_tensor(
	self, depths: Union[torch.Tensor, List[List[Image.Image]]]
	) -> torch.Tensor:
	"""
	Returns a [batch x num_views x 1 x size x size] tensor
	"""
	if isinstance(depths, torch.Tensor):
	return depths

	tensor_batch = []
	num_views = len(depths[0])
	for inner_list in depths:
	assert len(inner_list) == num_views
	inner_batch = []
	for arr in inner_list:
	tensor = torch.from_numpy(arr).clamp(max=self.max_depth)
	tensor = F.interpolate(
	tensor[None, None],
	(self.image_size,) * 2,
	mode="nearest",
	)
	inner_batch.append(tensor.to(device=self.device, dtype=torch.float32))
	tensor_batch.append(torch.cat(inner_batch, dim=0))
	return torch.stack(tensor_batch, dim=0)

	def cameras_to_tensor(
	self, cameras: Union[torch.Tensor, List[List[ProjectiveCamera]]]
	) -> torch.Tensor:
	"""
	Returns a [batch x num_views x 3*4+1] tensor of camera information.
	"""
	if isinstance(cameras, torch.Tensor):
	return cameras
	outer_batch = []
	for inner_list in cameras:
	inner_batch = []
	for camera in inner_list:
	inner_batch.append(
	np.array(
	[
	*camera.x,
	*camera.y,
	*camera.z,
	*camera.origin,
	camera.x_fov,
	]
	)
	)
	outer_batch.append(np.stack(inner_batch, axis=0))
	return torch.from_numpy(np.stack(outer_batch, axis=0)).float()

	def dense_pose_cameras_to_tensor(
	self, cameras: Union[torch.Tensor, List[List[ProjectiveCamera]]]
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Returns a tuple of (rays, z_directions) where
	- rays: [batch, num_views, height, width, 2, 3] tensor of camera information.
	- z_directions: [batch, num_views, 3] tensor of camera z directions.
	"""
	if isinstance(cameras, torch.Tensor):
	raise NotImplementedError

	for inner_list in cameras:
	assert len(inner_list) == len(cameras[0])

	camera = cameras[0][0]
	flat_camera = DifferentiableProjectiveCamera(
	origin=torch.from_numpy(
	np.stack(
	[cam.origin for inner_list in cameras for cam in inner_list],
	axis=0,
	)
	).to(self.device),
	x=torch.from_numpy(
	np.stack(
	[cam.x for inner_list in cameras for cam in inner_list],
	axis=0,
	)
	).to(self.device),
	y=torch.from_numpy(
	np.stack(
	[cam.y for inner_list in cameras for cam in inner_list],
	axis=0,
	)
	).to(self.device),
	z=torch.from_numpy(
	np.stack(
	[cam.z for inner_list in cameras for cam in inner_list],
	axis=0,
	)
	).to(self.device),
	width=camera.width,
	height=camera.height,
	x_fov=camera.x_fov,
	y_fov=camera.y_fov,
	)
	batch_size = len(cameras) * len(cameras[0])
	coords = (
	flat_camera.image_coords()
	.to(flat_camera.origin.device)
	.unsqueeze(0)
	.repeat(batch_size, 1, 1)
	)
	rays = flat_camera.camera_rays(coords)
	return (
	rays.view(len(cameras), len(cameras[0]), camera.height, camera.width, 2, 3).to(
	self.device
	),
	flat_camera.z.view(len(cameras), len(cameras[0]), 3).to(self.device),
	)


	def sample_pcl_fps(points: torch.Tensor, data_ctx: int, method: str = "fps") -> torch.Tensor:
	"""
	Run farthest-point sampling on a batch of point clouds.

	:param points: batch of shape [N x num_points].
	:param data_ctx: subsample count.
	:param method: either 'fps' or 'first'. Using 'first' assumes that the
	points are already sorted according to FPS sampling.
	:return: batch of shape [N x min(num_points, data_ctx)].
	"""
	n_points = points.shape[1]
	if n_points == data_ctx:
	return points
	if method == "first":
	return points[:, :data_ctx]
	elif method == "fps":
	batch = points.cpu().split(1, dim=0)
	fps = [sample_fps(x, n_samples=data_ctx) for x in batch]
	return torch.cat(fps, dim=0).to(points.device)
	else:
	raise ValueError(f"unsupported farthest-point sampling method: {method}")


	def sample_fps(example: torch.Tensor, n_samples: int) -> torch.Tensor:
	"""
	:param example: [1, n_points, 3 + n_channels]
	:return: [1, n_samples, 3 + n_channels]
	"""
	points = example.cpu().squeeze(0).numpy()
	coords, raw_channels = points[:, :3], points[:, 3:]
	n_points, n_channels = raw_channels.shape
	assert n_samples <= n_points
	channels = {str(idx): raw_channels[:, idx] for idx in range(n_channels)}
	max_points = min(32768, n_points)
	fps_pcl = (
	PointCloud(coords=coords, channels=channels)
	.random_sample(max_points)
	.farthest_point_sample(n_samples)
	)
	fps_channels = np.stack([fps_pcl.channels[str(idx)] for idx in range(n_channels)], axis=1)
	fps = np.concatenate([fps_pcl.coords, fps_channels], axis=1)
	fps = torch.from_numpy(fps).unsqueeze(0)
	assert fps.shape == (1, n_samples, 3 + n_channels)
	return fps