Spaces:
Runtime error
Runtime error
| # Copyright 2018 The TensorFlow Authors All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| """Interface for different embedders for modalities.""" | |
| import abc | |
| import numpy as np | |
| import tensorflow as tf | |
| import preprocessing | |
| from tensorflow.contrib.slim.nets import resnet_v2 | |
| slim = tf.contrib.slim | |
| class Embedder(object): | |
| """Represents the embedder for different modalities. | |
| Modalities can be semantic segmentation, depth channel, object detection and | |
| so on, which require specific embedder for them. | |
| """ | |
| __metaclass__ = abc.ABCMeta | |
| def build(self, observation): | |
| """Builds the model to embed the observation modality. | |
| Args: | |
| observation: tensor that contains the raw observation from modality. | |
| Returns: | |
| Embedding tensor for the given observation tensor. | |
| """ | |
| raise NotImplementedError( | |
| 'Needs to be implemented as part of Embedder Interface') | |
| class DetectionBoxEmbedder(Embedder): | |
| """Represents the model that encodes the detection boxes from images.""" | |
| def __init__(self, rnn_state_size, scope=None): | |
| self._rnn_state_size = rnn_state_size | |
| self._scope = scope | |
| def build(self, observations): | |
| """Builds the model to embed object detection observations. | |
| Args: | |
| observations: a tuple of (dets, det_num). | |
| dets is a tensor of BxTxLxE that has the detection boxes in all the | |
| images of the batch. B is the batch size, T is the maximum length of | |
| episode, L is the maximum number of detections per image in the batch | |
| and E is the size of each detection embedding. | |
| det_num is a tensor of BxT that contains the number of detected boxes | |
| each image of each sequence in the batch. | |
| Returns: | |
| For each image in the batch, returns the accumulative embedding of all the | |
| detection boxes in that image. | |
| """ | |
| with tf.variable_scope(self._scope, default_name=''): | |
| shape = observations[0].shape | |
| dets = tf.reshape(observations[0], [-1, shape[-2], shape[-1]]) | |
| det_num = tf.reshape(observations[1], [-1]) | |
| lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self._rnn_state_size) | |
| batch_size = tf.shape(dets)[0] | |
| lstm_outputs, _ = tf.nn.dynamic_rnn( | |
| cell=lstm_cell, | |
| inputs=dets, | |
| sequence_length=det_num, | |
| initial_state=lstm_cell.zero_state(batch_size, dtype=tf.float32), | |
| dtype=tf.float32) | |
| # Gathering the last state of each sequence in the batch. | |
| batch_range = tf.range(batch_size) | |
| indices = tf.stack([batch_range, det_num - 1], axis=1) | |
| last_lstm_outputs = tf.gather_nd(lstm_outputs, indices) | |
| last_lstm_outputs = tf.reshape(last_lstm_outputs, | |
| [-1, shape[1], self._rnn_state_size]) | |
| return last_lstm_outputs | |
| class ResNet(Embedder): | |
| """Residual net embedder for image data.""" | |
| def __init__(self, params, *args, **kwargs): | |
| super(ResNet, self).__init__(*args, **kwargs) | |
| self._params = params | |
| self._extra_train_ops = [] | |
| def build(self, images): | |
| shape = images.get_shape().as_list() | |
| if len(shape) == 5: | |
| images = tf.reshape(images, | |
| [shape[0] * shape[1], shape[2], shape[3], shape[4]]) | |
| embedding = self._build_model(images) | |
| if len(shape) == 5: | |
| embedding = tf.reshape(embedding, [shape[0], shape[1], -1]) | |
| return embedding | |
| def extra_train_ops(self): | |
| return self._extra_train_ops | |
| def _build_model(self, images): | |
| """Builds the model.""" | |
| # Convert images to floats and normalize them. | |
| images = tf.to_float(images) | |
| bs = images.get_shape().as_list()[0] | |
| images = [ | |
| tf.image.per_image_standardization(tf.squeeze(i)) | |
| for i in tf.split(images, bs) | |
| ] | |
| images = tf.concat([tf.expand_dims(i, axis=0) for i in images], axis=0) | |
| with tf.variable_scope('init'): | |
| x = self._conv('init_conv', images, 3, 3, 16, self._stride_arr(1)) | |
| strides = [1, 2, 2] | |
| activate_before_residual = [True, False, False] | |
| if self._params.use_bottleneck: | |
| res_func = self._bottleneck_residual | |
| filters = [16, 64, 128, 256] | |
| else: | |
| res_func = self._residual | |
| filters = [16, 16, 32, 128] | |
| with tf.variable_scope('unit_1_0'): | |
| x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]), | |
| activate_before_residual[0]) | |
| for i in xrange(1, self._params.num_residual_units): | |
| with tf.variable_scope('unit_1_%d' % i): | |
| x = res_func(x, filters[1], filters[1], self._stride_arr(1), False) | |
| with tf.variable_scope('unit_2_0'): | |
| x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]), | |
| activate_before_residual[1]) | |
| for i in xrange(1, self._params.num_residual_units): | |
| with tf.variable_scope('unit_2_%d' % i): | |
| x = res_func(x, filters[2], filters[2], self._stride_arr(1), False) | |
| with tf.variable_scope('unit_3_0'): | |
| x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]), | |
| activate_before_residual[2]) | |
| for i in xrange(1, self._params.num_residual_units): | |
| with tf.variable_scope('unit_3_%d' % i): | |
| x = res_func(x, filters[3], filters[3], self._stride_arr(1), False) | |
| with tf.variable_scope('unit_last'): | |
| x = self._batch_norm('final_bn', x) | |
| x = self._relu(x, self._params.relu_leakiness) | |
| with tf.variable_scope('pool_logit'): | |
| x = self._global_avg_pooling(x) | |
| return x | |
| def _stride_arr(self, stride): | |
| return [1, stride, stride, 1] | |
| def _batch_norm(self, name, x): | |
| """batch norm implementation.""" | |
| with tf.variable_scope(name): | |
| params_shape = [x.shape[-1]] | |
| beta = tf.get_variable( | |
| 'beta', | |
| params_shape, | |
| tf.float32, | |
| initializer=tf.constant_initializer(0.0, tf.float32)) | |
| gamma = tf.get_variable( | |
| 'gamma', | |
| params_shape, | |
| tf.float32, | |
| initializer=tf.constant_initializer(1.0, tf.float32)) | |
| if self._params.is_train: | |
| mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments') | |
| moving_mean = tf.get_variable( | |
| 'moving_mean', | |
| params_shape, | |
| tf.float32, | |
| initializer=tf.constant_initializer(0.0, tf.float32), | |
| trainable=False) | |
| moving_variance = tf.get_variable( | |
| 'moving_variance', | |
| params_shape, | |
| tf.float32, | |
| initializer=tf.constant_initializer(1.0, tf.float32), | |
| trainable=False) | |
| self._extra_train_ops.append( | |
| tf.assign_moving_average(moving_mean, mean, 0.9)) | |
| self._extra_train_ops.append( | |
| tf.assign_moving_average(moving_variance, variance, 0.9)) | |
| else: | |
| mean = tf.get_variable( | |
| 'moving_mean', | |
| params_shape, | |
| tf.float32, | |
| initializer=tf.constant_initializer(0.0, tf.float32), | |
| trainable=False) | |
| variance = tf.get_variable( | |
| 'moving_variance', | |
| params_shape, | |
| tf.float32, | |
| initializer=tf.constant_initializer(1.0, tf.float32), | |
| trainable=False) | |
| tf.summary.histogram(mean.op.name, mean) | |
| tf.summary.histogram(variance.op.name, variance) | |
| # elipson used to be 1e-5. Maybe 0.001 solves NaN problem in deeper net. | |
| y = tf.nn.batch_normalization(x, mean, variance, beta, gamma, 0.001) | |
| y.set_shape(x.shape) | |
| return y | |
| def _residual(self, | |
| x, | |
| in_filter, | |
| out_filter, | |
| stride, | |
| activate_before_residual=False): | |
| """Residual unit with 2 sub layers.""" | |
| if activate_before_residual: | |
| with tf.variable_scope('shared_activation'): | |
| x = self._batch_norm('init_bn', x) | |
| x = self._relu(x, self._params.relu_leakiness) | |
| orig_x = x | |
| else: | |
| with tf.variable_scope('residual_only_activation'): | |
| orig_x = x | |
| x = self._batch_norm('init_bn', x) | |
| x = self._relu(x, self._params.relu_leakiness) | |
| with tf.variable_scope('sub1'): | |
| x = self._conv('conv1', x, 3, in_filter, out_filter, stride) | |
| with tf.variable_scope('sub2'): | |
| x = self._batch_norm('bn2', x) | |
| x = self._relu(x, self._params.relu_leakiness) | |
| x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1]) | |
| with tf.variable_scope('sub_add'): | |
| if in_filter != out_filter: | |
| orig_x = tf.nn.avg_pool(orig_x, stride, stride, 'VALID') | |
| orig_x = tf.pad( | |
| orig_x, [[0, 0], [0, 0], [0, 0], [(out_filter - in_filter) // 2, | |
| (out_filter - in_filter) // 2]]) | |
| x += orig_x | |
| return x | |
| def _bottleneck_residual(self, | |
| x, | |
| in_filter, | |
| out_filter, | |
| stride, | |
| activate_before_residual=False): | |
| """A residual convolutional layer with a bottleneck. | |
| The layer is a composite of three convolutional layers with a ReLU non- | |
| linearity and batch normalization after each linear convolution. The depth | |
| if the second and third layer is out_filter / 4 (hence it is a bottleneck). | |
| Args: | |
| x: a float 4 rank Tensor representing the input to the layer. | |
| in_filter: a python integer representing depth of the input. | |
| out_filter: a python integer representing depth of the output. | |
| stride: a python integer denoting the stride of the layer applied before | |
| the first convolution. | |
| activate_before_residual: a python boolean. If True, then a ReLU is | |
| applied as a first operation on the input x before everything else. | |
| Returns: | |
| A 4 rank Tensor with batch_size = batch size of input, width and height = | |
| width / stride and height / stride of the input and depth = out_filter. | |
| """ | |
| if activate_before_residual: | |
| with tf.variable_scope('common_bn_relu'): | |
| x = self._batch_norm('init_bn', x) | |
| x = self._relu(x, self._params.relu_leakiness) | |
| orig_x = x | |
| else: | |
| with tf.variable_scope('residual_bn_relu'): | |
| orig_x = x | |
| x = self._batch_norm('init_bn', x) | |
| x = self._relu(x, self._params.relu_leakiness) | |
| with tf.variable_scope('sub1'): | |
| x = self._conv('conv1', x, 1, in_filter, out_filter / 4, stride) | |
| with tf.variable_scope('sub2'): | |
| x = self._batch_norm('bn2', x) | |
| x = self._relu(x, self._params.relu_leakiness) | |
| x = self._conv('conv2', x, 3, out_filter / 4, out_filter / 4, | |
| [1, 1, 1, 1]) | |
| with tf.variable_scope('sub3'): | |
| x = self._batch_norm('bn3', x) | |
| x = self._relu(x, self._params.relu_leakiness) | |
| x = self._conv('conv3', x, 1, out_filter / 4, out_filter, [1, 1, 1, 1]) | |
| with tf.variable_scope('sub_add'): | |
| if in_filter != out_filter: | |
| orig_x = self._conv('project', orig_x, 1, in_filter, out_filter, stride) | |
| x += orig_x | |
| return x | |
| def _decay(self): | |
| costs = [] | |
| for var in tf.trainable_variables(): | |
| if var.op.name.find(r'DW') > 0: | |
| costs.append(tf.nn.l2_loss(var)) | |
| return tf.mul(self._params.weight_decay_rate, tf.add_n(costs)) | |
| def _conv(self, name, x, filter_size, in_filters, out_filters, strides): | |
| """Convolution.""" | |
| with tf.variable_scope(name): | |
| n = filter_size * filter_size * out_filters | |
| kernel = tf.get_variable( | |
| 'DW', [filter_size, filter_size, in_filters, out_filters], | |
| tf.float32, | |
| initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / n))) | |
| return tf.nn.conv2d(x, kernel, strides, padding='SAME') | |
| def _relu(self, x, leakiness=0.0): | |
| return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu') | |
| def _fully_connected(self, x, out_dim): | |
| x = tf.reshape(x, [self._params.batch_size, -1]) | |
| w = tf.get_variable( | |
| 'DW', [x.get_shape()[1], out_dim], | |
| initializer=tf.uniform_unit_scaling_initializer(factor=1.0)) | |
| b = tf.get_variable( | |
| 'biases', [out_dim], initializer=tf.constant_initializer()) | |
| return tf.nn.xw_plus_b(x, w, b) | |
| def _global_avg_pooling(self, x): | |
| assert x.get_shape().ndims == 4 | |
| return tf.reduce_mean(x, [1, 2]) | |
| class MLPEmbedder(Embedder): | |
| """Embedder of vectorial data. | |
| The net is a multi-layer perceptron, with ReLU nonlinearities in all layers | |
| except the last one. | |
| """ | |
| def __init__(self, layers, *args, **kwargs): | |
| """Constructs MLPEmbedder. | |
| Args: | |
| layers: a list of python integers representing layer sizes. | |
| *args: arguments for super constructor. | |
| **kwargs: keyed arguments for super constructor. | |
| """ | |
| super(MLPEmbedder, self).__init__(*args, **kwargs) | |
| self._layers = layers | |
| def build(self, features): | |
| shape = features.get_shape().as_list() | |
| if len(shape) == 3: | |
| features = tf.reshape(features, [shape[0] * shape[1], shape[2]]) | |
| x = features | |
| for i, dim in enumerate(self._layers): | |
| with tf.variable_scope('layer_%i' % i): | |
| x = self._fully_connected(x, dim) | |
| if i < len(self._layers) - 1: | |
| x = self._relu(x) | |
| if len(shape) == 3: | |
| x = tf.reshape(x, shape[:-1] + [self._layers[-1]]) | |
| return x | |
| def _fully_connected(self, x, out_dim): | |
| w = tf.get_variable( | |
| 'DW', [x.get_shape()[1], out_dim], | |
| initializer=tf.variance_scaling_initializer(distribution='uniform')) | |
| b = tf.get_variable( | |
| 'biases', [out_dim], initializer=tf.constant_initializer()) | |
| return tf.nn.xw_plus_b(x, w, b) | |
| def _relu(self, x, leakiness=0.0): | |
| return tf.where(tf.less(x, 0.0), leakiness * x, x, name='leaky_relu') | |
| class SmallNetworkEmbedder(Embedder): | |
| """Embedder for image like observations. | |
| The network is comprised of multiple conv layers and a fully connected layer | |
| at the end. The number of conv layers and the parameters are configured from | |
| params. | |
| """ | |
| def __init__(self, params, *args, **kwargs): | |
| """Constructs the small network. | |
| Args: | |
| params: params should be tf.hparams type. params need to have a list of | |
| conv_sizes, conv_strides, conv_channels. The length of these lists | |
| should be equal to each other and to the number of conv layers in the | |
| network. Plus, it also needs to have boolean variable named to_one_hot | |
| which indicates whether the input should be converted to one hot or not. | |
| The size of the fully connected layer is specified by | |
| params.embedding_size. | |
| *args: The rest of the parameters. | |
| **kwargs: the reset of the parameters. | |
| Raises: | |
| ValueError: If the length of params.conv_strides, params.conv_sizes, and | |
| params.conv_channels are not equal. | |
| """ | |
| super(SmallNetworkEmbedder, self).__init__(*args, **kwargs) | |
| self._params = params | |
| if len(self._params.conv_sizes) != len(self._params.conv_strides): | |
| raise ValueError( | |
| 'Conv sizes and strides should have the same length: {} != {}'.format( | |
| len(self._params.conv_sizes), len(self._params.conv_strides))) | |
| if len(self._params.conv_sizes) != len(self._params.conv_channels): | |
| raise ValueError( | |
| 'Conv sizes and channels should have the same length: {} != {}'. | |
| format(len(self._params.conv_sizes), len(self._params.conv_channels))) | |
| def build(self, images): | |
| """Builds the embedder with the given speicifcation. | |
| Args: | |
| images: a tensor that contains the input images which has the shape of | |
| NxTxHxWxC where N is the batch size, T is the maximum length of the | |
| sequence, H and W are the height and width of the images and C is the | |
| number of channels. | |
| Returns: | |
| A tensor that is the embedding of the images. | |
| """ | |
| shape = images.get_shape().as_list() | |
| images = tf.reshape(images, | |
| [shape[0] * shape[1], shape[2], shape[3], shape[4]]) | |
| with slim.arg_scope( | |
| [slim.conv2d, slim.fully_connected], | |
| activation_fn=tf.nn.relu, | |
| weights_regularizer=slim.l2_regularizer(self._params.weight_decay_rate), | |
| biases_initializer=tf.zeros_initializer()): | |
| with slim.arg_scope([slim.conv2d], padding='SAME'): | |
| # convert the image to one hot if needed. | |
| if self._params.to_one_hot: | |
| net = tf.one_hot( | |
| tf.squeeze(tf.to_int32(images), axis=[-1]), | |
| self._params.one_hot_length) | |
| else: | |
| net = images | |
| p = self._params | |
| # Adding conv layers with the specified configurations. | |
| for conv_id, kernel_stride_channel in enumerate( | |
| zip(p.conv_sizes, p.conv_strides, p.conv_channels)): | |
| kernel_size, stride, channels = kernel_stride_channel | |
| net = slim.conv2d( | |
| net, | |
| channels, [kernel_size, kernel_size], | |
| stride, | |
| scope='conv_{}'.format(conv_id + 1)) | |
| net = slim.flatten(net) | |
| net = slim.fully_connected(net, self._params.embedding_size, scope='fc') | |
| output = tf.reshape(net, [shape[0], shape[1], -1]) | |
| return output | |
| class ResNet50Embedder(Embedder): | |
| """Uses ResNet50 to embed input images.""" | |
| def build(self, images): | |
| """Builds a ResNet50 embedder for the input images. | |
| It assumes that the range of the pixel values in the images tensor is | |
| [0,255] and should be castable to tf.uint8. | |
| Args: | |
| images: a tensor that contains the input images which has the shape of | |
| NxTxHxWx3 where N is the batch size, T is the maximum length of the | |
| sequence, H and W are the height and width of the images and C is the | |
| number of channels. | |
| Returns: | |
| The embedding of the input image with the shape of NxTxL where L is the | |
| embedding size of the output. | |
| Raises: | |
| ValueError: if the shape of the input does not agree with the expected | |
| shape explained in the Args section. | |
| """ | |
| shape = images.get_shape().as_list() | |
| if len(shape) != 5: | |
| raise ValueError( | |
| 'The tensor shape should have 5 elements, {} is provided'.format( | |
| len(shape))) | |
| if shape[4] != 3: | |
| raise ValueError('Three channels are expected for the input image') | |
| images = tf.cast(images, tf.uint8) | |
| images = tf.reshape(images, | |
| [shape[0] * shape[1], shape[2], shape[3], shape[4]]) | |
| with slim.arg_scope(resnet_v2.resnet_arg_scope()): | |
| def preprocess_fn(x): | |
| x = tf.expand_dims(x, 0) | |
| x = tf.image.resize_bilinear(x, [299, 299], | |
| align_corners=False) | |
| return(tf.squeeze(x, [0])) | |
| images = tf.map_fn(preprocess_fn, images, dtype=tf.float32) | |
| net, _ = resnet_v2.resnet_v2_50( | |
| images, is_training=False, global_pool=True) | |
| output = tf.reshape(net, [shape[0], shape[1], -1]) | |
| return output | |
| class IdentityEmbedder(Embedder): | |
| """This embedder just returns the input as the output. | |
| Used for modalitites that the embedding of the modality is the same as the | |
| modality itself. For example, it can be used for one_hot goal. | |
| """ | |
| def build(self, images): | |
| return images | |