Spaces:
Runtime error
Runtime error
| # Lint as: python2, python3 | |
| # Copyright 2018 The TensorFlow Authors All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| r"""Provides DeepLab model definition and helper functions. | |
| DeepLab is a deep learning system for semantic image segmentation with | |
| the following features: | |
| (1) Atrous convolution to explicitly control the resolution at which | |
| feature responses are computed within Deep Convolutional Neural Networks. | |
| (2) Atrous spatial pyramid pooling (ASPP) to robustly segment objects at | |
| multiple scales with filters at multiple sampling rates and effective | |
| fields-of-views. | |
| (3) ASPP module augmented with image-level feature and batch normalization. | |
| (4) A simple yet effective decoder module to recover the object boundaries. | |
| See the following papers for more details: | |
| "Encoder-Decoder with Atrous Separable Convolution for Semantic Image | |
| Segmentation" | |
| Liang-Chieh Chen, Yukun Zhu, George Papandreou, Florian Schroff, Hartwig Adam. | |
| (https://arxiv.org/abs/1802.02611) | |
| "Rethinking Atrous Convolution for Semantic Image Segmentation," | |
| Liang-Chieh Chen, George Papandreou, Florian Schroff, Hartwig Adam | |
| (https://arxiv.org/abs/1706.05587) | |
| "DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, | |
| Atrous Convolution, and Fully Connected CRFs", | |
| Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy, | |
| Alan L Yuille (* equal contribution) | |
| (https://arxiv.org/abs/1606.00915) | |
| "Semantic Image Segmentation with Deep Convolutional Nets and Fully Connected | |
| CRFs" | |
| Liang-Chieh Chen*, George Papandreou*, Iasonas Kokkinos, Kevin Murphy, | |
| Alan L. Yuille (* equal contribution) | |
| (https://arxiv.org/abs/1412.7062) | |
| """ | |
| import tensorflow as tf | |
| from tensorflow.contrib import slim as contrib_slim | |
| from deeplab.core import dense_prediction_cell | |
| from deeplab.core import feature_extractor | |
| from deeplab.core import utils | |
| slim = contrib_slim | |
| LOGITS_SCOPE_NAME = 'logits' | |
| MERGED_LOGITS_SCOPE = 'merged_logits' | |
| IMAGE_POOLING_SCOPE = 'image_pooling' | |
| ASPP_SCOPE = 'aspp' | |
| CONCAT_PROJECTION_SCOPE = 'concat_projection' | |
| DECODER_SCOPE = 'decoder' | |
| META_ARCHITECTURE_SCOPE = 'meta_architecture' | |
| PROB_SUFFIX = '_prob' | |
| _resize_bilinear = utils.resize_bilinear | |
| scale_dimension = utils.scale_dimension | |
| split_separable_conv2d = utils.split_separable_conv2d | |
| def get_extra_layer_scopes(last_layers_contain_logits_only=False): | |
| """Gets the scopes for extra layers. | |
| Args: | |
| last_layers_contain_logits_only: Boolean, True if only consider logits as | |
| the last layer (i.e., exclude ASPP module, decoder module and so on) | |
| Returns: | |
| A list of scopes for extra layers. | |
| """ | |
| if last_layers_contain_logits_only: | |
| return [LOGITS_SCOPE_NAME] | |
| else: | |
| return [ | |
| LOGITS_SCOPE_NAME, | |
| IMAGE_POOLING_SCOPE, | |
| ASPP_SCOPE, | |
| CONCAT_PROJECTION_SCOPE, | |
| DECODER_SCOPE, | |
| META_ARCHITECTURE_SCOPE, | |
| ] | |
| def predict_labels_multi_scale(images, | |
| model_options, | |
| eval_scales=(1.0,), | |
| add_flipped_images=False): | |
| """Predicts segmentation labels. | |
| Args: | |
| images: A tensor of size [batch, height, width, channels]. | |
| model_options: A ModelOptions instance to configure models. | |
| eval_scales: The scales to resize images for evaluation. | |
| add_flipped_images: Add flipped images for evaluation or not. | |
| Returns: | |
| A dictionary with keys specifying the output_type (e.g., semantic | |
| prediction) and values storing Tensors representing predictions (argmax | |
| over channels). Each prediction has size [batch, height, width]. | |
| """ | |
| outputs_to_predictions = { | |
| output: [] | |
| for output in model_options.outputs_to_num_classes | |
| } | |
| for i, image_scale in enumerate(eval_scales): | |
| with tf.variable_scope(tf.get_variable_scope(), reuse=True if i else None): | |
| outputs_to_scales_to_logits = multi_scale_logits( | |
| images, | |
| model_options=model_options, | |
| image_pyramid=[image_scale], | |
| is_training=False, | |
| fine_tune_batch_norm=False) | |
| if add_flipped_images: | |
| with tf.variable_scope(tf.get_variable_scope(), reuse=True): | |
| outputs_to_scales_to_logits_reversed = multi_scale_logits( | |
| tf.reverse_v2(images, [2]), | |
| model_options=model_options, | |
| image_pyramid=[image_scale], | |
| is_training=False, | |
| fine_tune_batch_norm=False) | |
| for output in sorted(outputs_to_scales_to_logits): | |
| scales_to_logits = outputs_to_scales_to_logits[output] | |
| logits = _resize_bilinear( | |
| scales_to_logits[MERGED_LOGITS_SCOPE], | |
| tf.shape(images)[1:3], | |
| scales_to_logits[MERGED_LOGITS_SCOPE].dtype) | |
| outputs_to_predictions[output].append( | |
| tf.expand_dims(tf.nn.softmax(logits), 4)) | |
| if add_flipped_images: | |
| scales_to_logits_reversed = ( | |
| outputs_to_scales_to_logits_reversed[output]) | |
| logits_reversed = _resize_bilinear( | |
| tf.reverse_v2(scales_to_logits_reversed[MERGED_LOGITS_SCOPE], [2]), | |
| tf.shape(images)[1:3], | |
| scales_to_logits_reversed[MERGED_LOGITS_SCOPE].dtype) | |
| outputs_to_predictions[output].append( | |
| tf.expand_dims(tf.nn.softmax(logits_reversed), 4)) | |
| for output in sorted(outputs_to_predictions): | |
| predictions = outputs_to_predictions[output] | |
| # Compute average prediction across different scales and flipped images. | |
| predictions = tf.reduce_mean(tf.concat(predictions, 4), axis=4) | |
| outputs_to_predictions[output] = tf.argmax(predictions, 3) | |
| outputs_to_predictions[output + PROB_SUFFIX] = tf.nn.softmax(predictions) | |
| return outputs_to_predictions | |
| def predict_labels(images, model_options, image_pyramid=None): | |
| """Predicts segmentation labels. | |
| Args: | |
| images: A tensor of size [batch, height, width, channels]. | |
| model_options: A ModelOptions instance to configure models. | |
| image_pyramid: Input image scales for multi-scale feature extraction. | |
| Returns: | |
| A dictionary with keys specifying the output_type (e.g., semantic | |
| prediction) and values storing Tensors representing predictions (argmax | |
| over channels). Each prediction has size [batch, height, width]. | |
| """ | |
| outputs_to_scales_to_logits = multi_scale_logits( | |
| images, | |
| model_options=model_options, | |
| image_pyramid=image_pyramid, | |
| is_training=False, | |
| fine_tune_batch_norm=False) | |
| predictions = {} | |
| for output in sorted(outputs_to_scales_to_logits): | |
| scales_to_logits = outputs_to_scales_to_logits[output] | |
| logits = scales_to_logits[MERGED_LOGITS_SCOPE] | |
| # There are two ways to obtain the final prediction results: (1) bilinear | |
| # upsampling the logits followed by argmax, or (2) argmax followed by | |
| # nearest neighbor upsampling. The second option may introduce the "blocking | |
| # effect" but is computationally efficient. | |
| if model_options.prediction_with_upsampled_logits: | |
| logits = _resize_bilinear(logits, | |
| tf.shape(images)[1:3], | |
| scales_to_logits[MERGED_LOGITS_SCOPE].dtype) | |
| predictions[output] = tf.argmax(logits, 3) | |
| predictions[output + PROB_SUFFIX] = tf.nn.softmax(logits) | |
| else: | |
| argmax_results = tf.argmax(logits, 3) | |
| argmax_results = tf.image.resize_nearest_neighbor( | |
| tf.expand_dims(argmax_results, 3), | |
| tf.shape(images)[1:3], | |
| align_corners=True, | |
| name='resize_prediction') | |
| predictions[output] = tf.squeeze(argmax_results, 3) | |
| predictions[output + PROB_SUFFIX] = tf.image.resize_bilinear( | |
| tf.nn.softmax(logits), | |
| tf.shape(images)[1:3], | |
| align_corners=True, | |
| name='resize_prob') | |
| return predictions | |
| def multi_scale_logits(images, | |
| model_options, | |
| image_pyramid, | |
| weight_decay=0.0001, | |
| is_training=False, | |
| fine_tune_batch_norm=False, | |
| nas_training_hyper_parameters=None): | |
| """Gets the logits for multi-scale inputs. | |
| The returned logits are all downsampled (due to max-pooling layers) | |
| for both training and evaluation. | |
| Args: | |
| images: A tensor of size [batch, height, width, channels]. | |
| model_options: A ModelOptions instance to configure models. | |
| image_pyramid: Input image scales for multi-scale feature extraction. | |
| weight_decay: The weight decay for model variables. | |
| is_training: Is training or not. | |
| fine_tune_batch_norm: Fine-tune the batch norm parameters or not. | |
| nas_training_hyper_parameters: A dictionary storing hyper-parameters for | |
| training nas models. Its keys are: | |
| - `drop_path_keep_prob`: Probability to keep each path in the cell when | |
| training. | |
| - `total_training_steps`: Total training steps to help drop path | |
| probability calculation. | |
| Returns: | |
| outputs_to_scales_to_logits: A map of maps from output_type (e.g., | |
| semantic prediction) to a dictionary of multi-scale logits names to | |
| logits. For each output_type, the dictionary has keys which | |
| correspond to the scales and values which correspond to the logits. | |
| For example, if `scales` equals [1.0, 1.5], then the keys would | |
| include 'merged_logits', 'logits_1.00' and 'logits_1.50'. | |
| Raises: | |
| ValueError: If model_options doesn't specify crop_size and its | |
| add_image_level_feature = True, since add_image_level_feature requires | |
| crop_size information. | |
| """ | |
| # Setup default values. | |
| if not image_pyramid: | |
| image_pyramid = [1.0] | |
| crop_height = ( | |
| model_options.crop_size[0] | |
| if model_options.crop_size else tf.shape(images)[1]) | |
| crop_width = ( | |
| model_options.crop_size[1] | |
| if model_options.crop_size else tf.shape(images)[2]) | |
| if model_options.image_pooling_crop_size: | |
| image_pooling_crop_height = model_options.image_pooling_crop_size[0] | |
| image_pooling_crop_width = model_options.image_pooling_crop_size[1] | |
| # Compute the height, width for the output logits. | |
| if model_options.decoder_output_stride: | |
| logits_output_stride = min(model_options.decoder_output_stride) | |
| else: | |
| logits_output_stride = model_options.output_stride | |
| logits_height = scale_dimension( | |
| crop_height, | |
| max(1.0, max(image_pyramid)) / logits_output_stride) | |
| logits_width = scale_dimension( | |
| crop_width, | |
| max(1.0, max(image_pyramid)) / logits_output_stride) | |
| # Compute the logits for each scale in the image pyramid. | |
| outputs_to_scales_to_logits = { | |
| k: {} | |
| for k in model_options.outputs_to_num_classes | |
| } | |
| num_channels = images.get_shape().as_list()[-1] | |
| for image_scale in image_pyramid: | |
| if image_scale != 1.0: | |
| scaled_height = scale_dimension(crop_height, image_scale) | |
| scaled_width = scale_dimension(crop_width, image_scale) | |
| scaled_crop_size = [scaled_height, scaled_width] | |
| scaled_images = _resize_bilinear(images, scaled_crop_size, images.dtype) | |
| if model_options.crop_size: | |
| scaled_images.set_shape( | |
| [None, scaled_height, scaled_width, num_channels]) | |
| # Adjust image_pooling_crop_size accordingly. | |
| scaled_image_pooling_crop_size = None | |
| if model_options.image_pooling_crop_size: | |
| scaled_image_pooling_crop_size = [ | |
| scale_dimension(image_pooling_crop_height, image_scale), | |
| scale_dimension(image_pooling_crop_width, image_scale)] | |
| else: | |
| scaled_crop_size = model_options.crop_size | |
| scaled_images = images | |
| scaled_image_pooling_crop_size = model_options.image_pooling_crop_size | |
| updated_options = model_options._replace( | |
| crop_size=scaled_crop_size, | |
| image_pooling_crop_size=scaled_image_pooling_crop_size) | |
| outputs_to_logits = _get_logits( | |
| scaled_images, | |
| updated_options, | |
| weight_decay=weight_decay, | |
| reuse=tf.AUTO_REUSE, | |
| is_training=is_training, | |
| fine_tune_batch_norm=fine_tune_batch_norm, | |
| nas_training_hyper_parameters=nas_training_hyper_parameters) | |
| # Resize the logits to have the same dimension before merging. | |
| for output in sorted(outputs_to_logits): | |
| outputs_to_logits[output] = _resize_bilinear( | |
| outputs_to_logits[output], [logits_height, logits_width], | |
| outputs_to_logits[output].dtype) | |
| # Return when only one input scale. | |
| if len(image_pyramid) == 1: | |
| for output in sorted(model_options.outputs_to_num_classes): | |
| outputs_to_scales_to_logits[output][ | |
| MERGED_LOGITS_SCOPE] = outputs_to_logits[output] | |
| return outputs_to_scales_to_logits | |
| # Save logits to the output map. | |
| for output in sorted(model_options.outputs_to_num_classes): | |
| outputs_to_scales_to_logits[output][ | |
| 'logits_%.2f' % image_scale] = outputs_to_logits[output] | |
| # Merge the logits from all the multi-scale inputs. | |
| for output in sorted(model_options.outputs_to_num_classes): | |
| # Concatenate the multi-scale logits for each output type. | |
| all_logits = [ | |
| tf.expand_dims(logits, axis=4) | |
| for logits in outputs_to_scales_to_logits[output].values() | |
| ] | |
| all_logits = tf.concat(all_logits, 4) | |
| merge_fn = ( | |
| tf.reduce_max | |
| if model_options.merge_method == 'max' else tf.reduce_mean) | |
| outputs_to_scales_to_logits[output][MERGED_LOGITS_SCOPE] = merge_fn( | |
| all_logits, axis=4) | |
| return outputs_to_scales_to_logits | |
| def extract_features(images, | |
| model_options, | |
| weight_decay=0.0001, | |
| reuse=None, | |
| is_training=False, | |
| fine_tune_batch_norm=False, | |
| nas_training_hyper_parameters=None): | |
| """Extracts features by the particular model_variant. | |
| Args: | |
| images: A tensor of size [batch, height, width, channels]. | |
| model_options: A ModelOptions instance to configure models. | |
| weight_decay: The weight decay for model variables. | |
| reuse: Reuse the model variables or not. | |
| is_training: Is training or not. | |
| fine_tune_batch_norm: Fine-tune the batch norm parameters or not. | |
| nas_training_hyper_parameters: A dictionary storing hyper-parameters for | |
| training nas models. Its keys are: | |
| - `drop_path_keep_prob`: Probability to keep each path in the cell when | |
| training. | |
| - `total_training_steps`: Total training steps to help drop path | |
| probability calculation. | |
| Returns: | |
| concat_logits: A tensor of size [batch, feature_height, feature_width, | |
| feature_channels], where feature_height/feature_width are determined by | |
| the images height/width and output_stride. | |
| end_points: A dictionary from components of the network to the corresponding | |
| activation. | |
| """ | |
| features, end_points = feature_extractor.extract_features( | |
| images, | |
| output_stride=model_options.output_stride, | |
| multi_grid=model_options.multi_grid, | |
| model_variant=model_options.model_variant, | |
| depth_multiplier=model_options.depth_multiplier, | |
| divisible_by=model_options.divisible_by, | |
| weight_decay=weight_decay, | |
| reuse=reuse, | |
| is_training=is_training, | |
| preprocessed_images_dtype=model_options.preprocessed_images_dtype, | |
| fine_tune_batch_norm=fine_tune_batch_norm, | |
| nas_architecture_options=model_options.nas_architecture_options, | |
| nas_training_hyper_parameters=nas_training_hyper_parameters, | |
| use_bounded_activation=model_options.use_bounded_activation) | |
| if not model_options.aspp_with_batch_norm: | |
| return features, end_points | |
| else: | |
| if model_options.dense_prediction_cell_config is not None: | |
| tf.logging.info('Using dense prediction cell config.') | |
| dense_prediction_layer = dense_prediction_cell.DensePredictionCell( | |
| config=model_options.dense_prediction_cell_config, | |
| hparams={ | |
| 'conv_rate_multiplier': 16 // model_options.output_stride, | |
| }) | |
| concat_logits = dense_prediction_layer.build_cell( | |
| features, | |
| output_stride=model_options.output_stride, | |
| crop_size=model_options.crop_size, | |
| image_pooling_crop_size=model_options.image_pooling_crop_size, | |
| weight_decay=weight_decay, | |
| reuse=reuse, | |
| is_training=is_training, | |
| fine_tune_batch_norm=fine_tune_batch_norm) | |
| return concat_logits, end_points | |
| else: | |
| # The following codes employ the DeepLabv3 ASPP module. Note that we | |
| # could express the ASPP module as one particular dense prediction | |
| # cell architecture. We do not do so but leave the following codes | |
| # for backward compatibility. | |
| batch_norm_params = utils.get_batch_norm_params( | |
| decay=0.9997, | |
| epsilon=1e-5, | |
| scale=True, | |
| is_training=(is_training and fine_tune_batch_norm), | |
| sync_batch_norm_method=model_options.sync_batch_norm_method) | |
| batch_norm = utils.get_batch_norm_fn( | |
| model_options.sync_batch_norm_method) | |
| activation_fn = ( | |
| tf.nn.relu6 if model_options.use_bounded_activation else tf.nn.relu) | |
| with slim.arg_scope( | |
| [slim.conv2d, slim.separable_conv2d], | |
| weights_regularizer=slim.l2_regularizer(weight_decay), | |
| activation_fn=activation_fn, | |
| normalizer_fn=batch_norm, | |
| padding='SAME', | |
| stride=1, | |
| reuse=reuse): | |
| with slim.arg_scope([batch_norm], **batch_norm_params): | |
| depth = model_options.aspp_convs_filters | |
| branch_logits = [] | |
| if model_options.add_image_level_feature: | |
| if model_options.crop_size is not None: | |
| image_pooling_crop_size = model_options.image_pooling_crop_size | |
| # If image_pooling_crop_size is not specified, use crop_size. | |
| if image_pooling_crop_size is None: | |
| image_pooling_crop_size = model_options.crop_size | |
| pool_height = scale_dimension( | |
| image_pooling_crop_size[0], | |
| 1. / model_options.output_stride) | |
| pool_width = scale_dimension( | |
| image_pooling_crop_size[1], | |
| 1. / model_options.output_stride) | |
| image_feature = slim.avg_pool2d( | |
| features, [pool_height, pool_width], | |
| model_options.image_pooling_stride, padding='VALID') | |
| resize_height = scale_dimension( | |
| model_options.crop_size[0], | |
| 1. / model_options.output_stride) | |
| resize_width = scale_dimension( | |
| model_options.crop_size[1], | |
| 1. / model_options.output_stride) | |
| else: | |
| # If crop_size is None, we simply do global pooling. | |
| pool_height = tf.shape(features)[1] | |
| pool_width = tf.shape(features)[2] | |
| image_feature = tf.reduce_mean( | |
| features, axis=[1, 2], keepdims=True) | |
| resize_height = pool_height | |
| resize_width = pool_width | |
| image_feature_activation_fn = tf.nn.relu | |
| image_feature_normalizer_fn = batch_norm | |
| if model_options.aspp_with_squeeze_and_excitation: | |
| image_feature_activation_fn = tf.nn.sigmoid | |
| if model_options.image_se_uses_qsigmoid: | |
| image_feature_activation_fn = utils.q_sigmoid | |
| image_feature_normalizer_fn = None | |
| image_feature = slim.conv2d( | |
| image_feature, depth, 1, | |
| activation_fn=image_feature_activation_fn, | |
| normalizer_fn=image_feature_normalizer_fn, | |
| scope=IMAGE_POOLING_SCOPE) | |
| image_feature = _resize_bilinear( | |
| image_feature, | |
| [resize_height, resize_width], | |
| image_feature.dtype) | |
| # Set shape for resize_height/resize_width if they are not Tensor. | |
| if isinstance(resize_height, tf.Tensor): | |
| resize_height = None | |
| if isinstance(resize_width, tf.Tensor): | |
| resize_width = None | |
| image_feature.set_shape([None, resize_height, resize_width, depth]) | |
| if not model_options.aspp_with_squeeze_and_excitation: | |
| branch_logits.append(image_feature) | |
| # Employ a 1x1 convolution. | |
| branch_logits.append(slim.conv2d(features, depth, 1, | |
| scope=ASPP_SCOPE + str(0))) | |
| if model_options.atrous_rates: | |
| # Employ 3x3 convolutions with different atrous rates. | |
| for i, rate in enumerate(model_options.atrous_rates, 1): | |
| scope = ASPP_SCOPE + str(i) | |
| if model_options.aspp_with_separable_conv: | |
| aspp_features = split_separable_conv2d( | |
| features, | |
| filters=depth, | |
| rate=rate, | |
| weight_decay=weight_decay, | |
| scope=scope) | |
| else: | |
| aspp_features = slim.conv2d( | |
| features, depth, 3, rate=rate, scope=scope) | |
| branch_logits.append(aspp_features) | |
| # Merge branch logits. | |
| concat_logits = tf.concat(branch_logits, 3) | |
| if model_options.aspp_with_concat_projection: | |
| concat_logits = slim.conv2d( | |
| concat_logits, depth, 1, scope=CONCAT_PROJECTION_SCOPE) | |
| concat_logits = slim.dropout( | |
| concat_logits, | |
| keep_prob=0.9, | |
| is_training=is_training, | |
| scope=CONCAT_PROJECTION_SCOPE + '_dropout') | |
| if (model_options.add_image_level_feature and | |
| model_options.aspp_with_squeeze_and_excitation): | |
| concat_logits *= image_feature | |
| return concat_logits, end_points | |
| def _get_logits(images, | |
| model_options, | |
| weight_decay=0.0001, | |
| reuse=None, | |
| is_training=False, | |
| fine_tune_batch_norm=False, | |
| nas_training_hyper_parameters=None): | |
| """Gets the logits by atrous/image spatial pyramid pooling. | |
| Args: | |
| images: A tensor of size [batch, height, width, channels]. | |
| model_options: A ModelOptions instance to configure models. | |
| weight_decay: The weight decay for model variables. | |
| reuse: Reuse the model variables or not. | |
| is_training: Is training or not. | |
| fine_tune_batch_norm: Fine-tune the batch norm parameters or not. | |
| nas_training_hyper_parameters: A dictionary storing hyper-parameters for | |
| training nas models. Its keys are: | |
| - `drop_path_keep_prob`: Probability to keep each path in the cell when | |
| training. | |
| - `total_training_steps`: Total training steps to help drop path | |
| probability calculation. | |
| Returns: | |
| outputs_to_logits: A map from output_type to logits. | |
| """ | |
| features, end_points = extract_features( | |
| images, | |
| model_options, | |
| weight_decay=weight_decay, | |
| reuse=reuse, | |
| is_training=is_training, | |
| fine_tune_batch_norm=fine_tune_batch_norm, | |
| nas_training_hyper_parameters=nas_training_hyper_parameters) | |
| if model_options.decoder_output_stride: | |
| crop_size = model_options.crop_size | |
| if crop_size is None: | |
| crop_size = [tf.shape(images)[1], tf.shape(images)[2]] | |
| features = refine_by_decoder( | |
| features, | |
| end_points, | |
| crop_size=crop_size, | |
| decoder_output_stride=model_options.decoder_output_stride, | |
| decoder_use_separable_conv=model_options.decoder_use_separable_conv, | |
| decoder_use_sum_merge=model_options.decoder_use_sum_merge, | |
| decoder_filters=model_options.decoder_filters, | |
| decoder_output_is_logits=model_options.decoder_output_is_logits, | |
| model_variant=model_options.model_variant, | |
| weight_decay=weight_decay, | |
| reuse=reuse, | |
| is_training=is_training, | |
| fine_tune_batch_norm=fine_tune_batch_norm, | |
| use_bounded_activation=model_options.use_bounded_activation) | |
| outputs_to_logits = {} | |
| for output in sorted(model_options.outputs_to_num_classes): | |
| if model_options.decoder_output_is_logits: | |
| outputs_to_logits[output] = tf.identity(features, | |
| name=output) | |
| else: | |
| outputs_to_logits[output] = get_branch_logits( | |
| features, | |
| model_options.outputs_to_num_classes[output], | |
| model_options.atrous_rates, | |
| aspp_with_batch_norm=model_options.aspp_with_batch_norm, | |
| kernel_size=model_options.logits_kernel_size, | |
| weight_decay=weight_decay, | |
| reuse=reuse, | |
| scope_suffix=output) | |
| return outputs_to_logits | |
| def refine_by_decoder(features, | |
| end_points, | |
| crop_size=None, | |
| decoder_output_stride=None, | |
| decoder_use_separable_conv=False, | |
| decoder_use_sum_merge=False, | |
| decoder_filters=256, | |
| decoder_output_is_logits=False, | |
| model_variant=None, | |
| weight_decay=0.0001, | |
| reuse=None, | |
| is_training=False, | |
| fine_tune_batch_norm=False, | |
| use_bounded_activation=False, | |
| sync_batch_norm_method='None'): | |
| """Adds the decoder to obtain sharper segmentation results. | |
| Args: | |
| features: A tensor of size [batch, features_height, features_width, | |
| features_channels]. | |
| end_points: A dictionary from components of the network to the corresponding | |
| activation. | |
| crop_size: A tuple [crop_height, crop_width] specifying whole patch crop | |
| size. | |
| decoder_output_stride: A list of integers specifying the output stride of | |
| low-level features used in the decoder module. | |
| decoder_use_separable_conv: Employ separable convolution for decoder or not. | |
| decoder_use_sum_merge: Boolean, decoder uses simple sum merge or not. | |
| decoder_filters: Integer, decoder filter size. | |
| decoder_output_is_logits: Boolean, using decoder output as logits or not. | |
| model_variant: Model variant for feature extraction. | |
| weight_decay: The weight decay for model variables. | |
| reuse: Reuse the model variables or not. | |
| is_training: Is training or not. | |
| fine_tune_batch_norm: Fine-tune the batch norm parameters or not. | |
| use_bounded_activation: Whether or not to use bounded activations. Bounded | |
| activations better lend themselves to quantized inference. | |
| sync_batch_norm_method: String, method used to sync batch norm. Currently | |
| only support `None` (no sync batch norm) and `tpu` (use tpu code to | |
| sync batch norm). | |
| Returns: | |
| Decoder output with size [batch, decoder_height, decoder_width, | |
| decoder_channels]. | |
| Raises: | |
| ValueError: If crop_size is None. | |
| """ | |
| if crop_size is None: | |
| raise ValueError('crop_size must be provided when using decoder.') | |
| batch_norm_params = utils.get_batch_norm_params( | |
| decay=0.9997, | |
| epsilon=1e-5, | |
| scale=True, | |
| is_training=(is_training and fine_tune_batch_norm), | |
| sync_batch_norm_method=sync_batch_norm_method) | |
| batch_norm = utils.get_batch_norm_fn(sync_batch_norm_method) | |
| decoder_depth = decoder_filters | |
| projected_filters = 48 | |
| if decoder_use_sum_merge: | |
| # When using sum merge, the projected filters must be equal to decoder | |
| # filters. | |
| projected_filters = decoder_filters | |
| if decoder_output_is_logits: | |
| # Overwrite the setting when decoder output is logits. | |
| activation_fn = None | |
| normalizer_fn = None | |
| conv2d_kernel = 1 | |
| # Use original conv instead of separable conv. | |
| decoder_use_separable_conv = False | |
| else: | |
| # Default setting when decoder output is not logits. | |
| activation_fn = tf.nn.relu6 if use_bounded_activation else tf.nn.relu | |
| normalizer_fn = batch_norm | |
| conv2d_kernel = 3 | |
| with slim.arg_scope( | |
| [slim.conv2d, slim.separable_conv2d], | |
| weights_regularizer=slim.l2_regularizer(weight_decay), | |
| activation_fn=activation_fn, | |
| normalizer_fn=normalizer_fn, | |
| padding='SAME', | |
| stride=1, | |
| reuse=reuse): | |
| with slim.arg_scope([batch_norm], **batch_norm_params): | |
| with tf.variable_scope(DECODER_SCOPE, DECODER_SCOPE, [features]): | |
| decoder_features = features | |
| decoder_stage = 0 | |
| scope_suffix = '' | |
| for output_stride in decoder_output_stride: | |
| feature_list = feature_extractor.networks_to_feature_maps[ | |
| model_variant][ | |
| feature_extractor.DECODER_END_POINTS][output_stride] | |
| # If only one decoder stage, we do not change the scope name in | |
| # order for backward compactibility. | |
| if decoder_stage: | |
| scope_suffix = '_{}'.format(decoder_stage) | |
| for i, name in enumerate(feature_list): | |
| decoder_features_list = [decoder_features] | |
| # MobileNet and NAS variants use different naming convention. | |
| if ('mobilenet' in model_variant or | |
| model_variant.startswith('mnas') or | |
| model_variant.startswith('nas')): | |
| feature_name = name | |
| else: | |
| feature_name = '{}/{}'.format( | |
| feature_extractor.name_scope[model_variant], name) | |
| decoder_features_list.append( | |
| slim.conv2d( | |
| end_points[feature_name], | |
| projected_filters, | |
| 1, | |
| scope='feature_projection' + str(i) + scope_suffix)) | |
| # Determine the output size. | |
| decoder_height = scale_dimension(crop_size[0], 1.0 / output_stride) | |
| decoder_width = scale_dimension(crop_size[1], 1.0 / output_stride) | |
| # Resize to decoder_height/decoder_width. | |
| for j, feature in enumerate(decoder_features_list): | |
| decoder_features_list[j] = _resize_bilinear( | |
| feature, [decoder_height, decoder_width], feature.dtype) | |
| h = (None if isinstance(decoder_height, tf.Tensor) | |
| else decoder_height) | |
| w = (None if isinstance(decoder_width, tf.Tensor) | |
| else decoder_width) | |
| decoder_features_list[j].set_shape([None, h, w, None]) | |
| if decoder_use_sum_merge: | |
| decoder_features = _decoder_with_sum_merge( | |
| decoder_features_list, | |
| decoder_depth, | |
| conv2d_kernel=conv2d_kernel, | |
| decoder_use_separable_conv=decoder_use_separable_conv, | |
| weight_decay=weight_decay, | |
| scope_suffix=scope_suffix) | |
| else: | |
| if not decoder_use_separable_conv: | |
| scope_suffix = str(i) + scope_suffix | |
| decoder_features = _decoder_with_concat_merge( | |
| decoder_features_list, | |
| decoder_depth, | |
| decoder_use_separable_conv=decoder_use_separable_conv, | |
| weight_decay=weight_decay, | |
| scope_suffix=scope_suffix) | |
| decoder_stage += 1 | |
| return decoder_features | |
| def _decoder_with_sum_merge(decoder_features_list, | |
| decoder_depth, | |
| conv2d_kernel=3, | |
| decoder_use_separable_conv=True, | |
| weight_decay=0.0001, | |
| scope_suffix=''): | |
| """Decoder with sum to merge features. | |
| Args: | |
| decoder_features_list: A list of decoder features. | |
| decoder_depth: Integer, the filters used in the convolution. | |
| conv2d_kernel: Integer, the convolution kernel size. | |
| decoder_use_separable_conv: Boolean, use separable conv or not. | |
| weight_decay: Weight decay for the model variables. | |
| scope_suffix: String, used in the scope suffix. | |
| Returns: | |
| decoder features merged with sum. | |
| Raises: | |
| RuntimeError: If decoder_features_list have length not equal to 2. | |
| """ | |
| if len(decoder_features_list) != 2: | |
| raise RuntimeError('Expect decoder_features has length 2.') | |
| # Only apply one convolution when decoder use sum merge. | |
| if decoder_use_separable_conv: | |
| decoder_features = split_separable_conv2d( | |
| decoder_features_list[0], | |
| filters=decoder_depth, | |
| rate=1, | |
| weight_decay=weight_decay, | |
| scope='decoder_split_sep_conv0'+scope_suffix) + decoder_features_list[1] | |
| else: | |
| decoder_features = slim.conv2d( | |
| decoder_features_list[0], | |
| decoder_depth, | |
| conv2d_kernel, | |
| scope='decoder_conv0'+scope_suffix) + decoder_features_list[1] | |
| return decoder_features | |
| def _decoder_with_concat_merge(decoder_features_list, | |
| decoder_depth, | |
| decoder_use_separable_conv=True, | |
| weight_decay=0.0001, | |
| scope_suffix=''): | |
| """Decoder with concatenation to merge features. | |
| This decoder method applies two convolutions to smooth the features obtained | |
| by concatenating the input decoder_features_list. | |
| This decoder module is proposed in the DeepLabv3+ paper. | |
| Args: | |
| decoder_features_list: A list of decoder features. | |
| decoder_depth: Integer, the filters used in the convolution. | |
| decoder_use_separable_conv: Boolean, use separable conv or not. | |
| weight_decay: Weight decay for the model variables. | |
| scope_suffix: String, used in the scope suffix. | |
| Returns: | |
| decoder features merged with concatenation. | |
| """ | |
| if decoder_use_separable_conv: | |
| decoder_features = split_separable_conv2d( | |
| tf.concat(decoder_features_list, 3), | |
| filters=decoder_depth, | |
| rate=1, | |
| weight_decay=weight_decay, | |
| scope='decoder_conv0'+scope_suffix) | |
| decoder_features = split_separable_conv2d( | |
| decoder_features, | |
| filters=decoder_depth, | |
| rate=1, | |
| weight_decay=weight_decay, | |
| scope='decoder_conv1'+scope_suffix) | |
| else: | |
| num_convs = 2 | |
| decoder_features = slim.repeat( | |
| tf.concat(decoder_features_list, 3), | |
| num_convs, | |
| slim.conv2d, | |
| decoder_depth, | |
| 3, | |
| scope='decoder_conv'+scope_suffix) | |
| return decoder_features | |
| def get_branch_logits(features, | |
| num_classes, | |
| atrous_rates=None, | |
| aspp_with_batch_norm=False, | |
| kernel_size=1, | |
| weight_decay=0.0001, | |
| reuse=None, | |
| scope_suffix=''): | |
| """Gets the logits from each model's branch. | |
| The underlying model is branched out in the last layer when atrous | |
| spatial pyramid pooling is employed, and all branches are sum-merged | |
| to form the final logits. | |
| Args: | |
| features: A float tensor of shape [batch, height, width, channels]. | |
| num_classes: Number of classes to predict. | |
| atrous_rates: A list of atrous convolution rates for last layer. | |
| aspp_with_batch_norm: Use batch normalization layers for ASPP. | |
| kernel_size: Kernel size for convolution. | |
| weight_decay: Weight decay for the model variables. | |
| reuse: Reuse model variables or not. | |
| scope_suffix: Scope suffix for the model variables. | |
| Returns: | |
| Merged logits with shape [batch, height, width, num_classes]. | |
| Raises: | |
| ValueError: Upon invalid input kernel_size value. | |
| """ | |
| # When using batch normalization with ASPP, ASPP has been applied before | |
| # in extract_features, and thus we simply apply 1x1 convolution here. | |
| if aspp_with_batch_norm or atrous_rates is None: | |
| if kernel_size != 1: | |
| raise ValueError('Kernel size must be 1 when atrous_rates is None or ' | |
| 'using aspp_with_batch_norm. Gets %d.' % kernel_size) | |
| atrous_rates = [1] | |
| with slim.arg_scope( | |
| [slim.conv2d], | |
| weights_regularizer=slim.l2_regularizer(weight_decay), | |
| weights_initializer=tf.truncated_normal_initializer(stddev=0.01), | |
| reuse=reuse): | |
| with tf.variable_scope(LOGITS_SCOPE_NAME, LOGITS_SCOPE_NAME, [features]): | |
| branch_logits = [] | |
| for i, rate in enumerate(atrous_rates): | |
| scope = scope_suffix | |
| if i: | |
| scope += '_%d' % i | |
| branch_logits.append( | |
| slim.conv2d( | |
| features, | |
| num_classes, | |
| kernel_size=kernel_size, | |
| rate=rate, | |
| activation_fn=None, | |
| normalizer_fn=None, | |
| scope=scope)) | |
| return tf.add_n(branch_logits) | |