Spaces:
Runtime error
Runtime error
| # Copyright 2018 The TensorFlow Authors All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| """Provides flags that are common to scripts. | |
| Common flags from train/vis_video.py are collected in this script. | |
| """ | |
| import tensorflow as tf | |
| from deeplab import common | |
| flags = tf.app.flags | |
| flags.DEFINE_enum( | |
| 'classification_loss', 'softmax_with_attention', | |
| ['softmax', 'triplet', 'softmax_with_attention'], | |
| 'Type of loss function used for classifying pixels, can be either softmax, ' | |
| 'softmax_with_attention, or triplet.') | |
| flags.DEFINE_integer('k_nearest_neighbors', 1, | |
| 'The number of nearest neighbors to use.') | |
| flags.DEFINE_integer('embedding_dimension', 100, 'The dimension used for the ' | |
| 'learned embedding') | |
| flags.DEFINE_boolean('use_softmax_feedback', True, | |
| 'Whether to give the softmax predictions of the last ' | |
| 'frame as additional input to the segmentation head.') | |
| flags.DEFINE_boolean('sample_adjacent_and_consistent_query_frames', True, | |
| 'If true, the query frames (all but the first frame ' | |
| 'which is the reference frame) will be sampled such ' | |
| 'that they are adjacent video frames and have the same ' | |
| 'crop coordinates and flip augmentation. Note that if ' | |
| 'use_softmax_feedback is True, this option will ' | |
| 'automatically be activated.') | |
| flags.DEFINE_integer('embedding_seg_feature_dimension', 256, | |
| 'The dimensionality used in the segmentation head layers.') | |
| flags.DEFINE_integer('embedding_seg_n_layers', 4, 'The number of layers in the ' | |
| 'segmentation head.') | |
| flags.DEFINE_integer('embedding_seg_kernel_size', 7, 'The kernel size used in ' | |
| 'the segmentation head.') | |
| flags.DEFINE_multi_integer('embedding_seg_atrous_rates', [], | |
| 'The atrous rates to use for the segmentation head.') | |
| flags.DEFINE_boolean('normalize_nearest_neighbor_distances', True, | |
| 'Whether to normalize the nearest neighbor distances ' | |
| 'to [0,1] using sigmoid, scale and shift.') | |
| flags.DEFINE_boolean('also_attend_to_previous_frame', True, 'Whether to also ' | |
| 'use nearest neighbor attention with respect to the ' | |
| 'previous frame.') | |
| flags.DEFINE_bool('use_local_previous_frame_attention', True, | |
| 'Whether to restrict the previous frame attention to a local ' | |
| 'search window. Only has an effect, if ' | |
| 'also_attend_to_previous_frame is True.') | |
| flags.DEFINE_integer('previous_frame_attention_window_size', 15, | |
| 'The window size used for local previous frame attention,' | |
| ' if use_local_previous_frame_attention is True.') | |
| flags.DEFINE_boolean('use_first_frame_matching', True, 'Whether to extract ' | |
| 'features by matching to the reference frame. This should ' | |
| 'always be true except for ablation experiments.') | |
| FLAGS = flags.FLAGS | |
| # Constants | |
| # Perform semantic segmentation predictions. | |
| OUTPUT_TYPE = common.OUTPUT_TYPE | |
| # Semantic segmentation item names. | |
| LABELS_CLASS = common.LABELS_CLASS | |
| IMAGE = common.IMAGE | |
| HEIGHT = common.HEIGHT | |
| WIDTH = common.WIDTH | |
| IMAGE_NAME = common.IMAGE_NAME | |
| SOURCE_ID = 'source_id' | |
| VIDEO_ID = 'video_id' | |
| LABEL = common.LABEL | |
| ORIGINAL_IMAGE = common.ORIGINAL_IMAGE | |
| PRECEDING_FRAME_LABEL = 'preceding_frame_label' | |
| # Test set name. | |
| TEST_SET = common.TEST_SET | |
| # Internal constants. | |
| OBJECT_LABEL = 'object_label' | |
| class VideoModelOptions(common.ModelOptions): | |
| """Internal version of immutable class to hold model options.""" | |
| def __new__(cls, | |
| outputs_to_num_classes, | |
| crop_size=None, | |
| atrous_rates=None, | |
| output_stride=8): | |
| """Constructor to set default values. | |
| Args: | |
| outputs_to_num_classes: A dictionary from output type to the number of | |
| classes. For example, for the task of semantic segmentation with 21 | |
| semantic classes, we would have outputs_to_num_classes['semantic'] = 21. | |
| crop_size: A tuple [crop_height, crop_width]. | |
| atrous_rates: A list of atrous convolution rates for ASPP. | |
| output_stride: The ratio of input to output spatial resolution. | |
| Returns: | |
| A new VideoModelOptions instance. | |
| """ | |
| self = super(VideoModelOptions, cls).__new__( | |
| cls, | |
| outputs_to_num_classes, | |
| crop_size, | |
| atrous_rates, | |
| output_stride) | |
| # Add internal flags. | |
| self.classification_loss = FLAGS.classification_loss | |
| return self | |
| def parse_decoder_output_stride(): | |
| """Parses decoder output stride. | |
| FEELVOS assumes decoder_output_stride = 4. Thus, this function is created for | |
| this particular purpose. | |
| Returns: | |
| An integer specifying the decoder_output_stride. | |
| Raises: | |
| ValueError: If decoder_output_stride is None or contains more than one | |
| element. | |
| """ | |
| if FLAGS.decoder_output_stride: | |
| decoder_output_stride = [ | |
| int(x) for x in FLAGS.decoder_output_stride] | |
| if len(decoder_output_stride) != 1: | |
| raise ValueError('Expect decoder output stride has only one element.') | |
| decoder_output_stride = decoder_output_stride[0] | |
| else: | |
| raise ValueError('Expect flag decoder output stride not to be None.') | |
| return decoder_output_stride | |