Spaces:
Runtime error
Runtime error
| # Copyright 2017 Google Inc. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| """Layers for VatxtModel.""" | |
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| # Dependency imports | |
| from six.moves import xrange | |
| import tensorflow as tf | |
| K = tf.keras | |
| def cl_logits_subgraph(layer_sizes, input_size, num_classes, keep_prob=1.): | |
| """Construct multiple ReLU layers with dropout and a linear layer.""" | |
| subgraph = K.models.Sequential(name='cl_logits') | |
| for i, layer_size in enumerate(layer_sizes): | |
| if i == 0: | |
| subgraph.add( | |
| K.layers.Dense(layer_size, activation='relu', input_dim=input_size)) | |
| else: | |
| subgraph.add(K.layers.Dense(layer_size, activation='relu')) | |
| if keep_prob < 1.: | |
| subgraph.add(K.layers.Dropout(1. - keep_prob)) | |
| subgraph.add(K.layers.Dense(1 if num_classes == 2 else num_classes)) | |
| return subgraph | |
| class Embedding(K.layers.Layer): | |
| """Embedding layer with frequency-based normalization and dropout.""" | |
| def __init__(self, | |
| vocab_size, | |
| embedding_dim, | |
| normalize=False, | |
| vocab_freqs=None, | |
| keep_prob=1., | |
| **kwargs): | |
| self.vocab_size = vocab_size | |
| self.embedding_dim = embedding_dim | |
| self.normalized = normalize | |
| self.keep_prob = keep_prob | |
| if normalize: | |
| assert vocab_freqs is not None | |
| self.vocab_freqs = tf.constant( | |
| vocab_freqs, dtype=tf.float32, shape=(vocab_size, 1)) | |
| super(Embedding, self).__init__(**kwargs) | |
| def build(self, input_shape): | |
| with tf.device('/cpu:0'): | |
| self.var = self.add_weight( | |
| shape=(self.vocab_size, self.embedding_dim), | |
| initializer=tf.random_uniform_initializer(-1., 1.), | |
| name='embedding', | |
| dtype=tf.float32) | |
| if self.normalized: | |
| self.var = self._normalize(self.var) | |
| super(Embedding, self).build(input_shape) | |
| def call(self, x): | |
| embedded = tf.nn.embedding_lookup(self.var, x) | |
| if self.keep_prob < 1.: | |
| shape = embedded.get_shape().as_list() | |
| # Use same dropout masks at each timestep with specifying noise_shape. | |
| # This slightly improves performance. | |
| # Please see https://arxiv.org/abs/1512.05287 for the theoretical | |
| # explanation. | |
| embedded = tf.nn.dropout( | |
| embedded, self.keep_prob, noise_shape=(shape[0], 1, shape[2])) | |
| return embedded | |
| def _normalize(self, emb): | |
| weights = self.vocab_freqs / tf.reduce_sum(self.vocab_freqs) | |
| mean = tf.reduce_sum(weights * emb, 0, keep_dims=True) | |
| var = tf.reduce_sum(weights * tf.pow(emb - mean, 2.), 0, keep_dims=True) | |
| stddev = tf.sqrt(1e-6 + var) | |
| return (emb - mean) / stddev | |
| class LSTM(object): | |
| """LSTM layer using dynamic_rnn. | |
| Exposes variables in `trainable_weights` property. | |
| """ | |
| def __init__(self, cell_size, num_layers=1, keep_prob=1., name='LSTM'): | |
| self.cell_size = cell_size | |
| self.num_layers = num_layers | |
| self.keep_prob = keep_prob | |
| self.reuse = None | |
| self.trainable_weights = None | |
| self.name = name | |
| def __call__(self, x, initial_state, seq_length): | |
| with tf.variable_scope(self.name, reuse=self.reuse) as vs: | |
| cell = tf.contrib.rnn.MultiRNNCell([ | |
| tf.contrib.rnn.BasicLSTMCell( | |
| self.cell_size, | |
| forget_bias=0.0, | |
| reuse=tf.get_variable_scope().reuse) | |
| for _ in xrange(self.num_layers) | |
| ]) | |
| # shape(x) = (batch_size, num_timesteps, embedding_dim) | |
| lstm_out, next_state = tf.nn.dynamic_rnn( | |
| cell, x, initial_state=initial_state, sequence_length=seq_length) | |
| # shape(lstm_out) = (batch_size, timesteps, cell_size) | |
| if self.keep_prob < 1.: | |
| lstm_out = tf.nn.dropout(lstm_out, self.keep_prob) | |
| if self.reuse is None: | |
| self.trainable_weights = vs.global_variables() | |
| self.reuse = True | |
| return lstm_out, next_state | |
| class SoftmaxLoss(K.layers.Layer): | |
| """Softmax xentropy loss with candidate sampling.""" | |
| def __init__(self, | |
| vocab_size, | |
| num_candidate_samples=-1, | |
| vocab_freqs=None, | |
| **kwargs): | |
| self.vocab_size = vocab_size | |
| self.num_candidate_samples = num_candidate_samples | |
| self.vocab_freqs = vocab_freqs | |
| super(SoftmaxLoss, self).__init__(**kwargs) | |
| self.multiclass_dense_layer = K.layers.Dense(self.vocab_size) | |
| def build(self, input_shape): | |
| input_shape = input_shape[0].as_list() | |
| with tf.device('/cpu:0'): | |
| self.lin_w = self.add_weight( | |
| shape=(input_shape[-1], self.vocab_size), | |
| name='lm_lin_w', | |
| initializer=K.initializers.glorot_uniform()) | |
| self.lin_b = self.add_weight( | |
| shape=(self.vocab_size,), | |
| name='lm_lin_b', | |
| initializer=K.initializers.glorot_uniform()) | |
| self.multiclass_dense_layer.build(input_shape) | |
| super(SoftmaxLoss, self).build(input_shape) | |
| def call(self, inputs): | |
| x, labels, weights = inputs | |
| if self.num_candidate_samples > -1: | |
| assert self.vocab_freqs is not None | |
| labels_reshaped = tf.reshape(labels, [-1]) | |
| labels_reshaped = tf.expand_dims(labels_reshaped, -1) | |
| sampled = tf.nn.fixed_unigram_candidate_sampler( | |
| true_classes=labels_reshaped, | |
| num_true=1, | |
| num_sampled=self.num_candidate_samples, | |
| unique=True, | |
| range_max=self.vocab_size, | |
| unigrams=self.vocab_freqs) | |
| inputs_reshaped = tf.reshape(x, [-1, int(x.get_shape()[2])]) | |
| lm_loss = tf.nn.sampled_softmax_loss( | |
| weights=tf.transpose(self.lin_w), | |
| biases=self.lin_b, | |
| labels=labels_reshaped, | |
| inputs=inputs_reshaped, | |
| num_sampled=self.num_candidate_samples, | |
| num_classes=self.vocab_size, | |
| sampled_values=sampled) | |
| lm_loss = tf.reshape( | |
| lm_loss, | |
| [int(x.get_shape()[0]), int(x.get_shape()[1])]) | |
| else: | |
| logits = self.multiclass_dense_layer(x) | |
| lm_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( | |
| logits=logits, labels=labels) | |
| lm_loss = tf.identity( | |
| tf.reduce_sum(lm_loss * weights) / _num_labels(weights), | |
| name='lm_xentropy_loss') | |
| return lm_loss | |
| def classification_loss(logits, labels, weights): | |
| """Computes cross entropy loss between logits and labels. | |
| Args: | |
| logits: 2-D [timesteps*batch_size, m] float tensor, where m=1 if | |
| num_classes=2, otherwise m=num_classes. | |
| labels: 1-D [timesteps*batch_size] integer tensor. | |
| weights: 1-D [timesteps*batch_size] float tensor. | |
| Returns: | |
| Loss scalar of type float. | |
| """ | |
| inner_dim = logits.get_shape().as_list()[-1] | |
| with tf.name_scope('classifier_loss'): | |
| # Logistic loss | |
| if inner_dim == 1: | |
| loss = tf.nn.sigmoid_cross_entropy_with_logits( | |
| logits=tf.squeeze(logits, -1), labels=tf.cast(labels, tf.float32)) | |
| # Softmax loss | |
| else: | |
| loss = tf.nn.sparse_softmax_cross_entropy_with_logits( | |
| logits=logits, labels=labels) | |
| num_lab = _num_labels(weights) | |
| tf.summary.scalar('num_labels', num_lab) | |
| return tf.identity( | |
| tf.reduce_sum(weights * loss) / num_lab, name='classification_xentropy') | |
| def accuracy(logits, targets, weights): | |
| """Computes prediction accuracy. | |
| Args: | |
| logits: 2-D classifier logits [timesteps*batch_size, num_classes] | |
| targets: 1-D [timesteps*batch_size] integer tensor. | |
| weights: 1-D [timesteps*batch_size] float tensor. | |
| Returns: | |
| Accuracy: float scalar. | |
| """ | |
| with tf.name_scope('accuracy'): | |
| eq = tf.cast(tf.equal(predictions(logits), targets), tf.float32) | |
| return tf.identity( | |
| tf.reduce_sum(weights * eq) / _num_labels(weights), name='accuracy') | |
| def predictions(logits): | |
| """Class prediction from logits.""" | |
| inner_dim = logits.get_shape().as_list()[-1] | |
| with tf.name_scope('predictions'): | |
| # For binary classification | |
| if inner_dim == 1: | |
| pred = tf.cast(tf.greater(tf.squeeze(logits, -1), 0.), tf.int64) | |
| # For multi-class classification | |
| else: | |
| pred = tf.argmax(logits, 2) | |
| return pred | |
| def _num_labels(weights): | |
| """Number of 1's in weights. Returns 1. if 0.""" | |
| num_labels = tf.reduce_sum(weights) | |
| num_labels = tf.where(tf.equal(num_labels, 0.), 1., num_labels) | |
| return num_labels | |
| def optimize(loss, | |
| global_step, | |
| max_grad_norm, | |
| lr, | |
| lr_decay, | |
| sync_replicas=False, | |
| replicas_to_aggregate=1, | |
| task_id=0): | |
| """Builds optimization graph. | |
| * Creates an optimizer, and optionally wraps with SyncReplicasOptimizer | |
| * Computes, clips, and applies gradients | |
| * Maintains moving averages for all trainable variables | |
| * Summarizes variables and gradients | |
| Args: | |
| loss: scalar loss to minimize. | |
| global_step: integer scalar Variable. | |
| max_grad_norm: float scalar. Grads will be clipped to this value. | |
| lr: float scalar, learning rate. | |
| lr_decay: float scalar, learning rate decay rate. | |
| sync_replicas: bool, whether to use SyncReplicasOptimizer. | |
| replicas_to_aggregate: int, number of replicas to aggregate when using | |
| SyncReplicasOptimizer. | |
| task_id: int, id of the current task; used to ensure proper initialization | |
| of SyncReplicasOptimizer. | |
| Returns: | |
| train_op | |
| """ | |
| with tf.name_scope('optimization'): | |
| # Compute gradients. | |
| tvars = tf.trainable_variables() | |
| grads = tf.gradients( | |
| loss, | |
| tvars, | |
| aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) | |
| # Clip non-embedding grads | |
| non_embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars) | |
| if 'embedding' not in v.op.name] | |
| embedding_grads_and_vars = [(g, v) for (g, v) in zip(grads, tvars) | |
| if 'embedding' in v.op.name] | |
| ne_grads, ne_vars = zip(*non_embedding_grads_and_vars) | |
| ne_grads, _ = tf.clip_by_global_norm(ne_grads, max_grad_norm) | |
| non_embedding_grads_and_vars = zip(ne_grads, ne_vars) | |
| grads_and_vars = embedding_grads_and_vars + list(non_embedding_grads_and_vars) | |
| # Summarize | |
| _summarize_vars_and_grads(grads_and_vars) | |
| # Decaying learning rate | |
| lr = tf.train.exponential_decay( | |
| lr, global_step, 1, lr_decay, staircase=True) | |
| tf.summary.scalar('learning_rate', lr) | |
| opt = tf.train.AdamOptimizer(lr) | |
| # Track the moving averages of all trainable variables. | |
| variable_averages = tf.train.ExponentialMovingAverage(0.999, global_step) | |
| # Apply gradients | |
| if sync_replicas: | |
| opt = tf.train.SyncReplicasOptimizer( | |
| opt, | |
| replicas_to_aggregate, | |
| variable_averages=variable_averages, | |
| variables_to_average=tvars, | |
| total_num_replicas=replicas_to_aggregate) | |
| apply_gradient_op = opt.apply_gradients( | |
| grads_and_vars, global_step=global_step) | |
| with tf.control_dependencies([apply_gradient_op]): | |
| train_op = tf.no_op(name='train_op') | |
| # Initialization ops | |
| tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, | |
| opt.get_chief_queue_runner()) | |
| if task_id == 0: # Chief task | |
| local_init_op = opt.chief_init_op | |
| tf.add_to_collection('chief_init_op', opt.get_init_tokens_op()) | |
| else: | |
| local_init_op = opt.local_step_init_op | |
| tf.add_to_collection('local_init_op', local_init_op) | |
| tf.add_to_collection('ready_for_local_init_op', | |
| opt.ready_for_local_init_op) | |
| else: | |
| # Non-sync optimizer | |
| apply_gradient_op = opt.apply_gradients(grads_and_vars, global_step) | |
| with tf.control_dependencies([apply_gradient_op]): | |
| train_op = variable_averages.apply(tvars) | |
| return train_op | |
| def _summarize_vars_and_grads(grads_and_vars): | |
| tf.logging.info('Trainable variables:') | |
| tf.logging.info('-' * 60) | |
| for grad, var in grads_and_vars: | |
| tf.logging.info(var) | |
| def tag(name, v=var): | |
| return v.op.name + '_' + name | |
| # Variable summary | |
| mean = tf.reduce_mean(var) | |
| tf.summary.scalar(tag('mean'), mean) | |
| with tf.name_scope(tag('stddev')): | |
| stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) | |
| tf.summary.scalar(tag('stddev'), stddev) | |
| tf.summary.scalar(tag('max'), tf.reduce_max(var)) | |
| tf.summary.scalar(tag('min'), tf.reduce_min(var)) | |
| tf.summary.histogram(tag('histogram'), var) | |
| # Gradient summary | |
| if grad is not None: | |
| if isinstance(grad, tf.IndexedSlices): | |
| grad_values = grad.values | |
| else: | |
| grad_values = grad | |
| tf.summary.histogram(tag('gradient'), grad_values) | |
| tf.summary.scalar(tag('gradient_norm'), tf.global_norm([grad_values])) | |
| else: | |
| tf.logging.info('Var %s has no gradient', var.op.name) | |