Spaces:
Runtime error
Runtime error
| # Copyright 2017 Google Inc. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # | |
| # ============================================================================== | |
| import numpy as np | |
| import tensorflow as tf | |
| from utils import linear, log_sum_exp | |
| class Poisson(object): | |
| """Poisson distributon | |
| Computes the log probability under the model. | |
| """ | |
| def __init__(self, log_rates): | |
| """ Create Poisson distributions with log_rates parameters. | |
| Args: | |
| log_rates: a tensor-like list of log rates underlying the Poisson dist. | |
| """ | |
| self.logr = log_rates | |
| def logp(self, bin_counts): | |
| """Compute the log probability for the counts in the bin, under the model. | |
| Args: | |
| bin_counts: array-like integer counts | |
| Returns: | |
| The log-probability under the Poisson models for each element of | |
| bin_counts. | |
| """ | |
| k = tf.to_float(bin_counts) | |
| # log poisson(k, r) = log(r^k * e^(-r) / k!) = k log(r) - r - log k! | |
| # log poisson(k, r=exp(x)) = k * x - exp(x) - lgamma(k + 1) | |
| return k * self.logr - tf.exp(self.logr) - tf.lgamma(k + 1) | |
| def diag_gaussian_log_likelihood(z, mu=0.0, logvar=0.0): | |
| """Log-likelihood under a Gaussian distribution with diagonal covariance. | |
| Returns the log-likelihood for each dimension. One should sum the | |
| results for the log-likelihood under the full multidimensional model. | |
| Args: | |
| z: The value to compute the log-likelihood. | |
| mu: The mean of the Gaussian | |
| logvar: The log variance of the Gaussian. | |
| Returns: | |
| The log-likelihood under the Gaussian model. | |
| """ | |
| return -0.5 * (logvar + np.log(2*np.pi) + \ | |
| tf.square((z-mu)/tf.exp(0.5*logvar))) | |
| def gaussian_pos_log_likelihood(unused_mean, logvar, noise): | |
| """Gaussian log-likelihood function for a posterior in VAE | |
| Note: This function is specialized for a posterior distribution, that has the | |
| form of z = mean + sigma * noise. | |
| Args: | |
| unused_mean: ignore | |
| logvar: The log variance of the distribution | |
| noise: The noise used in the sampling of the posterior. | |
| Returns: | |
| The log-likelihood under the Gaussian model. | |
| """ | |
| # ln N(z; mean, sigma) = - ln(sigma) - 0.5 ln 2pi - noise^2 / 2 | |
| return - 0.5 * (logvar + np.log(2 * np.pi) + tf.square(noise)) | |
| class Gaussian(object): | |
| """Base class for Gaussian distribution classes.""" | |
| pass | |
| class DiagonalGaussian(Gaussian): | |
| """Diagonal Gaussian with different constant mean and variances in each | |
| dimension. | |
| """ | |
| def __init__(self, batch_size, z_size, mean, logvar): | |
| """Create a diagonal gaussian distribution. | |
| Args: | |
| batch_size: The size of the batch, i.e. 0th dim in 2D tensor of samples. | |
| z_size: The dimension of the distribution, i.e. 1st dim in 2D tensor. | |
| mean: The N-D mean of the distribution. | |
| logvar: The N-D log variance of the diagonal distribution. | |
| """ | |
| size__xz = [None, z_size] | |
| self.mean = mean # bxn already | |
| self.logvar = logvar # bxn already | |
| self.noise = noise = tf.random_normal(tf.shape(logvar)) | |
| self.sample = mean + tf.exp(0.5 * logvar) * noise | |
| mean.set_shape(size__xz) | |
| logvar.set_shape(size__xz) | |
| self.sample.set_shape(size__xz) | |
| def logp(self, z=None): | |
| """Compute the log-likelihood under the distribution. | |
| Args: | |
| z (optional): value to compute likelihood for, if None, use sample. | |
| Returns: | |
| The likelihood of z under the model. | |
| """ | |
| if z is None: | |
| z = self.sample | |
| # This is needed to make sure that the gradients are simple. | |
| # The value of the function shouldn't change. | |
| if z == self.sample: | |
| return gaussian_pos_log_likelihood(self.mean, self.logvar, self.noise) | |
| return diag_gaussian_log_likelihood(z, self.mean, self.logvar) | |
| class LearnableDiagonalGaussian(Gaussian): | |
| """Diagonal Gaussian whose mean and variance are learned parameters.""" | |
| def __init__(self, batch_size, z_size, name, mean_init=0.0, | |
| var_init=1.0, var_min=0.0, var_max=1000000.0): | |
| """Create a learnable diagonal gaussian distribution. | |
| Args: | |
| batch_size: The size of the batch, i.e. 0th dim in 2D tensor of samples. | |
| z_size: The dimension of the distribution, i.e. 1st dim in 2D tensor. | |
| name: prefix name for the mean and log TF variables. | |
| mean_init (optional): The N-D mean initialization of the distribution. | |
| var_init (optional): The N-D variance initialization of the diagonal | |
| distribution. | |
| var_min (optional): The minimum value the learned variance can take in any | |
| dimension. | |
| var_max (optional): The maximum value the learned variance can take in any | |
| dimension. | |
| """ | |
| size_1xn = [1, z_size] | |
| size__xn = [None, z_size] | |
| size_bx1 = tf.stack([batch_size, 1]) | |
| assert var_init > 0.0, "Problems" | |
| assert var_max >= var_min, "Problems" | |
| assert var_init >= var_min, "Problems" | |
| assert var_max >= var_init, "Problems" | |
| z_mean_1xn = tf.get_variable(name=name+"/mean", shape=size_1xn, | |
| initializer=tf.constant_initializer(mean_init)) | |
| self.mean_bxn = mean_bxn = tf.tile(z_mean_1xn, size_bx1) | |
| mean_bxn.set_shape(size__xn) # tile loses shape | |
| log_var_init = np.log(var_init) | |
| if var_max > var_min: | |
| var_is_trainable = True | |
| else: | |
| var_is_trainable = False | |
| z_logvar_1xn = \ | |
| tf.get_variable(name=(name+"/logvar"), shape=size_1xn, | |
| initializer=tf.constant_initializer(log_var_init), | |
| trainable=var_is_trainable) | |
| if var_is_trainable: | |
| z_logit_var_1xn = tf.exp(z_logvar_1xn) | |
| z_var_1xn = tf.nn.sigmoid(z_logit_var_1xn)*(var_max-var_min) + var_min | |
| z_logvar_1xn = tf.log(z_var_1xn) | |
| logvar_bxn = tf.tile(z_logvar_1xn, size_bx1) | |
| self.logvar_bxn = logvar_bxn | |
| self.noise_bxn = noise_bxn = tf.random_normal(tf.shape(logvar_bxn)) | |
| self.sample_bxn = mean_bxn + tf.exp(0.5 * logvar_bxn) * noise_bxn | |
| def logp(self, z=None): | |
| """Compute the log-likelihood under the distribution. | |
| Args: | |
| z (optional): value to compute likelihood for, if None, use sample. | |
| Returns: | |
| The likelihood of z under the model. | |
| """ | |
| if z is None: | |
| z = self.sample | |
| # This is needed to make sure that the gradients are simple. | |
| # The value of the function shouldn't change. | |
| if z == self.sample_bxn: | |
| return gaussian_pos_log_likelihood(self.mean_bxn, self.logvar_bxn, | |
| self.noise_bxn) | |
| return diag_gaussian_log_likelihood(z, self.mean_bxn, self.logvar_bxn) | |
| def mean(self): | |
| return self.mean_bxn | |
| def logvar(self): | |
| return self.logvar_bxn | |
| def sample(self): | |
| return self.sample_bxn | |
| class DiagonalGaussianFromInput(Gaussian): | |
| """Diagonal Gaussian whose mean and variance are conditioned on other | |
| variables. | |
| Note: the parameters to convert from input to the learned mean and log | |
| variance are held in this class. | |
| """ | |
| def __init__(self, x_bxu, z_size, name, var_min=0.0): | |
| """Create an input dependent diagonal Gaussian distribution. | |
| Args: | |
| x: The input tensor from which the mean and variance are computed, | |
| via a linear transformation of x. I.e. | |
| mu = Wx + b, log(var) = Mx + c | |
| z_size: The size of the distribution. | |
| name: The name to prefix to learned variables. | |
| var_min (optional): Minimal variance allowed. This is an additional | |
| way to control the amount of information getting through the stochastic | |
| layer. | |
| """ | |
| size_bxn = tf.stack([tf.shape(x_bxu)[0], z_size]) | |
| self.mean_bxn = mean_bxn = linear(x_bxu, z_size, name=(name+"/mean")) | |
| logvar_bxn = linear(x_bxu, z_size, name=(name+"/logvar")) | |
| if var_min > 0.0: | |
| logvar_bxn = tf.log(tf.exp(logvar_bxn) + var_min) | |
| self.logvar_bxn = logvar_bxn | |
| self.noise_bxn = noise_bxn = tf.random_normal(size_bxn) | |
| self.noise_bxn.set_shape([None, z_size]) | |
| self.sample_bxn = mean_bxn + tf.exp(0.5 * logvar_bxn) * noise_bxn | |
| def logp(self, z=None): | |
| """Compute the log-likelihood under the distribution. | |
| Args: | |
| z (optional): value to compute likelihood for, if None, use sample. | |
| Returns: | |
| The likelihood of z under the model. | |
| """ | |
| if z is None: | |
| z = self.sample | |
| # This is needed to make sure that the gradients are simple. | |
| # The value of the function shouldn't change. | |
| if z == self.sample_bxn: | |
| return gaussian_pos_log_likelihood(self.mean_bxn, | |
| self.logvar_bxn, self.noise_bxn) | |
| return diag_gaussian_log_likelihood(z, self.mean_bxn, self.logvar_bxn) | |
| def mean(self): | |
| return self.mean_bxn | |
| def logvar(self): | |
| return self.logvar_bxn | |
| def sample(self): | |
| return self.sample_bxn | |
| class GaussianProcess: | |
| """Base class for Gaussian processes.""" | |
| pass | |
| class LearnableAutoRegressive1Prior(GaussianProcess): | |
| """AR(1) model where autocorrelation and process variance are learned | |
| parameters. Assumed zero mean. | |
| """ | |
| def __init__(self, batch_size, z_size, | |
| autocorrelation_taus, noise_variances, | |
| do_train_prior_ar_atau, do_train_prior_ar_nvar, | |
| num_steps, name): | |
| """Create a learnable autoregressive (1) process. | |
| Args: | |
| batch_size: The size of the batch, i.e. 0th dim in 2D tensor of samples. | |
| z_size: The dimension of the distribution, i.e. 1st dim in 2D tensor. | |
| autocorrelation_taus: The auto correlation time constant of the AR(1) | |
| process. | |
| A value of 0 is uncorrelated gaussian noise. | |
| noise_variances: The variance of the additive noise, *not* the process | |
| variance. | |
| do_train_prior_ar_atau: Train or leave as constant, the autocorrelation? | |
| do_train_prior_ar_nvar: Train or leave as constant, the noise variance? | |
| num_steps: Number of steps to run the process. | |
| name: The name to prefix to learned TF variables. | |
| """ | |
| # Note the use of the plural in all of these quantities. This is intended | |
| # to mark that even though a sample z_t from the posterior is thought of a | |
| # single sample of a multidimensional gaussian, the prior is actually | |
| # thought of as U AR(1) processes, where U is the dimension of the inferred | |
| # input. | |
| size_bx1 = tf.stack([batch_size, 1]) | |
| size__xu = [None, z_size] | |
| # process variance, the variance at time t over all instantiations of AR(1) | |
| # with these parameters. | |
| log_evar_inits_1xu = tf.expand_dims(tf.log(noise_variances), 0) | |
| self.logevars_1xu = logevars_1xu = \ | |
| tf.Variable(log_evar_inits_1xu, name=name+"/logevars", dtype=tf.float32, | |
| trainable=do_train_prior_ar_nvar) | |
| self.logevars_bxu = logevars_bxu = tf.tile(logevars_1xu, size_bx1) | |
| logevars_bxu.set_shape(size__xu) # tile loses shape | |
| # \tau, which is the autocorrelation time constant of the AR(1) process | |
| log_atau_inits_1xu = tf.expand_dims(tf.log(autocorrelation_taus), 0) | |
| self.logataus_1xu = logataus_1xu = \ | |
| tf.Variable(log_atau_inits_1xu, name=name+"/logatau", dtype=tf.float32, | |
| trainable=do_train_prior_ar_atau) | |
| # phi in x_t = \mu + phi x_tm1 + \eps | |
| # phi = exp(-1/tau) | |
| # phi = exp(-1/exp(logtau)) | |
| # phi = exp(-exp(-logtau)) | |
| phis_1xu = tf.exp(-tf.exp(-logataus_1xu)) | |
| self.phis_bxu = phis_bxu = tf.tile(phis_1xu, size_bx1) | |
| phis_bxu.set_shape(size__xu) | |
| # process noise | |
| # pvar = evar / (1- phi^2) | |
| # logpvar = log ( exp(logevar) / (1 - phi^2) ) | |
| # logpvar = logevar - log(1-phi^2) | |
| # logpvar = logevar - (log(1-phi) + log(1+phi)) | |
| self.logpvars_1xu = \ | |
| logevars_1xu - tf.log(1.0-phis_1xu) - tf.log(1.0+phis_1xu) | |
| self.logpvars_bxu = logpvars_bxu = tf.tile(self.logpvars_1xu, size_bx1) | |
| logpvars_bxu.set_shape(size__xu) | |
| # process mean (zero but included in for completeness) | |
| self.pmeans_bxu = pmeans_bxu = tf.zeros_like(phis_bxu) | |
| # For sampling from the prior during de-novo generation. | |
| self.means_t = means_t = [None] * num_steps | |
| self.logvars_t = logvars_t = [None] * num_steps | |
| self.samples_t = samples_t = [None] * num_steps | |
| self.gaussians_t = gaussians_t = [None] * num_steps | |
| sample_bxu = tf.zeros_like(phis_bxu) | |
| for t in range(num_steps): | |
| # process variance used here to make process completely stationary | |
| if t == 0: | |
| logvar_pt_bxu = self.logpvars_bxu | |
| else: | |
| logvar_pt_bxu = self.logevars_bxu | |
| z_mean_pt_bxu = pmeans_bxu + phis_bxu * sample_bxu | |
| gaussians_t[t] = DiagonalGaussian(batch_size, z_size, | |
| mean=z_mean_pt_bxu, | |
| logvar=logvar_pt_bxu) | |
| sample_bxu = gaussians_t[t].sample | |
| samples_t[t] = sample_bxu | |
| logvars_t[t] = logvar_pt_bxu | |
| means_t[t] = z_mean_pt_bxu | |
| def logp_t(self, z_t_bxu, z_tm1_bxu=None): | |
| """Compute the log-likelihood under the distribution for a given time t, | |
| not the whole sequence. | |
| Args: | |
| z_t_bxu: sample to compute likelihood for at time t. | |
| z_tm1_bxu (optional): sample condition probability of z_t upon. | |
| Returns: | |
| The likelihood of p_t under the model at time t. i.e. | |
| p(z_t|z_tm1_bxu) = N(z_tm1_bxu * phis, eps^2) | |
| """ | |
| if z_tm1_bxu is None: | |
| return diag_gaussian_log_likelihood(z_t_bxu, self.pmeans_bxu, | |
| self.logpvars_bxu) | |
| else: | |
| means_t_bxu = self.pmeans_bxu + self.phis_bxu * z_tm1_bxu | |
| logp_tgtm1_bxu = diag_gaussian_log_likelihood(z_t_bxu, | |
| means_t_bxu, | |
| self.logevars_bxu) | |
| return logp_tgtm1_bxu | |
| class KLCost_GaussianGaussian(object): | |
| """log p(x|z) + KL(q||p) terms for Gaussian posterior and Gaussian prior. See | |
| eqn 10 and Appendix B in VAE for latter term, | |
| http://arxiv.org/abs/1312.6114 | |
| The log p(x|z) term is the reconstruction error under the model. | |
| The KL term represents the penalty for passing information from the encoder | |
| to the decoder. | |
| To sample KL(q||p), we simply sample | |
| ln q - ln p | |
| by drawing samples from q and averaging. | |
| """ | |
| def __init__(self, zs, prior_zs): | |
| """Create a lower bound in three parts, normalized reconstruction | |
| cost, normalized KL divergence cost, and their sum. | |
| E_q[ln p(z_i | z_{i+1}) / q(z_i | x) | |
| \int q(z) ln p(z) dz = - 0.5 ln(2pi) - 0.5 \sum (ln(sigma_p^2) + \ | |
| sigma_q^2 / sigma_p^2 + (mean_p - mean_q)^2 / sigma_p^2) | |
| \int q(z) ln q(z) dz = - 0.5 ln(2pi) - 0.5 \sum (ln(sigma_q^2) + 1) | |
| Args: | |
| zs: posterior z ~ q(z|x) | |
| prior_zs: prior zs | |
| """ | |
| # L = -KL + log p(x|z), to maximize bound on likelihood | |
| # -L = KL - log p(x|z), to minimize bound on NLL | |
| # so 'KL cost' is postive KL divergence | |
| kl_b = 0.0 | |
| for z, prior_z in zip(zs, prior_zs): | |
| assert isinstance(z, Gaussian) | |
| assert isinstance(prior_z, Gaussian) | |
| # ln(2pi) terms cancel | |
| kl_b += 0.5 * tf.reduce_sum( | |
| prior_z.logvar - z.logvar | |
| + tf.exp(z.logvar - prior_z.logvar) | |
| + tf.square((z.mean - prior_z.mean) / tf.exp(0.5 * prior_z.logvar)) | |
| - 1.0, [1]) | |
| self.kl_cost_b = kl_b | |
| self.kl_cost = tf.reduce_mean(kl_b) | |
| class KLCost_GaussianGaussianProcessSampled(object): | |
| """ log p(x|z) + KL(q||p) terms for Gaussian posterior and Gaussian process | |
| prior via sampling. | |
| The log p(x|z) term is the reconstruction error under the model. | |
| The KL term represents the penalty for passing information from the encoder | |
| to the decoder. | |
| To sample KL(q||p), we simply sample | |
| ln q - ln p | |
| by drawing samples from q and averaging. | |
| """ | |
| def __init__(self, post_zs, prior_z_process): | |
| """Create a lower bound in three parts, normalized reconstruction | |
| cost, normalized KL divergence cost, and their sum. | |
| Args: | |
| post_zs: posterior z ~ q(z|x) | |
| prior_z_process: prior AR(1) process | |
| """ | |
| assert len(post_zs) > 1, "GP is for time, need more than 1 time step." | |
| assert isinstance(prior_z_process, GaussianProcess), "Must use GP." | |
| # L = -KL + log p(x|z), to maximize bound on likelihood | |
| # -L = KL - log p(x|z), to minimize bound on NLL | |
| # so 'KL cost' is postive KL divergence | |
| z0_bxu = post_zs[0].sample | |
| logq_bxu = post_zs[0].logp(z0_bxu) | |
| logp_bxu = prior_z_process.logp_t(z0_bxu) | |
| z_tm1_bxu = z0_bxu | |
| for z_t in post_zs[1:]: | |
| # posterior is independent in time, prior is not | |
| z_t_bxu = z_t.sample | |
| logq_bxu += z_t.logp(z_t_bxu) | |
| logp_bxu += prior_z_process.logp_t(z_t_bxu, z_tm1_bxu) | |
| z_tm1_bxu = z_t_bxu | |
| kl_bxu = logq_bxu - logp_bxu | |
| kl_b = tf.reduce_sum(kl_bxu, [1]) | |
| self.kl_cost_b = kl_b | |
| self.kl_cost = tf.reduce_mean(kl_b) | |