Spaces:

NCTCMumbai
/

NCTC

Runtime error

App Files Files Community

NCTC / models /research /fivo /experimental /models.py

NCTCMumbai

Upload 2571 files

0b8359d about 2 years ago

raw

history blame contribute delete

44.4 kB

	# Copyright 2018 The TensorFlow Authors All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""Model."""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import functools
	import sonnet as snt
	import tensorflow as tf
	import numpy as np
	import math

	SQUARED_OBSERVATION = "squared"
	ABS_OBSERVATION = "abs"
	STANDARD_OBSERVATION = "standard"
	OBSERVATION_TYPES = [SQUARED_OBSERVATION, ABS_OBSERVATION, STANDARD_OBSERVATION]

	ROUND_TRANSITION = "round"
	STANDARD_TRANSITION = "standard"
	TRANSITION_TYPES = [ROUND_TRANSITION, STANDARD_TRANSITION]


	class Q(object):

	def __init__(self,
	state_size,
	num_timesteps,
	sigma_min=1e-5,
	dtype=tf.float32,
	random_seed=None,
	init_mu0_to_zero=False,
	graph_collection_name="Q_VARS"):
	self.sigma_min = sigma_min
	self.dtype = dtype
	self.graph_collection_name = graph_collection_name
	initializers = []
	for t in xrange(num_timesteps):
	if t == 0 and init_mu0_to_zero:
	initializers.append(
	{"w": tf.zeros_initializer, "b": tf.zeros_initializer})
	else:
	initializers.append(
	{"w": tf.random_uniform_initializer(seed=random_seed),
	"b": tf.zeros_initializer})

	def custom_getter(getter, args, *kwargs):
	out = getter(args, *kwargs)
	ref = tf.get_collection_ref(self.graph_collection_name)
	if out not in ref:
	ref.append(out)
	return out

	self.mus = [
	snt.Linear(output_size=state_size,
	initializers=initializers[t],
	name="q_mu_%d" % t,
	custom_getter=custom_getter
	)
	for t in xrange(num_timesteps)
	]
	self.sigmas = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="q_sigma_%d" % (t + 1),
	collections=[tf.GraphKeys.GLOBAL_VARIABLES, graph_collection_name],
	initializer=tf.random_uniform_initializer(seed=random_seed))
	for t in xrange(num_timesteps)
	]

	def q_zt(self, observation, prev_state, t):
	batch_size = tf.shape(prev_state)[0]
	q_mu = self.mus[t](tf.concat([observation, prev_state], axis=1))
	q_sigma = tf.maximum(tf.nn.softplus(self.sigmas[t]), self.sigma_min)
	q_sigma = tf.tile(q_sigma[tf.newaxis, :], [batch_size, 1])
	q_zt = tf.contrib.distributions.Normal(loc=q_mu, scale=tf.sqrt(q_sigma))
	return q_zt

	def summarize_weights(self):
	for t, sigma in enumerate(self.sigmas):
	tf.summary.scalar("q_sigma/%d" % t, sigma[0])
	for t, f in enumerate(self.mus):
	tf.summary.scalar("q_mu/b_%d" % t, f.b[0])
	tf.summary.scalar("q_mu/w_obs_%d" % t, f.w[0,0])
	if t != 0:
	tf.summary.scalar("q_mu/w_prev_state_%d" % t, f.w[1,0])


	class PreviousStateQ(Q):

	def q_zt(self, unused_observation, prev_state, t):
	batch_size = tf.shape(prev_state)[0]
	q_mu = self.mus[t](prev_state)
	q_sigma = tf.maximum(tf.nn.softplus(self.sigmas[t]), self.sigma_min)
	q_sigma = tf.tile(q_sigma[tf.newaxis, :], [batch_size, 1])
	q_zt = tf.contrib.distributions.Normal(loc=q_mu, scale=tf.sqrt(q_sigma))
	return q_zt

	def summarize_weights(self):
	for t, sigma in enumerate(self.sigmas):
	tf.summary.scalar("q_sigma/%d" % t, sigma[0])
	for t, f in enumerate(self.mus):
	tf.summary.scalar("q_mu/b_%d" % t, f.b[0])
	tf.summary.scalar("q_mu/w_prev_state_%d" % t, f.w[0,0])


	class ObservationQ(Q):

	def q_zt(self, observation, prev_state, t):
	batch_size = tf.shape(prev_state)[0]
	q_mu = self.mus[t](observation)
	q_sigma = tf.maximum(tf.nn.softplus(self.sigmas[t]), self.sigma_min)
	q_sigma = tf.tile(q_sigma[tf.newaxis, :], [batch_size, 1])
	q_zt = tf.contrib.distributions.Normal(loc=q_mu, scale=tf.sqrt(q_sigma))
	return q_zt

	def summarize_weights(self):
	for t, sigma in enumerate(self.sigmas):
	tf.summary.scalar("q_sigma/%d" % t, sigma[0])
	for t, f in enumerate(self.mus):
	tf.summary.scalar("q_mu/b_%d" % t, f.b[0])
	tf.summary.scalar("q_mu/w_obs_%d" % t, f.w[0,0])


	class SimpleMeanQ(object):

	def __init__(self,
	state_size,
	num_timesteps,
	sigma_min=1e-5,
	dtype=tf.float32,
	random_seed=None,
	init_mu0_to_zero=False,
	graph_collection_name="Q_VARS"):
	self.sigma_min = sigma_min
	self.dtype = dtype
	self.graph_collection_name = graph_collection_name
	initializers = []
	for t in xrange(num_timesteps):
	if t == 0 and init_mu0_to_zero:
	initializers.append(tf.zeros_initializer)
	else:
	initializers.append(tf.random_uniform_initializer(seed=random_seed))

	self.mus = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="q_mu_%d" % (t + 1),
	collections=[tf.GraphKeys.GLOBAL_VARIABLES, graph_collection_name],
	initializer=initializers[t])
	for t in xrange(num_timesteps)
	]
	self.sigmas = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="q_sigma_%d" % (t + 1),
	collections=[tf.GraphKeys.GLOBAL_VARIABLES, graph_collection_name],
	initializer=tf.random_uniform_initializer(seed=random_seed))
	for t in xrange(num_timesteps)
	]

	def q_zt(self, unused_observation, prev_state, t):
	batch_size = tf.shape(prev_state)[0]
	q_mu = tf.tile(self.mus[t][tf.newaxis, :], [batch_size, 1])
	q_sigma = tf.maximum(tf.nn.softplus(self.sigmas[t]), self.sigma_min)
	q_sigma = tf.tile(q_sigma[tf.newaxis, :], [batch_size, 1])
	q_zt = tf.contrib.distributions.Normal(loc=q_mu, scale=tf.sqrt(q_sigma))
	return q_zt

	def summarize_weights(self):
	for t, sigma in enumerate(self.sigmas):
	tf.summary.scalar("q_sigma/%d" % t, sigma[0])
	for t, f in enumerate(self.mus):
	tf.summary.scalar("q_mu/%d" % t, f[0])


	class R(object):

	def __init__(self,
	state_size,
	num_timesteps,
	sigma_min=1e-5,
	dtype=tf.float32,
	sigma_init=1.,
	random_seed=None,
	graph_collection_name="R_VARS"):
	self.dtype = dtype
	self.sigma_min = sigma_min
	initializers = {"w": tf.truncated_normal_initializer(seed=random_seed),
	"b": tf.zeros_initializer}
	self.graph_collection_name=graph_collection_name

	def custom_getter(getter, args, *kwargs):
	out = getter(args, *kwargs)
	ref = tf.get_collection_ref(self.graph_collection_name)
	if out not in ref:
	ref.append(out)
	return out

	self.mus= [
	snt.Linear(output_size=state_size,
	initializers=initializers,
	name="r_mu_%d" % t,
	custom_getter=custom_getter)
	for t in xrange(num_timesteps)
	]

	self.sigmas = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="r_sigma_%d" % (t + 1),
	collections=[tf.GraphKeys.GLOBAL_VARIABLES, graph_collection_name],
	#initializer=tf.random_uniform_initializer(seed=random_seed, maxval=100))
	initializer=tf.constant_initializer(sigma_init))
	for t in xrange(num_timesteps)
	]

	def r_xn(self, z_t, t):
	batch_size = tf.shape(z_t)[0]
	r_mu = self.mus[t](z_t)
	r_sigma = tf.maximum(tf.nn.softplus(self.sigmas[t]), self.sigma_min)
	r_sigma = tf.tile(r_sigma[tf.newaxis, :], [batch_size, 1])
	return tf.contrib.distributions.Normal(
	loc=r_mu, scale=tf.sqrt(r_sigma))

	def summarize_weights(self):
	for t in range(len(self.mus) - 1):
	tf.summary.scalar("r_mu/%d" % t, self.mus[t][0])
	tf.summary.scalar("r_sigma/%d" % t, self.sigmas[t][0])


	class P(object):

	def __init__(self,
	state_size,
	num_timesteps,
	sigma_min=1e-5,
	variance=1.0,
	dtype=tf.float32,
	random_seed=None,
	trainable=True,
	init_bs_to_zero=False,
	graph_collection_name="P_VARS"):
	self.state_size = state_size
	self.num_timesteps = num_timesteps
	self.sigma_min = sigma_min
	self.dtype = dtype
	self.variance = variance
	self.graph_collection_name = graph_collection_name
	if init_bs_to_zero:
	initializers = [tf.zeros_initializer for _ in xrange(num_timesteps)]
	else:
	initializers = [tf.random_uniform_initializer(seed=random_seed) for _ in xrange(num_timesteps)]

	self.bs = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="p_b_%d" % (t + 1),
	initializer=initializers[t],
	collections=[tf.GraphKeys.GLOBAL_VARIABLES, graph_collection_name],
	trainable=trainable) for t in xrange(num_timesteps)
	]
	self.Bs = tf.cumsum(self.bs, reverse=True, axis=0)

	def posterior(self, observation, prev_state, t):
	"""Computes the true posterior p(z_t\|z_{t-1}, z_n)."""
	# bs[0] is really b_1
	# Bs[i] is sum from k=i+1^n b_k
	mu = observation - self.Bs[t]
	if t > 0:
	mu += (prev_state + self.bs[t - 1]) * float(self.num_timesteps - t)
	mu /= float(self.num_timesteps - t + 1)
	sigma = tf.ones_like(mu) * self.variance * (
	float(self.num_timesteps - t) / float(self.num_timesteps - t + 1))
	return tf.contrib.distributions.Normal(loc=mu, scale=tf.sqrt(sigma))

	def lookahead(self, state, t):
	"""Computes the true lookahead distribution p(z_n\|z_t)."""
	mu = state + self.Bs[t]
	sigma = tf.ones_like(state) * self.variance * float(self.num_timesteps - t)
	return tf.contrib.distributions.Normal(loc=mu, scale=tf.sqrt(sigma))

	def likelihood(self, observation):
	batch_size = tf.shape(observation)[0]
	mu = tf.tile(tf.reduce_sum(self.bs, axis=0)[tf.newaxis, :], [batch_size, 1])
	sigma = tf.ones_like(mu) * self.variance * (self.num_timesteps + 1)
	dist = tf.contrib.distributions.Normal(loc=mu, scale=tf.sqrt(sigma))
	# Average over the batch and take the sum over the state size
	return tf.reduce_mean(tf.reduce_sum(dist.log_prob(observation), axis=1))

	def p_zt(self, prev_state, t):
	"""Computes the model p(z_t\| z_{t-1})."""
	batch_size = tf.shape(prev_state)[0]
	if t > 0:
	z_mu_p = prev_state + self.bs[t - 1]
	else: # p(z_0) is Normal(0,1)
	z_mu_p = tf.zeros([batch_size, self.state_size], dtype=self.dtype)
	p_zt = tf.contrib.distributions.Normal(
	loc=z_mu_p, scale=tf.sqrt(tf.ones_like(z_mu_p) * self.variance))
	return p_zt

	def generative(self, unused_observation, z_nm1):
	"""Computes the model's generative distribution p(z_n\| z_{n-1})."""
	generative_p_mu = z_nm1 + self.bs[-1]
	return tf.contrib.distributions.Normal(
	loc=generative_p_mu, scale=tf.sqrt(tf.ones_like(generative_p_mu) * self.variance))


	class ShortChainNonlinearP(object):

	def __init__(self,
	state_size,
	num_timesteps,
	sigma_min=1e-5,
	variance=1.0,
	observation_variance=1.0,
	transition_type=STANDARD_TRANSITION,
	transition_dist=tf.contrib.distributions.Normal,
	dtype=tf.float32,
	random_seed=None):
	self.state_size = state_size
	self.num_timesteps = num_timesteps
	self.sigma_min = sigma_min
	self.dtype = dtype
	self.variance = variance
	self.observation_variance = observation_variance
	self.transition_type = transition_type
	self.transition_dist = transition_dist

	def p_zt(self, prev_state, t):
	"""Computes the model p(z_t\| z_{t-1})."""
	batch_size = tf.shape(prev_state)[0]
	if t > 0:
	if self.transition_type == ROUND_TRANSITION:
	loc = tf.round(prev_state)
	tf.logging.info("p(z_%d \| z_%d) ~ N(round(z_%d), %0.1f)" % (t, t-1, t-1, self.variance))
	elif self.transition_type == STANDARD_TRANSITION:
	loc = prev_state
	tf.logging.info("p(z_%d \| z_%d) ~ N(z_%d, %0.1f)" % (t, t-1, t-1, self.variance))
	else: # p(z_0) is Normal(0,1)
	loc = tf.zeros([batch_size, self.state_size], dtype=self.dtype)
	tf.logging.info("p(z_0) ~ N(0,%0.1f)" % self.variance)

	p_zt = self.transition_dist(
	loc=loc,
	scale=tf.sqrt(tf.ones_like(loc) * self.variance))
	return p_zt

	def generative(self, unused_obs, z_ni):
	"""Computes the model's generative distribution p(x_i\| z_{ni})."""
	if self.transition_type == ROUND_TRANSITION:
	loc = tf.round(z_ni)
	elif self.transition_type == STANDARD_TRANSITION:
	loc = z_ni
	generative_sigma_sq = tf.ones_like(loc) * self.observation_variance
	return self.transition_dist(
	loc=loc, scale=tf.sqrt(generative_sigma_sq))


	class BimodalPriorP(object):

	def __init__(self,
	state_size,
	num_timesteps,
	mixing_coeff=0.5,
	prior_mode_mean=1,
	sigma_min=1e-5,
	variance=1.0,
	dtype=tf.float32,
	random_seed=None,
	trainable=True,
	init_bs_to_zero=False,
	graph_collection_name="P_VARS"):
	self.state_size = state_size
	self.num_timesteps = num_timesteps
	self.sigma_min = sigma_min
	self.dtype = dtype
	self.variance = variance
	self.mixing_coeff = mixing_coeff
	self.prior_mode_mean = prior_mode_mean

	if init_bs_to_zero:
	initializers = [tf.zeros_initializer for _ in xrange(num_timesteps)]
	else:
	initializers = [tf.random_uniform_initializer(seed=random_seed) for _ in xrange(num_timesteps)]

	self.bs = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="b_%d" % (t + 1),
	initializer=initializers[t],
	collections=[tf.GraphKeys.GLOBAL_VARIABLES, graph_collection_name],
	trainable=trainable) for t in xrange(num_timesteps)
	]
	self.Bs = tf.cumsum(self.bs, reverse=True, axis=0)

	def posterior(self, observation, prev_state, t):
	# NOTE: This is currently wrong, but would require a refactoring of
	# summarize_q to fix as kl is not defined for a mixture
	"""Computes the true posterior p(z_t\|z_{t-1}, z_n)."""
	# bs[0] is really b_1
	# Bs[i] is sum from k=i+1^n b_k
	mu = observation - self.Bs[t]
	if t > 0:
	mu += (prev_state + self.bs[t - 1]) * float(self.num_timesteps - t)
	mu /= float(self.num_timesteps - t + 1)
	sigma = tf.ones_like(mu) * self.variance * (
	float(self.num_timesteps - t) / float(self.num_timesteps - t + 1))
	return tf.contrib.distributions.Normal(loc=mu, scale=tf.sqrt(sigma))

	def lookahead(self, state, t):
	"""Computes the true lookahead distribution p(z_n\|z_t)."""
	mu = state + self.Bs[t]
	sigma = tf.ones_like(state) * self.variance * float(self.num_timesteps - t)
	return tf.contrib.distributions.Normal(loc=mu, scale=tf.sqrt(sigma))

	def likelihood(self, observation):
	batch_size = tf.shape(observation)[0]
	sum_of_bs = tf.tile(tf.reduce_sum(self.bs, axis=0)[tf.newaxis, :], [batch_size, 1])
	sigma = tf.ones_like(sum_of_bs) * self.variance * (self.num_timesteps + 1)
	mu_pos = (tf.ones([batch_size, self.state_size], dtype=self.dtype) * self.prior_mode_mean) + sum_of_bs
	mu_neg = (tf.ones([batch_size, self.state_size], dtype=self.dtype) * -self.prior_mode_mean) + sum_of_bs
	zn_pos = tf.contrib.distributions.Normal(
	loc=mu_pos,
	scale=tf.sqrt(sigma))
	zn_neg = tf.contrib.distributions.Normal(
	loc=mu_neg,
	scale=tf.sqrt(sigma))
	mode_probs = tf.convert_to_tensor([self.mixing_coeff, 1-self.mixing_coeff], dtype=tf.float64)
	mode_probs = tf.tile(mode_probs[tf.newaxis, tf.newaxis, :], [batch_size, 1, 1])
	mode_selection_dist = tf.contrib.distributions.Categorical(probs=mode_probs)
	zn_dist = tf.contrib.distributions.Mixture(
	cat=mode_selection_dist,
	components=[zn_pos, zn_neg],
	validate_args=True)
	# Average over the batch and take the sum over the state size
	return tf.reduce_mean(tf.reduce_sum(zn_dist.log_prob(observation), axis=1))

	def p_zt(self, prev_state, t):
	"""Computes the model p(z_t\| z_{t-1})."""
	batch_size = tf.shape(prev_state)[0]
	if t > 0:
	z_mu_p = prev_state + self.bs[t - 1]
	p_zt = tf.contrib.distributions.Normal(
	loc=z_mu_p, scale=tf.sqrt(tf.ones_like(z_mu_p) * self.variance))
	return p_zt
	else: # p(z_0) is mixture of two Normals
	mu_pos = tf.ones([batch_size, self.state_size], dtype=self.dtype) * self.prior_mode_mean
	mu_neg = tf.ones([batch_size, self.state_size], dtype=self.dtype) * -self.prior_mode_mean
	z0_pos = tf.contrib.distributions.Normal(
	loc=mu_pos,
	scale=tf.sqrt(tf.ones_like(mu_pos) * self.variance))
	z0_neg = tf.contrib.distributions.Normal(
	loc=mu_neg,
	scale=tf.sqrt(tf.ones_like(mu_neg) * self.variance))
	mode_probs = tf.convert_to_tensor([self.mixing_coeff, 1-self.mixing_coeff], dtype=tf.float64)
	mode_probs = tf.tile(mode_probs[tf.newaxis, tf.newaxis, :], [batch_size, 1, 1])
	mode_selection_dist = tf.contrib.distributions.Categorical(probs=mode_probs)
	z0_dist = tf.contrib.distributions.Mixture(
	cat=mode_selection_dist,
	components=[z0_pos, z0_neg],
	validate_args=False)
	return z0_dist

	def generative(self, unused_observation, z_nm1):
	"""Computes the model's generative distribution p(z_n\| z_{n-1})."""
	generative_p_mu = z_nm1 + self.bs[-1]
	return tf.contrib.distributions.Normal(
	loc=generative_p_mu, scale=tf.sqrt(tf.ones_like(generative_p_mu) * self.variance))

	class Model(object):

	def __init__(self,
	p,
	q,
	r,
	state_size,
	num_timesteps,
	dtype=tf.float32):
	self.p = p
	self.q = q
	self.r = r
	self.state_size = state_size
	self.num_timesteps = num_timesteps
	self.dtype = dtype

	def zero_state(self, batch_size):
	return tf.zeros([batch_size, self.state_size], dtype=self.dtype)

	def __call__(self, prev_state, observation, t):
	# Compute the q distribution over z, q(z_t\|z_n, z_{t-1}).
	q_zt = self.q.q_zt(observation, prev_state, t)
	# Compute the p distribution over z, p(z_t\|z_{t-1}).
	p_zt = self.p.p_zt(prev_state, t)
	# sample from q
	zt = q_zt.sample()
	r_xn = self.r.r_xn(zt, t)
	# Calculate the logprobs and sum over the state size.
	log_q_zt = tf.reduce_sum(q_zt.log_prob(zt), axis=1)
	log_p_zt = tf.reduce_sum(p_zt.log_prob(zt), axis=1)
	log_r_xn = tf.reduce_sum(r_xn.log_prob(observation), axis=1)
	# If we're at the last timestep, also calc the logprob of the observation.
	if t == self.num_timesteps - 1:
	generative_dist = self.p.generative(observation, zt)
	log_p_x_given_z = tf.reduce_sum(generative_dist.log_prob(observation), axis=1)
	else:
	log_p_x_given_z = tf.zeros_like(log_q_zt)
	return (zt, log_q_zt, log_p_zt, log_p_x_given_z, log_r_xn)

	@staticmethod
	def create(state_size,
	num_timesteps,
	sigma_min=1e-5,
	r_sigma_init=1,
	variance=1.0,
	mixing_coeff=0.5,
	prior_mode_mean=1.0,
	dtype=tf.float32,
	random_seed=None,
	train_p=True,
	p_type="unimodal",
	q_type="normal",
	observation_variance=1.0,
	transition_type=STANDARD_TRANSITION,
	use_bs=True):
	if p_type == "unimodal":
	p = P(state_size,
	num_timesteps,
	sigma_min=sigma_min,
	variance=variance,
	dtype=dtype,
	random_seed=random_seed,
	trainable=train_p,
	init_bs_to_zero=not use_bs)
	elif p_type == "bimodal":
	p = BimodalPriorP(
	state_size,
	num_timesteps,
	mixing_coeff=mixing_coeff,
	prior_mode_mean=prior_mode_mean,
	sigma_min=sigma_min,
	variance=variance,
	dtype=dtype,
	random_seed=random_seed,
	trainable=train_p,
	init_bs_to_zero=not use_bs)
	elif "nonlinear" in p_type:
	if "cauchy" in p_type:
	trans_dist = tf.contrib.distributions.Cauchy
	else:
	trans_dist = tf.contrib.distributions.Normal
	p = ShortChainNonlinearP(
	state_size,
	num_timesteps,
	sigma_min=sigma_min,
	variance=variance,
	observation_variance=observation_variance,
	transition_type=transition_type,
	transition_dist=trans_dist,
	dtype=dtype,
	random_seed=random_seed
	)

	if q_type == "normal":
	q_class = Q
	elif q_type == "simple_mean":
	q_class = SimpleMeanQ
	elif q_type == "prev_state":
	q_class = PreviousStateQ
	elif q_type == "observation":
	q_class = ObservationQ

	q = q_class(state_size,
	num_timesteps,
	sigma_min=sigma_min,
	dtype=dtype,
	random_seed=random_seed,
	init_mu0_to_zero=not use_bs)
	r = R(state_size,
	num_timesteps,
	sigma_min=sigma_min,
	sigma_init=r_sigma_init,
	dtype=dtype,
	random_seed=random_seed)
	model = Model(p, q, r, state_size, num_timesteps, dtype=dtype)
	return model


	class BackwardsModel(object):

	def __init__(self,
	state_size,
	num_timesteps,
	sigma_min=1e-5,
	dtype=tf.float32):
	self.state_size = state_size
	self.num_timesteps = num_timesteps
	self.sigma_min = sigma_min
	self.dtype = dtype
	self.bs = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="b_%d" % (t + 1),
	initializer=tf.zeros_initializer) for t in xrange(num_timesteps)
	]
	self.Bs = tf.cumsum(self.bs, reverse=True, axis=0)
	self.q_mus = [
	snt.Linear(output_size=state_size) for _ in xrange(num_timesteps)
	]
	self.q_sigmas = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="q_sigma_%d" % (t + 1),
	initializer=tf.zeros_initializer) for t in xrange(num_timesteps)
	]
	self.r_mus = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="r_mu_%d" % (t + 1),
	initializer=tf.zeros_initializer) for t in xrange(num_timesteps)
	]
	self.r_sigmas = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="r_sigma_%d" % (t + 1),
	initializer=tf.zeros_initializer) for t in xrange(num_timesteps)
	]

	def zero_state(self, batch_size):
	return tf.zeros([batch_size, self.state_size], dtype=self.dtype)

	def posterior(self, unused_observation, prev_state, unused_t):
	# TODO(dieterichl): Correct this.
	return tf.contrib.distributions.Normal(
	loc=tf.zeros_like(prev_state), scale=tf.zeros_like(prev_state))

	def lookahead(self, state, unused_t):
	# TODO(dieterichl): Correct this.
	return tf.contrib.distributions.Normal(
	loc=tf.zeros_like(state), scale=tf.zeros_like(state))

	def q_zt(self, observation, next_state, t):
	"""Computes the variational posterior q(z_{t}\|z_{t+1}, z_n)."""
	t_backwards = self.num_timesteps - t - 1
	batch_size = tf.shape(next_state)[0]
	q_mu = self.q_mus[t_backwards](tf.concat([observation, next_state], axis=1))
	q_sigma = tf.maximum(
	tf.nn.softplus(self.q_sigmas[t_backwards]), self.sigma_min)
	q_sigma = tf.tile(q_sigma[tf.newaxis, :], [batch_size, 1])
	q_zt = tf.contrib.distributions.Normal(loc=q_mu, scale=tf.sqrt(q_sigma))
	return q_zt

	def p_zt(self, prev_state, t):
	"""Computes the model p(z_{t+1}\| z_{t})."""
	t_backwards = self.num_timesteps - t - 1
	z_mu_p = prev_state + self.bs[t_backwards]
	p_zt = tf.contrib.distributions.Normal(
	loc=z_mu_p, scale=tf.ones_like(z_mu_p))
	return p_zt

	def generative(self, unused_observation, z_nm1):
	"""Computes the model's generative distribution p(z_n\| z_{n-1})."""
	generative_p_mu = z_nm1 + self.bs[-1]
	return tf.contrib.distributions.Normal(
	loc=generative_p_mu, scale=tf.ones_like(generative_p_mu))

	def r(self, z_t, t):
	t_backwards = self.num_timesteps - t - 1
	batch_size = tf.shape(z_t)[0]
	r_mu = tf.tile(self.r_mus[t_backwards][tf.newaxis, :], [batch_size, 1])
	r_sigma = tf.maximum(
	tf.nn.softplus(self.r_sigmas[t_backwards]), self.sigma_min)
	r_sigma = tf.tile(r_sigma[tf.newaxis, :], [batch_size, 1])
	return tf.contrib.distributions.Normal(loc=r_mu, scale=tf.sqrt(r_sigma))

	def likelihood(self, observation):
	batch_size = tf.shape(observation)[0]
	mu = tf.tile(tf.reduce_sum(self.bs, axis=0)[tf.newaxis, :], [batch_size, 1])
	sigma = tf.ones_like(mu) * (self.num_timesteps + 1)
	dist = tf.contrib.distributions.Normal(loc=mu, scale=tf.sqrt(sigma))
	# Average over the batch and take the sum over the state size
	return tf.reduce_mean(tf.reduce_sum(dist.log_prob(observation), axis=1))

	def __call__(self, next_state, observation, t):
	# next state = z_{t+1}
	# Compute the q distribution over z, q(z_{t}\|z_n, z_{t+1}).
	q_zt = self.q_zt(observation, next_state, t)
	# sample from q
	zt = q_zt.sample()
	# Compute the p distribution over z, p(z_{t+1}\|z_{t}).
	p_zt = self.p_zt(zt, t)
	# Compute log p(z_{t+1} \| z_t)
	if t == 0:
	log_p_zt = p_zt.log_prob(observation)
	else:
	log_p_zt = p_zt.log_prob(next_state)

	# Compute r prior over zt
	r_zt = self.r(zt, t)
	log_r_zt = r_zt.log_prob(zt)
	# Compute proposal density at zt
	log_q_zt = q_zt.log_prob(zt)
	# If we're at the last timestep, also calc the logprob of the observation.

	if t == self.num_timesteps - 1:
	p_z0_dist = tf.contrib.distributions.Normal(
	loc=tf.zeros_like(zt), scale=tf.ones_like(zt))
	z0_log_prob = p_z0_dist.log_prob(zt)
	else:
	z0_log_prob = tf.zeros_like(log_q_zt)
	return (zt, log_q_zt, log_p_zt, z0_log_prob, log_r_zt)


	class LongChainP(object):

	def __init__(self,
	state_size,
	num_obs,
	steps_per_obs,
	sigma_min=1e-5,
	variance=1.0,
	observation_variance=1.0,
	observation_type=STANDARD_OBSERVATION,
	transition_type=STANDARD_TRANSITION,
	dtype=tf.float32,
	random_seed=None):
	self.state_size = state_size
	self.steps_per_obs = steps_per_obs
	self.num_obs = num_obs
	self.num_timesteps = steps_per_obs*num_obs + 1
	self.sigma_min = sigma_min
	self.dtype = dtype
	self.variance = variance
	self.observation_variance = observation_variance
	self.observation_type = observation_type
	self.transition_type = transition_type

	def likelihood(self, observations):
	"""Computes the model's true likelihood of the observations.

	Args:
	observations: A [batch_size, m, state_size] Tensor representing each of
	the m observations.
	Returns:
	logprob: The true likelihood of the observations given the model.
	"""
	raise ValueError("Likelihood is not defined for long-chain models")
	# batch_size = tf.shape(observations)[0]
	# mu = tf.zeros([batch_size, self.state_size, self.num_obs], dtype=self.dtype)
	# sigma = np.fromfunction(
	# lambda i, j: 1 + self.steps_per_obs*np.minimum(i+1, j+1),
	# [self.num_obs, self.num_obs])
	# sigma += np.eye(self.num_obs)
	# sigma = tf.convert_to_tensor(sigma * self.variance, dtype=self.dtype)
	# sigma = tf.tile(sigma[tf.newaxis, tf.newaxis, ...],
	# [batch_size, self.state_size, 1, 1])
	# dist = tf.contrib.distributions.MultivariateNormalFullCovariance(
	# loc=mu,
	# covariance_matrix=sigma)
	# Average over the batch and take the sum over the state size
	#return tf.reduce_mean(tf.reduce_sum(dist.log_prob(observations), axis=1))

	def p_zt(self, prev_state, t):
	"""Computes the model p(z_t\| z_{t-1})."""
	batch_size = tf.shape(prev_state)[0]
	if t > 0:
	if self.transition_type == ROUND_TRANSITION:
	loc = tf.round(prev_state)
	tf.logging.info("p(z_%d \| z_%d) ~ N(round(z_%d), %0.1f)" % (t, t-1, t-1, self.variance))
	elif self.transition_type == STANDARD_TRANSITION:
	loc = prev_state
	tf.logging.info("p(z_%d \| z_%d) ~ N(z_%d, %0.1f)" % (t, t-1, t-1, self.variance))
	else: # p(z_0) is Normal(0,1)
	loc = tf.zeros([batch_size, self.state_size], dtype=self.dtype)
	tf.logging.info("p(z_0) ~ N(0,%0.1f)" % self.variance)

	p_zt = tf.contrib.distributions.Normal(
	loc=loc,
	scale=tf.sqrt(tf.ones_like(loc) * self.variance))
	return p_zt

	def generative(self, z_ni, t):
	"""Computes the model's generative distribution p(x_i\| z_{ni})."""
	if self.observation_type == SQUARED_OBSERVATION:
	generative_mu = tf.square(z_ni)
	tf.logging.info("p(x_%d \| z_%d) ~ N(z_%d^2, %0.1f)" % (t, t, t, self.variance))
	elif self.observation_type == ABS_OBSERVATION:
	generative_mu = tf.abs(z_ni)
	tf.logging.info("p(x_%d \| z_%d) ~ N(\|z_%d\|, %0.1f)" % (t, t, t, self.variance))
	elif self.observation_type == STANDARD_OBSERVATION:
	generative_mu = z_ni
	tf.logging.info("p(x_%d \| z_%d) ~ N(z_%d, %0.1f)" % (t, t, t, self.variance))
	generative_sigma_sq = tf.ones_like(generative_mu) * self.observation_variance
	return tf.contrib.distributions.Normal(
	loc=generative_mu, scale=tf.sqrt(generative_sigma_sq))


	class LongChainQ(object):

	def __init__(self,
	state_size,
	num_obs,
	steps_per_obs,
	sigma_min=1e-5,
	dtype=tf.float32,
	random_seed=None):
	self.state_size = state_size
	self.sigma_min = sigma_min
	self.dtype = dtype
	self.steps_per_obs = steps_per_obs
	self.num_obs = num_obs
	self.num_timesteps = num_obs*steps_per_obs +1

	initializers = {
	"w": tf.random_uniform_initializer(seed=random_seed),
	"b": tf.zeros_initializer
	}
	self.mus = [
	snt.Linear(output_size=state_size, initializers=initializers)
	for t in xrange(self.num_timesteps)
	]
	self.sigmas = [
	tf.get_variable(
	shape=[state_size],
	dtype=self.dtype,
	name="q_sigma_%d" % (t + 1),
	initializer=tf.random_uniform_initializer(seed=random_seed))
	for t in xrange(self.num_timesteps)
	]

	def first_relevant_obs_index(self, t):
	return int(max((t-1)/self.steps_per_obs, 0))

	def q_zt(self, observations, prev_state, t):
	"""Computes a distribution over z_t.

	Args:
	observations: a [batch_size, num_observations, state_size] Tensor.
	prev_state: a [batch_size, state_size] Tensor.
	t: The current timestep, an int Tensor.
	"""
	# filter out unneeded past obs
	first_relevant_obs_index = int(math.floor(max(t-1, 0) / self.steps_per_obs))
	num_relevant_observations = self.num_obs - first_relevant_obs_index
	observations = observations[:,first_relevant_obs_index:,:]
	batch_size = tf.shape(prev_state)[0]
	# concatenate the prev state and observations along the second axis (that is
	# not the batch or state size axis, and then flatten it to
	# [batch_size, (num_relevant_observations + 1) * state_size] to feed it into
	# the linear layer.
	q_input = tf.concat([observations, prev_state[:,tf.newaxis, :]], axis=1)
	q_input = tf.reshape(q_input,
	[batch_size, (num_relevant_observations + 1) * self.state_size])
	q_mu = self.mus[t](q_input)
	q_sigma = tf.maximum(tf.nn.softplus(self.sigmas[t]), self.sigma_min)
	q_sigma = tf.tile(q_sigma[tf.newaxis, :], [batch_size, 1])
	q_zt = tf.contrib.distributions.Normal(loc=q_mu, scale=tf.sqrt(q_sigma))
	tf.logging.info(
	"q(z_{t} \| z_{tm1}, x_{obsf}:{obst}) ~ N(Linear([z_{tm1},x_{obsf}:{obst}]), sigma_{t})".format(
	**{"t": t,
	"tm1": t-1,
	"obsf": (first_relevant_obs_index+1)*self.steps_per_obs,
	"obst":self.steps_per_obs*self.num_obs}))
	return q_zt

	def summarize_weights(self):
	pass

	class LongChainR(object):

	def __init__(self,
	state_size,
	num_obs,
	steps_per_obs,
	sigma_min=1e-5,
	dtype=tf.float32,
	random_seed=None):
	self.state_size = state_size
	self.dtype = dtype
	self.sigma_min = sigma_min
	self.steps_per_obs = steps_per_obs
	self.num_obs = num_obs
	self.num_timesteps = num_obs*steps_per_obs + 1
	self.sigmas = [
	tf.get_variable(
	shape=[self.num_future_obs(t)],
	dtype=self.dtype,
	name="r_sigma_%d" % (t + 1),
	#initializer=tf.random_uniform_initializer(seed=random_seed, maxval=100))
	initializer=tf.constant_initializer(1.0))
	for t in range(self.num_timesteps)
	]

	def first_future_obs_index(self, t):
	return int(math.floor(t / self.steps_per_obs))

	def num_future_obs(self, t):
	return int(self.num_obs - self.first_future_obs_index(t))

	def r_xn(self, z_t, t):
	"""Computes a distribution over the future observations given current latent
	state.

	The indexing in these messages is 1 indexed and inclusive. This is
	consistent with the latex documents.

	Args:
	z_t: [batch_size, state_size] Tensor
	t: Current timestep
	"""
	tf.logging.info(
	"r(x_{start}:{end} \| z_{t}) ~ N(z_{t}, sigma_{t})".format(
	**{"t": t,
	"start": (self.first_future_obs_index(t)+1)*self.steps_per_obs,
	"end": self.num_timesteps-1}))
	batch_size = tf.shape(z_t)[0]
	# the mean for all future observations is the same.
	# this tiling results in a [batch_size, num_future_obs, state_size] Tensor
	r_mu = tf.tile(z_t[:,tf.newaxis,:], [1, self.num_future_obs(t), 1])
	# compute the variance
	r_sigma = tf.maximum(tf.nn.softplus(self.sigmas[t]), self.sigma_min)
	# the variance is the same across all state dimensions, so we only have to
	# time sigma to be [batch_size, num_future_obs].
	r_sigma = tf.tile(r_sigma[tf.newaxis,:, tf.newaxis], [batch_size, 1, self.state_size])
	return tf.contrib.distributions.Normal(
	loc=r_mu, scale=tf.sqrt(r_sigma))

	def summarize_weights(self):
	pass


	class LongChainModel(object):

	def __init__(self,
	p,
	q,
	r,
	state_size,
	num_obs,
	steps_per_obs,
	dtype=tf.float32,
	disable_r=False):
	self.p = p
	self.q = q
	self.r = r
	self.disable_r = disable_r
	self.state_size = state_size
	self.num_obs = num_obs
	self.steps_per_obs = steps_per_obs
	self.num_timesteps = steps_per_obs*num_obs + 1
	self.dtype = dtype

	def zero_state(self, batch_size):
	return tf.zeros([batch_size, self.state_size], dtype=self.dtype)

	def next_obs_ind(self, t):
	return int(math.floor(max(t-1,0)/self.steps_per_obs))

	def __call__(self, prev_state, observations, t):
	"""Computes the importance weight for the model system.

	Args:
	prev_state: [batch_size, state_size] Tensor
	observations: [batch_size, num_observations, state_size] Tensor
	"""
	# Compute the q distribution over z, q(z_t\|z_n, z_{t-1}).
	q_zt = self.q.q_zt(observations, prev_state, t)
	# Compute the p distribution over z, p(z_t\|z_{t-1}).
	p_zt = self.p.p_zt(prev_state, t)
	# sample from q and evaluate the logprobs, summing over the state size
	zt = q_zt.sample()
	log_q_zt = tf.reduce_sum(q_zt.log_prob(zt), axis=1)
	log_p_zt = tf.reduce_sum(p_zt.log_prob(zt), axis=1)
	if not self.disable_r and t < self.num_timesteps-1:
	# score the remaining observations using r
	r_xn = self.r.r_xn(zt, t)
	log_r_xn = r_xn.log_prob(observations[:, self.next_obs_ind(t+1):, :])
	# sum over state size and observation, leaving the batch index
	log_r_xn = tf.reduce_sum(log_r_xn, axis=[1,2])
	else:
	log_r_xn = tf.zeros_like(log_p_zt)
	if t != 0 and t % self.steps_per_obs == 0:
	generative_dist = self.p.generative(zt, t)
	log_p_x_given_z = generative_dist.log_prob(observations[:,self.next_obs_ind(t),:])
	log_p_x_given_z = tf.reduce_sum(log_p_x_given_z, axis=1)
	else:
	log_p_x_given_z = tf.zeros_like(log_q_zt)
	return (zt, log_q_zt, log_p_zt, log_p_x_given_z, log_r_xn)

	@staticmethod
	def create(state_size,
	num_obs,
	steps_per_obs,
	sigma_min=1e-5,
	variance=1.0,
	observation_variance=1.0,
	observation_type=STANDARD_OBSERVATION,
	transition_type=STANDARD_TRANSITION,
	dtype=tf.float32,
	random_seed=None,
	disable_r=False):
	p = LongChainP(
	state_size,
	num_obs,
	steps_per_obs,
	sigma_min=sigma_min,
	variance=variance,
	observation_variance=observation_variance,
	observation_type=observation_type,
	transition_type=transition_type,
	dtype=dtype,
	random_seed=random_seed)
	q = LongChainQ(
	state_size,
	num_obs,
	steps_per_obs,
	sigma_min=sigma_min,
	dtype=dtype,
	random_seed=random_seed)
	r = LongChainR(
	state_size,
	num_obs,
	steps_per_obs,
	sigma_min=sigma_min,
	dtype=dtype,
	random_seed=random_seed)
	model = LongChainModel(
	p, q, r, state_size, num_obs, steps_per_obs,
	dtype=dtype,
	disable_r=disable_r)
	return model


	class RTilde(object):

	def __init__(self,
	state_size,
	num_timesteps,
	sigma_min=1e-5,
	dtype=tf.float32,
	random_seed=None,
	graph_collection_name="R_TILDE_VARS"):
	self.dtype = dtype
	self.sigma_min = sigma_min
	initializers = {"w": tf.truncated_normal_initializer(seed=random_seed),
	"b": tf.zeros_initializer}
	self.graph_collection_name=graph_collection_name

	def custom_getter(getter, args, *kwargs):
	out = getter(args, *kwargs)
	ref = tf.get_collection_ref(self.graph_collection_name)
	if out not in ref:
	ref.append(out)
	return out

	self.fns = [
	snt.Linear(output_size=2*state_size,
	initializers=initializers,
	name="r_tilde_%d" % t,
	custom_getter=custom_getter)
	for t in xrange(num_timesteps)
	]

	def r_zt(self, z_t, observation, t):
	#out = self.fns[t](tf.stop_gradient(tf.concat([z_t, observation], axis=1)))
	out = self.fns[t](tf.concat([z_t, observation], axis=1))
	mu, raw_sigma_sq = tf.split(out, 2, axis=1)
	sigma_sq = tf.maximum(tf.nn.softplus(raw_sigma_sq), self.sigma_min)
	return mu, sigma_sq

	class TDModel(object):

	def __init__(self,
	p,
	q,
	r_tilde,
	state_size,
	num_timesteps,
	dtype=tf.float32,
	disable_r=False):
	self.p = p
	self.q = q
	self.r_tilde = r_tilde
	self.disable_r = disable_r
	self.state_size = state_size
	self.num_timesteps = num_timesteps
	self.dtype = dtype

	def zero_state(self, batch_size):
	return tf.zeros([batch_size, self.state_size], dtype=self.dtype)

	def __call__(self, prev_state, observation, t):
	"""Computes the importance weight for the model system.

	Args:
	prev_state: [batch_size, state_size] Tensor
	observations: [batch_size, num_observations, state_size] Tensor
	"""
	# Compute the q distribution over z, q(z_t\|z_n, z_{t-1}).
	q_zt = self.q.q_zt(observation, prev_state, t)
	# Compute the p distribution over z, p(z_t\|z_{t-1}).
	p_zt = self.p.p_zt(prev_state, t)
	# sample from q and evaluate the logprobs, summing over the state size
	zt = q_zt.sample()
	# If it isn't the last timestep, compute the distribution over the next z.
	if t < self.num_timesteps - 1:
	p_ztplus1 = self.p.p_zt(zt, t+1)
	else:
	p_ztplus1 = None
	log_q_zt = tf.reduce_sum(q_zt.log_prob(zt), axis=1)
	log_p_zt = tf.reduce_sum(p_zt.log_prob(zt), axis=1)

	if not self.disable_r and t < self.num_timesteps-1:
	# score the remaining observations using r
	r_tilde_mu, r_tilde_sigma_sq = self.r_tilde.r_zt(zt, observation, t+1)
	else:
	r_tilde_mu = None
	r_tilde_sigma_sq = None
	if t == self.num_timesteps - 1:
	generative_dist = self.p.generative(observation, zt)
	log_p_x_given_z = tf.reduce_sum(generative_dist.log_prob(observation), axis=1)
	else:
	log_p_x_given_z = tf.zeros_like(log_q_zt)
	return (zt, log_q_zt, log_p_zt, log_p_x_given_z,
	r_tilde_mu, r_tilde_sigma_sq, p_ztplus1)

	@staticmethod
	def create(state_size,
	num_timesteps,
	sigma_min=1e-5,
	variance=1.0,
	dtype=tf.float32,
	random_seed=None,
	train_p=True,
	p_type="unimodal",
	q_type="normal",
	mixing_coeff=0.5,
	prior_mode_mean=1.0,
	observation_variance=1.0,
	transition_type=STANDARD_TRANSITION,
	use_bs=True):
	if p_type == "unimodal":
	p = P(state_size,
	num_timesteps,
	sigma_min=sigma_min,
	variance=variance,
	dtype=dtype,
	random_seed=random_seed,
	trainable=train_p,
	init_bs_to_zero=not use_bs)
	elif p_type == "bimodal":
	p = BimodalPriorP(
	state_size,
	num_timesteps,
	mixing_coeff=mixing_coeff,
	prior_mode_mean=prior_mode_mean,
	sigma_min=sigma_min,
	variance=variance,
	dtype=dtype,
	random_seed=random_seed,
	trainable=train_p,
	init_bs_to_zero=not use_bs)
	elif "nonlinear" in p_type:
	if "cauchy" in p_type:
	trans_dist = tf.contrib.distributions.Cauchy
	else:
	trans_dist = tf.contrib.distributions.Normal

	p = ShortChainNonlinearP(
	state_size,
	num_timesteps,
	sigma_min=sigma_min,
	variance=variance,
	observation_variance=observation_variance,
	transition_type=transition_type,
	transition_dist=trans_dist,
	dtype=dtype,
	random_seed=random_seed
	)

	if q_type == "normal":
	q_class = Q
	elif q_type == "simple_mean":
	q_class = SimpleMeanQ
	elif q_type == "prev_state":
	q_class = PreviousStateQ
	elif q_type == "observation":
	q_class = ObservationQ

	q = q_class(state_size,
	num_timesteps,
	sigma_min=sigma_min,
	dtype=dtype,
	random_seed=random_seed,
	init_mu0_to_zero=not use_bs)
	r_tilde = RTilde(
	state_size,
	num_timesteps,
	sigma_min=sigma_min,
	dtype=dtype,
	random_seed=random_seed)
	model = TDModel(p, q, r_tilde, state_size, num_timesteps, dtype=dtype)
	return model