Spaces:
Runtime error
Runtime error
| # Copyright 2017 The TensorFlow Authors All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| """Model is responsible for setting up Tensorflow graph. | |
| Creates policy and value networks. Also sets up all optimization | |
| ops, including gradient ops, trust region ops, and value optimizers. | |
| """ | |
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| import tensorflow as tf | |
| class Model(object): | |
| def __init__(self, env_spec, global_step, | |
| target_network_lag=0.95, | |
| sample_from='online', | |
| get_policy=None, | |
| get_baseline=None, | |
| get_objective=None, | |
| get_trust_region_p_opt=None, | |
| get_value_opt=None): | |
| self.env_spec = env_spec | |
| self.global_step = global_step | |
| self.inc_global_step = self.global_step.assign_add(1) | |
| self.target_network_lag = target_network_lag | |
| self.sample_from = sample_from | |
| self.policy = get_policy() | |
| self.baseline = get_baseline() | |
| self.objective = get_objective() | |
| self.baseline.eps_lambda = self.objective.eps_lambda # TODO: do this better | |
| self.trust_region_policy_opt = get_trust_region_p_opt() | |
| self.value_opt = get_value_opt() | |
| def setup_placeholders(self): | |
| """Create the Tensorflow placeholders.""" | |
| # summary placeholder | |
| self.avg_episode_reward = tf.placeholder( | |
| tf.float32, [], 'avg_episode_reward') | |
| self.greedy_episode_reward = tf.placeholder( | |
| tf.float32, [], 'greedy_episode_reward') | |
| # sampling placeholders | |
| self.internal_state = tf.placeholder(tf.float32, | |
| [None, self.policy.rnn_state_dim], | |
| 'internal_state') | |
| self.single_observation = [] | |
| for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types): | |
| if self.env_spec.is_discrete(obs_type): | |
| self.single_observation.append( | |
| tf.placeholder(tf.int32, [None], 'obs%d' % i)) | |
| elif self.env_spec.is_box(obs_type): | |
| self.single_observation.append( | |
| tf.placeholder(tf.float32, [None, obs_dim], 'obs%d' % i)) | |
| else: | |
| assert False | |
| self.single_action = [] | |
| for i, (action_dim, action_type) in \ | |
| enumerate(self.env_spec.act_dims_and_types): | |
| if self.env_spec.is_discrete(action_type): | |
| self.single_action.append( | |
| tf.placeholder(tf.int32, [None], 'act%d' % i)) | |
| elif self.env_spec.is_box(action_type): | |
| self.single_action.append( | |
| tf.placeholder(tf.float32, [None, action_dim], 'act%d' % i)) | |
| else: | |
| assert False | |
| # training placeholders | |
| self.observations = [] | |
| for i, (obs_dim, obs_type) in enumerate(self.env_spec.obs_dims_and_types): | |
| if self.env_spec.is_discrete(obs_type): | |
| self.observations.append( | |
| tf.placeholder(tf.int32, [None, None], 'all_obs%d' % i)) | |
| else: | |
| self.observations.append( | |
| tf.placeholder(tf.float32, [None, None, obs_dim], 'all_obs%d' % i)) | |
| self.actions = [] | |
| self.other_logits = [] | |
| for i, (action_dim, action_type) in \ | |
| enumerate(self.env_spec.act_dims_and_types): | |
| if self.env_spec.is_discrete(action_type): | |
| self.actions.append( | |
| tf.placeholder(tf.int32, [None, None], 'all_act%d' % i)) | |
| if self.env_spec.is_box(action_type): | |
| self.actions.append( | |
| tf.placeholder(tf.float32, [None, None, action_dim], | |
| 'all_act%d' % i)) | |
| self.other_logits.append( | |
| tf.placeholder(tf.float32, [None, None, None], | |
| 'other_logits%d' % i)) | |
| self.rewards = tf.placeholder(tf.float32, [None, None], 'rewards') | |
| self.terminated = tf.placeholder(tf.float32, [None], 'terminated') | |
| self.pads = tf.placeholder(tf.float32, [None, None], 'pads') | |
| self.prev_log_probs = tf.placeholder(tf.float32, [None, None], | |
| 'prev_log_probs') | |
| def setup(self, train=True): | |
| """Setup Tensorflow Graph.""" | |
| self.setup_placeholders() | |
| tf.summary.scalar('avg_episode_reward', self.avg_episode_reward) | |
| tf.summary.scalar('greedy_episode_reward', self.greedy_episode_reward) | |
| with tf.variable_scope('model', reuse=None): | |
| # policy network | |
| with tf.variable_scope('policy_net'): | |
| (self.policy_internal_states, self.logits, self.log_probs, | |
| self.entropies, self.self_kls) = \ | |
| self.policy.multi_step(self.observations, | |
| self.internal_state, | |
| self.actions) | |
| self.out_log_probs = sum(self.log_probs) | |
| self.kl = self.policy.calculate_kl(self.other_logits, self.logits) | |
| self.avg_kl = (tf.reduce_sum(sum(self.kl)[:-1] * (1 - self.pads)) / | |
| tf.reduce_sum(1 - self.pads)) | |
| # value network | |
| with tf.variable_scope('value_net'): | |
| (self.values, | |
| self.regression_input, | |
| self.regression_weight) = self.baseline.get_values( | |
| self.observations, self.actions, | |
| self.policy_internal_states, self.logits) | |
| # target policy network | |
| with tf.variable_scope('target_policy_net'): | |
| (self.target_policy_internal_states, | |
| self.target_logits, self.target_log_probs, | |
| _, _) = \ | |
| self.policy.multi_step(self.observations, | |
| self.internal_state, | |
| self.actions) | |
| # target value network | |
| with tf.variable_scope('target_value_net'): | |
| (self.target_values, _, _) = self.baseline.get_values( | |
| self.observations, self.actions, | |
| self.target_policy_internal_states, self.target_logits) | |
| # construct copy op online --> target | |
| all_vars = tf.trainable_variables() | |
| online_vars = [p for p in all_vars if | |
| '/policy_net' in p.name or '/value_net' in p.name] | |
| target_vars = [p for p in all_vars if | |
| 'target_policy_net' in p.name or 'target_value_net' in p.name] | |
| online_vars.sort(key=lambda p: p.name) | |
| target_vars.sort(key=lambda p: p.name) | |
| aa = self.target_network_lag | |
| self.copy_op = tf.group(*[ | |
| target_p.assign(aa * target_p + (1 - aa) * online_p) | |
| for online_p, target_p in zip(online_vars, target_vars)]) | |
| if train: | |
| # evaluate objective | |
| (self.loss, self.raw_loss, self.regression_target, | |
| self.gradient_ops, self.summary) = self.objective.get( | |
| self.rewards, self.pads, | |
| self.values[:-1, :], | |
| self.values[-1, :] * (1 - self.terminated), | |
| self.log_probs, self.prev_log_probs, self.target_log_probs, | |
| self.entropies, self.logits, self.target_values[:-1, :], | |
| self.target_values[-1, :] * (1 - self.terminated)) | |
| self.regression_target = tf.reshape(self.regression_target, [-1]) | |
| self.policy_vars = [ | |
| v for v in tf.trainable_variables() | |
| if '/policy_net' in v.name] | |
| self.value_vars = [ | |
| v for v in tf.trainable_variables() | |
| if '/value_net' in v.name] | |
| # trust region optimizer | |
| if self.trust_region_policy_opt is not None: | |
| with tf.variable_scope('trust_region_policy', reuse=None): | |
| avg_self_kl = ( | |
| tf.reduce_sum(sum(self.self_kls) * (1 - self.pads)) / | |
| tf.reduce_sum(1 - self.pads)) | |
| self.trust_region_policy_opt.setup( | |
| self.policy_vars, self.raw_loss, avg_self_kl, | |
| self.avg_kl) | |
| # value optimizer | |
| if self.value_opt is not None: | |
| with tf.variable_scope('trust_region_value', reuse=None): | |
| self.value_opt.setup( | |
| self.value_vars, | |
| tf.reshape(self.values[:-1, :], [-1]), | |
| self.regression_target, | |
| tf.reshape(self.pads, [-1]), | |
| self.regression_input, self.regression_weight) | |
| # we re-use variables for the sampling operations | |
| with tf.variable_scope('model', reuse=True): | |
| scope = ('target_policy_net' if self.sample_from == 'target' | |
| else 'policy_net') | |
| with tf.variable_scope(scope): | |
| self.next_internal_state, self.sampled_actions = \ | |
| self.policy.sample_step(self.single_observation, | |
| self.internal_state, | |
| self.single_action) | |
| self.greedy_next_internal_state, self.greedy_sampled_actions = \ | |
| self.policy.sample_step(self.single_observation, | |
| self.internal_state, | |
| self.single_action, | |
| greedy=True) | |
| def sample_step(self, sess, | |
| single_observation, internal_state, single_action, | |
| greedy=False): | |
| """Sample batch of steps from policy.""" | |
| if greedy: | |
| outputs = [self.greedy_next_internal_state, self.greedy_sampled_actions] | |
| else: | |
| outputs = [self.next_internal_state, self.sampled_actions] | |
| feed_dict = {self.internal_state: internal_state} | |
| for action_place, action in zip(self.single_action, single_action): | |
| feed_dict[action_place] = action | |
| for obs_place, obs in zip(self.single_observation, single_observation): | |
| feed_dict[obs_place] = obs | |
| return sess.run(outputs, feed_dict=feed_dict) | |
| def train_step(self, sess, | |
| observations, internal_state, actions, | |
| rewards, terminated, pads, | |
| avg_episode_reward=0, greedy_episode_reward=0): | |
| """Train network using standard gradient descent.""" | |
| outputs = [self.raw_loss, self.gradient_ops, self.summary] | |
| feed_dict = {self.internal_state: internal_state, | |
| self.rewards: rewards, | |
| self.terminated: terminated, | |
| self.pads: pads, | |
| self.avg_episode_reward: avg_episode_reward, | |
| self.greedy_episode_reward: greedy_episode_reward} | |
| time_len = None | |
| for action_place, action in zip(self.actions, actions): | |
| if time_len is None: | |
| time_len = len(action) | |
| assert time_len == len(action) | |
| feed_dict[action_place] = action | |
| for obs_place, obs in zip(self.observations, observations): | |
| assert time_len == len(obs) | |
| feed_dict[obs_place] = obs | |
| assert len(rewards) == time_len - 1 | |
| return sess.run(outputs, feed_dict=feed_dict) | |
| def trust_region_step(self, sess, | |
| observations, internal_state, actions, | |
| rewards, terminated, pads, | |
| avg_episode_reward=0, | |
| greedy_episode_reward=0): | |
| """Train policy using trust region step.""" | |
| feed_dict = {self.internal_state: internal_state, | |
| self.rewards: rewards, | |
| self.terminated: terminated, | |
| self.pads: pads, | |
| self.avg_episode_reward: avg_episode_reward, | |
| self.greedy_episode_reward: greedy_episode_reward} | |
| for action_place, action in zip(self.actions, actions): | |
| feed_dict[action_place] = action | |
| for obs_place, obs in zip(self.observations, observations): | |
| feed_dict[obs_place] = obs | |
| (prev_log_probs, prev_logits) = sess.run( | |
| [self.out_log_probs, self.logits], feed_dict=feed_dict) | |
| feed_dict[self.prev_log_probs] = prev_log_probs | |
| for other_logit, prev_logit in zip(self.other_logits, prev_logits): | |
| feed_dict[other_logit] = prev_logit | |
| # fit policy | |
| self.trust_region_policy_opt.optimize(sess, feed_dict) | |
| ret = sess.run([self.raw_loss, self.summary], feed_dict=feed_dict) | |
| ret = [ret[0], None, ret[1]] | |
| return ret | |
| def fit_values(self, sess, | |
| observations, internal_state, actions, | |
| rewards, terminated, pads): | |
| """Train value network using value-specific optimizer.""" | |
| feed_dict = {self.internal_state: internal_state, | |
| self.rewards: rewards, | |
| self.terminated: terminated, | |
| self.pads: pads} | |
| for action_place, action in zip(self.actions, actions): | |
| feed_dict[action_place] = action | |
| for obs_place, obs in zip(self.observations, observations): | |
| feed_dict[obs_place] = obs | |
| # fit values | |
| if self.value_opt is None: | |
| raise ValueError('Specific value optimizer does not exist') | |
| self.value_opt.optimize(sess, feed_dict) | |