Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /efficient-hrl /agents /ddpg_agent.py

NCTCMumbai

Upload 2571 files

0b8359d almost 2 years ago

raw

history blame contribute delete

29 kB

	# Copyright 2018 The TensorFlow Authors All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""A DDPG/NAF agent.

	Implements the Deep Deterministic Policy Gradient (DDPG) algorithm from
	"Continuous control with deep reinforcement learning" - Lilicrap et al.
	https://arxiv.org/abs/1509.02971, and the Normalized Advantage Functions (NAF)
	algorithm "Continuous Deep Q-Learning with Model-based Acceleration" - Gu et al.
	https://arxiv.org/pdf/1603.00748.
	"""

	import tensorflow as tf
	slim = tf.contrib.slim
	import gin.tf
	from utils import utils
	from agents import ddpg_networks as networks


	@gin.configurable
	class DdpgAgent(object):
	"""An RL agent that learns using the DDPG algorithm.

	Example usage:

	def critic_net(states, actions):
	...
	def actor_net(states, num_action_dims):
	...

	Given a tensorflow environment tf_env,
	(of type learning.deepmind.rl.environments.tensorflow.python.tfpyenvironment)

	obs_spec = tf_env.observation_spec()
	action_spec = tf_env.action_spec()

	ddpg_agent = agent.DdpgAgent(obs_spec,
	action_spec,
	actor_net=actor_net,
	critic_net=critic_net)

	we can perform actions on the environment as follows:

	state = tf_env.observations()[0]
	action = ddpg_agent.actor_net(tf.expand_dims(state, 0))[0, :]
	transition_type, reward, discount = tf_env.step([action])

	Train:

	critic_loss = ddpg_agent.critic_loss(states, actions, rewards, discounts,
	next_states)
	actor_loss = ddpg_agent.actor_loss(states)

	critic_train_op = slim.learning.create_train_op(
	critic_loss,
	critic_optimizer,
	variables_to_train=ddpg_agent.get_trainable_critic_vars(),
	)

	actor_train_op = slim.learning.create_train_op(
	actor_loss,
	actor_optimizer,
	variables_to_train=ddpg_agent.get_trainable_actor_vars(),
	)
	"""

	ACTOR_NET_SCOPE = 'actor_net'
	CRITIC_NET_SCOPE = 'critic_net'
	TARGET_ACTOR_NET_SCOPE = 'target_actor_net'
	TARGET_CRITIC_NET_SCOPE = 'target_critic_net'

	def __init__(self,
	observation_spec,
	action_spec,
	actor_net=networks.actor_net,
	critic_net=networks.critic_net,
	td_errors_loss=tf.losses.huber_loss,
	dqda_clipping=0.,
	actions_regularizer=0.,
	target_q_clipping=None,
	residual_phi=0.0,
	debug_summaries=False):
	"""Constructs a DDPG agent.

	Args:
	observation_spec: A TensorSpec defining the observations.
	action_spec: A BoundedTensorSpec defining the actions.
	actor_net: A callable that creates the actor network. Must take the
	following arguments: states, num_actions. Please see networks.actor_net
	for an example.
	critic_net: A callable that creates the critic network. Must take the
	following arguments: states, actions. Please see networks.critic_net
	for an example.
	td_errors_loss: A callable defining the loss function for the critic
	td error.
	dqda_clipping: (float) clips the gradient dqda element-wise between
	[-dqda_clipping, dqda_clipping]. Does not perform clipping if
	dqda_clipping == 0.
	actions_regularizer: A scalar, when positive penalizes the norm of the
	actions. This can prevent saturation of actions for the actor_loss.
	target_q_clipping: (tuple of floats) clips target q values within
	(low, high) values when computing the critic loss.
	residual_phi: (float) [0.0, 1.0] Residual algorithm parameter that
	interpolates between Q-learning and residual gradient algorithm.
	http://www.leemon.com/papers/1995b.pdf
	debug_summaries: If True, add summaries to help debug behavior.
	Raises:
	ValueError: If 'dqda_clipping' is < 0.
	"""
	self._observation_spec = observation_spec[0]
	self._action_spec = action_spec[0]
	self._state_shape = tf.TensorShape([None]).concatenate(
	self._observation_spec.shape)
	self._action_shape = tf.TensorShape([None]).concatenate(
	self._action_spec.shape)
	self._num_action_dims = self._action_spec.shape.num_elements()

	self._scope = tf.get_variable_scope().name
	self._actor_net = tf.make_template(
	self.ACTOR_NET_SCOPE, actor_net, create_scope_now_=True)
	self._critic_net = tf.make_template(
	self.CRITIC_NET_SCOPE, critic_net, create_scope_now_=True)
	self._target_actor_net = tf.make_template(
	self.TARGET_ACTOR_NET_SCOPE, actor_net, create_scope_now_=True)
	self._target_critic_net = tf.make_template(
	self.TARGET_CRITIC_NET_SCOPE, critic_net, create_scope_now_=True)
	self._td_errors_loss = td_errors_loss
	if dqda_clipping < 0:
	raise ValueError('dqda_clipping must be >= 0.')
	self._dqda_clipping = dqda_clipping
	self._actions_regularizer = actions_regularizer
	self._target_q_clipping = target_q_clipping
	self._residual_phi = residual_phi
	self._debug_summaries = debug_summaries

	def _batch_state(self, state):
	"""Convert state to a batched state.

	Args:
	state: Either a list/tuple with an state tensor [num_state_dims].
	Returns:
	A tensor [1, num_state_dims]
	"""
	if isinstance(state, (tuple, list)):
	state = state[0]
	if state.get_shape().ndims == 1:
	state = tf.expand_dims(state, 0)
	return state

	def action(self, state):
	"""Returns the next action for the state.

	Args:
	state: A [num_state_dims] tensor representing a state.
	Returns:
	A [num_action_dims] tensor representing the action.
	"""
	return self.actor_net(self._batch_state(state), stop_gradients=True)[0, :]

	@gin.configurable('ddpg_sample_action')
	def sample_action(self, state, stddev=1.0):
	"""Returns the action for the state with additive noise.

	Args:
	state: A [num_state_dims] tensor representing a state.
	stddev: stddev for the Ornstein-Uhlenbeck noise.
	Returns:
	A [num_action_dims] action tensor.
	"""
	agent_action = self.action(state)
	agent_action += tf.random_normal(tf.shape(agent_action)) * stddev
	return utils.clip_to_spec(agent_action, self._action_spec)

	def actor_net(self, states, stop_gradients=False):
	"""Returns the output of the actor network.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	stop_gradients: (boolean) if true, gradients cannot be propogated through
	this operation.
	Returns:
	A [batch_size, num_action_dims] tensor of actions.
	Raises:
	ValueError: If `states` does not have the expected dimensions.
	"""
	self._validate_states(states)
	actions = self._actor_net(states, self._action_spec)
	if stop_gradients:
	actions = tf.stop_gradient(actions)
	return actions

	def critic_net(self, states, actions, for_critic_loss=False):
	"""Returns the output of the critic network.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	actions: A [batch_size, num_action_dims] tensor representing a batch
	of actions.
	Returns:
	q values: A [batch_size] tensor of q values.
	Raises:
	ValueError: If `states` or `actions' do not have the expected dimensions.
	"""
	self._validate_states(states)
	self._validate_actions(actions)
	return self._critic_net(states, actions,
	for_critic_loss=for_critic_loss)

	def target_actor_net(self, states):
	"""Returns the output of the target actor network.

	The target network is used to compute stable targets for training.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	Returns:
	A [batch_size, num_action_dims] tensor of actions.
	Raises:
	ValueError: If `states` does not have the expected dimensions.
	"""
	self._validate_states(states)
	actions = self._target_actor_net(states, self._action_spec)
	return tf.stop_gradient(actions)

	def target_critic_net(self, states, actions, for_critic_loss=False):
	"""Returns the output of the target critic network.

	The target network is used to compute stable targets for training.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	actions: A [batch_size, num_action_dims] tensor representing a batch
	of actions.
	Returns:
	q values: A [batch_size] tensor of q values.
	Raises:
	ValueError: If `states` or `actions' do not have the expected dimensions.
	"""
	self._validate_states(states)
	self._validate_actions(actions)
	return tf.stop_gradient(
	self._target_critic_net(states, actions,
	for_critic_loss=for_critic_loss))

	def value_net(self, states, for_critic_loss=False):
	"""Returns the output of the critic evaluated with the actor.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	Returns:
	q values: A [batch_size] tensor of q values.
	"""
	actions = self.actor_net(states)
	return self.critic_net(states, actions,
	for_critic_loss=for_critic_loss)

	def target_value_net(self, states, for_critic_loss=False):
	"""Returns the output of the target critic evaluated with the target actor.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	Returns:
	q values: A [batch_size] tensor of q values.
	"""
	target_actions = self.target_actor_net(states)
	return self.target_critic_net(states, target_actions,
	for_critic_loss=for_critic_loss)

	def critic_loss(self, states, actions, rewards, discounts,
	next_states):
	"""Computes a loss for training the critic network.

	The loss is the mean squared error between the Q value predictions of the
	critic and Q values estimated using TD-lambda.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	actions: A [batch_size, num_action_dims] tensor representing a batch
	of actions.
	rewards: A [batch_size, ...] tensor representing a batch of rewards,
	broadcastable to the critic net output.
	discounts: A [batch_size, ...] tensor representing a batch of discounts,
	broadcastable to the critic net output.
	next_states: A [batch_size, num_state_dims] tensor representing a batch
	of next states.
	Returns:
	A rank-0 tensor representing the critic loss.
	Raises:
	ValueError: If any of the inputs do not have the expected dimensions, or
	if their batch_sizes do not match.
	"""
	self._validate_states(states)
	self._validate_actions(actions)
	self._validate_states(next_states)

	target_q_values = self.target_value_net(next_states, for_critic_loss=True)
	td_targets = target_q_values * discounts + rewards
	if self._target_q_clipping is not None:
	td_targets = tf.clip_by_value(td_targets, self._target_q_clipping[0],
	self._target_q_clipping[1])
	q_values = self.critic_net(states, actions, for_critic_loss=True)
	td_errors = td_targets - q_values
	if self._debug_summaries:
	gen_debug_td_error_summaries(
	target_q_values, q_values, td_targets, td_errors)

	loss = self._td_errors_loss(td_targets, q_values)

	if self._residual_phi > 0.0: # compute residual gradient loss
	residual_q_values = self.value_net(next_states, for_critic_loss=True)
	residual_td_targets = residual_q_values * discounts + rewards
	if self._target_q_clipping is not None:
	residual_td_targets = tf.clip_by_value(residual_td_targets,
	self._target_q_clipping[0],
	self._target_q_clipping[1])
	residual_td_errors = residual_td_targets - q_values
	residual_loss = self._td_errors_loss(
	residual_td_targets, residual_q_values)
	loss = (loss * (1.0 - self._residual_phi) +
	residual_loss * self._residual_phi)
	return loss

	def actor_loss(self, states):
	"""Computes a loss for training the actor network.

	Note that output does not represent an actual loss. It is called a loss only
	in the sense that its gradient w.r.t. the actor network weights is the
	correct gradient for training the actor network,
	i.e. dloss/dweights = (dq/da)*(da/dweights)
	which is the gradient used in Algorithm 1 of Lilicrap et al.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	Returns:
	A rank-0 tensor representing the actor loss.
	Raises:
	ValueError: If `states` does not have the expected dimensions.
	"""
	self._validate_states(states)
	actions = self.actor_net(states, stop_gradients=False)
	critic_values = self.critic_net(states, actions)
	q_values = self.critic_function(critic_values, states)
	dqda = tf.gradients([q_values], [actions])[0]
	dqda_unclipped = dqda
	if self._dqda_clipping > 0:
	dqda = tf.clip_by_value(dqda, -self._dqda_clipping, self._dqda_clipping)

	actions_norm = tf.norm(actions)
	if self._debug_summaries:
	with tf.name_scope('dqda'):
	tf.summary.scalar('actions_norm', actions_norm)
	tf.summary.histogram('dqda', dqda)
	tf.summary.histogram('dqda_unclipped', dqda_unclipped)
	tf.summary.histogram('actions', actions)
	for a in range(self._num_action_dims):
	tf.summary.histogram('dqda_unclipped_%d' % a, dqda_unclipped[:, a])
	tf.summary.histogram('dqda_%d' % a, dqda[:, a])

	actions_norm *= self._actions_regularizer
	return slim.losses.mean_squared_error(tf.stop_gradient(dqda + actions),
	actions,
	scope='actor_loss') + actions_norm

	@gin.configurable('ddpg_critic_function')
	def critic_function(self, critic_values, states, weights=None):
	"""Computes q values based on critic_net outputs, states, and weights.

	Args:
	critic_values: A tf.float32 [batch_size, ...] tensor representing outputs
	from the critic net.
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	weights: A list or Numpy array or tensor with a shape broadcastable to
	`critic_values`.
	Returns:
	A tf.float32 [batch_size] tensor representing q values.
	"""
	del states # unused args
	if weights is not None:
	weights = tf.convert_to_tensor(weights, dtype=critic_values.dtype)
	critic_values *= weights
	if critic_values.shape.ndims > 1:
	critic_values = tf.reduce_sum(critic_values,
	range(1, critic_values.shape.ndims))
	critic_values.shape.assert_has_rank(1)
	return critic_values

	@gin.configurable('ddpg_update_targets')
	def update_targets(self, tau=1.0):
	"""Performs a soft update of the target network parameters.

	For each weight w_s in the actor/critic networks, and its corresponding
	weight w_t in the target actor/critic networks, a soft update is:
	w_t = (1- tau) x w_t + tau x ws

	Args:
	tau: A float scalar in [0, 1]
	Returns:
	An operation that performs a soft update of the target network parameters.
	Raises:
	ValueError: If `tau` is not in [0, 1].
	"""
	if tau < 0 or tau > 1:
	raise ValueError('Input `tau` should be in [0, 1].')
	update_actor = utils.soft_variables_update(
	slim.get_trainable_variables(
	utils.join_scope(self._scope, self.ACTOR_NET_SCOPE)),
	slim.get_trainable_variables(
	utils.join_scope(self._scope, self.TARGET_ACTOR_NET_SCOPE)),
	tau)
	update_critic = utils.soft_variables_update(
	slim.get_trainable_variables(
	utils.join_scope(self._scope, self.CRITIC_NET_SCOPE)),
	slim.get_trainable_variables(
	utils.join_scope(self._scope, self.TARGET_CRITIC_NET_SCOPE)),
	tau)
	return tf.group(update_actor, update_critic, name='update_targets')

	def get_trainable_critic_vars(self):
	"""Returns a list of trainable variables in the critic network.

	Returns:
	A list of trainable variables in the critic network.
	"""
	return slim.get_trainable_variables(
	utils.join_scope(self._scope, self.CRITIC_NET_SCOPE))

	def get_trainable_actor_vars(self):
	"""Returns a list of trainable variables in the actor network.

	Returns:
	A list of trainable variables in the actor network.
	"""
	return slim.get_trainable_variables(
	utils.join_scope(self._scope, self.ACTOR_NET_SCOPE))

	def get_critic_vars(self):
	"""Returns a list of all variables in the critic network.

	Returns:
	A list of trainable variables in the critic network.
	"""
	return slim.get_model_variables(
	utils.join_scope(self._scope, self.CRITIC_NET_SCOPE))

	def get_actor_vars(self):
	"""Returns a list of all variables in the actor network.

	Returns:
	A list of trainable variables in the actor network.
	"""
	return slim.get_model_variables(
	utils.join_scope(self._scope, self.ACTOR_NET_SCOPE))

	def _validate_states(self, states):
	"""Raises a value error if `states` does not have the expected shape.

	Args:
	states: A tensor.
	Raises:
	ValueError: If states.shape or states.dtype are not compatible with
	observation_spec.
	"""
	states.shape.assert_is_compatible_with(self._state_shape)
	if not states.dtype.is_compatible_with(self._observation_spec.dtype):
	raise ValueError('states.dtype={} is not compatible with'
	' observation_spec.dtype={}'.format(
	states.dtype, self._observation_spec.dtype))

	def _validate_actions(self, actions):
	"""Raises a value error if `actions` does not have the expected shape.

	Args:
	actions: A tensor.
	Raises:
	ValueError: If actions.shape or actions.dtype are not compatible with
	action_spec.
	"""
	actions.shape.assert_is_compatible_with(self._action_shape)
	if not actions.dtype.is_compatible_with(self._action_spec.dtype):
	raise ValueError('actions.dtype={} is not compatible with'
	' action_spec.dtype={}'.format(
	actions.dtype, self._action_spec.dtype))


	@gin.configurable
	class TD3Agent(DdpgAgent):
	"""An RL agent that learns using the TD3 algorithm."""

	ACTOR_NET_SCOPE = 'actor_net'
	CRITIC_NET_SCOPE = 'critic_net'
	CRITIC_NET2_SCOPE = 'critic_net2'
	TARGET_ACTOR_NET_SCOPE = 'target_actor_net'
	TARGET_CRITIC_NET_SCOPE = 'target_critic_net'
	TARGET_CRITIC_NET2_SCOPE = 'target_critic_net2'

	def __init__(self,
	observation_spec,
	action_spec,
	actor_net=networks.actor_net,
	critic_net=networks.critic_net,
	td_errors_loss=tf.losses.huber_loss,
	dqda_clipping=0.,
	actions_regularizer=0.,
	target_q_clipping=None,
	residual_phi=0.0,
	debug_summaries=False):
	"""Constructs a TD3 agent.

	Args:
	observation_spec: A TensorSpec defining the observations.
	action_spec: A BoundedTensorSpec defining the actions.
	actor_net: A callable that creates the actor network. Must take the
	following arguments: states, num_actions. Please see networks.actor_net
	for an example.
	critic_net: A callable that creates the critic network. Must take the
	following arguments: states, actions. Please see networks.critic_net
	for an example.
	td_errors_loss: A callable defining the loss function for the critic
	td error.
	dqda_clipping: (float) clips the gradient dqda element-wise between
	[-dqda_clipping, dqda_clipping]. Does not perform clipping if
	dqda_clipping == 0.
	actions_regularizer: A scalar, when positive penalizes the norm of the
	actions. This can prevent saturation of actions for the actor_loss.
	target_q_clipping: (tuple of floats) clips target q values within
	(low, high) values when computing the critic loss.
	residual_phi: (float) [0.0, 1.0] Residual algorithm parameter that
	interpolates between Q-learning and residual gradient algorithm.
	http://www.leemon.com/papers/1995b.pdf
	debug_summaries: If True, add summaries to help debug behavior.
	Raises:
	ValueError: If 'dqda_clipping' is < 0.
	"""
	self._observation_spec = observation_spec[0]
	self._action_spec = action_spec[0]
	self._state_shape = tf.TensorShape([None]).concatenate(
	self._observation_spec.shape)
	self._action_shape = tf.TensorShape([None]).concatenate(
	self._action_spec.shape)
	self._num_action_dims = self._action_spec.shape.num_elements()

	self._scope = tf.get_variable_scope().name
	self._actor_net = tf.make_template(
	self.ACTOR_NET_SCOPE, actor_net, create_scope_now_=True)
	self._critic_net = tf.make_template(
	self.CRITIC_NET_SCOPE, critic_net, create_scope_now_=True)
	self._critic_net2 = tf.make_template(
	self.CRITIC_NET2_SCOPE, critic_net, create_scope_now_=True)
	self._target_actor_net = tf.make_template(
	self.TARGET_ACTOR_NET_SCOPE, actor_net, create_scope_now_=True)
	self._target_critic_net = tf.make_template(
	self.TARGET_CRITIC_NET_SCOPE, critic_net, create_scope_now_=True)
	self._target_critic_net2 = tf.make_template(
	self.TARGET_CRITIC_NET2_SCOPE, critic_net, create_scope_now_=True)
	self._td_errors_loss = td_errors_loss
	if dqda_clipping < 0:
	raise ValueError('dqda_clipping must be >= 0.')
	self._dqda_clipping = dqda_clipping
	self._actions_regularizer = actions_regularizer
	self._target_q_clipping = target_q_clipping
	self._residual_phi = residual_phi
	self._debug_summaries = debug_summaries

	def get_trainable_critic_vars(self):
	"""Returns a list of trainable variables in the critic network.
	NOTE: This gets the vars of both critic networks.

	Returns:
	A list of trainable variables in the critic network.
	"""
	return (
	slim.get_trainable_variables(
	utils.join_scope(self._scope, self.CRITIC_NET_SCOPE)))

	def critic_net(self, states, actions, for_critic_loss=False):
	"""Returns the output of the critic network.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	actions: A [batch_size, num_action_dims] tensor representing a batch
	of actions.
	Returns:
	q values: A [batch_size] tensor of q values.
	Raises:
	ValueError: If `states` or `actions' do not have the expected dimensions.
	"""
	values1 = self._critic_net(states, actions,
	for_critic_loss=for_critic_loss)
	values2 = self._critic_net2(states, actions,
	for_critic_loss=for_critic_loss)
	if for_critic_loss:
	return values1, values2
	return values1

	def target_critic_net(self, states, actions, for_critic_loss=False):
	"""Returns the output of the target critic network.

	The target network is used to compute stable targets for training.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	actions: A [batch_size, num_action_dims] tensor representing a batch
	of actions.
	Returns:
	q values: A [batch_size] tensor of q values.
	Raises:
	ValueError: If `states` or `actions' do not have the expected dimensions.
	"""
	self._validate_states(states)
	self._validate_actions(actions)
	values1 = tf.stop_gradient(
	self._target_critic_net(states, actions,
	for_critic_loss=for_critic_loss))
	values2 = tf.stop_gradient(
	self._target_critic_net2(states, actions,
	for_critic_loss=for_critic_loss))
	if for_critic_loss:
	return values1, values2
	return values1

	def value_net(self, states, for_critic_loss=False):
	"""Returns the output of the critic evaluated with the actor.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	Returns:
	q values: A [batch_size] tensor of q values.
	"""
	actions = self.actor_net(states)
	return self.critic_net(states, actions,
	for_critic_loss=for_critic_loss)

	def target_value_net(self, states, for_critic_loss=False):
	"""Returns the output of the target critic evaluated with the target actor.

	Args:
	states: A [batch_size, num_state_dims] tensor representing a batch
	of states.
	Returns:
	q values: A [batch_size] tensor of q values.
	"""
	target_actions = self.target_actor_net(states)
	noise = tf.clip_by_value(
	tf.random_normal(tf.shape(target_actions), stddev=0.2), -0.5, 0.5)
	values1, values2 = self.target_critic_net(
	states, target_actions + noise,
	for_critic_loss=for_critic_loss)
	values = tf.minimum(values1, values2)
	return values, values

	@gin.configurable('td3_update_targets')
	def update_targets(self, tau=1.0):
	"""Performs a soft update of the target network parameters.

	For each weight w_s in the actor/critic networks, and its corresponding
	weight w_t in the target actor/critic networks, a soft update is:
	w_t = (1- tau) x w_t + tau x ws

	Args:
	tau: A float scalar in [0, 1]
	Returns:
	An operation that performs a soft update of the target network parameters.
	Raises:
	ValueError: If `tau` is not in [0, 1].
	"""
	if tau < 0 or tau > 1:
	raise ValueError('Input `tau` should be in [0, 1].')
	update_actor = utils.soft_variables_update(
	slim.get_trainable_variables(
	utils.join_scope(self._scope, self.ACTOR_NET_SCOPE)),
	slim.get_trainable_variables(
	utils.join_scope(self._scope, self.TARGET_ACTOR_NET_SCOPE)),
	tau)
	# NOTE: This updates both critic networks.
	update_critic = utils.soft_variables_update(
	slim.get_trainable_variables(
	utils.join_scope(self._scope, self.CRITIC_NET_SCOPE)),
	slim.get_trainable_variables(
	utils.join_scope(self._scope, self.TARGET_CRITIC_NET_SCOPE)),
	tau)
	return tf.group(update_actor, update_critic, name='update_targets')


	def gen_debug_td_error_summaries(
	target_q_values, q_values, td_targets, td_errors):
	"""Generates debug summaries for critic given a set of batch samples.

	Args:
	target_q_values: set of predicted next stage values.
	q_values: current predicted value for the critic network.
	td_targets: discounted target_q_values with added next stage reward.
	td_errors: the different between td_targets and q_values.
	"""
	with tf.name_scope('td_errors'):
	tf.summary.histogram('td_targets', td_targets)
	tf.summary.histogram('q_values', q_values)
	tf.summary.histogram('target_q_values', target_q_values)
	tf.summary.histogram('td_errors', td_errors)
	with tf.name_scope('td_targets'):
	tf.summary.scalar('mean', tf.reduce_mean(td_targets))
	tf.summary.scalar('max', tf.reduce_max(td_targets))
	tf.summary.scalar('min', tf.reduce_min(td_targets))
	with tf.name_scope('q_values'):
	tf.summary.scalar('mean', tf.reduce_mean(q_values))
	tf.summary.scalar('max', tf.reduce_max(q_values))
	tf.summary.scalar('min', tf.reduce_min(q_values))
	with tf.name_scope('target_q_values'):
	tf.summary.scalar('mean', tf.reduce_mean(target_q_values))
	tf.summary.scalar('max', tf.reduce_max(target_q_values))
	tf.summary.scalar('min', tf.reduce_min(target_q_values))
	with tf.name_scope('td_errors'):
	tf.summary.scalar('mean', tf.reduce_mean(td_errors))
	tf.summary.scalar('max', tf.reduce_max(td_errors))
	tf.summary.scalar('min', tf.reduce_min(td_errors))
	tf.summary.scalar('mean_abs', tf.reduce_mean(tf.abs(td_errors)))