Spaces:
Running
Running
# Copyright 2017 Google, Inc. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Utilities and helper functions.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import numpy as np | |
import tensorflow as tf | |
def make_finite(t, replacement): | |
"""Replaces non-finite tensor values with the replacement value.""" | |
return tf.where(tf.is_finite(t), t, replacement) | |
def asinh(x): | |
"""Computes the inverse hyperbolic sine function (in tensorflow).""" | |
return tf.log(x + tf.sqrt(1. + x ** 2)) | |
def affine(inputs, output_size, scope="Affine", scale=0.1, vec_mean=0., | |
include_bias=True, bias_init=0., random_seed=None): | |
"""Computes an affine function of the inputs. | |
Creates or recalls tensorflow variables "Matrix" and "Bias" | |
to generate an affine operation on the input. | |
If the inputs are a list of tensors, they are concatenated together. | |
Initial weights for the matrix are drawn from a Gaussian with zero | |
mean and standard deviation that is the given scale divided by the | |
square root of the input dimension. Initial weights for the bias are | |
set to zero. | |
Args: | |
inputs: List of tensors with shape (batch_size, input_size) | |
output_size: Size (dimension) of the output | |
scope: Variable scope for these parameters (default: "Affine") | |
scale: Initial weight scale for the matrix parameters (default: 0.1), | |
this constant is divided by the sqrt of the input size to get the | |
std. deviation of the initial weights | |
vec_mean: The mean for the random initializer | |
include_bias: Whether to include the bias term | |
bias_init: The initializer bias (default 0.) | |
random_seed: Random seed for random initializers. (Default: None) | |
Returns: | |
output: Tensor with shape (batch_size, output_size) | |
""" | |
# Concatenate the input arguments. | |
x = tf.concat(inputs, 1) | |
with tf.variable_scope(scope): | |
input_size = x.get_shape().as_list()[1] | |
sigma = scale / np.sqrt(input_size) | |
rand_init = tf.random_normal_initializer(mean=vec_mean, stddev=sigma, | |
seed=random_seed) | |
matrix = tf.get_variable("Matrix", [input_size, output_size], | |
dtype=tf.float32, initializer=rand_init) | |
if include_bias: | |
bias = tf.get_variable("Bias", [output_size], dtype=tf.float32, | |
initializer=tf.constant_initializer(bias_init, | |
tf.float32)) | |
else: | |
bias = 0. | |
output = tf.matmul(x, matrix) + bias | |
return output | |
def project(inputs, weights, bias=0., activation=tf.identity): | |
"""Computes an affine or linear projection of the inputs. | |
Projects the inputs onto the given weight vector and (optionally) | |
adds a bias and passes the result through an activation function. | |
Args: | |
inputs: matrix of inputs with shape [batch_size, dim] | |
weights: weight matrix with shape [dim, output_dim] | |
bias: bias vector with shape [output_dim] (default: 0) | |
activation: nonlinear activation function (default: tf.identity) | |
Returns: | |
outputs: an op which computes activation(inputs @ weights + bias) | |
""" | |
return activation(tf.matmul(inputs, weights) + bias) | |
def new_mean_squared(grad_vec, decay, ms): | |
"""Calculates the new accumulated mean squared of the gradient. | |
Args: | |
grad_vec: the vector for the current gradient | |
decay: the decay term | |
ms: the previous mean_squared value | |
Returns: | |
the new mean_squared value | |
""" | |
decay_size = decay.get_shape().num_elements() | |
decay_check_ops = [ | |
tf.assert_less_equal(decay, 1., summarize=decay_size), | |
tf.assert_greater_equal(decay, 0., summarize=decay_size)] | |
with tf.control_dependencies(decay_check_ops): | |
grad_squared = tf.square(grad_vec) | |
# If the previous mean_squared is the 0 vector, don't use the decay and just | |
# return the full grad_squared. This should only happen on the first timestep. | |
decay = tf.cond(tf.reduce_all(tf.equal(ms, 0.)), | |
lambda: tf.zeros_like(decay, dtype=tf.float32), lambda: decay) | |
# Update the running average of squared gradients. | |
epsilon = 1e-12 | |
return (1. - decay) * (grad_squared + epsilon) + decay * ms | |
def rms_scaling(gradient, decay, ms, update_ms=True): | |
"""Vectorizes and scales a tensor of gradients. | |
Args: | |
gradient: the current gradient | |
decay: the current decay value. | |
ms: the previous mean squared value | |
update_ms: Whether to update the mean squared value (default: True) | |
Returns: | |
The scaled gradient and the new ms value if update_ms is True, | |
the old ms value otherwise. | |
""" | |
# Vectorize the gradients and compute the squared gradients. | |
grad_vec = tf.reshape(gradient, [-1, 1]) | |
if update_ms: | |
ms = new_mean_squared(grad_vec, decay, ms) | |
# Scale the current gradients by the RMS, squashed by the asinh function. | |
scaled_gradient = asinh(grad_vec / tf.sqrt(ms + 1e-16)) | |
return scaled_gradient, ms | |
def accumulate_sparse_gradients(grad): | |
"""Accumulates repeated indices of a sparse gradient update. | |
Args: | |
grad: a tf.IndexedSlices gradient | |
Returns: | |
grad_indices: unique indices | |
grad_values: gradient values corresponding to the indices | |
""" | |
grad_indices, grad_segments = tf.unique(grad.indices) | |
grad_values = tf.unsorted_segment_sum(grad.values, grad_segments, | |
tf.shape(grad_indices)[0]) | |
return grad_indices, grad_values | |
def slice_tensor(dense_tensor, indices, head_dims): | |
"""Extracts slices from a partially flattened dense tensor. | |
indices is assumed to index into the first dimension of head_dims. | |
dense_tensor is assumed to have a shape [D_0, D_1, ...] such that | |
prod(head_dims) == D_0. This function will extract slices along the | |
first_dimension of head_dims. | |
Example: | |
Consider a tensor with shape head_dims = [100, 2] and a dense_tensor with | |
shape [200, 3]. Note that the first dimension of dense_tensor equals the | |
product of head_dims. This function will reshape dense_tensor such that | |
its shape is now [100, 2, 3] (i.e. the first dimension became head-dims) | |
and then slice it along the first dimension. After slicing, the slices will | |
have their initial dimensions flattened just as they were in dense_tensor | |
(e.g. if there are 4 indices, the return value will have a shape of [4, 3]). | |
Args: | |
dense_tensor: a N-D dense tensor. Shape: [D_0, D_1, ...] | |
indices: a 1-D integer tensor. Shape: [K] | |
head_dims: True dimensions of the dense_tensor's first dimension. | |
Returns: | |
Extracted slices. Shape [K, D_1, ...] | |
""" | |
tail_dims = tf.shape(dense_tensor)[1:] | |
dense_tensor = tf.reshape(dense_tensor, | |
tf.concat([head_dims, tail_dims], 0)) | |
slices = tf.gather(dense_tensor, indices) | |
# NOTE(siege): This kills the shape annotation. | |
return tf.reshape(slices, tf.concat([[-1], tail_dims], 0)) | |
def stack_tensor(slices, indices, dense_tensor, head_dims): | |
"""Reconsititutes a tensor from slices and corresponding indices. | |
This is an inverse operation to slice_tensor. Missing slices are set to 0. | |
Args: | |
slices: a tensor. Shape [K, D_1, ...] | |
indices: a 1-D integer tensor. Shape: [K] | |
dense_tensor: the original tensor the slices were taken | |
from. Shape: [D_0, D_1, ...] | |
head_dims: True dimensions of the dense_tensor's first dimension. | |
Returns: | |
Reconsituted tensor. Shape: [D_0, D_1, ...] | |
""" | |
# NOTE(siege): This cast shouldn't be necessary. | |
indices = tf.cast(indices, tf.int32) | |
tail_dims = tf.shape(dense_tensor)[1:] | |
dense_shape = tf.concat([head_dims, tail_dims], 0) | |
slices = tf.reshape(slices, tf.concat([[-1], dense_shape[1:]], 0)) | |
indices = tf.expand_dims(indices, -1) | |
return tf.reshape(tf.scatter_nd(indices, slices, dense_shape), | |
tf.shape(dense_tensor)) | |
def update_slices(slices, indices, dense_tensor, head_dims): | |
"""Reconstitutes a tensor from slices and corresponding indices. | |
Like _stack_tensor, but instead of setting missing slices to 0, sets them to | |
what they were in the original tensor. The return value is reshaped to be | |
the same as dense_tensor. | |
Args: | |
slices: a tensor. Shape [K, D_1, ...] | |
indices: a 1-D integer tensor. Shape: [K] | |
dense_tensor: the original tensor the slices were taken | |
from. Shape: [D_0, D_1, ...] | |
head_dims: True dimensions of the dense_tensor's first dimension. | |
Returns: | |
Reconsituted tensor. Shape: [D_0, D_1, ...] | |
""" | |
# NOTE(siege): This cast shouldn't be necessary. | |
indices = tf.cast(indices, tf.int32) | |
tail_dims = tf.shape(dense_tensor)[1:] | |
dense_shape = tf.concat([head_dims, tail_dims], 0) | |
update_mask_vals = tf.fill(tf.shape(indices), 1) | |
reshaped_indices = tf.expand_dims(indices, -1) | |
update_mask = tf.equal( | |
tf.scatter_nd(reshaped_indices, update_mask_vals, head_dims[:1]), 1) | |
reshaped_dense_slices = tf.reshape( | |
stack_tensor(slices, indices, dense_tensor, head_dims), dense_shape) | |
reshaped_dense_tensor = tf.reshape(dense_tensor, dense_shape) | |
return tf.reshape( | |
tf.where(update_mask, reshaped_dense_slices, reshaped_dense_tensor), | |
tf.shape(dense_tensor)) | |