# Lint as: python2, python3 # Copyright 2019 The TensorFlow Authors All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Augment slim.conv2d with optional Weight Standardization (WS). WS is a normalization method to accelerate micro-batch training. When used with Group Normalization and trained with 1 image/GPU, WS is able to match or outperform the performances of BN trained with large batch sizes. [1] Siyuan Qiao, Huiyu Wang, Chenxi Liu, Wei Shen, Alan Yuille Weight Standardization. arXiv:1903.10520 [2] Lei Huang, Xianglong Liu, Yang Liu, Bo Lang, Dacheng Tao Centered Weight Normalization in Accelerating Training of Deep Neural Networks. ICCV 2017 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from tensorflow.contrib import framework as contrib_framework from tensorflow.contrib import layers as contrib_layers from tensorflow.contrib.layers.python.layers import layers from tensorflow.contrib.layers.python.layers import utils class Conv2D(tf.keras.layers.Conv2D, tf.layers.Layer): """2D convolution layer (e.g. spatial convolution over images). This layer creates a convolution kernel that is convolved (actually cross-correlated) with the layer input to produce a tensor of outputs. If `use_bias` is True (and a `bias_initializer` is provided), a bias vector is created and added to the outputs. Finally, if `activation` is not `None`, it is applied to the outputs as well. """ def __init__(self, filters, kernel_size, strides=(1, 1), padding='valid', data_format='channels_last', dilation_rate=(1, 1), activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, use_weight_standardization=False, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, trainable=True, name=None, **kwargs): """Constructs the 2D convolution layer. Args: filters: Integer, the dimensionality of the output space (i.e. the number of filters in the convolution). kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution window. Can be a single integer to specify the same value for all spatial dimensions. strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution along the height and width. Can be a single integer to specify the same value for all spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any `dilation_rate` value != 1. padding: One of `"valid"` or `"same"` (case-insensitive). data_format: A string, one of `channels_last` (default) or `channels_first`. The ordering of the dimensions in the inputs. `channels_last` corresponds to inputs with shape `(batch, height, width, channels)` while `channels_first` corresponds to inputs with shape `(batch, channels, height, width)`. dilation_rate: An integer or tuple/list of 2 integers, specifying the dilation rate to use for dilated convolution. Can be a single integer to specify the same value for all spatial dimensions. Currently, specifying any `dilation_rate` value != 1 is incompatible with specifying any stride value != 1. activation: Activation function. Set it to None to maintain a linear activation. use_bias: Boolean, whether the layer uses a bias. kernel_initializer: An initializer for the convolution kernel. bias_initializer: An initializer for the bias vector. If None, the default initializer will be used. kernel_regularizer: Optional regularizer for the convolution kernel. bias_regularizer: Optional regularizer for the bias vector. use_weight_standardization: Boolean, whether the layer uses weight standardization. activity_regularizer: Optional regularizer function for the output. kernel_constraint: Optional projection function to be applied to the kernel after being updated by an `Optimizer` (e.g. used to implement norm constraints or value constraints for layer weights). The function must take as input the unprojected variable and must return the projected variable (which must have the same shape). Constraints are not safe to use when doing asynchronous distributed training. bias_constraint: Optional projection function to be applied to the bias after being updated by an `Optimizer`. trainable: Boolean, if `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). name: A string, the name of the layer. **kwargs: Arbitrary keyword arguments passed to tf.keras.layers.Conv2D """ super(Conv2D, self).__init__( filters=filters, kernel_size=kernel_size, strides=strides, padding=padding, data_format=data_format, dilation_rate=dilation_rate, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer, kernel_constraint=kernel_constraint, bias_constraint=bias_constraint, trainable=trainable, name=name, **kwargs) self.use_weight_standardization = use_weight_standardization def call(self, inputs): if self.use_weight_standardization: mean, var = tf.nn.moments(self.kernel, [0, 1, 2], keep_dims=True) kernel = (self.kernel - mean) / tf.sqrt(var + 1e-5) outputs = self._convolution_op(inputs, kernel) else: outputs = self._convolution_op(inputs, self.kernel) if self.use_bias: if self.data_format == 'channels_first': if self.rank == 1: # tf.nn.bias_add does not accept a 1D input tensor. bias = tf.reshape(self.bias, (1, self.filters, 1)) outputs += bias else: outputs = tf.nn.bias_add(outputs, self.bias, data_format='NCHW') else: outputs = tf.nn.bias_add(outputs, self.bias, data_format='NHWC') if self.activation is not None: return self.activation(outputs) return outputs @contrib_framework.add_arg_scope def conv2d(inputs, num_outputs, kernel_size, stride=1, padding='SAME', data_format=None, rate=1, activation_fn=tf.nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=contrib_layers.xavier_initializer(), weights_regularizer=None, biases_initializer=tf.zeros_initializer(), biases_regularizer=None, use_weight_standardization=False, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None): """Adds a 2D convolution followed by an optional batch_norm layer. `convolution` creates a variable called `weights`, representing the convolutional kernel, that is convolved (actually cross-correlated) with the `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is provided (such as `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is None and a `biases_initializer` is provided then a `biases` variable would be created and added the activations. Finally, if `activation_fn` is not `None`, it is applied to the activations as well. Performs atrous convolution with input stride/dilation rate equal to `rate` if a value > 1 for any dimension of `rate` is specified. In this case `stride` values != 1 are not supported. Args: inputs: A Tensor of rank N+2 of shape `[batch_size] + input_spatial_shape + [in_channels]` if data_format does not start with "NC" (default), or `[batch_size, in_channels] + input_spatial_shape` if data_format starts with "NC". num_outputs: Integer, the number of output filters. kernel_size: A sequence of N positive integers specifying the spatial dimensions of the filters. Can be a single integer to specify the same value for all spatial dimensions. stride: A sequence of N positive integers specifying the stride at which to compute output. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `stride` value != 1 is incompatible with specifying any `rate` value != 1. padding: One of `"VALID"` or `"SAME"`. data_format: A string or None. Specifies whether the channel dimension of the `input` and output is the last dimension (default, or if `data_format` does not start with "NC"), or the second dimension (if `data_format` starts with "NC"). For N=1, the valid values are "NWC" (default) and "NCW". For N=2, the valid values are "NHWC" (default) and "NCHW". For N=3, the valid values are "NDHWC" (default) and "NCDHW". rate: A sequence of N positive integers specifying the dilation rate to use for atrous convolution. Can be a single integer to specify the same value for all spatial dimensions. Specifying any `rate` value != 1 is incompatible with specifying any `stride` value != 1. activation_fn: Activation function. The default value is a ReLU function. Explicitly set it to None to skip it and maintain a linear activation. normalizer_fn: Normalization function to use instead of `biases`. If `normalizer_fn` is provided then `biases_initializer` and `biases_regularizer` are ignored and `biases` are not created nor added. default set to None for no normalizer function normalizer_params: Normalization function parameters. weights_initializer: An initializer for the weights. weights_regularizer: Optional regularizer for the weights. biases_initializer: An initializer for the biases. If None skip biases. biases_regularizer: Optional regularizer for the biases. use_weight_standardization: Boolean, whether the layer uses weight standardization. reuse: Whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. variables_collections: Optional list of collections for all the variables or a dictionary containing a different list of collection per variable. outputs_collections: Collection to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for `variable_scope`. Returns: A tensor representing the output of the operation. Raises: ValueError: If `data_format` is invalid. ValueError: Both 'rate' and `stride` are not uniformly 1. """ if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']: raise ValueError('Invalid data_format: %r' % (data_format,)) # pylint: disable=protected-access layer_variable_getter = layers._build_variable_getter({ 'bias': 'biases', 'kernel': 'weights' }) # pylint: enable=protected-access with tf.variable_scope( scope, 'Conv', [inputs], reuse=reuse, custom_getter=layer_variable_getter) as sc: inputs = tf.convert_to_tensor(inputs) input_rank = inputs.get_shape().ndims if input_rank != 4: raise ValueError('Convolution expects input with rank %d, got %d' % (4, input_rank)) data_format = ('channels_first' if data_format and data_format.startswith('NC') else 'channels_last') layer = Conv2D( filters=num_outputs, kernel_size=kernel_size, strides=stride, padding=padding, data_format=data_format, dilation_rate=rate, activation=None, use_bias=not normalizer_fn and biases_initializer, kernel_initializer=weights_initializer, bias_initializer=biases_initializer, kernel_regularizer=weights_regularizer, bias_regularizer=biases_regularizer, use_weight_standardization=use_weight_standardization, activity_regularizer=None, trainable=trainable, name=sc.name, dtype=inputs.dtype.base_dtype, _scope=sc, _reuse=reuse) outputs = layer.apply(inputs) # Add variables to collections. # pylint: disable=protected-access layers._add_variable_to_collections(layer.kernel, variables_collections, 'weights') if layer.use_bias: layers._add_variable_to_collections(layer.bias, variables_collections, 'biases') # pylint: enable=protected-access if normalizer_fn is not None: normalizer_params = normalizer_params or {} outputs = normalizer_fn(outputs, **normalizer_params) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs) def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): """Strided 2-D convolution with 'SAME' padding. When stride > 1, then we do explicit zero-padding, followed by conv2d with 'VALID' padding. Note that net = conv2d_same(inputs, num_outputs, 3, stride=stride) is equivalent to net = conv2d(inputs, num_outputs, 3, stride=1, padding='SAME') net = subsample(net, factor=stride) whereas net = conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME') is different when the input's height or width is even, which is why we add the current function. For more details, see ResnetUtilsTest.testConv2DSameEven(). Args: inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. num_outputs: An integer, the number of output filters. kernel_size: An int with the kernel_size of the filters. stride: An integer, the output stride. rate: An integer, rate for atrous convolution. scope: Scope. Returns: output: A 4-D tensor of size [batch, height_out, width_out, channels] with the convolution output. """ if stride == 1: return conv2d( inputs, num_outputs, kernel_size, stride=1, rate=rate, padding='SAME', scope=scope) else: kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) pad_total = kernel_size_effective - 1 pad_beg = pad_total // 2 pad_end = pad_total - pad_beg inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) return conv2d( inputs, num_outputs, kernel_size, stride=stride, rate=rate, padding='VALID', scope=scope)