TF convnets perf improvement: fused BN + native bias_add for NCHW. (keras-team#8785)

fchollet · web-flow · commit 018af47bc87b · 2017-12-13T21:31:08.000-08:00
* TF convnets perf improvement: fused BN + native bias_add for NCHW.

* Add docstrings

* Skip some theano tests

* Fix typo
diff --git a/keras/backend/tensorflow_backend.py b/keras/backend/tensorflow_backend.py
@@ -1684,9 +1684,9 @@ def cos(x):
     return tf.cos(x)
 
 
-def normalize_batch_in_training(x, gamma, beta,
-                                reduction_axes, epsilon=1e-3):
-    """Computes mean and std for batch then apply batch_normalization on batch.
+def _regular_normalize_batch_in_training(x, gamma, beta,
+                                         reduction_axes, epsilon=1e-3):
+    """Non-fused version of `normalize_batch_in_training`.
 
     # Arguments
         x: Input tensor or variable.
@@ -1701,36 +1701,131 @@ def normalize_batch_in_training(x, gamma, beta,
     """
     mean, var = tf.nn.moments(x, reduction_axes,
                               shift=None, name=None, keep_dims=False)
-    if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
-        normed = tf.nn.batch_normalization(x, mean, var,
-                                           beta, gamma,
-                                           epsilon)
-    else:
-        # need broadcasting
-        target_shape = []
-        for axis in range(ndim(x)):
-            if axis in reduction_axes:
-                target_shape.append(1)
-            else:
-                target_shape.append(tf.shape(x)[axis])
-        target_shape = tf.stack(target_shape)
+    normed = tf.nn.batch_normalization(x, mean, var,
+                                       beta, gamma,
+                                       epsilon)
+    return normed, mean, var
 
-        broadcast_mean = tf.reshape(mean, target_shape)
-        broadcast_var = tf.reshape(var, target_shape)
-        if gamma is None:
-            broadcast_gamma = None
-        else:
-            broadcast_gamma = tf.reshape(gamma, target_shape)
-        if beta is None:
-            broadcast_beta = None
+
+def _broadcast_normalize_batch_in_training(x, gamma, beta,
+                                           reduction_axes, epsilon=1e-3):
+    """Non-fused, broadcast version of `normalize_batch_in_training`.
+
+    # Arguments
+        x: Input tensor or variable.
+        gamma: Tensor by which to scale the input.
+        beta: Tensor with which to center the input.
+        reduction_axes: iterable of integers,
+            axes over which to normalize.
+        epsilon: Fuzz factor.
+
+    # Returns
+        A tuple length of 3, `(normalized_tensor, mean, variance)`.
+    """
+    mean, var = tf.nn.moments(x, reduction_axes,
+                              shift=None, name=None, keep_dims=False)
+    target_shape = []
+    for axis in range(ndim(x)):
+        if axis in reduction_axes:
+            target_shape.append(1)
         else:
-            broadcast_beta = tf.reshape(beta, target_shape)
-        normed = tf.nn.batch_normalization(x, broadcast_mean, broadcast_var,
-                                           broadcast_beta, broadcast_gamma,
-                                           epsilon)
+            target_shape.append(tf.shape(x)[axis])
+    target_shape = tf.stack(target_shape)
+
+    broadcast_mean = tf.reshape(mean, target_shape)
+    broadcast_var = tf.reshape(var, target_shape)
+    if gamma is None:
+        broadcast_gamma = None
+    else:
+        broadcast_gamma = tf.reshape(gamma, target_shape)
+    if beta is None:
+        broadcast_beta = None
+    else:
+        broadcast_beta = tf.reshape(beta, target_shape)
+
+    normed = tf.nn.batch_normalization(
+        x,
+        broadcast_mean,
+        broadcast_var,
+        broadcast_beta,
+        broadcast_gamma,
+        epsilon)
     return normed, mean, var
 
 
+def _fused_normalize_batch_in_training(x, gamma, beta, reduction_axes,
+                                       epsilon=1e-3):
+    """Fused version of `normalize_batch_in_training`.
+
+    # Arguments
+        x: Input tensor or variable.
+        gamma: Tensor by which to scale the input.
+        beta: Tensor with which to center the input.
+        reduction_axes: iterable of integers,
+            axes over which to normalize.
+        epsilon: Fuzz factor.
+
+    # Returns
+        A tuple length of 3, `(normalized_tensor, mean, variance)`.
+    """
+    if list(reduction_axes) == [0, 1, 2]:
+        normalization_axis = 3
+        tf_data_format = 'NHWC'
+    else:
+        normalization_axis = 1
+        tf_data_format = 'NCHW'
+
+    if gamma is None:
+        gamma = tf.constant(1.0,
+                            dtype=x.dtype,
+                            shape=[x.get_shape()[normalization_axis]])
+    if beta is None:
+        beta = tf.constant(0.0,
+                           dtype=x.dtype,
+                           shape=[x.get_shape()[normalization_axis]])
+
+    return tf.nn.fused_batch_norm(
+        x,
+        gamma,
+        beta,
+        epsilon=epsilon,
+        data_format=tf_data_format)
+
+
+def normalize_batch_in_training(x, gamma, beta,
+                                reduction_axes, epsilon=1e-3):
+    """Computes mean and std for batch then apply batch_normalization on batch.
+
+    # Arguments
+        x: Input tensor or variable.
+        gamma: Tensor by which to scale the input.
+        beta: Tensor with which to center the input.
+        reduction_axes: iterable of integers,
+            axes over which to normalize.
+        epsilon: Fuzz factor.
+
+    # Returns
+        A tuple length of 3, `(normalized_tensor, mean, variance)`.
+    """
+    if ndim(x) == 4 and list(reduction_axes) in [[0, 1, 2], [0, 2, 3]]:
+        if not _has_nchw_support() and list(reduction_axes) == [0, 2, 3]:
+            return _broadcast_normalize_batch_in_training(x, gamma, beta,
+                                                          reduction_axes,
+                                                          epsilon=epsilon)
+        return _fused_normalize_batch_in_training(
+            x, gamma, beta, reduction_axes,
+            epsilon=epsilon)
+    else:
+        if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
+            return _regular_normalize_batch_in_training(x, gamma, beta,
+                                                        reduction_axes,
+                                                        epsilon=epsilon)
+        else:
+            return _broadcast_normalize_batch_in_training(x, gamma, beta,
+                                                          reduction_axes,
+                                                          epsilon=epsilon)
+
+
 def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
     """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -3573,7 +3668,11 @@ def bias_add(x, bias, data_format=None):
     elif ndim(x) == 4:
         if data_format == 'channels_first':
             if len(bias_shape) == 1:
-                x += reshape(bias, (1, bias_shape[0], 1, 1))
+                if _has_nchw_support():
+                    x = tf.nn.bias_add(x, bias,
+                                       data_format='NCHW')
+                else:
+                    x += reshape(bias, (1, bias_shape[0], 1, 1))
             else:
                 x += reshape(bias, (1, bias_shape[2]) + bias_shape[:2])
         elif data_format == 'channels_last':
diff --git a/tests/keras/layers/normalization_test.py b/tests/keras/layers/normalization_test.py
@@ -3,6 +3,7 @@
 from numpy.testing import assert_allclose
 
 from keras.layers import Input
+from keras import regularizers
 from keras.utils.test_utils import layer_test, keras_test
 from keras.layers import normalization
 from keras.models import Sequential, Model
@@ -16,23 +17,35 @@
 
 @keras_test
 def test_basic_batchnorm():
-    from keras import regularizers
     layer_test(normalization.BatchNormalization,
                kwargs={'momentum': 0.9,
                        'epsilon': 0.1,
                        'gamma_regularizer': regularizers.l2(0.01),
                        'beta_regularizer': regularizers.l2(0.01)},
                input_shape=(3, 4, 2))
+    layer_test(normalization.BatchNormalization,
+               kwargs={'momentum': 0.9,
+                       'epsilon': 0.1,
+                       'axis': 1},
+               input_shape=(3, 4, 2))
     layer_test(normalization.BatchNormalization,
                kwargs={'gamma_initializer': 'ones',
                        'beta_initializer': 'ones',
                        'moving_mean_initializer': 'zeros',
                        'moving_variance_initializer': 'ones'},
-               input_shape=(3, 4, 2))
+               input_shape=(3, 4, 2, 4))
+    if K.backend() != 'theano':
+        layer_test(normalization.BatchNormalization,
+                   kwargs={'momentum': 0.9,
+                           'epsilon': 0.1,
+                           'axis': 1,
+                           'scale': False,
+                           'center': False},
+                   input_shape=(3, 4, 2, 4))
 
 
 @keras_test
-def test_batchnorm_correctness():
+def test_batchnorm_correctness_1d():
     model = Sequential()
     norm = normalization.BatchNormalization(input_shape=(10,), momentum=0.8)
     model.add(norm)
@@ -49,6 +62,24 @@ def test_batchnorm_correctness():
     assert_allclose(out.std(), 1.0, atol=1e-1)
 
 
+@keras_test
+def test_batchnorm_correctness_2d():
+    model = Sequential()
+    norm = normalization.BatchNormalization(axis=1, input_shape=(10, 6), momentum=0.8)
+    model.add(norm)
+    model.compile(loss='mse', optimizer='sgd')
+
+    # centered on 5.0, variance 10.0
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10, 6))
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+    out -= np.reshape(K.eval(norm.beta), (1, 10, 1))
+    out /= np.reshape(K.eval(norm.gamma), (1, 10, 1))
+
+    assert_allclose(out.mean(axis=(0, 2)), 0.0, atol=1e-1)
+    assert_allclose(out.std(axis=(0, 2)), 1.0, atol=1e-1)
+
+
 @keras_test
 def test_batchnorm_training_argument():
     bn1 = normalization.BatchNormalization(input_shape=(10,))
@@ -106,6 +137,25 @@ def test_batchnorm_convnet():
     assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
 
+@keras_test
+@pytest.mark.skipif((K.backend() == 'theano'),
+                    reason='Bug with theano backend')
+def test_batchnorm_convnet_no_center_no_scale():
+    model = Sequential()
+    norm = normalization.BatchNormalization(axis=-1, center=False, scale=False,
+                                            input_shape=(3, 4, 4), momentum=0.8)
+    model.add(norm)
+    model.compile(loss='mse', optimizer='sgd')
+
+    # centered on 5.0, variance 10.0
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+
+    assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
+    assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
+
+
 @keras_test
 def test_shared_batchnorm():
     '''Test that a BN layer can be shared