Skip to content

Commit 98cde20

Browse files
authored
Merge pull request haoyuhu#4 from HaoyuHu/fp16
PR: FP16 Support
2 parents 24434dc + 323e167 commit 98cde20

File tree

6 files changed

+1104
-1003
lines changed

6 files changed

+1104
-1003
lines changed

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# bert-multi-gpu
22

3-
Feel free to fine tune large BERT models with large batch size easily. Multi-GPU are supported.
3+
Feel free to fine tune large BERT models with large batch size easily. Multi-GPU and FP16 are supported.
44

55
## Dependencies
66

@@ -11,6 +11,15 @@ Feel free to fine tune large BERT models with large batch size easily. Multi-GPU
1111

1212

1313

14+
## Features
15+
16+
- CPU/GPU/TPU Support
17+
- **Multi-GPU Support**: [`tf.distribute.MirroredStrategy`](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy) is used to achieve Multi-GPU support for this project, which mirrors vars to distribute across multiple devices and machines. The maximum batch_size for each GPU is almost the same as [bert](https://github.com/google-research/bert/blob/master/README.md#out-of-memory-issues). So **global batch_size** depends on how many GPUs there are.
18+
- **FP16 Support**: [FP16](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) allows you to use a larger batch_size. And training speed will increase by 70~100% on Volta GPUs, but may be slower on Pascal GPUs([REF1](https://github.com/tensorflow/tensorflow/issues/15585#issuecomment-361769151), [REF2](https://github.com/HaoyuHu/bert-multi-gpu/issues/1#issuecomment-493363383)).
19+
- **SavedModel Export**
20+
21+
22+
1423
## Usage
1524

1625
List some optional parameters below:
@@ -29,6 +38,7 @@ List some optional parameters below:
2938
- `num_train_epochs`: Train epoch number.
3039
- `use_gpu`: Use GPU or not.
3140
- `num_gpu_cores`: Total number of GPU cores to use, only used if `use_gpu` is True.
41+
- `use_fp16`: Use [`FP16`](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) or not.
3242
- `output_dir`: **Checkpoints** and **SavedModel(.pb) files** will be saved in this directory.
3343

3444
```shell
@@ -49,6 +59,7 @@ python run_custom_classifier.py \
4959
--num_train_epochs=3.0 \
5060
--use_gpu=true \
5161
--num_gpu_cores=3 \
62+
--use_fp16=true \
5263
--output_dir=/cfs/outputs/bert-large-uncased-qqp
5364
```
5465

custom_optimization.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from tensorflow.python.ops import resource_variable_ops
2929

3030

31-
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps):
31+
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, fp16=False):
3232
"""Creates an optimizer training op."""
3333
global_step = tf.train.get_or_create_global_step()
3434

@@ -70,19 +70,39 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps):
7070
epsilon=1e-6,
7171
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
7272

73+
# REF: https://github.com/tensorflow/tensorflow/issues/25080
74+
# if fp16:
75+
# loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(
76+
# init_loss_scale=2 ** 32,
77+
# incr_every_n_steps=1000,
78+
# decr_every_n_nan_or_inf=2,
79+
# decr_ratio=0.5)
80+
# optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
81+
7382
tvars = tf.trainable_variables()
74-
grads = tf.gradients(loss, tvars)
83+
gvs = optimizer.compute_gradients(loss, tvars)
84+
gvs = [(g, v) for g, v in gvs if g is not None]
85+
grads, tvars = list(zip(*gvs))
86+
if fp16:
87+
all_finite = tf.reduce_all([tf.reduce_all(tf.is_finite(g)) for g in grads])
88+
else:
89+
all_finite = tf.constant(True, dtype=tf.bool)
7590

7691
# This is how the model was pre-trained.
77-
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
92+
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0,
93+
use_norm=tf.cond(
94+
all_finite,
95+
lambda: tf.global_norm(grads),
96+
lambda: tf.constant(1.0)))
7897

7998
train_op = optimizer.apply_gradients(
8099
zip(grads, tvars), global_step=global_step)
81100

82101
# Normally the global step update is done inside of `apply_gradients`.
83102
# However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
84103
# a different optimizer, you should probably take this line out.
85-
new_global_step = global_step + 1
104+
new_global_step = tf.cond(all_finite, lambda: global_step + 1, lambda: global_step)
105+
new_global_step = tf.identity(new_global_step, name='update_step')
86106
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
87107
return train_op
88108

@@ -101,7 +121,7 @@ def __init__(self,
101121
"""Constructs a AdamWeightDecayOptimizer."""
102122
super(AdamWeightDecayOptimizer, self).__init__(False, name)
103123

104-
self.learning_rate = learning_rate
124+
self.learning_rate = tf.identity(learning_rate, name='learning_rate')
105125
self.weight_decay_rate = weight_decay_rate
106126
self.beta_1 = beta_1
107127
self.beta_2 = beta_2

gpu_environment.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# coding=utf-8
2+
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import tensorflow as tf
17+
18+
19+
def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
20+
initializer=None, regularizer=None,
21+
trainable=True,
22+
*args, **kwargs):
23+
"""Custom variable getter that forces trainable variables to be stored in
24+
float32 precision and then casts them to the training precision.
25+
"""
26+
storage_dtype = tf.float32 if trainable else dtype
27+
variable = getter(name, shape, dtype=storage_dtype,
28+
initializer=initializer, regularizer=regularizer,
29+
trainable=trainable,
30+
*args, **kwargs)
31+
if trainable and dtype != tf.float32:
32+
variable = tf.cast(variable, dtype)
33+
return variable
34+
35+
36+
def get_custom_getter(compute_type):
37+
return float32_variable_storage_getter if compute_type == tf.float16 else None

0 commit comments

Comments
 (0)