Skip to content

Commit 28863de

Browse files
authored
Add distribution strategy to keras benchmark (tensorflow#5188)
* Add distribution strategy to keras benchmark * Fix comments * Fix lints
1 parent 6a0dda1 commit 28863de

File tree

2 files changed

+60
-19
lines changed

2 files changed

+60
-19
lines changed

official/keras_application_models/README.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,19 @@ Synthetic dataset is used for the benchmark.
1919
Two custom callbacks are provided for model benchmarking: ExamplesPerSecondCallback and LoggingMetricCallback. For each callback, `epoch_based` and `batch_based` options are available to set the benchmark level. Check [model_callbacks.py](model_callbacks.py) for more details.
2020

2121
## Running Code
22-
To benchmark a model, use `--model` to specify the model name, and issue the following command:
22+
To benchmark a model, use `--model` to specify the model name. To perform the benchmark with eager execution, issue the following command:
2323
```
24-
python benchmark_main.py --model=resnet
24+
python benchmark_main.py --model resnet50 --eager
2525
```
26+
Note that, if eager execution is enabled, only one GPU is utilized even if multiple GPUs are provided and multi_gpu_model is used.
27+
28+
29+
To use distribution strategy in the benchmark, run the following:
30+
```
31+
python benchmark_main.py --model resnet50 --dist_strat
32+
```
33+
Currently, only one of the --eager and --dist_strat arguments can be defined, as DistributionStrategy is not supported in Eager execution now.
34+
2635
Arguments:
2736
* `--model`: Which model to be benchmarked. The model name is defined as the keys of `MODELS` in [benchmark_main.py](benchmark_main.py).
2837
* `--callbacks`: To specify a list of callbacks.

official/keras_application_models/benchmark_main.py

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from official.keras_application_models import model_callbacks
2929
from official.utils.flags import core as flags_core
3030
from official.utils.logs import logger
31+
from official.utils.misc import distribution_utils
3132

3233
# Define a dictionary that maps model names to their model classes inside Keras
3334
MODELS = {
@@ -41,9 +42,8 @@
4142
"densenet121": tf.keras.applications.DenseNet121,
4243
"densenet169": tf.keras.applications.DenseNet169,
4344
"densenet201": tf.keras.applications.DenseNet201,
44-
# TODO(b/80431378)
45-
# "nasnetlarge": tf.keras.applications.NASNetLarge,
46-
# "nasnetmobile": tf.keras.applications.NASNetMobile,
45+
"nasnetlarge": tf.keras.applications.NASNetLarge,
46+
"nasnetmobile": tf.keras.applications.NASNetMobile,
4747
}
4848

4949

@@ -76,28 +76,39 @@ def run_keras_model_benchmark(_):
7676
else:
7777
raise ValueError("Only synthetic dataset is supported!")
7878

79-
# If run with multiple GPUs
80-
# If eager execution is enabled, only one GPU is utilized even if multiple
81-
# GPUs are provided.
8279
num_gpus = flags_core.get_num_gpus(FLAGS)
83-
if num_gpus > 1:
80+
81+
distribution = None
82+
# Use distribution strategy
83+
if FLAGS.dist_strat:
84+
distribution = distribution_utils.get_distribution_strategy(
85+
num_gpus=num_gpus)
86+
elif num_gpus > 1:
87+
# Run with multi_gpu_model
88+
# If eager execution is enabled, only one GPU is utilized even if multiple
89+
# GPUs are provided.
8490
if FLAGS.eager:
8591
tf.logging.warning(
8692
"{} GPUs are provided, but only one GPU is utilized as "
8793
"eager execution is enabled.".format(num_gpus))
8894
model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)
8995

96+
# Adam optimizer and some other optimizers doesn't work well with
97+
# distribution strategy (b/113076709)
98+
# Use GradientDescentOptimizer here
99+
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
90100
model.compile(loss="categorical_crossentropy",
91-
optimizer=tf.train.AdamOptimizer(),
92-
metrics=["accuracy"])
101+
optimizer=optimizer,
102+
metrics=["accuracy"],
103+
distribute=distribution)
93104

94105
# Create benchmark logger for benchmark logging
95106
run_params = {
96107
"batch_size": FLAGS.batch_size,
97108
"synthetic_data": FLAGS.use_synthetic_data,
98109
"train_epochs": FLAGS.train_epochs,
99-
"num_train_images": FLAGS.num_images,
100-
"num_eval_images": FLAGS.num_images,
110+
"num_train_images": FLAGS.num_train_images,
111+
"num_eval_images": FLAGS.num_eval_images,
101112
}
102113

103114
benchmark_logger = logger.get_benchmark_logger()
@@ -118,8 +129,8 @@ def run_keras_model_benchmark(_):
118129
epochs=FLAGS.train_epochs,
119130
callbacks=callbacks,
120131
validation_data=val_dataset,
121-
steps_per_epoch=int(np.ceil(FLAGS.num_images / FLAGS.batch_size)),
122-
validation_steps=int(np.ceil(FLAGS.num_images / FLAGS.batch_size))
132+
steps_per_epoch=int(np.ceil(FLAGS.num_train_images / FLAGS.batch_size)),
133+
validation_steps=int(np.ceil(FLAGS.num_eval_images / FLAGS.batch_size))
123134
)
124135

125136
tf.logging.info("Logging the evaluation results...")
@@ -128,7 +139,7 @@ def run_keras_model_benchmark(_):
128139
"accuracy": history.history["val_acc"][epoch],
129140
"loss": history.history["val_loss"][epoch],
130141
tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
131-
FLAGS.num_images/FLAGS.batch_size)
142+
FLAGS.num_eval_images/FLAGS.batch_size)
132143
}
133144
benchmark_logger.log_evaluation_result(eval_results)
134145

@@ -157,17 +168,29 @@ def define_keras_benchmark_flags():
157168
"Model to be benchmarked."))
158169

159170
flags.DEFINE_integer(
160-
name="num_images", default=1000,
171+
name="num_train_images", default=1000,
172+
help=flags_core.help_wrap(
173+
"The number of synthetic images for training. The default value is "
174+
"1000."))
175+
176+
flags.DEFINE_integer(
177+
name="num_eval_images", default=50,
161178
help=flags_core.help_wrap(
162-
"The number of synthetic images for training and evaluation. The "
163-
"default value is 1000."))
179+
"The number of synthetic images for evaluation. The default value is "
180+
"50."))
164181

165182
flags.DEFINE_boolean(
166183
name="eager", default=False, help=flags_core.help_wrap(
167184
"To enable eager execution. Note that if eager execution is enabled, "
168185
"only one GPU is utilized even if multiple GPUs are provided and "
169186
"multi_gpu_model is used."))
170187

188+
flags.DEFINE_boolean(
189+
name="dist_strat", default=False, help=flags_core.help_wrap(
190+
"To enable distribution strategy for model training and evaluation. "
191+
"Number of GPUs used for distribution strategy can be set by the "
192+
"argument --num_gpus."))
193+
171194
flags.DEFINE_list(
172195
name="callbacks",
173196
default=["ExamplesPerSecondCallback", "LoggingMetricCallback"],
@@ -176,6 +199,15 @@ def define_keras_benchmark_flags():
176199
"callbacks. For example: `--callbacks ExamplesPerSecondCallback,"
177200
"LoggingMetricCallback`"))
178201

202+
@flags.multi_flags_validator(
203+
["eager", "dist_strat"],
204+
message="Both --eager and --dist_strat were set. Only one can be "
205+
"defined, as DistributionStrategy is not supported in Eager "
206+
"execution currently.")
207+
# pylint: disable=unused-variable
208+
def _check_eager_dist_strat(flag_dict):
209+
return not(flag_dict["eager"] and flag_dict["dist_strat"])
210+
179211

180212
def main(_):
181213
with logger.benchmark_context(FLAGS):

0 commit comments

Comments
 (0)