Add distribution strategy to keras benchmark (tensorflow#5188)

yhliang2018 · web-flow · commit 28863de1c832 · 2018-08-29T09:36:58.000-07:00
* Add distribution strategy to keras benchmark

* Fix comments

* Fix lints
diff --git a/official/keras_application_models/README.md b/official/keras_application_models/README.md
@@ -19,10 +19,19 @@ Synthetic dataset is used for the benchmark.
 Two custom callbacks are provided for model benchmarking: ExamplesPerSecondCallback and LoggingMetricCallback. For each callback, `epoch_based` and `batch_based` options are available to set the benchmark level. Check [model_callbacks.py](model_callbacks.py) for more details.
 
 ## Running Code
-To benchmark a model, use `--model` to specify the model name, and issue the following command:
+To benchmark a model, use `--model` to specify the model name. To perform the benchmark with eager execution, issue the following command:
 ```
-python benchmark_main.py --model=resnet
+python benchmark_main.py --model resnet50 --eager
 ```
+Note that, if eager execution is enabled, only one GPU is utilized even if multiple GPUs are provided and multi_gpu_model is used.
+
+
+To use distribution strategy in the benchmark, run the following:
+```
+python benchmark_main.py --model resnet50 --dist_strat
+```
+Currently, only one of the --eager and --dist_strat arguments can be defined, as DistributionStrategy is not supported in Eager execution now.
+
 Arguments:
   * `--model`: Which model to be benchmarked. The model name is defined as the keys of `MODELS` in [benchmark_main.py](benchmark_main.py).
   * `--callbacks`: To specify a list of callbacks.
diff --git a/official/keras_application_models/benchmark_main.py b/official/keras_application_models/benchmark_main.py
@@ -28,6 +28,7 @@
 from official.keras_application_models import model_callbacks
 from official.utils.flags import core as flags_core
 from official.utils.logs import logger
+from official.utils.misc import distribution_utils
 
 # Define a dictionary that maps model names to their model classes inside Keras
 MODELS = {
@@ -41,9 +42,8 @@
     "densenet121": tf.keras.applications.DenseNet121,
     "densenet169": tf.keras.applications.DenseNet169,
     "densenet201": tf.keras.applications.DenseNet201,
-    # TODO(b/80431378)
-    # "nasnetlarge": tf.keras.applications.NASNetLarge,
-    # "nasnetmobile": tf.keras.applications.NASNetMobile,
+    "nasnetlarge": tf.keras.applications.NASNetLarge,
+    "nasnetmobile": tf.keras.applications.NASNetMobile,
 }
 
 
@@ -76,28 +76,39 @@ def run_keras_model_benchmark(_):
   else:
     raise ValueError("Only synthetic dataset is supported!")
 
-  # If run with multiple GPUs
-  # If eager execution is enabled, only one GPU is utilized even if multiple
-  # GPUs are provided.
   num_gpus = flags_core.get_num_gpus(FLAGS)
-  if num_gpus > 1:
+
+  distribution = None
+  # Use distribution strategy
+  if FLAGS.dist_strat:
+    distribution = distribution_utils.get_distribution_strategy(
+        num_gpus=num_gpus)
+  elif num_gpus > 1:
+    # Run with multi_gpu_model
+    # If eager execution is enabled, only one GPU is utilized even if multiple
+    # GPUs are provided.
     if FLAGS.eager:
       tf.logging.warning(
           "{} GPUs are provided, but only one GPU is utilized as "
           "eager execution is enabled.".format(num_gpus))
     model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)
 
+  # Adam optimizer and some other optimizers doesn't work well with
+  # distribution strategy (b/113076709)
+  # Use GradientDescentOptimizer here
+  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
   model.compile(loss="categorical_crossentropy",
-                optimizer=tf.train.AdamOptimizer(),
-                metrics=["accuracy"])
+                optimizer=optimizer,
+                metrics=["accuracy"],
+                distribute=distribution)
 
   # Create benchmark logger for benchmark logging
   run_params = {
       "batch_size": FLAGS.batch_size,
       "synthetic_data": FLAGS.use_synthetic_data,
       "train_epochs": FLAGS.train_epochs,
-      "num_train_images": FLAGS.num_images,
-      "num_eval_images": FLAGS.num_images,
+      "num_train_images": FLAGS.num_train_images,
+      "num_eval_images": FLAGS.num_eval_images,
   }
 
   benchmark_logger = logger.get_benchmark_logger()
@@ -118,8 +129,8 @@ def run_keras_model_benchmark(_):
       epochs=FLAGS.train_epochs,
       callbacks=callbacks,
       validation_data=val_dataset,
-      steps_per_epoch=int(np.ceil(FLAGS.num_images / FLAGS.batch_size)),
-      validation_steps=int(np.ceil(FLAGS.num_images / FLAGS.batch_size))
+      steps_per_epoch=int(np.ceil(FLAGS.num_train_images / FLAGS.batch_size)),
+      validation_steps=int(np.ceil(FLAGS.num_eval_images / FLAGS.batch_size))
   )
 
   tf.logging.info("Logging the evaluation results...")
@@ -128,7 +139,7 @@ def run_keras_model_benchmark(_):
         "accuracy": history.history["val_acc"][epoch],
         "loss": history.history["val_loss"][epoch],
         tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
-            FLAGS.num_images/FLAGS.batch_size)
+            FLAGS.num_eval_images/FLAGS.batch_size)
     }
     benchmark_logger.log_evaluation_result(eval_results)
 
@@ -157,17 +168,29 @@ def define_keras_benchmark_flags():
           "Model to be benchmarked."))
 
   flags.DEFINE_integer(
-      name="num_images", default=1000,
+      name="num_train_images", default=1000,
+      help=flags_core.help_wrap(
+          "The number of synthetic images for training. The default value is "
+          "1000."))
+
+  flags.DEFINE_integer(
+      name="num_eval_images", default=50,
       help=flags_core.help_wrap(
-          "The number of synthetic images for training and evaluation. The "
-          "default value is 1000."))
+          "The number of synthetic images for evaluation. The default value is "
+          "50."))
 
   flags.DEFINE_boolean(
       name="eager", default=False, help=flags_core.help_wrap(
           "To enable eager execution. Note that if eager execution is enabled, "
           "only one GPU is utilized even if multiple GPUs are provided and "
           "multi_gpu_model is used."))
 
+  flags.DEFINE_boolean(
+      name="dist_strat", default=False, help=flags_core.help_wrap(
+          "To enable distribution strategy for model training and evaluation. "
+          "Number of GPUs used for distribution strategy can be set by the "
+          "argument --num_gpus."))
+
   flags.DEFINE_list(
       name="callbacks",
       default=["ExamplesPerSecondCallback", "LoggingMetricCallback"],
@@ -176,6 +199,15 @@ def define_keras_benchmark_flags():
           "callbacks. For example: `--callbacks ExamplesPerSecondCallback,"
           "LoggingMetricCallback`"))
 
+  @flags.multi_flags_validator(
+      ["eager", "dist_strat"],
+      message="Both --eager and --dist_strat were set. Only one can be "
+              "defined, as DistributionStrategy is not supported in Eager "
+              "execution currently.")
+  # pylint: disable=unused-variable
+  def _check_eager_dist_strat(flag_dict):
+    return not(flag_dict["eager"] and flag_dict["dist_strat"])
+
 
 def main(_):
   with logger.benchmark_context(FLAGS):