From c395175433ec94694bcf3043082aea1d6497080a Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Tue, 14 Mar 2023 19:42:28 +0530
Subject: [PATCH 01/17] script added

---
 .../automated_trainval_script.py              | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 examples/glue_benchmark/automated_trainval_script.py

diff --git a/examples/glue_benchmark/automated_trainval_script.py b/examples/glue_benchmark/automated_trainval_script.py
new file mode 100644
index 0000000000..283d902e2a
--- /dev/null
+++ b/examples/glue_benchmark/automated_trainval_script.py
@@ -0,0 +1,184 @@
+import os
+import keras_nlp
+import tensorflow as tf
+from absl import flags, app
+from tensorflow import keras
+import tensorflow_datasets as tfds
+
+seed = 42
+os.environ["PYTHONHASHSEED"] = str(seed)
+tf.random.set_seed(seed)
+
+
+flags.DEFINE_string("model", None, "The Model you want to train and evaluate.")
+flags.DEFINE_string("preset", None, "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')")
+flags.DEFINE_string("task", "stsb", "The task you want the model to train on.")
+flags.DEFINE_float("learning_rate", 0.005, "The learning_rate for the optimizer.")
+flags.DEFINE_integer("epochs", 2, "No of Epochs.")
+flags.DEFINE_integer("batch_size", 8, "Batch Size.")
+
+
+FLAGS = flags.FLAGS
+
+
+def load_data(task_name):
+    """
+    Load GLUE dataset.
+    Load GLUE dataset, and convert the dictionary format to (features, label),
+    where features is a tuple of all input sentences.
+    """
+    if task_name in ("cola", "sst2"):
+        feature_names = ("sentence",)
+    elif task_name in ("mrpc", "stsb", "rte", "wnli"):
+        feature_names = ("sentence1", "sentence2")
+    elif task_name in ("mnli", "mnli_matched", "mnli_mismatched", "ax"):
+        feature_names = ("premise", "hypothesis")
+    elif task_name in "qnli":
+        feature_names = ("question", "sentence")
+    elif task_name in "qqp":
+        feature_names = ("question1", "question2")
+    else:
+        raise ValueError(f"Unknown task_name {task_name}.")
+
+    test_suffix = ""
+    if task_name in ("mnli", "mnli_matched"):
+        # For "mnli", just run default to "mnli_matched".
+        task_name = "mnli"
+        test_suffix = "_matched"
+    elif task_name in ("mnli_mismatched"):
+        task_name = "mnli"
+        test_suffix = "_mismatched"
+
+    def split_features(x):
+        # GLUE comes with dictonary data, we convert it to a uniform format
+        # (features, label), where features is a tuple consisting of all
+        # features.
+        features = tuple([x[name] for name in feature_names])
+        label = x["label"]
+        return (features, label)
+
+    if task_name == "ax":
+        # AX is trained and evaluated on MNLI, and has its own test split.
+        train_ds, validation_ds = tfds.load(
+            "glue/mnli",
+            split=["train", "validation_matched"],
+        )
+        test_ds = tfds.load(
+            "glue/ax",
+            split="test",
+        )
+    else:
+        train_ds, test_ds, validation_ds = tfds.load(
+            f"glue/{task_name}",
+            split=["train", "test" + test_suffix, "validation" + test_suffix],
+        )
+
+    # Extract out the index order of test dataset.
+    idx_order = test_ds.map(lambda data: data["idx"])
+
+    train_ds = train_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
+    test_ds = test_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
+    validation_ds = validation_ds.map(
+        split_features,
+        num_parallel_calls=tf.data.AUTOTUNE
+    )
+    return train_ds, test_ds, validation_ds, idx_order
+
+def create_model(model, preset, task):
+    # output_dim
+    if task in ("cola", "sst2", "mrpc", "qqp", "rte", "qnli", "wnli"):
+        output_dim = 2
+    elif task in ("mnli", "mnli_matched", "mnli_mismatched", "ax"):
+        output_dim = 3
+    elif task in ("stsb"):
+        output_dim = 1
+    else:
+        raise ValueError(f"Task not supported! Please choose a task from {('cola', 'sst2', 'mrpc', 'qqp', 'rte', 'qnli', 'wnli', 'mnli', 'mnli_matched', 'mnli_mismatched', 'ax', 'stsb')}")
+
+    # select backbone
+    backbone_dict = {"bert":keras_nlp.models.BertBackbone,
+                     "albert":keras_nlp.models.AlbertBackbone,
+                     "deberta":keras_nlp.models.DebertaV3Backbone,
+                     "distil-bert":keras_nlp.models.DistilBertBackbone,
+                     "roberta":keras_nlp.models.RobertaBackbone,
+                     "xlm-roberta":keras_nlp.models.XLMRobertaBackbone,
+                     "f_net":keras_nlp.models.FNetBackbone,
+                     }
+    if model not in list(backbone_dict.keys()):
+        raise ValueError(f"Model is either not an Encoder based model(eg. Bert, Albert) or "
+                         f"not supported at this moment! Please select a model from here - {tuple(backbone_dict.keys())}")
+
+    # Build the model
+    backbone = backbone_dict[model].from_preset(preset)
+    # If the model has pooled_output
+    if len(backbone.outputs)>1:
+        output = keras.layers.Dense(output_dim)(backbone.output["pooled_output"])
+    elif len(backbone.outputs)==1:
+        output = keras.layers.Dense(output_dim)(backbone.output)
+    model = keras.models.Model(backbone.inputs, output)
+
+    return model
+
+def preprocess_data(dataset, model, preset):
+    """Run `proprocess_fn` on input dataset then batch & prefetch."""
+
+    preprocessor_dict = {
+        "bert":keras_nlp.models.BertPreprocessor,
+        "albert":keras_nlp.models.AlbertPreprocessor,
+        "deberta":keras_nlp.models.DebertaV3Preprocessor,
+        "distil-bert":keras_nlp.models.DistilBertPreprocessor,
+        "roberta":keras_nlp.models.RobertaPreprocessor,
+        "xlm-roberta":keras_nlp.models.XLMRobertaPreprocessor,
+        "f_net":keras_nlp.models.FNetPreprocessor,
+    }
+    if model not in list(preprocessor_dict.keys()):
+        raise ValueError(f"Model does not have a preprocessor class. This class is required for preprocessing "
+                         f"of the data before feeding it to the model! Please select a model from here - {tuple(preprocessor_dict.keys())}")
+
+    preprocessor = preprocessor_dict[model].from_preset(preset)
+    def preprocess_fn(feature, label):
+        return preprocessor(feature), label
+
+    return dataset.map(preprocess_fn).batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+
+def main(_):
+    print(tf.__version__)
+    print("GPU available : ", tf.test.is_gpu_available())
+
+    print("="*120)
+    print(f"MODEL : {FLAGS.model}   PRESET : {FLAGS.preset}   DATASET : {FLAGS.task}")
+    print("="*120)
+
+    # Load the model
+    model = create_model(model=FLAGS.model,
+                         preset=FLAGS.preset,
+                         task=FLAGS.task)
+    # Add loss and optimizer
+    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    metrics = [keras.metrics.SparseCategoricalAccuracy()]
+    if FLAGS.task == "stsb":
+        loss = keras.losses.MeanSquaredError()
+        metrics = [keras.metrics.MeanSquaredError()]
+
+    # Load datasets
+    train_ds, test_ds, validation_ds, idx_order = load_data(FLAGS.task)
+    train_ds = preprocess_data(dataset=train_ds, model=FLAGS.model, preset=FLAGS.preset)
+    validation_ds = preprocess_data(dataset=validation_ds, model=FLAGS.model, preset=FLAGS.preset)
+
+    lr = tf.keras.optimizers.schedules.PolynomialDecay(
+                FLAGS.learning_rate,
+                decay_steps=train_ds.cardinality() * FLAGS.epochs,
+                end_learning_rate=0.0)
+    optimizer = tf.keras.optimizers.experimental.AdamW(lr, weight_decay=0.01, global_clipnorm=1.0)
+    optimizer.exclude_from_weight_decay(var_names=["LayerNorm", "layer_norm", "bias"])
+    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
+
+    # Start training
+    model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
+
+
+if __name__=="__main__":
+    flags.mark_flag_as_required("model")
+    flags.mark_flag_as_required("preset")
+    app.run(main)
+

From 40c4abbf16999764b5a406d7d49b414e1c003936 Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Tue, 14 Mar 2023 19:44:56 +0530
Subject: [PATCH 02/17] .

---
 .../automated_trainval_script.py              | 140 ++++++++++++------
 1 file changed, 94 insertions(+), 46 deletions(-)

diff --git a/examples/glue_benchmark/automated_trainval_script.py b/examples/glue_benchmark/automated_trainval_script.py
index 283d902e2a..8cb62336f0 100644
--- a/examples/glue_benchmark/automated_trainval_script.py
+++ b/examples/glue_benchmark/automated_trainval_script.py
@@ -1,9 +1,25 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
-import keras_nlp
+
 import tensorflow as tf
-from absl import flags, app
-from tensorflow import keras
 import tensorflow_datasets as tfds
+from absl import app
+from absl import flags
+from tensorflow import keras
+
+import keras_nlp
 
 seed = 42
 os.environ["PYTHONHASHSEED"] = str(seed)
@@ -11,9 +27,15 @@
 
 
 flags.DEFINE_string("model", None, "The Model you want to train and evaluate.")
-flags.DEFINE_string("preset", None, "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')")
+flags.DEFINE_string(
+    "preset",
+    None,
+    "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')",
+)
 flags.DEFINE_string("task", "stsb", "The task you want the model to train on.")
-flags.DEFINE_float("learning_rate", 0.005, "The learning_rate for the optimizer.")
+flags.DEFINE_float(
+    "learning_rate", 0.005, "The learning_rate for the optimizer."
+)
 flags.DEFINE_integer("epochs", 2, "No of Epochs.")
 flags.DEFINE_integer("batch_size", 8, "Batch Size.")
 
@@ -79,11 +101,11 @@ def split_features(x):
     train_ds = train_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
     test_ds = test_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
     validation_ds = validation_ds.map(
-        split_features,
-        num_parallel_calls=tf.data.AUTOTUNE
+        split_features, num_parallel_calls=tf.data.AUTOTUNE
     )
     return train_ds, test_ds, validation_ds, idx_order
 
+
 def create_model(model, preset, task):
     # output_dim
     if task in ("cola", "sst2", "mrpc", "qqp", "rte", "qnli", "wnli"):
@@ -93,66 +115,84 @@ def create_model(model, preset, task):
     elif task in ("stsb"):
         output_dim = 1
     else:
-        raise ValueError(f"Task not supported! Please choose a task from {('cola', 'sst2', 'mrpc', 'qqp', 'rte', 'qnli', 'wnli', 'mnli', 'mnli_matched', 'mnli_mismatched', 'ax', 'stsb')}")
+        raise ValueError(
+            f"Task not supported! Please choose a task from {('cola', 'sst2', 'mrpc', 'qqp', 'rte', 'qnli', 'wnli', 'mnli', 'mnli_matched', 'mnli_mismatched', 'ax', 'stsb')}"
+        )
 
     # select backbone
-    backbone_dict = {"bert":keras_nlp.models.BertBackbone,
-                     "albert":keras_nlp.models.AlbertBackbone,
-                     "deberta":keras_nlp.models.DebertaV3Backbone,
-                     "distil-bert":keras_nlp.models.DistilBertBackbone,
-                     "roberta":keras_nlp.models.RobertaBackbone,
-                     "xlm-roberta":keras_nlp.models.XLMRobertaBackbone,
-                     "f_net":keras_nlp.models.FNetBackbone,
-                     }
+    backbone_dict = {
+        "bert": keras_nlp.models.BertBackbone,
+        "albert": keras_nlp.models.AlbertBackbone,
+        "deberta": keras_nlp.models.DebertaV3Backbone,
+        "distil-bert": keras_nlp.models.DistilBertBackbone,
+        "roberta": keras_nlp.models.RobertaBackbone,
+        "xlm-roberta": keras_nlp.models.XLMRobertaBackbone,
+        "f_net": keras_nlp.models.FNetBackbone,
+    }
     if model not in list(backbone_dict.keys()):
-        raise ValueError(f"Model is either not an Encoder based model(eg. Bert, Albert) or "
-                         f"not supported at this moment! Please select a model from here - {tuple(backbone_dict.keys())}")
+        raise ValueError(
+            f"Model is either not an Encoder based model(eg. Bert, Albert) or "
+            f"not supported at this moment! Please select a model from here - {tuple(backbone_dict.keys())}"
+        )
 
     # Build the model
     backbone = backbone_dict[model].from_preset(preset)
     # If the model has pooled_output
-    if len(backbone.outputs)>1:
-        output = keras.layers.Dense(output_dim)(backbone.output["pooled_output"])
-    elif len(backbone.outputs)==1:
+    if len(backbone.outputs) > 1:
+        output = keras.layers.Dense(output_dim)(
+            backbone.output["pooled_output"]
+        )
+    elif len(backbone.outputs) == 1:
         output = keras.layers.Dense(output_dim)(backbone.output)
     model = keras.models.Model(backbone.inputs, output)
 
     return model
 
+
 def preprocess_data(dataset, model, preset):
     """Run `proprocess_fn` on input dataset then batch & prefetch."""
 
     preprocessor_dict = {
-        "bert":keras_nlp.models.BertPreprocessor,
-        "albert":keras_nlp.models.AlbertPreprocessor,
-        "deberta":keras_nlp.models.DebertaV3Preprocessor,
-        "distil-bert":keras_nlp.models.DistilBertPreprocessor,
-        "roberta":keras_nlp.models.RobertaPreprocessor,
-        "xlm-roberta":keras_nlp.models.XLMRobertaPreprocessor,
-        "f_net":keras_nlp.models.FNetPreprocessor,
+        "bert": keras_nlp.models.BertPreprocessor,
+        "albert": keras_nlp.models.AlbertPreprocessor,
+        "deberta": keras_nlp.models.DebertaV3Preprocessor,
+        "distil-bert": keras_nlp.models.DistilBertPreprocessor,
+        "roberta": keras_nlp.models.RobertaPreprocessor,
+        "xlm-roberta": keras_nlp.models.XLMRobertaPreprocessor,
+        "f_net": keras_nlp.models.FNetPreprocessor,
     }
     if model not in list(preprocessor_dict.keys()):
-        raise ValueError(f"Model does not have a preprocessor class. This class is required for preprocessing "
-                         f"of the data before feeding it to the model! Please select a model from here - {tuple(preprocessor_dict.keys())}")
+        raise ValueError(
+            f"Model does not have a preprocessor class. This class is required for preprocessing "
+            f"of the data before feeding it to the model! Please select a model from here - {tuple(preprocessor_dict.keys())}"
+        )
 
     preprocessor = preprocessor_dict[model].from_preset(preset)
+
     def preprocess_fn(feature, label):
         return preprocessor(feature), label
 
-    return dataset.map(preprocess_fn).batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+    return (
+        dataset.map(preprocess_fn)
+        .batch(FLAGS.batch_size)
+        .prefetch(tf.data.AUTOTUNE)
+    )
+
 
 def main(_):
     print(tf.__version__)
     print("GPU available : ", tf.test.is_gpu_available())
 
-    print("="*120)
-    print(f"MODEL : {FLAGS.model}   PRESET : {FLAGS.preset}   DATASET : {FLAGS.task}")
-    print("="*120)
+    print("=" * 120)
+    print(
+        f"MODEL : {FLAGS.model}   PRESET : {FLAGS.preset}   DATASET : {FLAGS.task}"
+    )
+    print("=" * 120)
 
     # Load the model
-    model = create_model(model=FLAGS.model,
-                         preset=FLAGS.preset,
-                         task=FLAGS.task)
+    model = create_model(
+        model=FLAGS.model, preset=FLAGS.preset, task=FLAGS.task
+    )
     # Add loss and optimizer
     loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     metrics = [keras.metrics.SparseCategoricalAccuracy()]
@@ -162,23 +202,31 @@ def main(_):
 
     # Load datasets
     train_ds, test_ds, validation_ds, idx_order = load_data(FLAGS.task)
-    train_ds = preprocess_data(dataset=train_ds, model=FLAGS.model, preset=FLAGS.preset)
-    validation_ds = preprocess_data(dataset=validation_ds, model=FLAGS.model, preset=FLAGS.preset)
+    train_ds = preprocess_data(
+        dataset=train_ds, model=FLAGS.model, preset=FLAGS.preset
+    )
+    validation_ds = preprocess_data(
+        dataset=validation_ds, model=FLAGS.model, preset=FLAGS.preset
+    )
 
     lr = tf.keras.optimizers.schedules.PolynomialDecay(
-                FLAGS.learning_rate,
-                decay_steps=train_ds.cardinality() * FLAGS.epochs,
-                end_learning_rate=0.0)
-    optimizer = tf.keras.optimizers.experimental.AdamW(lr, weight_decay=0.01, global_clipnorm=1.0)
-    optimizer.exclude_from_weight_decay(var_names=["LayerNorm", "layer_norm", "bias"])
+        FLAGS.learning_rate,
+        decay_steps=train_ds.cardinality() * FLAGS.epochs,
+        end_learning_rate=0.0,
+    )
+    optimizer = tf.keras.optimizers.experimental.AdamW(
+        lr, weight_decay=0.01, global_clipnorm=1.0
+    )
+    optimizer.exclude_from_weight_decay(
+        var_names=["LayerNorm", "layer_norm", "bias"]
+    )
     model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
 
     # Start training
     model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
 
 
-if __name__=="__main__":
+if __name__ == "__main__":
     flags.mark_flag_as_required("model")
     flags.mark_flag_as_required("preset")
     app.run(main)
-

From 6c68a527d6be98147926ae6605a19b4857d6c819 Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Wed, 15 Mar 2023 20:10:40 +0530
Subject: [PATCH 03/17] glue_mrpc.py added

---
 .../automated_trainval_script.py              | 232 ------------------
 examples/glue_benchmark/glue_mrpc.py          | 194 +++++++++++++++
 2 files changed, 194 insertions(+), 232 deletions(-)
 delete mode 100644 examples/glue_benchmark/automated_trainval_script.py
 create mode 100644 examples/glue_benchmark/glue_mrpc.py

diff --git a/examples/glue_benchmark/automated_trainval_script.py b/examples/glue_benchmark/automated_trainval_script.py
deleted file mode 100644
index 8cb62336f0..0000000000
--- a/examples/glue_benchmark/automated_trainval_script.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Copyright 2023 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-import tensorflow as tf
-import tensorflow_datasets as tfds
-from absl import app
-from absl import flags
-from tensorflow import keras
-
-import keras_nlp
-
-seed = 42
-os.environ["PYTHONHASHSEED"] = str(seed)
-tf.random.set_seed(seed)
-
-
-flags.DEFINE_string("model", None, "The Model you want to train and evaluate.")
-flags.DEFINE_string(
-    "preset",
-    None,
-    "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')",
-)
-flags.DEFINE_string("task", "stsb", "The task you want the model to train on.")
-flags.DEFINE_float(
-    "learning_rate", 0.005, "The learning_rate for the optimizer."
-)
-flags.DEFINE_integer("epochs", 2, "No of Epochs.")
-flags.DEFINE_integer("batch_size", 8, "Batch Size.")
-
-
-FLAGS = flags.FLAGS
-
-
-def load_data(task_name):
-    """
-    Load GLUE dataset.
-    Load GLUE dataset, and convert the dictionary format to (features, label),
-    where features is a tuple of all input sentences.
-    """
-    if task_name in ("cola", "sst2"):
-        feature_names = ("sentence",)
-    elif task_name in ("mrpc", "stsb", "rte", "wnli"):
-        feature_names = ("sentence1", "sentence2")
-    elif task_name in ("mnli", "mnli_matched", "mnli_mismatched", "ax"):
-        feature_names = ("premise", "hypothesis")
-    elif task_name in "qnli":
-        feature_names = ("question", "sentence")
-    elif task_name in "qqp":
-        feature_names = ("question1", "question2")
-    else:
-        raise ValueError(f"Unknown task_name {task_name}.")
-
-    test_suffix = ""
-    if task_name in ("mnli", "mnli_matched"):
-        # For "mnli", just run default to "mnli_matched".
-        task_name = "mnli"
-        test_suffix = "_matched"
-    elif task_name in ("mnli_mismatched"):
-        task_name = "mnli"
-        test_suffix = "_mismatched"
-
-    def split_features(x):
-        # GLUE comes with dictonary data, we convert it to a uniform format
-        # (features, label), where features is a tuple consisting of all
-        # features.
-        features = tuple([x[name] for name in feature_names])
-        label = x["label"]
-        return (features, label)
-
-    if task_name == "ax":
-        # AX is trained and evaluated on MNLI, and has its own test split.
-        train_ds, validation_ds = tfds.load(
-            "glue/mnli",
-            split=["train", "validation_matched"],
-        )
-        test_ds = tfds.load(
-            "glue/ax",
-            split="test",
-        )
-    else:
-        train_ds, test_ds, validation_ds = tfds.load(
-            f"glue/{task_name}",
-            split=["train", "test" + test_suffix, "validation" + test_suffix],
-        )
-
-    # Extract out the index order of test dataset.
-    idx_order = test_ds.map(lambda data: data["idx"])
-
-    train_ds = train_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
-    test_ds = test_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
-    validation_ds = validation_ds.map(
-        split_features, num_parallel_calls=tf.data.AUTOTUNE
-    )
-    return train_ds, test_ds, validation_ds, idx_order
-
-
-def create_model(model, preset, task):
-    # output_dim
-    if task in ("cola", "sst2", "mrpc", "qqp", "rte", "qnli", "wnli"):
-        output_dim = 2
-    elif task in ("mnli", "mnli_matched", "mnli_mismatched", "ax"):
-        output_dim = 3
-    elif task in ("stsb"):
-        output_dim = 1
-    else:
-        raise ValueError(
-            f"Task not supported! Please choose a task from {('cola', 'sst2', 'mrpc', 'qqp', 'rte', 'qnli', 'wnli', 'mnli', 'mnli_matched', 'mnli_mismatched', 'ax', 'stsb')}"
-        )
-
-    # select backbone
-    backbone_dict = {
-        "bert": keras_nlp.models.BertBackbone,
-        "albert": keras_nlp.models.AlbertBackbone,
-        "deberta": keras_nlp.models.DebertaV3Backbone,
-        "distil-bert": keras_nlp.models.DistilBertBackbone,
-        "roberta": keras_nlp.models.RobertaBackbone,
-        "xlm-roberta": keras_nlp.models.XLMRobertaBackbone,
-        "f_net": keras_nlp.models.FNetBackbone,
-    }
-    if model not in list(backbone_dict.keys()):
-        raise ValueError(
-            f"Model is either not an Encoder based model(eg. Bert, Albert) or "
-            f"not supported at this moment! Please select a model from here - {tuple(backbone_dict.keys())}"
-        )
-
-    # Build the model
-    backbone = backbone_dict[model].from_preset(preset)
-    # If the model has pooled_output
-    if len(backbone.outputs) > 1:
-        output = keras.layers.Dense(output_dim)(
-            backbone.output["pooled_output"]
-        )
-    elif len(backbone.outputs) == 1:
-        output = keras.layers.Dense(output_dim)(backbone.output)
-    model = keras.models.Model(backbone.inputs, output)
-
-    return model
-
-
-def preprocess_data(dataset, model, preset):
-    """Run `proprocess_fn` on input dataset then batch & prefetch."""
-
-    preprocessor_dict = {
-        "bert": keras_nlp.models.BertPreprocessor,
-        "albert": keras_nlp.models.AlbertPreprocessor,
-        "deberta": keras_nlp.models.DebertaV3Preprocessor,
-        "distil-bert": keras_nlp.models.DistilBertPreprocessor,
-        "roberta": keras_nlp.models.RobertaPreprocessor,
-        "xlm-roberta": keras_nlp.models.XLMRobertaPreprocessor,
-        "f_net": keras_nlp.models.FNetPreprocessor,
-    }
-    if model not in list(preprocessor_dict.keys()):
-        raise ValueError(
-            f"Model does not have a preprocessor class. This class is required for preprocessing "
-            f"of the data before feeding it to the model! Please select a model from here - {tuple(preprocessor_dict.keys())}"
-        )
-
-    preprocessor = preprocessor_dict[model].from_preset(preset)
-
-    def preprocess_fn(feature, label):
-        return preprocessor(feature), label
-
-    return (
-        dataset.map(preprocess_fn)
-        .batch(FLAGS.batch_size)
-        .prefetch(tf.data.AUTOTUNE)
-    )
-
-
-def main(_):
-    print(tf.__version__)
-    print("GPU available : ", tf.test.is_gpu_available())
-
-    print("=" * 120)
-    print(
-        f"MODEL : {FLAGS.model}   PRESET : {FLAGS.preset}   DATASET : {FLAGS.task}"
-    )
-    print("=" * 120)
-
-    # Load the model
-    model = create_model(
-        model=FLAGS.model, preset=FLAGS.preset, task=FLAGS.task
-    )
-    # Add loss and optimizer
-    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-    metrics = [keras.metrics.SparseCategoricalAccuracy()]
-    if FLAGS.task == "stsb":
-        loss = keras.losses.MeanSquaredError()
-        metrics = [keras.metrics.MeanSquaredError()]
-
-    # Load datasets
-    train_ds, test_ds, validation_ds, idx_order = load_data(FLAGS.task)
-    train_ds = preprocess_data(
-        dataset=train_ds, model=FLAGS.model, preset=FLAGS.preset
-    )
-    validation_ds = preprocess_data(
-        dataset=validation_ds, model=FLAGS.model, preset=FLAGS.preset
-    )
-
-    lr = tf.keras.optimizers.schedules.PolynomialDecay(
-        FLAGS.learning_rate,
-        decay_steps=train_ds.cardinality() * FLAGS.epochs,
-        end_learning_rate=0.0,
-    )
-    optimizer = tf.keras.optimizers.experimental.AdamW(
-        lr, weight_decay=0.01, global_clipnorm=1.0
-    )
-    optimizer.exclude_from_weight_decay(
-        var_names=["LayerNorm", "layer_norm", "bias"]
-    )
-    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
-
-    # Start training
-    model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
-
-
-if __name__ == "__main__":
-    flags.mark_flag_as_required("model")
-    flags.mark_flag_as_required("preset")
-    app.run(main)
diff --git a/examples/glue_benchmark/glue_mrpc.py b/examples/glue_benchmark/glue_mrpc.py
new file mode 100644
index 0000000000..76106165d9
--- /dev/null
+++ b/examples/glue_benchmark/glue_mrpc.py
@@ -0,0 +1,194 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from absl import app
+from absl import flags
+from tensorflow import keras
+
+import keras_nlp
+
+seed = 42
+os.environ["PYTHONHASHSEED"] = str(seed)
+tf.random.set_seed(seed)
+
+flags.DEFINE_string("model", None, "The Model you want to train and evaluate.")
+flags.DEFINE_bool(
+    "freeze_backbone",
+    True,
+    "If you want to freeze the backbone and only train the downstream layers or not.",
+)
+flags.DEFINE_string(
+    "preset",
+    None,
+    "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')",
+)
+flags.DEFINE_float(
+    "learning_rate", 0.005, "The learning_rate for the optimizer."
+)
+flags.DEFINE_integer("epochs", 2, "No of Epochs.")
+flags.DEFINE_integer("batch_size", 8, "Batch Size.")
+
+
+FLAGS = flags.FLAGS
+
+
+def load_data():
+    """
+    Load GLUE dataset.
+    Load GLUE dataset, and convert the dictionary format to (features, label),
+    where features is a tuple of all input sentences.
+    """
+
+    feature_names = ("sentence1", "sentence2")
+
+    def split_features(x):
+        # GLUE comes with dictonary data, we convert it to a uniform format
+        # (features, label), where features is a tuple consisting of all
+        # features.
+        features = tuple([x[name] for name in feature_names])
+        label = x["label"]
+        return (features, label)
+
+    train_ds, test_ds, validation_ds = tfds.load(
+        "glue/mrpc",
+        split=["train", "test", "validation"],
+    )
+
+    train_ds = train_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
+    test_ds = test_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
+    validation_ds = validation_ds.map(
+        split_features, num_parallel_calls=tf.data.AUTOTUNE
+    )
+    return train_ds, test_ds, validation_ds
+
+
+def load_model_and_preprocessor(model, preset):
+    for name, symbol in keras_nlp.models.__dict__.items():
+        if inspect.isclass(symbol) and issubclass(symbol, keras.Model):
+            if model and name != model:
+                continue
+            if not hasattr(symbol, "from_preset"):
+                continue
+            for _preset in symbol.presets:
+                if preset and _preset != preset:
+                    continue
+                model = symbol.from_preset(preset)
+                print(f"Using model {name} with preset {preset}")
+                preprocessor = keras_nlp.models.__dict__[
+                    name.replace("Backbone", "Preprocessor")
+                ].from_preset(preset)
+
+                return model, preprocessor
+
+    raise ValueError(f"Model {model} or preset {preset} not found.")
+
+
+def create_model(model, preset):
+    # output_dim
+    output_dim = 2
+    # select backbone
+
+    if "backbone" not in FLAGS.model.lower():
+        backbones = [
+            i
+            for i in keras_nlp.models.__dict__
+            if "Backbone" in i
+            and i not in ["OPTBackbone", "GPT2Backbone", "WhisperBackbone"]
+        ]
+        raise ValueError(
+            f"Please enter a Backbone, from this list - {backbones}, the script will take care of the downstream layers."
+        )
+
+    # Build the model
+    backbone, preprocessor = load_model_and_preprocessor(
+        model=model, preset=preset
+    )
+    # Freeze
+    if FLAGS.freeze_backbone:
+        backbone.trainable = False
+    else:
+        backbone.trainable = True
+    # If the model has pooled_output
+    if len(backbone.outputs) > 1:
+        output = keras.layers.Dense(output_dim)(
+            backbone.output["pooled_output"]
+        )
+    elif len(backbone.outputs) == 1:
+        output = keras.layers.Dense(output_dim)(backbone.output)
+    model = keras.models.Model(backbone.inputs, output)
+
+    return model, preprocessor
+
+
+def preprocess_data(dataset, preprocessor):
+    """Run `proprocess_fn` on input dataset then batch & prefetch."""
+
+    def preprocess_fn(feature, label):
+        return preprocessor(feature), label
+
+    return (
+        dataset.map(preprocess_fn)
+        .batch(FLAGS.batch_size)
+        .prefetch(tf.data.AUTOTUNE)
+    )
+
+
+def main(_):
+    print(tf.__version__)
+    print("GPU available : ", tf.test.is_gpu_available())
+
+    print("=" * 120)
+    print(
+        f"MODEL : {FLAGS.model}   PRESET : {FLAGS.preset}   DATASET : glue/mrpc"
+    )
+    print("=" * 120)
+
+    # Load the model
+    model, preprocessor = create_model(model=FLAGS.model, preset=FLAGS.preset)
+    # Add loss and optimizer
+    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    metrics = [keras.metrics.SparseCategoricalAccuracy()]
+
+    # Load datasets
+    train_ds, test_ds, validation_ds = load_data()
+    train_ds = preprocess_data(dataset=train_ds, preprocessor=preprocessor)
+    validation_ds = preprocess_data(
+        dataset=validation_ds, preprocessor=preprocessor
+    )
+
+    lr = tf.keras.optimizers.schedules.PolynomialDecay(
+        FLAGS.learning_rate,
+        decay_steps=train_ds.cardinality() * FLAGS.epochs,
+        end_learning_rate=0.0,
+    )
+    optimizer = tf.keras.optimizers.experimental.AdamW(
+        lr, weight_decay=0.01, global_clipnorm=1.0
+    )
+    optimizer.exclude_from_weight_decay(
+        var_names=["LayerNorm", "layer_norm", "bias"]
+    )
+    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
+
+    # Start training
+    model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
+
+
+if __name__ == "__main__":
+    flags.mark_flag_as_required("model")
+    flags.mark_flag_as_required("preset")
+    app.run(main)

From bc422234b0fb9346568a307354c281ead4846274 Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Wed, 15 Mar 2023 20:12:00 +0530
Subject: [PATCH 04/17] nit

---
 examples/glue_benchmark/glue_mrpc.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/glue_benchmark/glue_mrpc.py b/examples/glue_benchmark/glue_mrpc.py
index 76106165d9..78ca666c5b 100644
--- a/examples/glue_benchmark/glue_mrpc.py
+++ b/examples/glue_benchmark/glue_mrpc.py
@@ -49,8 +49,7 @@
 
 def load_data():
     """
-    Load GLUE dataset.
-    Load GLUE dataset, and convert the dictionary format to (features, label),
+    Load GLUE/MRPC dataset, and convert the dictionary format to (features, label),
     where features is a tuple of all input sentences.
     """
 

From 1a1242699947359db69ef527972ae54ff3b249cb Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Thu, 16 Mar 2023 00:22:43 +0530
Subject: [PATCH 05/17] .

---
 examples/glue_benchmark/glue_mrpc.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/glue_benchmark/glue_mrpc.py b/examples/glue_benchmark/glue_mrpc.py
index 78ca666c5b..2b626e05dc 100644
--- a/examples/glue_benchmark/glue_mrpc.py
+++ b/examples/glue_benchmark/glue_mrpc.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import inspect
 import os
+import time
 
 import tensorflow as tf
 import tensorflow_datasets as tfds
@@ -169,6 +170,7 @@ def main(_):
     validation_ds = preprocess_data(
         dataset=validation_ds, preprocessor=preprocessor
     )
+    print("GLUE/MRPC Dataset Loaded!")
 
     lr = tf.keras.optimizers.schedules.PolynomialDecay(
         FLAGS.learning_rate,
@@ -184,7 +186,16 @@ def main(_):
     model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
 
     # Start training
+    print("Starting Training...")
+
+    st = time.time()
     model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
+    et = time.time()
+
+    print("Training Finished!")
+    print(
+        f"Training took :: {(et-st):.4f} seconds, or {((et-st)/60):.2f} minutes, or {((et-st)/3600):.2f} hours!"
+    )
 
 
 if __name__ == "__main__":

From 08aa45f1ecebdc9ff8b03caec3bf7df9d744482e Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Fri, 17 Mar 2023 20:56:28 +0530
Subject: [PATCH 06/17] renamed glue_mrpc->glue and changes

---
 examples/glue_benchmark/glue.py      | 362 +++++++++------------------
 examples/glue_benchmark/glue_mrpc.py | 204 ---------------
 2 files changed, 123 insertions(+), 443 deletions(-)
 delete mode 100644 examples/glue_benchmark/glue_mrpc.py

diff --git a/examples/glue_benchmark/glue.py b/examples/glue_benchmark/glue.py
index 358b61221e..eaa18cfe36 100644
--- a/examples/glue_benchmark/glue.py
+++ b/examples/glue_benchmark/glue.py
@@ -11,10 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import csv
+import inspect
 import os
+import time
 
-import numpy as np
 import tensorflow as tf
 import tensorflow_datasets as tfds
 from absl import app
@@ -23,86 +23,44 @@
 
 import keras_nlp
 
-FLAGS = flags.FLAGS
+seed = 42
+os.environ["PYTHONHASHSEED"] = str(seed)
+tf.random.set_seed(seed)
+
 
 flags.DEFINE_string(
-    "task_name",
+    "task",
     "mrpc",
-    "The name of the GLUE task to finetune on.",
-)
-
-flags.DEFINE_integer(
-    "batch_size",
-    32,
-    "Batch size of data.",
-)
-
-flags.DEFINE_integer(
-    "epochs",
-    2,
-    "Number of epochs to run finetuning.",
+    "The name of the test that the model is going to be trained and evaluated on.",
 )
-
-flags.DEFINE_float(
-    "learning_rate",
-    5e-5,
-    "Learning rate",
+flags.DEFINE_string("model", None, "The Model you want to train and evaluate.")
+flags.DEFINE_bool(
+    "freeze_backbone",
+    True,
+    "If you want to freeze the backbone and only train the downstream layers or not.",
 )
-
 flags.DEFINE_string(
-    "tpu_name",
+    "preset",
     None,
-    "The name of TPU to connect to. If None, no TPU will be used. If you only "
-    "have one TPU, use `local`",
+    "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')",
 )
-
-flags.DEFINE_string(
-    "submission_directory",
-    None,
-    "The directory to save the glue submission file.",
+flags.DEFINE_float(
+    "learning_rate", 0.005, "The learning_rate for the optimizer."
 )
+flags.DEFINE_integer("epochs", 2, "No of Epochs.")
+flags.DEFINE_integer("batch_size", 8, "Batch Size.")
 
-flags.DEFINE_string(
-    "load_finetuning_model",
-    None,
-    "The path to load the finetuning model. If None, the model is trained.",
-)
 
-flags.DEFINE_string(
-    "save_finetuning_model",
-    None,
-    "The path to save the finetuning model. If None, the model is not saved.",
-)
+FLAGS = flags.FLAGS
 
 
-def load_data(task_name):
+def load_data():
     """
-    Load GLUE dataset.
-
-    Load GLUE dataset, and convert the dictionary format to (features, label),
+    Load GLUE/MRPC dataset, and convert the dictionary format to (features, label),
     where features is a tuple of all input sentences.
     """
-    if task_name in ("cola", "sst2"):
-        feature_names = ("sentence",)
-    elif task_name in ("mrpc", "stsb", "rte", "wnli"):
-        feature_names = ("sentence1", "sentence2")
-    elif task_name in ("mnli", "mnli_matched", "mnli_mismatched", "ax"):
-        feature_names = ("premise", "hypothesis")
-    elif task_name in "qnli":
-        feature_names = ("question", "sentence")
-    elif task_name in "qqp":
-        feature_names = ("question1", "question2")
-    else:
-        raise ValueError(f"Unknown task_name {task_name}.")
-
-    test_suffix = ""
-    if task_name in ("mnli", "mnli_matched"):
-        # For "mnli", just run default to "mnli_matched".
-        task_name = "mnli"
-        test_suffix = "_matched"
-    elif task_name in ("mnli_mismatched"):
-        task_name = "mnli"
-        test_suffix = "_mismatched"
+
+    feature_names = ("sentence1", "sentence2")
 
     def split_features(x):
         # GLUE comes with dictonary data, we convert it to a uniform format
@@ -112,35 +70,62 @@ def split_features(x):
         label = x["label"]
         return (features, label)
 
-    if task_name == "ax":
-        # AX is trained and evaluated on MNLI, and has its own test split.
-        train_ds, validation_ds = tfds.load(
-            "glue/mnli",
-            split=["train", "validation_matched"],
-        )
-        test_ds = tfds.load(
-            "glue/ax",
-            split="test",
-        )
-    else:
-        train_ds, test_ds, validation_ds = tfds.load(
-            f"glue/{task_name}",
-            split=["train", "test" + test_suffix, "validation" + test_suffix],
-        )
-
-    # Extract out the index order of test dataset.
-    idx_order = test_ds.map(lambda data: data["idx"])
+    train_ds, test_ds, validation_ds = tfds.load(
+        "glue/mrpc",
+        split=["train", "test", "validation"],
+    )
 
     train_ds = train_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
     test_ds = test_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
     validation_ds = validation_ds.map(
         split_features, num_parallel_calls=tf.data.AUTOTUNE
     )
-    return train_ds, test_ds, validation_ds, idx_order
+    return train_ds, test_ds, validation_ds
+
+
+def load_model_and_preprocessor(model, preset, num_classes):
+    for name, symbol in keras_nlp.models.__dict__.items():
+        if inspect.isclass(symbol) and issubclass(symbol, keras.Model):
+            if model and name != model:
+                continue
+            if not hasattr(symbol, "from_preset"):
+                continue
+            for _preset in symbol.presets:
+                if preset and _preset != preset:
+                    continue
+                if "Backbone" in name:
+                    model = keras_nlp.models.__dict__[
+                        name.replace("Backbone", "Classifier")
+                    ](
+                        backbone=symbol.from_preset(preset),
+                        num_classes=num_classes,
+                        preprocessor=None,
+                    )
+                    preprocessor = keras_nlp.models.__dict__[
+                        name.replace("Backbone", "Preprocessor")
+                    ].from_preset(preset)
+                elif "Classifier" in name:
+                    model = symbol.from_preset(
+                        preset=preset,
+                        num_classes=num_classes,
+                        preprocessor=None,
+                    )
+                    preprocessor = keras_nlp.models.__dict__[
+                        name.replace("Classifier", "Preprocessor")
+                    ].from_preset(preset)
+
+                print(f"Using model {name} with preset {preset}")
+                return model, preprocessor
+
+    raise ValueError(f"Model {model} or preset {preset} not found.")
+
+
+def preprocess_data(dataset, preprocessor):
+    """Run `proprocess_fn` on input dataset then batch & prefetch."""
 
+    def preprocess_fn(feature, label):
+        return preprocessor(feature), label
 
-def preprocess_data(preprocess_fn, dataset):
-    """Run `proprocess_fn` on input dataset then batch & prefetch."""
     return (
         dataset.map(preprocess_fn)
         .batch(FLAGS.batch_size)
@@ -148,164 +133,63 @@ def preprocess_data(preprocess_fn, dataset):
     )
 
 
-def generate_submission_files(finetuning_model, test_ds, idx_order):
-    """Generate GLUE leaderboard submission files."""
-    filenames = {
-        "cola": "CoLA.tsv",
-        "sst2": "SST-2.tsv",
-        "mrpc": "MRPC.tsv",
-        "qqp": "QQP.tsv",
-        "stsb": "STS-B.tsv",
-        "mnli_matched": "MNLI-m.tsv",
-        "mnli_mismatched": "MNLI-mm.tsv",
-        "qnli": "QNLI.tsv",
-        "rte": "RTE.tsv",
-        "wnli": "WNLI.tsv",
-        "ax": "AX.tsv",
-    }
-
-    labelnames = {
-        "mnli_matched": ["entailment", "neutral", "contradiction"],
-        "mnli_mismatched": ["entailment", "neutral", "contradiction"],
-        "ax": ["entailment", "neutral", "contradiction"],
-        "qnli": ["entailment", "not_entailment"],
-        "rte": ["entailment", "not_entailment"],
-    }
-    if not os.path.exists(FLAGS.submission_directory):
-        os.makedirs(FLAGS.submission_directory)
-    filename = FLAGS.submission_directory + "/" + filenames[FLAGS.task_name]
-    labelname = labelnames.get(FLAGS.task_name)
-
-    predictions = finetuning_model.predict(test_ds)
-    if FLAGS.task_name == "stsb":
-        predictions = np.squeeze(predictions)
-    else:
-        predictions = np.argmax(predictions, -1)
-
-    # Map the predictions to the right index order.
-    idx_order = list(idx_order.as_numpy_iterator())
-    contents = ["" for _ in idx_order]
-    for idx, pred in zip(idx_order, predictions):
-        if labelname:
-            pred_value = labelname[int(pred)]
-        else:
-            pred_value = pred
-            if FLAGS.task_name == "stsb":
-                pred_value = min(pred_value, 5)
-                pred_value = max(pred_value, 0)
-                pred_value = f"{pred_value:.3f}"
-        contents[idx] = pred_value
-
-    with tf.io.gfile.GFile(filename, "w") as f:
-        # GLUE requires a format of index + tab + prediction.
-        writer = csv.writer(f, delimiter="\t")
-        # Write the required headline for GLUE.
-        writer.writerow(["index", "prediction"])
-
-        for idx, value in enumerate(contents):
-            writer.writerow([idx, value])
-
-
-def connect_to_tpu(tpu_name):
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect(
-        tpu=tpu_name
+def main(_):
+    # checking task version (erroring out other testes except "mrpc")
+    if FLAGS.task != "mrpc":
+        raise ValueError("task - mrpc is only supported currently.")
+
+    print(tf.__version__)
+    print("GPU available : ", tf.test.is_gpu_available())
+
+    print("=" * 120)
+    print(
+        f"MODEL : {FLAGS.model} | PRESET : {FLAGS.preset} | DATASET : glue/mrpc | batch_size : {FLAGS.batch_size} | epochs : {FLAGS.epochs}"
     )
-    return tf.distribute.TPUStrategy(resolver)
+    print("=" * 120)
 
+    # Load the model
+    model, preprocessor = load_model_and_preprocessor(
+        model=FLAGS.model, preset=FLAGS.preset, num_classes=2
+    )
+    # Add loss and optimizer
+    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    metrics = [keras.metrics.SparseCategoricalAccuracy()]
+
+    # Load datasets
+    train_ds, test_ds, validation_ds = load_data()
+    train_ds = preprocess_data(dataset=train_ds, preprocessor=preprocessor)
+    validation_ds = preprocess_data(
+        dataset=validation_ds, preprocessor=preprocessor
+    )
+    print("GLUE/MRPC Dataset Loaded!")
 
-def main(_):
-    if FLAGS.tpu_name:
-        strategy = connect_to_tpu(FLAGS.tpu_name)
-        policy = keras.mixed_precision.Policy("mixed_bfloat16")
-    else:
-        # Use default strategy if not using TPU.
-        strategy = tf.distribute.get_strategy()
-        policy = keras.mixed_precision.Policy("mixed_float16")
-    keras.mixed_precision.set_global_policy(policy)
-
-    train_ds, test_ds, val_ds, idx_order = load_data(FLAGS.task_name)
-    # ----- Custom code block starts -----
-    bert_preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
-        "bert_base_en_uncased"
+    lr = tf.keras.optimizers.schedules.PolynomialDecay(
+        FLAGS.learning_rate,
+        decay_steps=train_ds.cardinality() * FLAGS.epochs,
+        end_learning_rate=0.0,
+    )
+    optimizer = tf.keras.optimizers.experimental.AdamW(
+        lr, weight_decay=0.01, global_clipnorm=1.0
+    )
+    optimizer.exclude_from_weight_decay(
+        var_names=["LayerNorm", "layer_norm", "bias"]
     )
+    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
 
-    # Users should change this function to implement the preprocessing required
-    # by the model.
-    def preprocess_fn(feature, label):
-        return bert_preprocessor(feature), label
-
-    # ----- Custom code block ends -----
-
-    train_ds = preprocess_data(preprocess_fn, train_ds)
-    val_ds = preprocess_data(preprocess_fn, val_ds)
-    test_ds = preprocess_data(preprocess_fn, test_ds)
-
-    if FLAGS.load_finetuning_model:
-        with strategy.scope():
-            finetuning_model = tf.keras.models.load_model(
-                FLAGS.load_finetuning_model
-            )
-    else:
-        with strategy.scope():
-            loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-            metrics = [keras.metrics.SparseCategoricalAccuracy()]
-            if FLAGS.task_name == "stsb":
-                num_classes = 1
-                loss = keras.losses.MeanSquaredError()
-                metrics = [keras.metrics.MeanSquaredError()]
-            elif FLAGS.task_name in (
-                "mnli",
-                "mnli_mismatched",
-                "mnli_matched",
-                "ax",
-            ):
-                num_classes = 3
-            else:
-                num_classes = 2
-
-            # ----- Custom code block starts -----
-            # Users should change this `BertClassifier` to your own classifier.
-            # Commonly the classifier is simply your model + several dense layers,
-            # please refer to "Make the Finetuning Model" section in README for
-            # detailed instructions.
-            bert_model = keras_nlp.models.BertBackbone.from_preset(
-                "bert_base_en_uncased"
-            )
-            finetuning_model = keras_nlp.models.BertClassifier(
-                backbone=bert_model,
-                num_classes=num_classes,
-            )
-            # ----- Custom code block ends -----
-            lr = tf.keras.optimizers.schedules.PolynomialDecay(
-                FLAGS.learning_rate,
-                decay_steps=train_ds.cardinality() * FLAGS.epochs,
-                end_learning_rate=0.0,
-            )
-            optimizer = tf.keras.optimizers.experimental.AdamW(
-                lr, weight_decay=0.01, global_clipnorm=1.0
-            )
-            optimizer.exclude_from_weight_decay(
-                var_names=["LayerNorm", "layer_norm", "bias"]
-            )
-            finetuning_model.compile(
-                optimizer=optimizer,
-                loss=loss,
-                metrics=metrics,
-            )
-
-        finetuning_model.fit(
-            train_ds,
-            validation_data=val_ds,
-            epochs=FLAGS.epochs,
-        )
-    with strategy.scope():
-        if FLAGS.submission_directory:
-            generate_submission_files(finetuning_model, test_ds, idx_order)
-    if FLAGS.save_finetuning_model:
-        # Don't need to save the optimizer.
-        finetuning_model.optimizer = None
-        finetuning_model.save(FLAGS.save_finetuning_model)
+    # Start training
+    print("Starting Training...")
+
+    st = time.time()
+    model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
+    et = time.time()
+
+    print("Training Finished!")
+    print(
+        f"Training took :: {(et-st):.4f} seconds, or {((et-st)/60):.2f} minutes, or {((et-st)/3600):.2f} hours!"
+    )
 
 
 if __name__ == "__main__":
+    flags.mark_flag_as_required("model")
+    flags.mark_flag_as_required("preset")
     app.run(main)
diff --git a/examples/glue_benchmark/glue_mrpc.py b/examples/glue_benchmark/glue_mrpc.py
deleted file mode 100644
index 2b626e05dc..0000000000
--- a/examples/glue_benchmark/glue_mrpc.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2023 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import os
-import time
-
-import tensorflow as tf
-import tensorflow_datasets as tfds
-from absl import app
-from absl import flags
-from tensorflow import keras
-
-import keras_nlp
-
-seed = 42
-os.environ["PYTHONHASHSEED"] = str(seed)
-tf.random.set_seed(seed)
-
-flags.DEFINE_string("model", None, "The Model you want to train and evaluate.")
-flags.DEFINE_bool(
-    "freeze_backbone",
-    True,
-    "If you want to freeze the backbone and only train the downstream layers or not.",
-)
-flags.DEFINE_string(
-    "preset",
-    None,
-    "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')",
-)
-flags.DEFINE_float(
-    "learning_rate", 0.005, "The learning_rate for the optimizer."
-)
-flags.DEFINE_integer("epochs", 2, "No of Epochs.")
-flags.DEFINE_integer("batch_size", 8, "Batch Size.")
-
-
-FLAGS = flags.FLAGS
-
-
-def load_data():
-    """
-    Load GLUE/MRPC dataset, and convert the dictionary format to (features, label),
-    where features is a tuple of all input sentences.
-    """
-
-    feature_names = ("sentence1", "sentence2")
-
-    def split_features(x):
-        # GLUE comes with dictonary data, we convert it to a uniform format
-        # (features, label), where features is a tuple consisting of all
-        # features.
-        features = tuple([x[name] for name in feature_names])
-        label = x["label"]
-        return (features, label)
-
-    train_ds, test_ds, validation_ds = tfds.load(
-        "glue/mrpc",
-        split=["train", "test", "validation"],
-    )
-
-    train_ds = train_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
-    test_ds = test_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
-    validation_ds = validation_ds.map(
-        split_features, num_parallel_calls=tf.data.AUTOTUNE
-    )
-    return train_ds, test_ds, validation_ds
-
-
-def load_model_and_preprocessor(model, preset):
-    for name, symbol in keras_nlp.models.__dict__.items():
-        if inspect.isclass(symbol) and issubclass(symbol, keras.Model):
-            if model and name != model:
-                continue
-            if not hasattr(symbol, "from_preset"):
-                continue
-            for _preset in symbol.presets:
-                if preset and _preset != preset:
-                    continue
-                model = symbol.from_preset(preset)
-                print(f"Using model {name} with preset {preset}")
-                preprocessor = keras_nlp.models.__dict__[
-                    name.replace("Backbone", "Preprocessor")
-                ].from_preset(preset)
-
-                return model, preprocessor
-
-    raise ValueError(f"Model {model} or preset {preset} not found.")
-
-
-def create_model(model, preset):
-    # output_dim
-    output_dim = 2
-    # select backbone
-
-    if "backbone" not in FLAGS.model.lower():
-        backbones = [
-            i
-            for i in keras_nlp.models.__dict__
-            if "Backbone" in i
-            and i not in ["OPTBackbone", "GPT2Backbone", "WhisperBackbone"]
-        ]
-        raise ValueError(
-            f"Please enter a Backbone, from this list - {backbones}, the script will take care of the downstream layers."
-        )
-
-    # Build the model
-    backbone, preprocessor = load_model_and_preprocessor(
-        model=model, preset=preset
-    )
-    # Freeze
-    if FLAGS.freeze_backbone:
-        backbone.trainable = False
-    else:
-        backbone.trainable = True
-    # If the model has pooled_output
-    if len(backbone.outputs) > 1:
-        output = keras.layers.Dense(output_dim)(
-            backbone.output["pooled_output"]
-        )
-    elif len(backbone.outputs) == 1:
-        output = keras.layers.Dense(output_dim)(backbone.output)
-    model = keras.models.Model(backbone.inputs, output)
-
-    return model, preprocessor
-
-
-def preprocess_data(dataset, preprocessor):
-    """Run `proprocess_fn` on input dataset then batch & prefetch."""
-
-    def preprocess_fn(feature, label):
-        return preprocessor(feature), label
-
-    return (
-        dataset.map(preprocess_fn)
-        .batch(FLAGS.batch_size)
-        .prefetch(tf.data.AUTOTUNE)
-    )
-
-
-def main(_):
-    print(tf.__version__)
-    print("GPU available : ", tf.test.is_gpu_available())
-
-    print("=" * 120)
-    print(
-        f"MODEL : {FLAGS.model}   PRESET : {FLAGS.preset}   DATASET : glue/mrpc"
-    )
-    print("=" * 120)
-
-    # Load the model
-    model, preprocessor = create_model(model=FLAGS.model, preset=FLAGS.preset)
-    # Add loss and optimizer
-    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-    metrics = [keras.metrics.SparseCategoricalAccuracy()]
-
-    # Load datasets
-    train_ds, test_ds, validation_ds = load_data()
-    train_ds = preprocess_data(dataset=train_ds, preprocessor=preprocessor)
-    validation_ds = preprocess_data(
-        dataset=validation_ds, preprocessor=preprocessor
-    )
-    print("GLUE/MRPC Dataset Loaded!")
-
-    lr = tf.keras.optimizers.schedules.PolynomialDecay(
-        FLAGS.learning_rate,
-        decay_steps=train_ds.cardinality() * FLAGS.epochs,
-        end_learning_rate=0.0,
-    )
-    optimizer = tf.keras.optimizers.experimental.AdamW(
-        lr, weight_decay=0.01, global_clipnorm=1.0
-    )
-    optimizer.exclude_from_weight_decay(
-        var_names=["LayerNorm", "layer_norm", "bias"]
-    )
-    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
-
-    # Start training
-    print("Starting Training...")
-
-    st = time.time()
-    model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
-    et = time.time()
-
-    print("Training Finished!")
-    print(
-        f"Training took :: {(et-st):.4f} seconds, or {((et-st)/60):.2f} minutes, or {((et-st)/3600):.2f} hours!"
-    )
-
-
-if __name__ == "__main__":
-    flags.mark_flag_as_required("model")
-    flags.mark_flag_as_required("preset")
-    app.run(main)

From d62de3cbb9c7491652444e8a7fa0ecc85191d696 Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Fri, 17 Mar 2023 21:05:53 +0530
Subject: [PATCH 07/17] .

---
 examples/glue_benchmark/glue.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/examples/glue_benchmark/glue.py b/examples/glue_benchmark/glue.py
index eaa18cfe36..4a63379dbf 100644
--- a/examples/glue_benchmark/glue.py
+++ b/examples/glue_benchmark/glue.py
@@ -31,14 +31,9 @@
 flags.DEFINE_string(
     "task",
     "mrpc",
-    "The name of the test that the model is going to be trained and evaluated on.",
+    "The name of the GLUE task to finetune on.",
 )
 flags.DEFINE_string("model", None, "The Model you want to train and evaluate.")
-flags.DEFINE_bool(
-    "freeze_backbone",
-    True,
-    "If you want to freeze the backbone and only train the downstream layers or not.",
-)
 flags.DEFINE_string(
     "preset",
     None,

From 3c12c6060ea226384992d57a12f7ed29a3e52b8a Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Fri, 17 Mar 2023 21:11:46 +0530
Subject: [PATCH 08/17] .

---
 examples/glue_benchmark/glue.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/glue_benchmark/glue.py b/examples/glue_benchmark/glue.py
index 4a63379dbf..ef92ab5f6e 100644
--- a/examples/glue_benchmark/glue.py
+++ b/examples/glue_benchmark/glue.py
@@ -11,9 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+# DISCLAIMER:This script only supports GLUE/mrpc (for now). #
+
 import inspect
 import os
 import time
+import warnings
 
 import tensorflow as tf
 import tensorflow_datasets as tfds
@@ -129,6 +133,7 @@ def preprocess_fn(feature, label):
 
 
 def main(_):
+    warnings.warn("DISCLAIMER:This script only supports GLUE/mrpc (for now).")
     # checking task version (erroring out other testes except "mrpc")
     if FLAGS.task != "mrpc":
         raise ValueError("task - mrpc is only supported currently.")

From 2c4fdc6c5beb57d9655bb2d21fee9c909956da81 Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Thu, 23 Mar 2023 10:53:56 +0530
Subject: [PATCH 09/17] comments

---
 examples/glue_benchmark/glue.py | 110 +++++++++++++-------------------
 1 file changed, 46 insertions(+), 64 deletions(-)

diff --git a/examples/glue_benchmark/glue.py b/examples/glue_benchmark/glue.py
index ef92ab5f6e..31b1fb6f84 100644
--- a/examples/glue_benchmark/glue.py
+++ b/examples/glue_benchmark/glue.py
@@ -12,7 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# DISCLAIMER:This script only supports GLUE/mrpc (for now). #
+""" GLUE benchmark script to test model performance.
+
+To run the script, use this command:
+```
+python3 glue.py --model BertClassifier \
+                --preset bert_base_en \
+                --epochs 5 \
+                --batch_size 16 \
+                --learning_rate 0.001 \
+                --mixed_precision_policy mixed_float16
+```
+
+Disclaimer: This script only supports GLUE/mrpc (for now).
+"""
 
 import inspect
 import os
@@ -23,12 +36,12 @@
 import tensorflow_datasets as tfds
 from absl import app
 from absl import flags
+from absl import logging
 from tensorflow import keras
 
 import keras_nlp
 
 seed = 42
-os.environ["PYTHONHASHSEED"] = str(seed)
 tf.random.set_seed(seed)
 
 
@@ -37,7 +50,7 @@
     "mrpc",
     "The name of the GLUE task to finetune on.",
 )
-flags.DEFINE_string("model", None, "The Model you want to train and evaluate.")
+flags.DEFINE_string("model", None, "The name of the classifier such as BertClassifier.")
 flags.DEFINE_string(
     "preset",
     None,
@@ -46,6 +59,11 @@
 flags.DEFINE_float(
     "learning_rate", 0.005, "The learning_rate for the optimizer."
 )
+flags.DEFINE_string(
+    "mixed_precision_policy",
+    "mixed_float16",
+    "The global precision policy to use. E.g. 'mixed_float16' or 'float32'.",
+)
 flags.DEFINE_integer("epochs", 2, "No of Epochs.")
 flags.DEFINE_integer("batch_size", 8, "Batch Size.")
 
@@ -79,76 +97,40 @@ def split_features(x):
     validation_ds = validation_ds.map(
         split_features, num_parallel_calls=tf.data.AUTOTUNE
     )
+
     return train_ds, test_ds, validation_ds
 
 
-def load_model_and_preprocessor(model, preset, num_classes):
+def load_model(model, preset, num_classes):
     for name, symbol in keras_nlp.models.__dict__.items():
         if inspect.isclass(symbol) and issubclass(symbol, keras.Model):
             if model and name != model:
                 continue
             if not hasattr(symbol, "from_preset"):
                 continue
-            for _preset in symbol.presets:
-                if preset and _preset != preset:
+            for preset in symbol.presets:
+                if preset and preset != preset:
                     continue
-                if "Backbone" in name:
-                    model = keras_nlp.models.__dict__[
-                        name.replace("Backbone", "Classifier")
-                    ](
-                        backbone=symbol.from_preset(preset),
-                        num_classes=num_classes,
-                        preprocessor=None,
-                    )
-                    preprocessor = keras_nlp.models.__dict__[
-                        name.replace("Backbone", "Preprocessor")
-                    ].from_preset(preset)
-                elif "Classifier" in name:
-                    model = symbol.from_preset(
-                        preset=preset,
-                        num_classes=num_classes,
-                        preprocessor=None,
-                    )
-                    preprocessor = keras_nlp.models.__dict__[
-                        name.replace("Classifier", "Preprocessor")
-                    ].from_preset(preset)
-
-                print(f"Using model {name} with preset {preset}")
-                return model, preprocessor
+                model = symbol.from_preset(preset, num_classes=num_classes)
+                logging.info(f"\nUsing model {name} with preset {preset}\n")
+                return model
 
     raise ValueError(f"Model {model} or preset {preset} not found.")
 
 
-def preprocess_data(dataset, preprocessor):
-    """Run `proprocess_fn` on input dataset then batch & prefetch."""
-
-    def preprocess_fn(feature, label):
-        return preprocessor(feature), label
-
-    return (
-        dataset.map(preprocess_fn)
-        .batch(FLAGS.batch_size)
-        .prefetch(tf.data.AUTOTUNE)
-    )
-
-
 def main(_):
-    warnings.warn("DISCLAIMER:This script only supports GLUE/mrpc (for now).")
+    keras.mixed_precision.set_global_policy(FLAGS.mixed_precision_policy)
+
     # checking task version (erroring out other testes except "mrpc")
     if FLAGS.task != "mrpc":
-        raise ValueError("task - mrpc is only supported currently.")
-
-    print(tf.__version__)
-    print("GPU available : ", tf.test.is_gpu_available())
+        raise ValueError(f"For now this script only supports mrpc, but received {FLAGS.task}")
 
-    print("=" * 120)
-    print(
-        f"MODEL : {FLAGS.model} | PRESET : {FLAGS.preset} | DATASET : glue/mrpc | batch_size : {FLAGS.batch_size} | epochs : {FLAGS.epochs}"
+    logging.info(
+        f"\nMODEL : {FLAGS.model} | PRESET : {FLAGS.preset} | DATASET : glue/mrpc | batch_size : {FLAGS.batch_size} | epochs : {FLAGS.epochs}\n"
     )
-    print("=" * 120)
 
     # Load the model
-    model, preprocessor = load_model_and_preprocessor(
+    model = load_model(
         model=FLAGS.model, preset=FLAGS.preset, num_classes=2
     )
     # Add loss and optimizer
@@ -157,11 +139,9 @@ def main(_):
 
     # Load datasets
     train_ds, test_ds, validation_ds = load_data()
-    train_ds = preprocess_data(dataset=train_ds, preprocessor=preprocessor)
-    validation_ds = preprocess_data(
-        dataset=validation_ds, preprocessor=preprocessor
-    )
-    print("GLUE/MRPC Dataset Loaded!")
+    train_ds = train_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+    test_ds = test_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+    validation_ds = validation_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
 
     lr = tf.keras.optimizers.schedules.PolynomialDecay(
         FLAGS.learning_rate,
@@ -169,7 +149,7 @@ def main(_):
         end_learning_rate=0.0,
     )
     optimizer = tf.keras.optimizers.experimental.AdamW(
-        lr, weight_decay=0.01, global_clipnorm=1.0
+        lr, weight_decay=0.01
     )
     optimizer.exclude_from_weight_decay(
         var_names=["LayerNorm", "layer_norm", "bias"]
@@ -177,16 +157,18 @@ def main(_):
     model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
 
     # Start training
-    print("Starting Training...")
+    logging.info("Starting Training...")
 
     st = time.time()
-    model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
-    et = time.time()
+    history = model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
+    wall_time = time.time() - st
 
-    print("Training Finished!")
-    print(
-        f"Training took :: {(et-st):.4f} seconds, or {((et-st)/60):.2f} minutes, or {((et-st)/3600):.2f} hours!"
+    logging.info("Training Finished!")
+    logging.info(
+        f"Wall Time :: {wall_time:.4f} seconds."
     )
+    logging.info(f"Validation Accuracy :: {history.history['val_sparse_categorical_accuracy'][-1]:.4f}")
+    logging.info(f"examples_per_second :: {(FLAGS.epochs*FLAGS.batch_size*(len(train_ds)+len(validation_ds)))/wall_time:.4f}")
 
 
 if __name__ == "__main__":

From 0183fd3eefd7ab997fa8dc1d37fe6d32f25654ca Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Thu, 23 Mar 2023 10:56:17 +0530
Subject: [PATCH 10/17] nits

---
 examples/glue_benchmark/glue.py | 34 ++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/examples/glue_benchmark/glue.py b/examples/glue_benchmark/glue.py
index 31b1fb6f84..02854d7c01 100644
--- a/examples/glue_benchmark/glue.py
+++ b/examples/glue_benchmark/glue.py
@@ -28,9 +28,7 @@
 """
 
 import inspect
-import os
 import time
-import warnings
 
 import tensorflow as tf
 import tensorflow_datasets as tfds
@@ -50,7 +48,9 @@
     "mrpc",
     "The name of the GLUE task to finetune on.",
 )
-flags.DEFINE_string("model", None, "The name of the classifier such as BertClassifier.")
+flags.DEFINE_string(
+    "model", None, "The name of the classifier such as BertClassifier."
+)
 flags.DEFINE_string(
     "preset",
     None,
@@ -123,16 +123,16 @@ def main(_):
 
     # checking task version (erroring out other testes except "mrpc")
     if FLAGS.task != "mrpc":
-        raise ValueError(f"For now this script only supports mrpc, but received {FLAGS.task}")
+        raise ValueError(
+            f"For now this script only supports mrpc, but received {FLAGS.task}"
+        )
 
     logging.info(
         f"\nMODEL : {FLAGS.model} | PRESET : {FLAGS.preset} | DATASET : glue/mrpc | batch_size : {FLAGS.batch_size} | epochs : {FLAGS.epochs}\n"
     )
 
     # Load the model
-    model = load_model(
-        model=FLAGS.model, preset=FLAGS.preset, num_classes=2
-    )
+    model = load_model(model=FLAGS.model, preset=FLAGS.preset, num_classes=2)
     # Add loss and optimizer
     loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     metrics = [keras.metrics.SparseCategoricalAccuracy()]
@@ -141,16 +141,16 @@ def main(_):
     train_ds, test_ds, validation_ds = load_data()
     train_ds = train_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
     test_ds = test_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
-    validation_ds = validation_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+    validation_ds = validation_ds.batch(FLAGS.batch_size).prefetch(
+        tf.data.AUTOTUNE
+    )
 
     lr = tf.keras.optimizers.schedules.PolynomialDecay(
         FLAGS.learning_rate,
         decay_steps=train_ds.cardinality() * FLAGS.epochs,
         end_learning_rate=0.0,
     )
-    optimizer = tf.keras.optimizers.experimental.AdamW(
-        lr, weight_decay=0.01
-    )
+    optimizer = tf.keras.optimizers.experimental.AdamW(lr, weight_decay=0.01)
     optimizer.exclude_from_weight_decay(
         var_names=["LayerNorm", "layer_norm", "bias"]
     )
@@ -160,15 +160,19 @@ def main(_):
     logging.info("Starting Training...")
 
     st = time.time()
-    history = model.fit(train_ds, validation_data=validation_ds, epochs=FLAGS.epochs)
+    history = model.fit(
+        train_ds, validation_data=validation_ds, epochs=FLAGS.epochs
+    )
     wall_time = time.time() - st
 
     logging.info("Training Finished!")
+    logging.info(f"Wall Time :: {wall_time:.4f} seconds.")
+    logging.info(
+        f"Validation Accuracy :: {history.history['val_sparse_categorical_accuracy'][-1]:.4f}"
+    )
     logging.info(
-        f"Wall Time :: {wall_time:.4f} seconds."
+        f"examples_per_second :: {(FLAGS.epochs*FLAGS.batch_size*(len(train_ds)+len(validation_ds)))/wall_time:.4f}"
     )
-    logging.info(f"Validation Accuracy :: {history.history['val_sparse_categorical_accuracy'][-1]:.4f}")
-    logging.info(f"examples_per_second :: {(FLAGS.epochs*FLAGS.batch_size*(len(train_ds)+len(validation_ds)))/wall_time:.4f}")
 
 
 if __name__ == "__main__":

From 9ad5640d26bec99c44aa4735ee564bea99e7d189 Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Tue, 28 Mar 2023 07:48:21 +0530
Subject: [PATCH 11/17] glue.py pushed to keras_nlp/benchmarks/glue.py

---
 examples/glue_benchmark/glue.py | 353 ++++++++++++++++++++++----------
 keras_nlp/benchmarks/glue.py    | 181 ++++++++++++++++
 2 files changed, 422 insertions(+), 112 deletions(-)
 create mode 100644 keras_nlp/benchmarks/glue.py

diff --git a/examples/glue_benchmark/glue.py b/examples/glue_benchmark/glue.py
index 02854d7c01..7301044cca 100644
--- a/examples/glue_benchmark/glue.py
+++ b/examples/glue_benchmark/glue.py
@@ -11,73 +11,97 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import csv
+import os
 
-""" GLUE benchmark script to test model performance.
-
-To run the script, use this command:
-```
-python3 glue.py --model BertClassifier \
-                --preset bert_base_en \
-                --epochs 5 \
-                --batch_size 16 \
-                --learning_rate 0.001 \
-                --mixed_precision_policy mixed_float16
-```
-
-Disclaimer: This script only supports GLUE/mrpc (for now).
-"""
-
-import inspect
-import time
-
+import numpy as np
 import tensorflow as tf
 import tensorflow_datasets as tfds
 from absl import app
 from absl import flags
-from absl import logging
 from tensorflow import keras
 
 import keras_nlp
 
-seed = 42
-tf.random.set_seed(seed)
-
+FLAGS = flags.FLAGS
 
 flags.DEFINE_string(
-    "task",
+    "task_name",
     "mrpc",
     "The name of the GLUE task to finetune on.",
 )
-flags.DEFINE_string(
-    "model", None, "The name of the classifier such as BertClassifier."
+
+flags.DEFINE_integer(
+    "batch_size",
+    32,
+    "Batch size of data.",
 )
-flags.DEFINE_string(
-    "preset",
-    None,
-    "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')",
+
+flags.DEFINE_integer(
+    "epochs",
+    2,
+    "Number of epochs to run finetuning.",
 )
+
 flags.DEFINE_float(
-    "learning_rate", 0.005, "The learning_rate for the optimizer."
+    "learning_rate",
+    5e-5,
+    "Learning rate",
 )
+
 flags.DEFINE_string(
-    "mixed_precision_policy",
-    "mixed_float16",
-    "The global precision policy to use. E.g. 'mixed_float16' or 'float32'.",
+    "tpu_name",
+    None,
+    "The name of TPU to connect to. If None, no TPU will be used. If you only "
+    "have one TPU, use `local`",
 )
-flags.DEFINE_integer("epochs", 2, "No of Epochs.")
-flags.DEFINE_integer("batch_size", 8, "Batch Size.")
 
+flags.DEFINE_string(
+    "submission_directory",
+    None,
+    "The directory to save the glue submission file.",
+)
 
-FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    "load_finetuning_model",
+    None,
+    "The path to load the finetuning model. If None, the model is trained.",
+)
+
+flags.DEFINE_string(
+    "save_finetuning_model",
+    None,
+    "The path to save the finetuning model. If None, the model is not saved.",
+)
 
 
-def load_data():
+def load_data(task_name):
     """
-    Load GLUE/MRPC dataset, and convert the dictionary format to (features, label),
+    Load GLUE dataset.
+    Load GLUE dataset, and convert the dictionary format to (features, label),
     where features is a tuple of all input sentences.
     """
-
-    feature_names = ("sentence1", "sentence2")
+    if task_name in ("cola", "sst2"):
+        feature_names = ("sentence",)
+    elif task_name in ("mrpc", "stsb", "rte", "wnli"):
+        feature_names = ("sentence1", "sentence2")
+    elif task_name in ("mnli", "mnli_matched", "mnli_mismatched", "ax"):
+        feature_names = ("premise", "hypothesis")
+    elif task_name in "qnli":
+        feature_names = ("question", "sentence")
+    elif task_name in "qqp":
+        feature_names = ("question1", "question2")
+    else:
+        raise ValueError(f"Unknown task_name {task_name}.")
+
+    test_suffix = ""
+    if task_name in ("mnli", "mnli_matched"):
+        # For "mnli", just run default to "mnli_matched".
+        task_name = "mnli"
+        test_suffix = "_matched"
+    elif task_name in ("mnli_mismatched"):
+        task_name = "mnli"
+        test_suffix = "_mismatched"
 
     def split_features(x):
         # GLUE comes with dictonary data, we convert it to a uniform format
@@ -87,95 +111,200 @@ def split_features(x):
         label = x["label"]
         return (features, label)
 
-    train_ds, test_ds, validation_ds = tfds.load(
-        "glue/mrpc",
-        split=["train", "test", "validation"],
-    )
+    if task_name == "ax":
+        # AX is trained and evaluated on MNLI, and has its own test split.
+        train_ds, validation_ds = tfds.load(
+            "glue/mnli",
+            split=["train", "validation_matched"],
+        )
+        test_ds = tfds.load(
+            "glue/ax",
+            split="test",
+        )
+    else:
+        train_ds, test_ds, validation_ds = tfds.load(
+            f"glue/{task_name}",
+            split=["train", "test" + test_suffix, "validation" + test_suffix],
+        )
+
+    # Extract out the index order of test dataset.
+    idx_order = test_ds.map(lambda data: data["idx"])
 
     train_ds = train_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
     test_ds = test_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
     validation_ds = validation_ds.map(
         split_features, num_parallel_calls=tf.data.AUTOTUNE
     )
+    return train_ds, test_ds, validation_ds, idx_order
 
-    return train_ds, test_ds, validation_ds
-
-
-def load_model(model, preset, num_classes):
-    for name, symbol in keras_nlp.models.__dict__.items():
-        if inspect.isclass(symbol) and issubclass(symbol, keras.Model):
-            if model and name != model:
-                continue
-            if not hasattr(symbol, "from_preset"):
-                continue
-            for preset in symbol.presets:
-                if preset and preset != preset:
-                    continue
-                model = symbol.from_preset(preset, num_classes=num_classes)
-                logging.info(f"\nUsing model {name} with preset {preset}\n")
-                return model
-
-    raise ValueError(f"Model {model} or preset {preset} not found.")
 
-
-def main(_):
-    keras.mixed_precision.set_global_policy(FLAGS.mixed_precision_policy)
-
-    # checking task version (erroring out other testes except "mrpc")
-    if FLAGS.task != "mrpc":
-        raise ValueError(
-            f"For now this script only supports mrpc, but received {FLAGS.task}"
-        )
-
-    logging.info(
-        f"\nMODEL : {FLAGS.model} | PRESET : {FLAGS.preset} | DATASET : glue/mrpc | batch_size : {FLAGS.batch_size} | epochs : {FLAGS.epochs}\n"
+def preprocess_data(preprocess_fn, dataset):
+    """Run `proprocess_fn` on input dataset then batch & prefetch."""
+    return (
+        dataset.map(preprocess_fn)
+        .batch(FLAGS.batch_size)
+        .prefetch(tf.data.AUTOTUNE)
     )
 
-    # Load the model
-    model = load_model(model=FLAGS.model, preset=FLAGS.preset, num_classes=2)
-    # Add loss and optimizer
-    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-    metrics = [keras.metrics.SparseCategoricalAccuracy()]
-
-    # Load datasets
-    train_ds, test_ds, validation_ds = load_data()
-    train_ds = train_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
-    test_ds = test_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
-    validation_ds = validation_ds.batch(FLAGS.batch_size).prefetch(
-        tf.data.AUTOTUNE
-    )
 
-    lr = tf.keras.optimizers.schedules.PolynomialDecay(
-        FLAGS.learning_rate,
-        decay_steps=train_ds.cardinality() * FLAGS.epochs,
-        end_learning_rate=0.0,
+def generate_submission_files(finetuning_model, test_ds, idx_order):
+    """Generate GLUE leaderboard submission files."""
+    filenames = {
+        "cola": "CoLA.tsv",
+        "sst2": "SST-2.tsv",
+        "mrpc": "MRPC.tsv",
+        "qqp": "QQP.tsv",
+        "stsb": "STS-B.tsv",
+        "mnli_matched": "MNLI-m.tsv",
+        "mnli_mismatched": "MNLI-mm.tsv",
+        "qnli": "QNLI.tsv",
+        "rte": "RTE.tsv",
+        "wnli": "WNLI.tsv",
+        "ax": "AX.tsv",
+    }
+
+    labelnames = {
+        "mnli_matched": ["entailment", "neutral", "contradiction"],
+        "mnli_mismatched": ["entailment", "neutral", "contradiction"],
+        "ax": ["entailment", "neutral", "contradiction"],
+        "qnli": ["entailment", "not_entailment"],
+        "rte": ["entailment", "not_entailment"],
+    }
+    if not os.path.exists(FLAGS.submission_directory):
+        os.makedirs(FLAGS.submission_directory)
+    filename = FLAGS.submission_directory + "/" + filenames[FLAGS.task_name]
+    labelname = labelnames.get(FLAGS.task_name)
+
+    predictions = finetuning_model.predict(test_ds)
+    if FLAGS.task_name == "stsb":
+        predictions = np.squeeze(predictions)
+    else:
+        predictions = np.argmax(predictions, -1)
+
+    # Map the predictions to the right index order.
+    idx_order = list(idx_order.as_numpy_iterator())
+    contents = ["" for _ in idx_order]
+    for idx, pred in zip(idx_order, predictions):
+        if labelname:
+            pred_value = labelname[int(pred)]
+        else:
+            pred_value = pred
+            if FLAGS.task_name == "stsb":
+                pred_value = min(pred_value, 5)
+                pred_value = max(pred_value, 0)
+                pred_value = f"{pred_value:.3f}"
+        contents[idx] = pred_value
+
+    with tf.io.gfile.GFile(filename, "w") as f:
+        # GLUE requires a format of index + tab + prediction.
+        writer = csv.writer(f, delimiter="\t")
+        # Write the required headline for GLUE.
+        writer.writerow(["index", "prediction"])
+
+        for idx, value in enumerate(contents):
+            writer.writerow([idx, value])
+
+
+def connect_to_tpu(tpu_name):
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver.connect(
+        tpu=tpu_name
     )
-    optimizer = tf.keras.optimizers.experimental.AdamW(lr, weight_decay=0.01)
-    optimizer.exclude_from_weight_decay(
-        var_names=["LayerNorm", "layer_norm", "bias"]
-    )
-    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
+    return tf.distribute.TPUStrategy(resolver)
 
-    # Start training
-    logging.info("Starting Training...")
 
-    st = time.time()
-    history = model.fit(
-        train_ds, validation_data=validation_ds, epochs=FLAGS.epochs
+def main(_):
+    if FLAGS.tpu_name:
+        strategy = connect_to_tpu(FLAGS.tpu_name)
+        policy = keras.mixed_precision.Policy("mixed_bfloat16")
+    else:
+        # Use default strategy if not using TPU.
+        strategy = tf.distribute.get_strategy()
+        policy = keras.mixed_precision.Policy("mixed_float16")
+    keras.mixed_precision.set_global_policy(policy)
+
+    train_ds, test_ds, val_ds, idx_order = load_data(FLAGS.task_name)
+    # ----- Custom code block starts -----
+    bert_preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
+        "bert_base_en_uncased"
     )
-    wall_time = time.time() - st
 
-    logging.info("Training Finished!")
-    logging.info(f"Wall Time :: {wall_time:.4f} seconds.")
-    logging.info(
-        f"Validation Accuracy :: {history.history['val_sparse_categorical_accuracy'][-1]:.4f}"
-    )
-    logging.info(
-        f"examples_per_second :: {(FLAGS.epochs*FLAGS.batch_size*(len(train_ds)+len(validation_ds)))/wall_time:.4f}"
-    )
+    # Users should change this function to implement the preprocessing required
+    # by the model.
+    def preprocess_fn(feature, label):
+        return bert_preprocessor(feature), label
+
+    # ----- Custom code block ends -----
+
+    train_ds = preprocess_data(preprocess_fn, train_ds)
+    val_ds = preprocess_data(preprocess_fn, val_ds)
+    test_ds = preprocess_data(preprocess_fn, test_ds)
+
+    if FLAGS.load_finetuning_model:
+        with strategy.scope():
+            finetuning_model = tf.keras.models.load_model(
+                FLAGS.load_finetuning_model
+            )
+    else:
+        with strategy.scope():
+            loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+            metrics = [keras.metrics.SparseCategoricalAccuracy()]
+            if FLAGS.task_name == "stsb":
+                num_classes = 1
+                loss = keras.losses.MeanSquaredError()
+                metrics = [keras.metrics.MeanSquaredError()]
+            elif FLAGS.task_name in (
+                "mnli",
+                "mnli_mismatched",
+                "mnli_matched",
+                "ax",
+            ):
+                num_classes = 3
+            else:
+                num_classes = 2
+
+            # ----- Custom code block starts -----
+            # Users should change this `BertClassifier` to your own classifier.
+            # Commonly the classifier is simply your model + several dense layers,
+            # please refer to "Make the Finetuning Model" section in README for
+            # detailed instructions.
+            bert_model = keras_nlp.models.BertBackbone.from_preset(
+                "bert_base_en_uncased"
+            )
+            finetuning_model = keras_nlp.models.BertClassifier(
+                backbone=bert_model,
+                num_classes=num_classes,
+            )
+            # ----- Custom code block ends -----
+            lr = tf.keras.optimizers.schedules.PolynomialDecay(
+                FLAGS.learning_rate,
+                decay_steps=train_ds.cardinality() * FLAGS.epochs,
+                end_learning_rate=0.0,
+            )
+            optimizer = tf.keras.optimizers.experimental.AdamW(
+                lr, weight_decay=0.01, global_clipnorm=1.0
+            )
+            optimizer.exclude_from_weight_decay(
+                var_names=["LayerNorm", "layer_norm", "bias"]
+            )
+            finetuning_model.compile(
+                optimizer=optimizer,
+                loss=loss,
+                metrics=metrics,
+            )
+
+        finetuning_model.fit(
+            train_ds,
+            validation_data=val_ds,
+            epochs=FLAGS.epochs,
+        )
+    with strategy.scope():
+        if FLAGS.submission_directory:
+            generate_submission_files(finetuning_model, test_ds, idx_order)
+    if FLAGS.save_finetuning_model:
+        # Don't need to save the optimizer.
+        finetuning_model.optimizer = None
+        finetuning_model.save(FLAGS.save_finetuning_model)
 
 
 if __name__ == "__main__":
-    flags.mark_flag_as_required("model")
-    flags.mark_flag_as_required("preset")
     app.run(main)
diff --git a/keras_nlp/benchmarks/glue.py b/keras_nlp/benchmarks/glue.py
new file mode 100644
index 0000000000..02854d7c01
--- /dev/null
+++ b/keras_nlp/benchmarks/glue.py
@@ -0,0 +1,181 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" GLUE benchmark script to test model performance.
+
+To run the script, use this command:
+```
+python3 glue.py --model BertClassifier \
+                --preset bert_base_en \
+                --epochs 5 \
+                --batch_size 16 \
+                --learning_rate 0.001 \
+                --mixed_precision_policy mixed_float16
+```
+
+Disclaimer: This script only supports GLUE/mrpc (for now).
+"""
+
+import inspect
+import time
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from absl import app
+from absl import flags
+from absl import logging
+from tensorflow import keras
+
+import keras_nlp
+
+seed = 42
+tf.random.set_seed(seed)
+
+
+flags.DEFINE_string(
+    "task",
+    "mrpc",
+    "The name of the GLUE task to finetune on.",
+)
+flags.DEFINE_string(
+    "model", None, "The name of the classifier such as BertClassifier."
+)
+flags.DEFINE_string(
+    "preset",
+    None,
+    "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')",
+)
+flags.DEFINE_float(
+    "learning_rate", 0.005, "The learning_rate for the optimizer."
+)
+flags.DEFINE_string(
+    "mixed_precision_policy",
+    "mixed_float16",
+    "The global precision policy to use. E.g. 'mixed_float16' or 'float32'.",
+)
+flags.DEFINE_integer("epochs", 2, "No of Epochs.")
+flags.DEFINE_integer("batch_size", 8, "Batch Size.")
+
+
+FLAGS = flags.FLAGS
+
+
+def load_data():
+    """
+    Load GLUE/MRPC dataset, and convert the dictionary format to (features, label),
+    where features is a tuple of all input sentences.
+    """
+
+    feature_names = ("sentence1", "sentence2")
+
+    def split_features(x):
+        # GLUE comes with dictonary data, we convert it to a uniform format
+        # (features, label), where features is a tuple consisting of all
+        # features.
+        features = tuple([x[name] for name in feature_names])
+        label = x["label"]
+        return (features, label)
+
+    train_ds, test_ds, validation_ds = tfds.load(
+        "glue/mrpc",
+        split=["train", "test", "validation"],
+    )
+
+    train_ds = train_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
+    test_ds = test_ds.map(split_features, num_parallel_calls=tf.data.AUTOTUNE)
+    validation_ds = validation_ds.map(
+        split_features, num_parallel_calls=tf.data.AUTOTUNE
+    )
+
+    return train_ds, test_ds, validation_ds
+
+
+def load_model(model, preset, num_classes):
+    for name, symbol in keras_nlp.models.__dict__.items():
+        if inspect.isclass(symbol) and issubclass(symbol, keras.Model):
+            if model and name != model:
+                continue
+            if not hasattr(symbol, "from_preset"):
+                continue
+            for preset in symbol.presets:
+                if preset and preset != preset:
+                    continue
+                model = symbol.from_preset(preset, num_classes=num_classes)
+                logging.info(f"\nUsing model {name} with preset {preset}\n")
+                return model
+
+    raise ValueError(f"Model {model} or preset {preset} not found.")
+
+
+def main(_):
+    keras.mixed_precision.set_global_policy(FLAGS.mixed_precision_policy)
+
+    # checking task version (erroring out other testes except "mrpc")
+    if FLAGS.task != "mrpc":
+        raise ValueError(
+            f"For now this script only supports mrpc, but received {FLAGS.task}"
+        )
+
+    logging.info(
+        f"\nMODEL : {FLAGS.model} | PRESET : {FLAGS.preset} | DATASET : glue/mrpc | batch_size : {FLAGS.batch_size} | epochs : {FLAGS.epochs}\n"
+    )
+
+    # Load the model
+    model = load_model(model=FLAGS.model, preset=FLAGS.preset, num_classes=2)
+    # Add loss and optimizer
+    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    metrics = [keras.metrics.SparseCategoricalAccuracy()]
+
+    # Load datasets
+    train_ds, test_ds, validation_ds = load_data()
+    train_ds = train_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+    test_ds = test_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
+    validation_ds = validation_ds.batch(FLAGS.batch_size).prefetch(
+        tf.data.AUTOTUNE
+    )
+
+    lr = tf.keras.optimizers.schedules.PolynomialDecay(
+        FLAGS.learning_rate,
+        decay_steps=train_ds.cardinality() * FLAGS.epochs,
+        end_learning_rate=0.0,
+    )
+    optimizer = tf.keras.optimizers.experimental.AdamW(lr, weight_decay=0.01)
+    optimizer.exclude_from_weight_decay(
+        var_names=["LayerNorm", "layer_norm", "bias"]
+    )
+    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
+
+    # Start training
+    logging.info("Starting Training...")
+
+    st = time.time()
+    history = model.fit(
+        train_ds, validation_data=validation_ds, epochs=FLAGS.epochs
+    )
+    wall_time = time.time() - st
+
+    logging.info("Training Finished!")
+    logging.info(f"Wall Time :: {wall_time:.4f} seconds.")
+    logging.info(
+        f"Validation Accuracy :: {history.history['val_sparse_categorical_accuracy'][-1]:.4f}"
+    )
+    logging.info(
+        f"examples_per_second :: {(FLAGS.epochs*FLAGS.batch_size*(len(train_ds)+len(validation_ds)))/wall_time:.4f}"
+    )
+
+
+if __name__ == "__main__":
+    flags.mark_flag_as_required("model")
+    flags.mark_flag_as_required("preset")
+    app.run(main)

From 40191abde5dcd3f74629d2afdb7038bf1acee94a Mon Sep 17 00:00:00 2001
From: Susnato Dhar <susnatodhar10@gmail.com>
Date: Tue, 28 Mar 2023 07:50:44 +0530
Subject: [PATCH 12/17] nits

---
 examples/glue_benchmark/glue.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/glue_benchmark/glue.py b/examples/glue_benchmark/glue.py
index 7301044cca..358b61221e 100644
--- a/examples/glue_benchmark/glue.py
+++ b/examples/glue_benchmark/glue.py
@@ -78,6 +78,7 @@
 def load_data(task_name):
     """
     Load GLUE dataset.
+
     Load GLUE dataset, and convert the dictionary format to (features, label),
     where features is a tuple of all input sentences.
     """

From b377f29f718d634d2b1c2df8a735aeb2d349170f Mon Sep 17 00:00:00 2001
From: chenmoneygithub <chenmoney@google.com>
Date: Tue, 28 Mar 2023 10:57:01 +0800
Subject: [PATCH 13/17] fix some style

---
 keras_nlp/benchmarks/glue.py | 66 +++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/keras_nlp/benchmarks/glue.py b/keras_nlp/benchmarks/glue.py
index 02854d7c01..43caf9a44c 100644
--- a/keras_nlp/benchmarks/glue.py
+++ b/keras_nlp/benchmarks/glue.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" GLUE benchmark script to test model performance.
+"""GLUE benchmark script to test model performance.
 
 To run the script, use this command:
 ```
@@ -54,7 +54,7 @@
 flags.DEFINE_string(
     "preset",
     None,
-    "The model preset(eg. For bert it is 'bert_base_en', 'bert_tiny_en_uncased')",
+    "The model preset, e.g., 'bert_base_en_uncased' for `BertClassifier`",
 )
 flags.DEFINE_float(
     "learning_rate", 0.005, "The learning_rate for the optimizer."
@@ -62,9 +62,9 @@
 flags.DEFINE_string(
     "mixed_precision_policy",
     "mixed_float16",
-    "The global precision policy to use. E.g. 'mixed_float16' or 'float32'.",
+    "The global precision policy to use, e.g., 'mixed_float16' or 'float32'.",
 )
-flags.DEFINE_integer("epochs", 2, "No of Epochs.")
+flags.DEFINE_integer("epochs", 2, "The number of epochs.")
 flags.DEFINE_integer("batch_size", 8, "Batch Size.")
 
 
@@ -72,11 +72,11 @@
 
 
 def load_data():
-    """
-    Load GLUE/MRPC dataset, and convert the dictionary format to (features, label),
-    where features is a tuple of all input sentences.
-    """
+    """Load data.
 
+    Load GLUE/MRPC dataset, and convert the dictionary format to
+    (features, label), where `features` is a tuple of all input sentences.
+    """
     feature_names = ("sentence1", "sentence2")
 
     def split_features(x):
@@ -97,7 +97,6 @@ def split_features(x):
     validation_ds = validation_ds.map(
         split_features, num_parallel_calls=tf.data.AUTOTUNE
     )
-
     return train_ds, test_ds, validation_ds
 
 
@@ -121,23 +120,25 @@ def load_model(model, preset, num_classes):
 def main(_):
     keras.mixed_precision.set_global_policy(FLAGS.mixed_precision_policy)
 
-    # checking task version (erroring out other testes except "mrpc")
+    # Check task is supported.
+    # TODO(chenmoneygithub): Add support for other glue tasks.
     if FLAGS.task != "mrpc":
         raise ValueError(
-            f"For now this script only supports mrpc, but received {FLAGS.task}"
+            f"For now only mrpc is supported, but received {FLAGS.task}."
         )
 
     logging.info(
-        f"\nMODEL : {FLAGS.model} | PRESET : {FLAGS.preset} | DATASET : glue/mrpc | batch_size : {FLAGS.batch_size} | epochs : {FLAGS.epochs}\n"
+        "Benchmarking configs...\n"
+        "=========================\n"
+        f"MODEL: {FLAGS.model}\n"
+        f"PRESET: {FLAGS.preset}\n"
+        f"TASK: glue/{FLAGS.task}\n"
+        f"BATCH_SIZE: {FLAGS.batch_size}\n"
+        f"EPOCHS: {FLAGS.epochs}\n"
+        "=========================\n"
     )
 
-    # Load the model
-    model = load_model(model=FLAGS.model, preset=FLAGS.preset, num_classes=2)
-    # Add loss and optimizer
-    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-    metrics = [keras.metrics.SparseCategoricalAccuracy()]
-
-    # Load datasets
+    # Load datasets.
     train_ds, test_ds, validation_ds = load_data()
     train_ds = train_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
     test_ds = test_ds.batch(FLAGS.batch_size).prefetch(tf.data.AUTOTUNE)
@@ -145,6 +146,12 @@ def main(_):
         tf.data.AUTOTUNE
     )
 
+    # Load the model.
+    model = load_model(model=FLAGS.model, preset=FLAGS.preset, num_classes=2)
+    # Set loss and metrics.
+    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    metrics = [keras.metrics.SparseCategoricalAccuracy()]
+    # Configure optimizer.
     lr = tf.keras.optimizers.schedules.PolynomialDecay(
         FLAGS.learning_rate,
         decay_steps=train_ds.cardinality() * FLAGS.epochs,
@@ -156,23 +163,26 @@ def main(_):
     )
     model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
 
-    # Start training
+    # Start training.
     logging.info("Starting Training...")
 
     st = time.time()
     history = model.fit(
-        train_ds, validation_data=validation_ds, epochs=FLAGS.epochs
+        train_ds,
+        validation_data=validation_ds,
+        epochs=FLAGS.epochs,
     )
+
     wall_time = time.time() - st
+    validation_accuracy = history.history["val_sparse_categorical_accuracy"][-1]
+    examples_per_second = (
+        FLAGS.epochs * FLAGS.batch_size * (len(train_ds) + len(validation_ds))
+    ) / wall_time
 
     logging.info("Training Finished!")
-    logging.info(f"Wall Time :: {wall_time:.4f} seconds.")
-    logging.info(
-        f"Validation Accuracy :: {history.history['val_sparse_categorical_accuracy'][-1]:.4f}"
-    )
-    logging.info(
-        f"examples_per_second :: {(FLAGS.epochs*FLAGS.batch_size*(len(train_ds)+len(validation_ds)))/wall_time:.4f}"
-    )
+    logging.info(f"Wall Time: {wall_time:.4f} seconds.")
+    logging.info(f"Validation Accuracy: {validation_accuracy:.4f}")
+    logging.info(f"examples_per_second: {examples_per_second:.4f}")
 
 
 if __name__ == "__main__":

From aca34d94532f742606de6f1204762e4325191746 Mon Sep 17 00:00:00 2001
From: chenmoneygithub <chenmoney@google.com>
Date: Tue, 28 Mar 2023 13:11:11 +0800
Subject: [PATCH 14/17] Rename to avoid flag conflict

---
 keras_nlp/benchmarks/{glue.py => glue_test.py}                 | 3 ++-
 .../{sentiment_analysis.py => sentiment_analysis_test.py}      | 0
 .../benchmarks/{text_generation.py => text_generation_test.py} | 0
 3 files changed, 2 insertions(+), 1 deletion(-)
 rename keras_nlp/benchmarks/{glue.py => glue_test.py} (98%)
 rename keras_nlp/benchmarks/{sentiment_analysis.py => sentiment_analysis_test.py} (100%)
 rename keras_nlp/benchmarks/{text_generation.py => text_generation_test.py} (100%)

diff --git a/keras_nlp/benchmarks/glue.py b/keras_nlp/benchmarks/glue_test.py
similarity index 98%
rename from keras_nlp/benchmarks/glue.py
rename to keras_nlp/benchmarks/glue_test.py
index 43caf9a44c..50ba796862 100644
--- a/keras_nlp/benchmarks/glue.py
+++ b/keras_nlp/benchmarks/glue_test.py
@@ -169,8 +169,9 @@ def main(_):
     st = time.time()
     history = model.fit(
         train_ds,
-        validation_data=validation_ds,
+        validation_data=validation_ds.take(2),
         epochs=FLAGS.epochs,
+        steps_per_epoch=2,
     )
 
     wall_time = time.time() - st
diff --git a/keras_nlp/benchmarks/sentiment_analysis.py b/keras_nlp/benchmarks/sentiment_analysis_test.py
similarity index 100%
rename from keras_nlp/benchmarks/sentiment_analysis.py
rename to keras_nlp/benchmarks/sentiment_analysis_test.py
diff --git a/keras_nlp/benchmarks/text_generation.py b/keras_nlp/benchmarks/text_generation_test.py
similarity index 100%
rename from keras_nlp/benchmarks/text_generation.py
rename to keras_nlp/benchmarks/text_generation_test.py

From 2507cd917e7163f9c914f09367b71c0629417c27 Mon Sep 17 00:00:00 2001
From: chenmoneygithub <chenmoney@google.com>
Date: Tue, 28 Mar 2023 13:33:12 +0800
Subject: [PATCH 15/17] try avoiding flag issue

---
 keras_nlp/benchmarks/{glue_test.py => glue.py}                    | 0
 keras_nlp/benchmarks/{__init__.py => no__init__.py}               | 0
 .../{sentiment_analysis_test.py => sentiment_analysis.py}         | 0
 .../benchmarks/{text_generation_test.py => text_generation.py}    | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename keras_nlp/benchmarks/{glue_test.py => glue.py} (100%)
 rename keras_nlp/benchmarks/{__init__.py => no__init__.py} (100%)
 rename keras_nlp/benchmarks/{sentiment_analysis_test.py => sentiment_analysis.py} (100%)
 rename keras_nlp/benchmarks/{text_generation_test.py => text_generation.py} (100%)

diff --git a/keras_nlp/benchmarks/glue_test.py b/keras_nlp/benchmarks/glue.py
similarity index 100%
rename from keras_nlp/benchmarks/glue_test.py
rename to keras_nlp/benchmarks/glue.py
diff --git a/keras_nlp/benchmarks/__init__.py b/keras_nlp/benchmarks/no__init__.py
similarity index 100%
rename from keras_nlp/benchmarks/__init__.py
rename to keras_nlp/benchmarks/no__init__.py
diff --git a/keras_nlp/benchmarks/sentiment_analysis_test.py b/keras_nlp/benchmarks/sentiment_analysis.py
similarity index 100%
rename from keras_nlp/benchmarks/sentiment_analysis_test.py
rename to keras_nlp/benchmarks/sentiment_analysis.py
diff --git a/keras_nlp/benchmarks/text_generation_test.py b/keras_nlp/benchmarks/text_generation.py
similarity index 100%
rename from keras_nlp/benchmarks/text_generation_test.py
rename to keras_nlp/benchmarks/text_generation.py

From 585339e3dd5e511c1f0977bb5d1f25d27a9ccfa2 Mon Sep 17 00:00:00 2001
From: chenmoneygithub <chenmoney@google.com>
Date: Tue, 28 Mar 2023 13:50:46 +0800
Subject: [PATCH 16/17] remove __init__ since it's not a module

---
 keras_nlp/benchmarks/no__init__.py | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 keras_nlp/benchmarks/no__init__.py

diff --git a/keras_nlp/benchmarks/no__init__.py b/keras_nlp/benchmarks/no__init__.py
deleted file mode 100644
index ba0c2545e4..0000000000
--- a/keras_nlp/benchmarks/no__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2023 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

From 10f46eb8b8942f7751f039bc258b5353dbfd50df Mon Sep 17 00:00:00 2001
From: chenmoneygithub <chenmoney@google.com>
Date: Wed, 29 Mar 2023 09:02:46 +0800
Subject: [PATCH 17/17] move benchmarks out

---
 {keras_nlp/benchmarks => benchmarks}/README.md             | 0
 {keras_nlp/benchmarks => benchmarks}/glue.py               | 3 +--
 {keras_nlp/benchmarks => benchmarks}/sentiment_analysis.py | 0
 {keras_nlp/benchmarks => benchmarks}/text_generation.py    | 0
 4 files changed, 1 insertion(+), 2 deletions(-)
 rename {keras_nlp/benchmarks => benchmarks}/README.md (100%)
 rename {keras_nlp/benchmarks => benchmarks}/glue.py (98%)
 rename {keras_nlp/benchmarks => benchmarks}/sentiment_analysis.py (100%)
 rename {keras_nlp/benchmarks => benchmarks}/text_generation.py (100%)

diff --git a/keras_nlp/benchmarks/README.md b/benchmarks/README.md
similarity index 100%
rename from keras_nlp/benchmarks/README.md
rename to benchmarks/README.md
diff --git a/keras_nlp/benchmarks/glue.py b/benchmarks/glue.py
similarity index 98%
rename from keras_nlp/benchmarks/glue.py
rename to benchmarks/glue.py
index 50ba796862..43caf9a44c 100644
--- a/keras_nlp/benchmarks/glue.py
+++ b/benchmarks/glue.py
@@ -169,9 +169,8 @@ def main(_):
     st = time.time()
     history = model.fit(
         train_ds,
-        validation_data=validation_ds.take(2),
+        validation_data=validation_ds,
         epochs=FLAGS.epochs,
-        steps_per_epoch=2,
     )
 
     wall_time = time.time() - st
diff --git a/keras_nlp/benchmarks/sentiment_analysis.py b/benchmarks/sentiment_analysis.py
similarity index 100%
rename from keras_nlp/benchmarks/sentiment_analysis.py
rename to benchmarks/sentiment_analysis.py
diff --git a/keras_nlp/benchmarks/text_generation.py b/benchmarks/text_generation.py
similarity index 100%
rename from keras_nlp/benchmarks/text_generation.py
rename to benchmarks/text_generation.py