Support ONNX runtime profiling.

psyhtest · psyhtest · commit 81ed5cc713b8 · 2020-05-14T17:15:09.000+01:00
diff --git a/v0.7/language/bert/.gitignore b/v0.7/language/bert/.gitignore
@@ -1,2 +1,3 @@
 build/
 eval_features.pickle
+onnxruntime_profile__*.json
diff --git a/v0.7/language/bert/Makefile b/v0.7/language/bert/Makefile
@@ -169,3 +169,4 @@ evaluate:
 clean:
 	@rm -rf ${BUILD_DIR}
 	@rm -f  ${FEATURE_CACHE}
+	@rm -f  onnxruntime_profile__*.json
diff --git a/v0.7/language/bert/onnxruntime_SUT.py b/v0.7/language/bert/onnxruntime_SUT.py
@@ -27,34 +27,39 @@
 from squad_QSL import get_squad_QSL
 
 class BERT_ONNXRuntime_SUT():
-    def __init__(self, quantized):
+    def __init__(self, args):
+        self.profile = args.profile
+        self.options = onnxruntime.SessionOptions()
+        self.options.enable_profiling = args.profile
+
         print("Loading ONNX model...")
-        self.quantized = quantized
-        if not quantized:
-            model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx"
-        else:
+        self.quantized = args.quantized
+        if self.quantized:
             model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/bert_large_v1_1_fake_quant.onnx"
-        self.sess = onnxruntime.InferenceSession(model_path)
+        else:
+            model_path = "build/data/bert_tf_v1_1_large_fp32_384_v2/model.onnx"
+        self.sess = onnxruntime.InferenceSession(model_path, self.options)
 
         print("Constructing SUT...")
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies)
-        self.qsl = get_squad_QSL()
         print("Finished constructing SUT.")
 
+        self.qsl = get_squad_QSL()
+
     def issue_queries(self, query_samples):
         for i in range(len(query_samples)):
             eval_features = self.qsl.get_features(query_samples[i].index)
-            if not self.quantized:
+            if self.quantized:
                 fd = {
                     "input_ids": np.array(eval_features.input_ids).astype(np.int64)[np.newaxis, :],
-                    "input_mask": np.array(eval_features.input_mask).astype(np.int64)[np.newaxis, :],
-                    "segment_ids": np.array(eval_features.segment_ids).astype(np.int64)[np.newaxis, :]
+                    "attention_mask": np.array(eval_features.input_mask).astype(np.int64)[np.newaxis, :],
+                    "token_type_ids": np.array(eval_features.segment_ids).astype(np.int64)[np.newaxis, :]
                 }
             else:
                 fd = {
                     "input_ids": np.array(eval_features.input_ids).astype(np.int64)[np.newaxis, :],
-                    "attention_mask": np.array(eval_features.input_mask).astype(np.int64)[np.newaxis, :],
-                    "token_type_ids": np.array(eval_features.segment_ids).astype(np.int64)[np.newaxis, :]
+                    "input_mask": np.array(eval_features.input_mask).astype(np.int64)[np.newaxis, :],
+                    "segment_ids": np.array(eval_features.segment_ids).astype(np.int64)[np.newaxis, :]
                 }
             scores = self.sess.run([o.name for o in self.sess.get_outputs()], fd)
             output = np.stack(scores, axis=-1)[0]
@@ -71,8 +76,9 @@ def process_latencies(self, latencies_ns):
         pass
 
     def __del__(self):
-        lg.DestroySUT(self.sut)
+        if self.profile:
+            print("ONNX runtime profile dumped to: '{}'".format(self.sess.end_profiling()))
         print("Finished destroying SUT.")
 
-def get_onnxruntime_sut(quantized=False):
-    return BERT_ONNXRuntime_SUT(quantized)
+def get_onnxruntime_sut(args):
+    return BERT_ONNXRuntime_SUT(args)
diff --git a/v0.7/language/bert/pytorch_SUT.py b/v0.7/language/bert/pytorch_SUT.py
@@ -53,9 +53,10 @@ def __init__(self):
 
         print("Constructing SUT...")
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies)
-        self.qsl = get_squad_QSL()
         print("Finished constructing SUT.")
 
+        self.qsl = get_squad_QSL()
+
     def issue_queries(self, query_samples):
         with torch.no_grad():
             for i in range(len(query_samples)):
@@ -77,7 +78,6 @@ def process_latencies(self, latencies_ns):
         pass
 
     def __del__(self):
-        lg.DestroySUT(self.sut)
         print("Finished destroying SUT.")
 
 def get_pytorch_sut():
diff --git a/v0.7/language/bert/run.py b/v0.7/language/bert/run.py
@@ -22,12 +22,15 @@
 import mlperf_loadgen as lg
 import subprocess
 
+from squad_QSL import get_squad_QSL
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--backend", choices=["tf","pytorch","onnxruntime"], default="tf", help="Backend")
     parser.add_argument("--scenario", choices=["SingleStream", "Offline", "Server", "MultiStream"], default="Offline", help="Scenario")
     parser.add_argument("--accuracy", action="store_true", help="enable accuracy pass")
     parser.add_argument("--quantized", action="store_true", help="use quantized model (only valid for onnxruntime backend)")
+    parser.add_argument("--profile", action="store_true", help="enable profiling (only valid for onnxruntime backend)")
     parser.add_argument("--mlperf_conf", default="build/mlperf.conf", help="mlperf rules config")
     parser.add_argument("--user_conf", default="user.conf", help="mlperf rules config")
     args = parser.parse_args()
@@ -45,15 +48,17 @@ def main():
 
     if args.backend == "pytorch":
         assert not args.quantized, "Quantized model is only supported by onnxruntime backend!"
+        assert not args.profile, "Profiling is only supported by onnxruntime backend!"
         from pytorch_SUT import get_pytorch_sut
         sut = get_pytorch_sut()
     elif args.backend == "tf":
         assert not args.quantized, "Quantized model is only supported by onnxruntime backend!"
+        assert not args.profile, "Profiling is only supported by onnxruntime backend!"
         from tf_SUT import get_tf_sut
         sut = get_tf_sut()
     elif args.backend == "onnxruntime":
         from onnxruntime_SUT import get_onnxruntime_sut
-        sut = get_onnxruntime_sut(args.quantized)
+        sut = get_onnxruntime_sut(args)
     else:
         raise ValueError("Unknown backend: {:}".format(args.backend))
 
@@ -76,7 +81,7 @@ def main():
     log_settings = lg.LogSettings()
     log_settings.log_output = log_output_settings
 
-    print("Running Loadgen test...")
+    print("Running LoadGen test...")
     lg.StartTestWithLogSettings(sut.sut, sut.qsl.qsl, settings, log_settings)
 
     if args.accuracy:
@@ -85,5 +90,11 @@ def main():
 
     print("Done!")
 
+    print("Destroying SUT...")
+    lg.DestroySUT(sut.sut)
+
+    print("Destroying QSL...")
+    lg.DestroyQSL(sut.qsl.qsl)
+
 if __name__ == "__main__":
     main()
diff --git a/v0.7/language/bert/squad_QSL.py b/v0.7/language/bert/squad_QSL.py
@@ -84,7 +84,6 @@ def get_features(self, sample_id):
         return self.eval_features[sample_id]
 
     def __del__(self):
-        lg.DestroyQSL(self.qsl)
         print("Finished destroying QSL.")
 
 def get_squad_QSL():
diff --git a/v0.7/language/bert/tf_SUT.py b/v0.7/language/bert/tf_SUT.py
@@ -46,9 +46,10 @@ def __init__(self, batch_size=8):
 
         print("Constructing SUT...")
         self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies)
-        self.qsl = get_squad_QSL()
         print("Finished constructing SUT.")
 
+        self.qsl = get_squad_QSL()
+
     def issue_queries(self, query_samples):
         input_ids = np.zeros((len(query_samples), 1, 384), dtype=np.int32)
         input_mask = np.zeros((len(query_samples), 1, 384), dtype=np.int32)
@@ -81,7 +82,6 @@ def process_latencies(self, latencies_ns):
         pass
 
     def __del__(self):
-        lg.DestroySUT(self.sut)
         print("Finished destroying SUT.")
 
     def create_model(self, bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings):

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`build/`
`2`	`2`	`eval_features.pickle`
	`3`	`+onnxruntime_profile__*.json`