[BYOC][ETHOSN] Fix tests for new module API (apache#6560)

mbaret · web-flow · commit 2cfbd093be04 · 2020-09-30T10:32:32.000-07:00
* [BYOC][ETHOSN] Fix tests for new module API

Some of the downstream variants of our tests had
been broken by a recent change to the API of build.
This both fixes that and refactors a couple of tests
so that they will run entirely in upstream CI and
we won't see this sort of failure again.

Change-Id: I841266eef0e2e89cc76e0526fc6cd3fc8d1326d8

* Only run mobilenet

Change-Id: Ie41c6d2c13c4473ecaa5c50c33d2c1589c742796

* Improve docs

Change-Id: I2c8bde44278e4cbc9cea5c5cbd4bb3c316ec37ae

* More docs

Change-Id: Ia9973915eecea647689535cc1e6eef9228111324
diff --git a/src/runtime/contrib/ethosn/ethosn_device.cc b/src/runtime/contrib/ethosn/ethosn_device.cc
@@ -174,7 +174,6 @@ bool Inference(tvm::runtime::TVMArgs args, sl::CompiledNetwork* network,
  * it's called.
  */
 
-#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/registry.h>
 
 namespace tvm {
@@ -188,7 +187,7 @@ std::vector<tvm::runtime::NDArray> test_outputs;
 TVM_REGISTER_GLOBAL("relay.ethos-n.test.infra.inference_result")
     .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
       test_outputs.clear();
-      for (int argc = 1; argc < args.size(); argc++) {
+      for (int argc = 0; argc < args.size(); argc++) {
         const DLTensor* tensor = args[argc];
         auto shape = std::vector<int64_t>(tensor->shape, tensor->shape + tensor->ndim);
         test_outputs.emplace_back(tvm::runtime::NDArray::Empty(shape, tensor->dtype, tensor->ctx));
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -43,11 +43,34 @@ def get_real_image(im_height, im_width):
 
 
 def assert_lib_hash(lib, golden):
+    """Check that the Ethos-N runtime modules in a library hash to the same values
+    as given by the golden hash(es).
+
+    If there's only one Ethos-N module, the golden hash may be provided as a str.
+    If there's multiple, a set of golden hashes should be provided to correspond
+    with each Ethos-N module that is expected.
+
+    This function is used to ensure that no change is made which alters the output
+    of a compilation. If such a change is made deliberately (eg. to fix a bug) then
+    the golden hash should be updated after verifying on hardware that the behaviour
+    is still correct.
+
+    This method is used because of the lack of hardware availability in upstream CI.
+    """
+    # Convert str hash into a set of hashes
+    if isinstance(golden, str):
+        golden = {golden}
+
     temp = util.tempdir()
     path = temp.relpath("lib.cmm")
-    lib.imported_modules[1].save(path)
-    lib_hash = md5(open(path, "rb").read()).hexdigest()
-    assert lib_hash == golden, "Expected hash: {} Got hash: {}".format(golden, lib_hash)
+    hash_set = set()
+    for mod in lib.imported_modules:
+        if mod.type_key == "ethos-n":
+            mod.save(path)
+            lib_hash = md5(open(path, "rb").read()).hexdigest()
+            hash_set.add(lib_hash)
+
+    assert hash_set == golden, "Expected hash: {} Got hash: {}".format(golden, hash_set)
 
 
 def make_module(func, params):
@@ -102,6 +125,21 @@ def visit_call(self, call):
 
 
 def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
+    """Build a network with or without Ethos-N offloading.
+
+    Parameters
+    ----------
+    mod : IRModule
+        The Relay module to build.
+    params : dict of str to NDArray
+        The weights to build with.
+    npu : bool, optional
+        Whether to build with Ethos-N offloading.
+    expected_host_ops : int, optional
+        The number of ops expected to remain on the host.
+    npu_partitions : int, optional
+        The number of Ethos-N partitions expected.
+    """
     relay.backend.compile_engine.get().clear()
     with tvm.transform.PassContext(
         opt_level=3, config={"relay.ext.ethos-n.options": {"variant": 0}}
@@ -133,6 +171,28 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
 
 
 def run(lib, inputs, outputs, npu=True):
+    """Run a module with specified inputs.
+
+    Parameters
+    ----------
+    lib : runtime.Module
+        The runtime module.
+    inputs : dict of str to NDArray
+        The input dictionary.
+    outputs : int
+        The expected number of outputs.
+    npu : bool
+        Whether or not any part of the lib is offloaded to Ethos-N.
+        If it's false (i.e. it's all running on the CPU), we set
+        the mocked result equal to the output so that a subsequent
+        mocked run on the NPU returns the same value.
+
+    Returns
+    -------
+    out : list of NDArray
+        The results.
+
+    """
     # Export and load lib to confirm this works
     lib_name = "mod.so"
     temp = util.tempdir()
@@ -144,7 +204,7 @@ def run(lib, inputs, outputs, npu=True):
     module.run()
     out = [module.get_output(i) for i in range(outputs)]
     if not npu:
-        inference_result(0, out)
+        inference_result(out)
     return out
 
 
@@ -171,12 +231,12 @@ def verify(answers, atol, rtol=1e-07, verify_saturation=True):
             tvm.testing.assert_allclose(outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol)
 
 
-def inference_result(checksum, outputs):
+def inference_result(outputs):
     """Set the expected results of an Ethos inference, if the testing
     infrastructure is available. This assumes that the entire graph
     was offloaded to the neural processor."""
     if tvm.get_global_func("relay.ethos-n.test.infra.inference_result", True):
-        return _infrastructure.inference_result(checksum, *outputs)
+        return _infrastructure.inference_result(*outputs)
     return False
 
 
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
@@ -23,7 +23,7 @@
 pytest.importorskip("tensorflow")
 
 from tvm import relay
-from tvm.relay.op.contrib.ethosn import ethosn_available, Available
+from tvm.relay.op.contrib.ethosn import ethosn_available
 from tvm.contrib import download
 import tvm.relay.testing.tf as tf_testing
 import tflite.Model
@@ -58,10 +58,36 @@ def _test_image_network(
     input_dict,
     compile_hash,
     output_count,
-    run=True,
     host_ops=0,
     npu_partitions=1,
+    run=False,
 ):
+    """Test an image network.
+
+    Parameters
+    ----------
+    model_url : str
+        The URL to the model.
+    model_sub_path : str
+        The name of the model file.
+    input_dict : dict
+        The input dict.
+    compile_hash : str, set
+        The compile hash(es) to check the compilation output against.
+    output_count : int
+        The expected number of outputs.
+    host_ops : int
+        The expected number of host operators.
+    npu_partitions : int
+        The expected number of Ethos-N partitions.
+    run : bool
+        Whether or not to try running the network. If hardware isn't
+        available, the run will still take place but with a mocked
+        inference function, so the results will be incorrect. This is
+        therefore just to test the runtime flow is working rather than
+        to check the correctness/accuracy.
+
+    """
     if not ethosn_available():
         return
 
@@ -78,24 +104,16 @@ def get_model():
             )
         return _get_tflite_model(model_path, input_dict, "uint8")
 
-    outputs = []
     inputs = {}
     for input_name in input_dict:
         input_shape = input_dict[input_name]
         inputs[input_name] = tei.get_real_image(input_shape[1], input_shape[2])
 
-    for npu in [False, True]:
-        mod, params = get_model()
-        graph, lib, params = tei.build(
-            mod, params, npu=npu, expected_host_ops=host_ops, npu_partitions=npu_partitions
-        )
-        if npu:
-            tei.assert_lib_hash(lib, compile_hash)
-        if run:
-            outputs.append(tei.run(graph, lib, params, inputs, output_count, npu=npu))
-
+    mod, params = get_model()
+    m = tei.build(mod, params, npu=True, expected_host_ops=host_ops, npu_partitions=npu_partitions)
+    tei.assert_lib_hash(m.get_lib(), compile_hash)
     if run:
-        tei.verify(outputs, 1, verify_saturation=False)
+        tei.run(m, inputs, output_count, npu=True)
 
 
 def test_mobilenet_v1():
@@ -104,17 +122,16 @@ def test_mobilenet_v1():
     # codegen, which could come about from either a change in Support Library
     # version or a change in the Ethos-N codegen. To update this requires running
     # on hardware that isn't available in CI.
-    hw = ethosn_available()
     _test_image_network(
         model_url="https://storage.googleapis.com/download.tensorflow.org/"
         "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
         model_sub_path="mobilenet_v1_1.0_224_quant.tflite",
         input_dict={"input": (1, 224, 224, 3)},
         compile_hash="81637c89339201a07dc96e3b5dbf836a",
         output_count=1,
-        run=(hw == Available.SW_AND_HW),
         host_ops=3,
         npu_partitions=1,
+        run=True,
     )
 
 
@@ -131,7 +148,6 @@ def test_inception_v3():
         input_dict={"input": (1, 299, 299, 3)},
         compile_hash="de0e175af610ebd45ccb03d170dc9664",
         output_count=1,
-        run=False,
         host_ops=0,
         npu_partitions=1,
     )
@@ -150,7 +166,6 @@ def test_inception_v4():
         input_dict={"input": (1, 299, 299, 3)},
         compile_hash="06bf6cb56344f3904bcb108e54edfe87",
         output_count=1,
-        run=False,
         host_ops=3,
         npu_partitions=1,
     )
@@ -167,9 +182,8 @@ def test_ssd_mobilenet_v1():
         "models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip",
         model_sub_path="detect.tflite",
         input_dict={"normalized_input_image_tensor": (1, 300, 300, 3)},
-        compile_hash="6211d96103880b016baa85e638abddef",
+        compile_hash={"29aec6b184b09454b4323271aadf89b1", "6211d96103880b016baa85e638abddef"},
         output_count=4,
-        run=False,
         host_ops=28,
         npu_partitions=2,
     )
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -80,7 +80,7 @@ def test_multiple_command_streams():
     simple graph which creates two Ethos-N partitions and checks the result
     against an 'all-CPU' run through TVM.
     """
-    if ethosn_available() != Available.SW_AND_HW:
+    if not ethosn_available():
         return
 
     def get_model():
@@ -100,14 +100,11 @@ def get_model():
     np.random.seed(0)
     outputs = []
     inputs = {"x": tvm.nd.array(np.random.randint(0, high=256, size=(1, 4, 4, 4), dtype="uint8"))}
-    for npu in [False, True]:
-        model = get_model()
-        mod = tei.make_module(model, {})
-        outputs.append(
-            tei.build_and_run(mod, inputs, 1, {}, npu=npu, expected_host_ops=1, npu_partitions=2)
-        )
-
-    tei.verify(outputs, 0)
+    model = get_model()
+    mod = tei.make_module(model, {})
+    outputs.append(
+        tei.build_and_run(mod, inputs, 1, {}, npu=True, expected_host_ops=1, npu_partitions=2)
+    )
 
 
 def test_output_order():