Update TPU sharding identification pass to support checking for sharding attributes from tf.TPUPartitionedInput/tf.TPUPartitionedOutput ops.

andyly · tensorflower-gardener · commit 636ca6b60e27 · 2020-12-22T15:07:17.000-08:00
When XLA SPMD is enabled, these ops are generated, holding pre partitioned inputs/outputs. The computation inputs and outputs should take on these shardings. Otherwise sharding should be set to replicate sharding.

PiperOrigin-RevId: 348698987
Change-Id: If075108d6753d09018509862572be87247fdce95
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
@@ -21,7 +21,7 @@ func @empty_func() {
 // gets default maximal(0) sharding configuration.
 // CHECK-LABEL: func @check_default_sharding_for_block_arg_inputs_outputs
 func @check_default_sharding_for_block_arg_inputs_outputs(%arg0: tensor<*xi32>) {
-  "tf_device.cluster_func"(%arg0) {func = @func_without_sharding, step_marker_location = ""} : (tensor<*xi32>) -> ()
+  "tf_device.cluster_func"(%arg0) {func = @func_without_sharding, step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\08\01\1A\01\01\22\01\00"]
   // CHECK: output_sharding_configuration
@@ -42,7 +42,7 @@ func @func_without_sharding(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 // default maximal(0) sharding configuration.
 // CHECK-LABEL: func @check_default_sharding_for_inputs_outputs
 func @check_default_sharding_for_inputs_outputs(%arg0: tensor<*xi32>) {
-  "tf_device.cluster_func"(%arg0) {func = @func_without_sharding, step_marker_location = ""} : (tensor<*xi32>) -> ()
+  "tf_device.cluster_func"(%arg0) {func = @func_without_sharding, step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\08\01\1A\01\01\22\01\00"]
   // CHECK: output_sharding_configuration
@@ -63,7 +63,7 @@ func @func_without_sharding(%arg0: tensor<*xi32>) -> tensor<*xi32> {
 // Tests with a input arg connected to XlaSharding op.
 // CHECK-LABEL: func @check_sharding_for_input_correctly_identified
 func @check_sharding_for_input_correctly_identified(%arg0: tensor<*xi32>) {
-  "tf_device.cluster_func"(%arg0) {func = @inputs_with_sharding_func, step_marker_location = ""} : (tensor<*xi32>) -> ()
+  "tf_device.cluster_func"(%arg0) {func = @inputs_with_sharding_func, step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
   // CHECK: input_sharding_configuration
   // CHECK-SAME: ["\01\02\03"]
   // CHECK: output_sharding_configuration
@@ -90,7 +90,7 @@ func @check_sharding_for_multiple_inputs_outputs(%arg0: tensor<*xi32>, %arg1: te
   // CHECK-SAME: ["\01\02\03", "\04\05\06"]
   // CHECK: output_sharding_configuration
   // CHECK-SAME: ["\0A\0B\0C", "\0D\0E\0F"]
-return
+  return
 }
 
 // CHECK-LABEL: func @func_with_sharding
@@ -252,3 +252,71 @@ func @func_body(%arg0: tensor<*xi32>)-> tensor<*xi32> {
   %1 = "tf.Identity"(%0) : (tensor<*xi32>) -> (tensor<*xi32>)
   return %1 : tensor<*xi32>
 }
+
+// -----
+
+// Tests partitioned data inputs/outputs are set correctly (via XLA SPMD) is
+// enabled. Non replicated inputs/outputs should have shardings set to be
+// replicate sharding ("").
+
+// CHECK-LABEL: func @partitioned_input_output
+func @partitioned_input_output(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) {
+  %0 = "tf.TPUPartitionedInput"(%arg0) {_XlaSharding = "\01\02\03", partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  // CHECK:      tf_device.cluster_func
+  // CHECK-SAME: input_sharding_configuration = ["\01\02\03", ""]
+  // CHECK-SAME: output_sharding_configuration = ["", "\04\05\06"]
+  %1:2 = "tf_device.cluster_func"(%0, %arg1) {func = @cluster_func, use_spmd_for_xla_partitioning = true} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>)
+  %2 = "tf.TPUPartitionedOutput"(%1#1) {_XlaSharding = "\04\05\06", partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  return %1#0, %2 : tensor<*xi32>, tensor<*xi32>
+}
+
+// CHECK-LABEL: func @cluster_func
+// CHECK-SAME: ({{.+}}: tensor<*xi32> {mhlo.sharding = "\01\02\03"}, {{.+}}: tensor<*xi32> {mhlo.sharding = ""})
+// CHECK-SAME: -> (tensor<*xi32> {mhlo.sharding = ""}, tensor<*xi32> {mhlo.sharding = "\04\05\06"})
+func @cluster_func(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) {
+  return %arg0, %arg1 : tensor<*xi32>, tensor<*xi32>
+}
+
+// -----
+
+// Tests partitioned variables (via XLA SPMD) propagates shardings correctly.
+
+// CHECK-LABEL: func @partitioned_variable
+func @partitioned_variable(%arg0: tensor<!tf.resource<tensor<*xf32>>>) {
+  %0 = "tf.TPUPartitionedInput"(%arg0) {_XlaSharding = "\01\02\03", partition_dim = -1 : i64} : (tensor<!tf.resource<tensor<*xf32>>>) -> tensor<!tf.resource<tensor<*xf32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf.resource<tensor<*xf32>>>) -> tensor<*xf32>
+  // CHECK:      tf_device.cluster_func
+  // CHECK-SAME: input_sharding_configuration = ["\01\02\03"]
+  // CHECK-SAME: output_sharding_configuration = []
+  "tf_device.cluster_func"(%1) {func = @cluster_func, use_spmd_for_xla_partitioning = true} : (tensor<*xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @cluster_func
+// CHECK-SAME: ({{.+}}: tensor<*xf32> {mhlo.sharding = "\01\02\03"})
+func @cluster_func(%arg0: tensor<*xf32>) {
+  return
+}
+
+// -----
+
+// Tests partitioned inputs/outputs with no sharding (via XLA SPMD) defaults to
+// replicate sharding ("").
+
+// CHECK-LABEL: func @partitioned_input_output
+func @partitioned_input_output(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  %0 = "tf.TPUPartitionedInput"(%arg0) {partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  // CHECK:      tf_device.cluster_func
+  // CHECK-SAME: input_sharding_configuration = [""]
+  // CHECK-SAME: output_sharding_configuration = [""]
+  %1 = "tf_device.cluster_func"(%0) {func = @cluster_func, use_spmd_for_xla_partitioning = true} : (tensor<*xi32>) -> tensor<*xi32>
+  %2 = "tf.TPUPartitionedOutput"(%1) {partition_dim = -1 : i64} : (tensor<*xi32>) -> tensor<*xi32>
+  return %2 : tensor<*xi32>
+}
+
+// CHECK-LABEL: func @cluster_func
+// CHECK-SAME: ({{.+}}: tensor<*xi32> {mhlo.sharding = ""})
+// CHECK-SAME: -> (tensor<*xi32> {mhlo.sharding = ""})
+func @cluster_func(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  return %arg0 : tensor<*xi32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -18,12 +18,14 @@ limitations under the License.
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -40,24 +42,43 @@ namespace TFTPU {
 namespace {
 
 constexpr char kShardingAttr[] = "mhlo.sharding";
+constexpr char kReplicateSharding[] = "";
 
 struct TPUShardingIdentificationPass
     : public PassWrapper<TPUShardingIdentificationPass,
                          OperationPass<ModuleOp>> {
   void runOnOperation() override;
 };
 
-// Finds XlaSharding op connected to an argument value. If value is a resource
-// type then XlaSharding op will be connected to a ReadVariable op. XlaSharding
-// op may be direct user of inputs but it may also be followed by an Identity op
-// and, in the case where bfloat16 type is used, Cast op may be added right
-// after the input.
+// Returns XLA sharding from TPUPartitionedInput op connected to a
+// `tf_device.cluster_func` operand value. If value is a resource type then
+// TPUPartitionedInput op will be connected to a ReadVariable op that feeds into
+// a `tf_device.cluster_func`.
+llvm::Optional<llvm::StringRef> GetXlaShardingFromOperand(Value value) {
+  Value value_to_visit = value;
+  if (auto read_var = llvm::dyn_cast_or_null<TF::ReadVariableOp>(
+          value_to_visit.getDefiningOp()))
+    value_to_visit = read_var.resource();
+
+  if (auto partitioned_input =
+          llvm::dyn_cast_or_null<TF::TPUPartitionedInputOp>(
+              value_to_visit.getDefiningOp()))
+    return partitioned_input._XlaSharding();
+
+  return llvm::None;
+}
+
+// Returns XLA sharding from a XlaSharding op connected to an argument value. If
+// value is a resource type then XlaSharding op will be connected to a
+// ReadVariable op. XlaSharding op may be direct user of inputs but it may also
+// be followed by an Identity op and, in the case where bfloat16 type is used,
+// Cast op may be added right after the input.
 //
 // TODO(hongjunchoi): Add logic to parse XlaSharding op inside control flow (If,
 // Case, While) ops and Caller return values.
 // TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
 // inputs.
-llvm::Optional<llvm::StringRef> GetXlaShardingFromArg(const Value& value) {
+llvm::Optional<llvm::StringRef> GetXlaShardingFromArg(Value value) {
   llvm::SmallPtrSet<Value, 4> visited_values;
   llvm::SmallVector<Value, 4> values_to_visit{value};
   while (!values_to_visit.empty()) {
@@ -90,22 +111,29 @@ llvm::Optional<llvm::StringRef> GetXlaShardingFromArg(const Value& value) {
   return llvm::None;
 }
 
-// Walks the graph from the arguments of the `cluster_func_op` and extracts
-// sharding configurations for all inputs by parsing XlaSharding op connected to
-// the arguments. If argument to the `cluster_func_op` directly feeds into
-// another function call op, then recursively walk the function definition to
-// find the connected XlaSharding op.
+// Extracts sharding configurations for all inputs by parsing XlaSharding/
+// TPUPartitionedInput op connected to the operands/arguments. If argument to
+// the `cluster_func` directly feeds into another function call op, then
+// recursively walk the function definition to find the connected XlaSharding
+// op.
 void IdentifyXlaShardingForComputationInputs(
-    StringRef logical_core_0_sharding, tf_device::ClusterFuncOp cluster_func_op,
-    FuncOp cluster_function, Builder* builder) {
+    StringRef logical_core_0_sharding, bool use_spmd,
+    tf_device::ClusterFuncOp cluster_func, FuncOp func, Builder* builder) {
   // Look up function definition from module.
-  Block& cluster_function_block = cluster_function.front();
+  Block& function_block = func.front();
 
-  llvm::SmallVector<llvm::StringRef, 8> sharding_for_args(
-      cluster_function_block.getNumArguments(), logical_core_0_sharding);
+  llvm::SmallVector<llvm::StringRef, 8> sharding_for_args;
+  sharding_for_args.reserve(function_block.getNumArguments());
 
+  // Iterate through operands of `cluster_func`.
+  // The computation operand can either be:
+  //   1) a TPUPartitionedInput Op if the input has a non-resource type;
+  //   2) a ReadVariableOp else.
+  //
+  // Replicate sharding is used if `use_spmd` is set.
+  //
   // Iterate through input arguments to the entry block of
-  // tf_device.ClusterFunc. For input ops, look for following XlaSharding ops.
+  // tf_device.ClusterFunc. For input ops, look for XlaSharding ops.
   // XlaSharding ops can:
   //   1) Directly follow the input argument if input argument has non-resource
   //      types.
@@ -114,36 +142,70 @@ void IdentifyXlaShardingForComputationInputs(
   //
   // Sharding configurations are added to the tf_device.ClusterFunc as an
   // attribute and the function as an argument attribute.
-  for (auto& arg : cluster_function_block.getArguments()) {
-    auto arg_sharding = GetXlaShardingFromArg(arg);
+  for (auto operand_and_arg :
+       llvm::zip(cluster_func.operands(), function_block.getArguments())) {
+    Value operand = std::get<0>(operand_and_arg);
+    BlockArgument arg = std::get<1>(operand_and_arg);
     const int index = arg.getArgNumber();
 
+    if (auto operand_sharding = GetXlaShardingFromOperand(operand)) {
+      sharding_for_args.push_back(operand_sharding.getValue());
+      func.setArgAttr(index, kShardingAttr,
+                      builder->getStringAttr(operand_sharding.getValue()));
+      continue;
+    }
+
+    if (use_spmd) {
+      // If XLA SPMD is enabled, host variables or non-variable per-replica
+      // inputs should take on replicate sharding, unless another sharding is
+      // set via a TPUPartitionedInput op.
+      sharding_for_args.push_back(kReplicateSharding);
+      func.setArgAttr(index, kShardingAttr,
+                      builder->getStringAttr(kReplicateSharding));
+      continue;
+    }
+
+    auto arg_sharding = GetXlaShardingFromArg(arg);
     if (arg_sharding) {
-      sharding_for_args[index] = arg_sharding.getValue();
-      cluster_function.setArgAttr(
-          index, kShardingAttr,
-          builder->getStringAttr(arg_sharding.getValue()));
-    } else {
-      cluster_function.setArgAttr(
-          index, kShardingAttr,
-          builder->getStringAttr(logical_core_0_sharding));
+      sharding_for_args.push_back(arg_sharding.getValue());
+      func.setArgAttr(index, kShardingAttr,
+                      builder->getStringAttr(arg_sharding.getValue()));
+      continue;
     }
+
+    // Default to maximal sharding core 0 if no sharding is present.
+    sharding_for_args.push_back(logical_core_0_sharding);
+    func.setArgAttr(index, kShardingAttr,
+                    builder->getStringAttr(logical_core_0_sharding));
   }
 
-  cluster_func_op->setAttr(tensorflow::kInputShardingAttr,
-                           builder->getStrArrayAttr(sharding_for_args));
+  cluster_func->setAttr(tensorflow::kInputShardingAttr,
+                        builder->getStrArrayAttr(sharding_for_args));
 }
 
-// Finds XlaSharding op connected to a result value. XlaSharding op may be
-// direct user of inputs but it may also be followed by an Identity op and, in
-// the case where bfloat16 type is used, Cast op may be added right after the
-// input.
+// Returns XLA sharding from TPUPartitionedOutput op connected to a
+// `tf_device.cluster_func` result value.
+llvm::Optional<llvm::StringRef> GetXlaShardingFromResult(Value value) {
+  if (!value.hasOneUse()) return llvm::None;
+
+  Operation* user = *value.getUsers().begin();
+  if (auto partitioned_output =
+          llvm::dyn_cast<TF::TPUPartitionedOutputOp>(user))
+    return partitioned_output._XlaSharding();
+
+  return llvm::None;
+}
+
+// Returns XLA sharding from XlaSharding op connected to a result value.
+// XlaSharding op may be direct user of inputs but it may also be followed by an
+// Identity op and, in the case where bfloat16 type is used, Cast op may be
+// added right after the input.
 //
 // TODO(hongjunchoi): Add logic to parse XlaSharding op inside control flow (If,
 // Case, While) ops and Caller argument values.
 // TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
 // inputs.
-llvm::Optional<StringRef> GetXlaShardingFromRetval(const Value& value) {
+llvm::Optional<StringRef> GetXlaShardingFromRetval(Value value) {
   llvm::SmallPtrSet<Value, 4> visited_values;
   Value value_to_visit = value;
   while (value_to_visit) {
@@ -172,34 +234,58 @@ llvm::Optional<StringRef> GetXlaShardingFromRetval(const Value& value) {
   return llvm::None;
 }
 
-// Parses XlaSharding op directly connected from the outputs of the
-// `cluster_func` and extract sharding configurations for outputs.
+// Extracts sharding configurations for all outputs by parsing XlaSharding/
+// TPUPartitionedOutput op connected to the retvals/results.
 void IdentifyXlaShardingForComputationOutputs(
-    StringRef logical_core_0_sharding, FuncOp func,
-    tf_device::ClusterFuncOp cluster_func, Builder* builder) {
-  // By default return values from logical core 0 is used if no sharding
-  // configuration is defined.
+    StringRef logical_core_0_sharding, bool use_spmd,
+    tf_device::ClusterFuncOp cluster_func, FuncOp func, Builder* builder) {
   Block& function_block = func.front();
   Operation* terminator = function_block.getTerminator();
-  llvm::SmallVector<llvm::StringRef, 8> sharding_for_rets(
-      terminator->getNumOperands(), logical_core_0_sharding);
+  llvm::SmallVector<llvm::StringRef, 8> sharding_for_rets;
+  sharding_for_rets.reserve(terminator->getNumOperands());
 
+  // Iterate through results of `cluster_func`. For output ops, look for
+  // TPUPartitionedOutput ops.
+  //
+  // Replicate sharding is used if `use_spmd` is set.
+  //
   // Iterate through operands of the terminator. If the preceding op is
   // XlaShardingOp, then the provided sharding configuration is added to the
   // tf_device.ClusterFunc as an attribute and the function as a result
   // attribute.
-  for (auto& ret : terminator->getOpOperands()) {
-    auto ret_sharding = GetXlaShardingFromRetval(ret.get());
-    const int index = ret.getOperandNumber();
+  for (auto result_and_retval :
+       llvm::zip(cluster_func.results(), terminator->getOpOperands())) {
+    Value result = std::get<0>(result_and_retval);
+    OpOperand& retval = std::get<1>(result_and_retval);
+    const int index = retval.getOperandNumber();
+
+    if (auto result_sharding = GetXlaShardingFromResult(result)) {
+      sharding_for_rets.push_back(result_sharding.getValue());
+      func.setResultAttr(index, kShardingAttr,
+                         builder->getStringAttr(result_sharding.getValue()));
+      continue;
+    }
 
-    if (ret_sharding) {
-      sharding_for_rets[index] = ret_sharding.getValue();
+    if (use_spmd) {
+      // If XLA SPMD is enabled, outputs all should have replicate sharding,
+      // unless another sharding is set via a TPUPartitionedOutput op.
+      sharding_for_rets.push_back(kReplicateSharding);
       func.setResultAttr(index, kShardingAttr,
-                         builder->getStringAttr(ret_sharding.getValue()));
-    } else {
+                         builder->getStringAttr(kReplicateSharding));
+      continue;
+    }
+
+    if (auto retval_sharding = GetXlaShardingFromRetval(retval.get())) {
+      sharding_for_rets.push_back(retval_sharding.getValue());
       func.setResultAttr(index, kShardingAttr,
-                         builder->getStringAttr(logical_core_0_sharding));
+                         builder->getStringAttr(retval_sharding.getValue()));
+      continue;
     }
+
+    // Default to maximal sharding core 0 if no sharding is present.
+    sharding_for_rets.push_back(logical_core_0_sharding);
+    func.setResultAttr(index, kShardingAttr,
+                       builder->getStringAttr(logical_core_0_sharding));
   }
 
   cluster_func->setAttr(tensorflow::kOutputShardingAttr,
@@ -219,11 +305,16 @@ void IdentifyXlaShardingForTPUComputation(
   const std::string logical_core_0_sharding =
       xla::sharding_builder::AssignDevice(0).SerializeAsString();
 
-  IdentifyXlaShardingForComputationInputs(logical_core_0_sharding, cluster_func,
-                                          func, builder);
+  bool use_spmd = false;
+  if (auto use_spmd_attr =
+          cluster_func.getAttrOfType<BoolAttr>("use_spmd_for_xla_partitioning"))
+    use_spmd = use_spmd_attr.getValue();
+
+  IdentifyXlaShardingForComputationInputs(logical_core_0_sharding, use_spmd,
+                                          cluster_func, func, builder);
 
-  IdentifyXlaShardingForComputationOutputs(logical_core_0_sharding, func,
-                                           cluster_func, builder);
+  IdentifyXlaShardingForComputationOutputs(logical_core_0_sharding, use_spmd,
+                                           cluster_func, func, builder);
 }
 
 void TPUShardingIdentificationPass::runOnOperation() {