pytorch
diff --git a/‎aten/src/ATen/native/cuda/layer_norm_kernel.cu
+81-74 b/‎aten/src/ATen/native/cuda/layer_norm_kernel.cu
+81-74
diff --git a/‎aten/src/ATen/native/layer_norm.cpp
+85-81 b/‎aten/src/ATen/native/layer_norm.cpp
+85-81
@@ -424,32 +424,36 @@ void LayerNormBackwardKernelImpl(
 
 std::tuple<Tensor, Tensor, Tensor> layer_norm_cuda(
     const Tensor& input,
-    IntArrayRef normalized_shape, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
+    IntArrayRef normalized_shape,
+    const c10::optional<Tensor>& weight_opt /* optional */,
+    const c10::optional<Tensor>& bias_opt /* optional */,
     double eps) {
   // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  c10::MaybeOwned<Tensor> weight_maybe_owned =
+      at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-
+  c10::MaybeOwned<Tensor> bias_maybe_owned =
+      at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
 
-  auto inputs = _prepare_layer_norm_inputs(input, normalized_shape, weight, bias);
-  auto X = std::get<0>(inputs);
-  auto gamma = std::get<1>(inputs);
-  auto beta = std::get<2>(inputs);
-  auto M = std::get<3>(inputs);
-  auto N = std::get<4>(inputs);
+  auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto M = M_N.first;
+  auto N = M_N.second;
+  auto X = input.expect_contiguous();
+  auto gamma = weight.expect_contiguous();
+  auto beta = bias.expect_contiguous();
 
   Tensor Y = at::native::empty_like(
-      X,
+      *X,
       c10::nullopt /* dtype */,
       c10::nullopt /* layout */,
       c10::nullopt /* device */,
       c10::nullopt /* pin_memory */,
       LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  Tensor mean = at::empty({M}, X.options());
-  Tensor rstd = at::empty({M}, X.options());
+  Tensor mean = at::empty({M}, X->options());
+  Tensor rstd = at::empty({M}, X->options());
   if (M > 0) {
-    LayerNormKernelImpl(X, gamma, beta, M, N, eps, &Y, &mean, &rstd);
+    LayerNormKernelImpl(*X, *gamma, *beta, M, N, eps, &Y, &mean, &rstd);
 
     const auto input_shape = input.sizes();
     const size_t axis = input.dim() - normalized_shape.size();
@@ -473,73 +477,76 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cuda(
     const Tensor& input,
     IntArrayRef normalized_shape,
     const Tensor& mean,
-    const Tensor& rstd, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
+    const Tensor& rstd,
+    const c10::optional<Tensor>& weight_opt /* optional */,
+    const c10::optional<Tensor>& bias_opt /* optional */,
     std::array<bool, 3> grad_input_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  c10::MaybeOwned<Tensor> weight_maybe_owned =
+      at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-
+  c10::MaybeOwned<Tensor> bias_maybe_owned =
+      at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
 
-    auto inputs = _prepare_layer_norm_inputs(input, normalized_shape, weight, bias);
-    auto X = std::get<0>(inputs);
-    auto gamma = std::get<1>(inputs);
-    auto beta = std::get<2>(inputs);
-    auto M = std::get<3>(inputs);
-    auto N = std::get<4>(inputs);
+  auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto M = M_N.first;
+  auto N = M_N.second;
+  auto X = input.expect_contiguous();
+  auto gamma = weight.expect_contiguous();
+  auto beta = bias.expect_contiguous();
 
-    Tensor dX;
-    Tensor dgamma;
-    Tensor dbeta;
-    if (grad_input_mask[0]) {
-      dX = at::native::empty_like(
-          X,
-          c10::nullopt /* dtype */,
-          c10::nullopt /* layout */,
-          c10::nullopt /* device */,
-          c10::nullopt /* pin_memory */,
-          LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-    }
-    if (grad_input_mask[1]) {
-      dgamma = M > 0 ? at::native::empty_like(
-                           gamma,
-                           c10::nullopt /* dtype */,
-                           c10::nullopt /* layout */,
-                           c10::nullopt /* device */,
-                           c10::nullopt /* pin_memory */,
-                           LEGACY_CONTIGUOUS_MEMORY_FORMAT)
-                     : at::native::zeros_like(
-                           gamma,
-                           c10::nullopt /* dtype */,
-                           c10::nullopt /* layout */,
-                           c10::nullopt /* device */,
-                           c10::nullopt /* pin_memory */,
-                           LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-    }
-    if (grad_input_mask[2]) {
-      dbeta = M > 0 ? at::native::empty_like(
-                          beta,
-                          c10::nullopt /* dtype */,
-                          c10::nullopt /* layout */,
-                          c10::nullopt /* device */,
-                          c10::nullopt /* pin_memory */,
-                          LEGACY_CONTIGUOUS_MEMORY_FORMAT)
-                    : at::native::zeros_like(
-                          beta,
-                          c10::nullopt /* dtype */,
-                          c10::nullopt /* layout */,
-                          c10::nullopt /* device */,
-                          c10::nullopt /* pin_memory */,
-                          LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-    }
-    if (M > 0) {
-      LayerNormBackwardKernelImpl(
-          dY, X, mean, rstd, gamma, M, N, &dX, &dgamma, &dbeta);
-    }
-    return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
+  Tensor dX;
+  Tensor dgamma;
+  Tensor dbeta;
+  if (grad_input_mask[0]) {
+    dX = at::native::empty_like(
+        *X,
+        c10::nullopt /* dtype */,
+        c10::nullopt /* layout */,
+        c10::nullopt /* device */,
+        c10::nullopt /* pin_memory */,
+        LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (grad_input_mask[1]) {
+    dgamma = M > 0 ? at::native::empty_like(
+                         *gamma,
+                         c10::nullopt /* dtype */,
+                         c10::nullopt /* layout */,
+                         c10::nullopt /* device */,
+                         c10::nullopt /* pin_memory */,
+                         LEGACY_CONTIGUOUS_MEMORY_FORMAT)
+                   : at::native::zeros_like(
+                         *gamma,
+                         c10::nullopt /* dtype */,
+                         c10::nullopt /* layout */,
+                         c10::nullopt /* device */,
+                         c10::nullopt /* pin_memory */,
+                         LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (grad_input_mask[2]) {
+    dbeta = M > 0 ? at::native::empty_like(
+                        *beta,
+                        c10::nullopt /* dtype */,
+                        c10::nullopt /* layout */,
+                        c10::nullopt /* device */,
+                        c10::nullopt /* pin_memory */,
+                        LEGACY_CONTIGUOUS_MEMORY_FORMAT)
+                  : at::native::zeros_like(
+                        *beta,
+                        c10::nullopt /* dtype */,
+                        c10::nullopt /* layout */,
+                        c10::nullopt /* device */,
+                        c10::nullopt /* pin_memory */,
+                        LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  if (M > 0) {
+    LayerNormBackwardKernelImpl(
+        dY, *X, mean, rstd, *gamma, M, N, &dX, &dgamma, &dbeta);
+  }
+  return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
 }
 
-
 REGISTER_DISPATCH(LayerNormKernel, &LayerNormKernelImpl);
 REGISTER_DISPATCH(LayerNormBackwardKernel, &LayerNormBackwardKernelImpl);
 
 
@@ -60,24 +60,24 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_cpu(
   const Tensor& bias = *bias_maybe_owned;
 
 
-  auto inputs = _prepare_layer_norm_inputs(input, normalized_shape, weight, bias);
-  auto X = std::get<0>(inputs);
-  auto gamma = std::get<1>(inputs);
-  auto beta = std::get<2>(inputs);
-  auto M = std::get<3>(inputs);
-  auto N = std::get<4>(inputs);
+  auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto M = M_N.first;
+  auto N = M_N.second;
+  auto X = input.expect_contiguous();
+  auto gamma = weight.expect_contiguous();
+  auto beta = bias.expect_contiguous();
 
   Tensor Y = at::native::empty_like(
-      X,
+      *X,
       c10::nullopt /* dtype */,
       c10::nullopt /* layout */,
       c10::nullopt /* device */,
       c10::nullopt /* pin_memory */,
       at::MemoryFormat::Contiguous);
-  Tensor mean = at::empty({M}, X.options());
-  Tensor rstd = at::empty({M}, X.options());
+  Tensor mean = at::empty({M}, X->options());
+  Tensor rstd = at::empty({M}, X->options());
 
-  layer_norm_cpu_out(Y, mean, rstd, X, normalized_shape, gamma, beta, eps, M, N);
+  layer_norm_cpu_out(Y, mean, rstd, *X, normalized_shape, *gamma, *beta, eps, M, N);
   return std::make_tuple(std::move(Y), std::move(mean), std::move(rstd));
 }
 
@@ -86,70 +86,74 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_backward_cpu(
     const Tensor& input,
     IntArrayRef normalized_shape,
     const Tensor& mean,
-    const Tensor& rstd, const c10::optional<Tensor>& weight_opt /* optional */, const c10::optional<Tensor>& bias_opt /* optional */,
+    const Tensor& rstd,
+    const c10::optional<Tensor>& weight_opt /* optional */,
+    const c10::optional<Tensor>& bias_opt /* optional */,
     std::array<bool, 3> grad_input_mask) {
   // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
+  c10::MaybeOwned<Tensor> weight_maybe_owned =
+      at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-
-
-    auto inputs = _prepare_layer_norm_inputs(input, normalized_shape, weight, bias);
-    auto X = std::get<0>(inputs);
-    auto gamma = std::get<1>(inputs);
-    auto beta = std::get<2>(inputs);
-    auto M = std::get<3>(inputs);
-    auto N = std::get<4>(inputs);
-
-    Tensor dX;
-    Tensor dgamma;
-    Tensor dbeta;
-    if (grad_input_mask[0]) {
-      dX = at::native::empty_like(
-          X,
-          c10::nullopt /* dtype */,
-          c10::nullopt /* layout */,
-          c10::nullopt /* device */,
-          c10::nullopt /* pin_memory */,
-          at::MemoryFormat::Contiguous);
-    }
-    if (grad_input_mask[1]) {
-      dgamma = M > 0 ? at::native::empty_like(
-                           gamma,
-                           c10::nullopt /* dtype */,
-                           c10::nullopt /* layout */,
-                           c10::nullopt /* device */,
-                           c10::nullopt /* pin_memory */,
-                           at::MemoryFormat::Contiguous)
-                     : at::native::zeros_like(
-                           gamma,
-                           c10::nullopt /* dtype */,
-                           c10::nullopt /* layout */,
-                           c10::nullopt /* device */,
-                           c10::nullopt /* pin_memory */,
-                           at::MemoryFormat::Contiguous);
-    }
-    if (grad_input_mask[2]) {
-      dbeta = M > 0 ? at::native::empty_like(
-                          beta,
-                          c10::nullopt /* dtype */,
-                          c10::nullopt /* layout */,
-                          c10::nullopt /* device */,
-                          c10::nullopt /* pin_memory */,
-                          at::MemoryFormat::Contiguous)
-                    : at::native::zeros_like(
-                          beta,
-                          c10::nullopt /* dtype */,
-                          c10::nullopt /* layout */,
-                          c10::nullopt /* device */,
-                          c10::nullopt /* pin_memory */,
-                          at::MemoryFormat::Contiguous);
-    }
-    if (M > 0) {
-      LayerNormBackwardKernel(
-          kCPU, dY, X, mean, rstd, gamma, M, N, &dX, &dgamma, &dbeta);
-    }
-    return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
+  c10::MaybeOwned<Tensor> bias_maybe_owned =
+      at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto M = M_N.first;
+  auto N = M_N.second;
+  auto X = input.expect_contiguous();
+  auto gamma = weight.expect_contiguous();
+  auto beta = bias.expect_contiguous();
+
+  Tensor dX;
+  Tensor dgamma;
+  Tensor dbeta;
+  if (grad_input_mask[0]) {
+    dX = at::native::empty_like(
+        *X,
+        c10::nullopt /* dtype */,
+        c10::nullopt /* layout */,
+        c10::nullopt /* device */,
+        c10::nullopt /* pin_memory */,
+        at::MemoryFormat::Contiguous);
+  }
+  if (grad_input_mask[1]) {
+    dgamma = M > 0 ? at::native::empty_like(
+                         *gamma,
+                         c10::nullopt /* dtype */,
+                         c10::nullopt /* layout */,
+                         c10::nullopt /* device */,
+                         c10::nullopt /* pin_memory */,
+                         at::MemoryFormat::Contiguous)
+                   : at::native::zeros_like(
+                         *gamma,
+                         c10::nullopt /* dtype */,
+                         c10::nullopt /* layout */,
+                         c10::nullopt /* device */,
+                         c10::nullopt /* pin_memory */,
+                         at::MemoryFormat::Contiguous);
+  }
+  if (grad_input_mask[2]) {
+    dbeta = M > 0 ? at::native::empty_like(
+                        *beta,
+                        c10::nullopt /* dtype */,
+                        c10::nullopt /* layout */,
+                        c10::nullopt /* device */,
+                        c10::nullopt /* pin_memory */,
+                        at::MemoryFormat::Contiguous)
+                  : at::native::zeros_like(
+                        *beta,
+                        c10::nullopt /* dtype */,
+                        c10::nullopt /* layout */,
+                        c10::nullopt /* device */,
+                        c10::nullopt /* pin_memory */,
+                        at::MemoryFormat::Contiguous);
+  }
+  if (M > 0) {
+    LayerNormBackwardKernel(
+        kCPU, dY, *X, mean, rstd, *gamma, M, N, &dX, &dgamma, &dbeta);
+  }
+  return std::make_tuple(std::move(dX), std::move(dgamma), std::move(dbeta));
 }
 
 Tensor layer_norm(
@@ -160,7 +164,8 @@ Tensor layer_norm(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
 
 
   return std::get<0>(at::native_layer_norm(input, normalized_shape, weight, bias, eps));
@@ -179,15 +184,14 @@ std::tuple<Tensor, Tensor, Tensor> math_native_layer_norm(
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
   const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();});
-
-  auto inputs = _prepare_layer_norm_inputs(input, normalized_shape, weight, bias);
-  auto X = std::get<0>(inputs);
-  auto gamma = std::get<1>(inputs);
-  auto beta = std::get<2>(inputs);
-  auto M = std::get<3>(inputs);
-  // NOLINTNEXTLINE(clang-diagnostic-unused-variable,clang-analyzer-deadcode.DeadStores)
-  auto N = std::get<4>(inputs);
+  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
+  const Tensor& bias = *bias_maybe_owned;
+
+  auto M_N = _check_layer_norm_inputs(input, normalized_shape, weight, bias);
+  auto M = M_N.first;
+  auto X = input.expect_contiguous();
+  auto gamma = weight.expect_contiguous();
+
   auto input_shape = input.sizes();
   const auto input_ndim = input.dim();
   const int normalized_ndim = normalized_shape.size();