pytorch
diff --git a/‎aten/src/ATen/Context.cpp
+15 b/‎aten/src/ATen/Context.cpp
+15
diff --git a/‎aten/src/ATen/Context.h
+5 b/‎aten/src/ATen/Context.h
+5
diff --git a/‎aten/src/ATen/LinalgBackend.h
+31 b/‎aten/src/ATen/LinalgBackend.h
+31
diff --git a/‎aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
+140-42 b/‎aten/src/ATen/native/cuda/BatchLinearAlgebra.cpp
+140-42
diff --git a/‎docs/source/backends.rst
+2 b/‎docs/source/backends.rst
+2
@@ -147,6 +147,21 @@ void Context::setAllowTF32CuBLAS(bool b) {
   allow_tf32_cublas = b;
 }
 
+at::LinalgBackend Context::linalgPreferredBackend() const {
+  return linalg_preferred_backend;
+}
+
+void Context::setLinalgPreferredBackend(at::LinalgBackend b) {
+  linalg_preferred_backend = b;
+  if (b != at::LinalgBackend::Default) {
+    TORCH_WARN_ONCE(
+      "torch.backends.cuda.preferred_linalg_library is an experimental feature. "
+      "If you see any error or unexpected behavior when this flag is set "
+      "please file an issue on GitHub."
+    );
+  }
+}
+
 bool Context::allowFP16ReductionCuBLAS() const {
   return allow_fp16_reduction_cublas;
 }
 
@@ -3,6 +3,7 @@
 #include <ATen/core/ATenGeneral.h>
 #include <ATen/core/Generator.h>
 #include <ATen/CPUGeneratorImpl.h>
+#include <ATen/LinalgBackend.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/core/DeprecatedTypeProperties.h>
 #include <ATen/detail/CUDAHooksInterface.h>
@@ -128,6 +129,9 @@ class TORCH_API Context {
   bool deterministicCuDNN() const;
   void setDeterministicCuDNN(bool);
 
+  at::LinalgBackend linalgPreferredBackend() const;
+  void setLinalgPreferredBackend(at::LinalgBackend);
+
   // Note [Enabling Deterministic Operations]
   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   // Operations in PyTorch that normally act nondeterministically, but have an alternate
@@ -249,6 +253,7 @@ class TORCH_API Context {
   bool allow_tf32_cublas = true;
   bool allow_fp16_reduction_cublas = true;
   bool enabled_mkldnn = true;
+  at::LinalgBackend linalg_preferred_backend = at::LinalgBackend::Default;
   #ifdef C10_MOBILE
   bool release_original_weights = true;
   #else
 
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+
+#include <ostream>
+#include <string>
+
+namespace at {
+
+enum class LinalgBackend : int8_t { Default, Cusolver, Magma };
+
+inline std::string LinalgBackendToString(at::LinalgBackend backend) {
+  switch (backend) {
+    case LinalgBackend::Default:
+      return "at::LinalgBackend::Default";
+    case LinalgBackend::Cusolver:
+      return "at::LinalgBackend::Cusolver";
+    case LinalgBackend::Magma:
+      return "at::LinalgBackend::Magma";
+    default:
+      TORCH_CHECK(false, "Unknown linalg backend");
+  }
+}
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    at::LinalgBackend backend) {
+  return stream << LinalgBackendToString(backend);
+}
+
+} // namespace c10
@@ -1471,10 +1471,18 @@ Tensor _inverse_helper_cuda_legacy(const Tensor& self) {
 
 Tensor _inverse_helper_cuda(const Tensor& self) {
 #ifdef USE_CUSOLVER
-  if ((self.dim() == 2) || (/* self.dim() > 2 && */ batchCount(self) <= 2) || !use_magma_) {
-    return _inverse_helper_cuda_lib(self);    // cusolver or cublas
-  } else {
-    return _inverse_helper_cuda_legacy(self); // magma-cuda
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Cusolver:
+      return _inverse_helper_cuda_lib(self); // cusolver or cublas
+    case at::LinalgBackend::Magma:
+      return _inverse_helper_cuda_legacy(self); // magma-cuda
+    default:
+      if (batchCount(self) <= 2 || !use_magma_) {
+        return _inverse_helper_cuda_lib(self); // cusolver or cublas
+      } else {
+        return _inverse_helper_cuda_legacy(self); // magma-cuda
+      }
   }
 #else
   return _inverse_helper_cuda_legacy(self); // magma-cuda
@@ -1503,10 +1511,18 @@ Tensor& _linalg_inv_out_helper_cuda(Tensor &result, Tensor& infos_lu, Tensor& in
   // This function calculates the inverse matrix in-place
   // result should be in column major order and contain matrices to invert
 #ifdef USE_CUSOLVER
-  if ((result.dim() == 2) || (/* result.dim() > 2 && */ batchCount(result) <= 2) || !use_magma_) {
-    return _linalg_inv_out_helper_cuda_lib(result, infos_lu, infos_getri);  // cusolver or cublas
-  } else {
-    return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri);  // magma-cuda
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Cusolver:
+      return _linalg_inv_out_helper_cuda_lib(result, infos_lu, infos_getri);  // cusolver or cublas
+    case at::LinalgBackend::Magma:
+      return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri);  // magma-cuda
+    default:
+      if (batchCount(result) <= 2 || !use_magma_) {
+        return _linalg_inv_out_helper_cuda_lib(result, infos_lu, infos_getri);  // cusolver or cublas
+      } else {
+        return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri);  // magma-cuda
+      }
   }
 #else
   return _linalg_inv_out_helper_cuda_legacy(result, infos_lu, infos_getri);  // magma-cuda
@@ -1600,10 +1616,18 @@ Tensor _cholesky_solve_helper_cuda_magma(const Tensor& self, const Tensor& A, bo
 //     Batched cholesky_solve is dispatched to magma.
 Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
 #ifdef USE_CUSOLVER
-  if (batchCount(self) == 1 || !use_magma_) {
-    return _cholesky_solve_helper_cuda_cusolver(self, A, upper);
-  } else {
-    return _cholesky_solve_helper_cuda_magma(self, A, upper);
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Cusolver:
+      return _cholesky_solve_helper_cuda_cusolver(self, A, upper);
+    case at::LinalgBackend::Magma:
+      return _cholesky_solve_helper_cuda_magma(self, A, upper);
+    default:
+      if (batchCount(self) == 1 || !use_magma_) {
+        return _cholesky_solve_helper_cuda_cusolver(self, A, upper);
+      } else {
+        return _cholesky_solve_helper_cuda_magma(self, A, upper);
+      }
   }
 #else
   return _cholesky_solve_helper_cuda_magma(self, A, upper);
@@ -1706,10 +1730,20 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
 
 static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) {
 #ifdef USE_CUSOLVER
-  if (batchCount(input) == 1 || !use_magma_ || use_cusolver_potrf_batched_) {
-    cholesky_helper_cusolver(input, upper, info);
-  } else {
-    cholesky_helper_magma(input, upper, info);
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Cusolver:
+      cholesky_helper_cusolver(input, upper, info);
+      break;
+    case at::LinalgBackend::Magma:
+      cholesky_helper_magma(input, upper, info);
+      break;
+    default:
+      if (batchCount(input) == 1 || !use_magma_ || use_cusolver_potrf_batched_) {
+        cholesky_helper_cusolver(input, upper, info);
+      } else {
+        cholesky_helper_magma(input, upper, info);
+      }
   }
 #else
   cholesky_helper_magma(input, upper, info);
@@ -1777,10 +1811,19 @@ Tensor& cholesky_inverse_kernel_impl(Tensor &result, Tensor& infos, bool upper)
   // result should be in column major order and contain matrices to invert
   // the content of result is overwritten by 'apply_cholesky_inverse'
 #ifdef USE_CUSOLVER
-  if (batchCount(result) == 1 || !use_magma_) {
-    return cholesky_inverse_kernel_impl_cusolver(result, infos, upper);
-  } else {
-    return cholesky_inverse_kernel_impl_magma(result, infos, upper);
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Cusolver:
+      return cholesky_inverse_kernel_impl_cusolver(result, infos, upper);
+    case at::LinalgBackend::Magma:
+      return cholesky_inverse_kernel_impl_magma(result, infos, upper);
+    default:
+      if (batchCount(result) == 1 ||
+          !use_magma_) {
+        return cholesky_inverse_kernel_impl_cusolver(result, infos, upper);
+      } else {
+        return cholesky_inverse_kernel_impl_magma(result, infos, upper);
+      }
   }
 #else
   return cholesky_inverse_kernel_impl_magma(result, infos, upper);
@@ -1944,20 +1987,39 @@ static void lu_batched_magma(const Tensor& input, const Tensor& pivots, const Te
 static void apply_lu(const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) {
   int64_t batch_size = batchCount(input);
 #ifdef USE_CUSOLVER
-  // Use a heuristic to determine that cusolver is faster than MAGMA for the following sizes.
-  auto m = input.size(-2);
-  // exclude complex128 since nan_to_num_ does not work with it.
-  if ((batch_size == 1 || (batch_size <= 8 && m <= 16) || !use_magma_ ) && !input.is_complex()) {
-    lu_looped_cusolver(input, pivots, infos, compute_pivots);
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Cusolver:
+      lu_looped_cusolver(input, pivots, infos, compute_pivots);
+      break;
+    case at::LinalgBackend::Magma:
+      if (batch_size == 1) {
+        lu_looped_magma(input, pivots, infos, compute_pivots);
+      } else {
+        lu_batched_magma(input, pivots, infos, compute_pivots);
+      }
+      break;
+    default:
+      // Use a heuristic to determine that cusolver is faster than MAGMA for the following sizes.
+      auto m = input.size(-2);
+      // exclude complex128 since nan_to_num_ does not work with it.
+      if ((batch_size == 1 ||
+            (batch_size <= 8 && m <= 16) ||
+            !use_magma_)
+          && !input.is_complex()) {
+        lu_looped_cusolver(input, pivots, infos, compute_pivots);
+      } else {
+        lu_batched_magma(input, pivots, infos, compute_pivots);
+      }
   }
 #else
   if (batch_size == 1) {
     lu_looped_magma(input, pivots, infos, compute_pivots);
   }
-#endif // USE_CUSOLVER
   else {
     lu_batched_magma(input, pivots, infos, compute_pivots);
   }
+#endif // USE_CUSOLVER
 }
 
 REGISTER_CUDA_DISPATCH(lu_stub, &apply_lu);
@@ -2064,12 +2126,12 @@ Tensor& orgqr_kernel_impl(Tensor& result, const Tensor& tau) {
   // See discussions in https://github.com/pytorch/pytorch/pull/51348 for comparison of cuSOLVER-MAGMA
   // and Windows failure.
   // For reference here is the MAGMA-based implementation: https://gist.github.com/IvanYashchuk/2db50002c9d3c1462ff769e6410ad983
-  #if defined(USE_CUSOLVER)
-    return orgqr_helper_cusolver(result, tau); // cusolver
-  #else
-    TORCH_CHECK(false, "Calling torch.orgqr on a CUDA tensor requires compiling ",
-      "PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER support.");
-  #endif
+#if defined(USE_CUSOLVER)
+  return orgqr_helper_cusolver(result, tau); // cusolver
+#else
+  TORCH_CHECK(false, "Calling torch.orgqr on a CUDA tensor requires compiling ",
+    "PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER support.");
+#endif
 }
 
 REGISTER_CUDA_DISPATCH(orgqr_stub, &orgqr_kernel_impl);
@@ -2136,7 +2198,14 @@ void geqrf_magma(const Tensor& input, const Tensor& tau) {
 // This is a backend library dispatching helper function for calling looped batch implementation
 void geqrf_looped(const Tensor& input, const Tensor& tau) {
 #if defined(USE_CUSOLVER)
-  return geqrf_cusolver(input, tau);
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Magma:
+      return geqrf_magma(input, tau);
+    case at::LinalgBackend::Cusolver:
+    default:
+      return geqrf_cusolver(input, tau);
+  }
 #else
   return geqrf_magma(input, tau);
 #endif
@@ -2273,9 +2342,16 @@ std::tuple<Tensor, Tensor> linalg_qr_helper_magma(const Tensor& self, c10::strin
 
 std::tuple<Tensor, Tensor> _linalg_qr_helper_cuda(const Tensor& input, c10::string_view mode) {
 #if defined(USE_CUSOLVER)
-  // _linalg_qr_helper_default is a generic function that is implemented using
-  // geqrf_stub and orgqr_stub. It dispatches to cuSOLVER for CUDA inputs if USE_CUSOLVER is defined
-  return _linalg_qr_helper_default(input, mode);
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Magma:
+      return linalg_qr_helper_magma(input, mode);
+    case at::LinalgBackend::Cusolver:
+    default:
+      // _linalg_qr_helper_default is a generic function that is implemented using
+      // geqrf_stub and orgqr_stub. It dispatches to cuSOLVER for CUDA inputs if USE_CUSOLVER is defined
+      return _linalg_qr_helper_default(input, mode);
+  }
 #else
   return linalg_qr_helper_magma(input, mode);
 #endif
@@ -2432,7 +2508,15 @@ void linalg_eigh_magma(const Tensor& eigenvalues, const Tensor& eigenvectors, co
 
 void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
 #if defined(USE_CUSOLVER)
-  linalg_eigh_cusolver(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Magma:
+      linalg_eigh_magma(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
+      break;
+    case at::LinalgBackend::Cusolver:
+    default:
+      linalg_eigh_cusolver(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
+  }
 #else
   linalg_eigh_magma(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors);
 #endif
@@ -2731,7 +2815,14 @@ std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda_legacy(const Tensor& self, b
 
 std::tuple<Tensor, Tensor, Tensor> _svd_helper_cuda(const Tensor& self, bool some, bool compute_uv) {
 #ifdef USE_CUSOLVER
-  return _svd_helper_cuda_lib(self, some, compute_uv);
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Magma:
+      return _svd_helper_cuda_legacy(self, some, compute_uv);
+    case at::LinalgBackend::Cusolver:
+    default:
+      return _svd_helper_cuda_lib(self, some, compute_uv);
+  }
 #else
   return _svd_helper_cuda_legacy(self, some, compute_uv);
 #endif
@@ -3046,10 +3137,17 @@ void linalg_lstsq_gels(const Tensor& A, const Tensor& B, const Tensor& /*infos*/
 
 void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
 #if defined(USE_CUSOLVER)
-  // linalg_lstsq_gels is a generic function that is implemented using
-  // geqrf_stub, ormqr_stub, and triangular_solve_stub
-  // It dispatches to cuSOLVER for CUDA inputs if USE_CUSOLVER is defined
-  return linalg_lstsq_gels(a, b, infos);
+  auto preferred_backend = at::globalContext().linalgPreferredBackend();
+  switch (preferred_backend) {
+    case at::LinalgBackend::Magma:
+      return gels_magma(a, b, infos);
+    case at::LinalgBackend::Cusolver:
+    default:
+      // linalg_lstsq_gels is a generic function that is implemented using
+      // geqrf_stub, ormqr_stub, and triangular_solve_stub
+      // It dispatches to cuSOLVER for CUDA inputs if USE_CUSOLVER is defined
+      return linalg_lstsq_gels(a, b, infos);
+  }
 #else
   return gels_magma(a, b, infos);
 #endif
 
@@ -45,6 +45,8 @@ torch.backends.cuda
 
         Clears the cuFFT plan cache.
 
+.. autofunction:: torch.backends.cuda.preferred_linalg_library
+
 
 torch.backends.cudnn
 ^^^^^^^^^^^^^^^^^^^^