cuda: Dynamically loading libcudart.so

yexiang-aws · yexiang-aws · commit 791a48d0412c · 2025-10-20T10:20:47.000-07:00
Add --enable-cudart-dynamic configure option to conditionally link CUDA
runtime library. Enhance entry point detection for cross-version compatibility
and fix functional test linking with separate CUDA_RUNTIME_LIBS variable.

Signed-off-by: Ye Xiang &lt;yexiang@amazon.com&gt;
diff --git a/m4/check_pkg_cuda.m4 b/m4/check_pkg_cuda.m4
@@ -20,9 +20,9 @@ AC_DEFUN([CHECK_PKG_CUDA], [
     [AS_HELP_STRING([--enable-cudart-dynamic],
                     [link cudart dynamically (default=link statically)])],,
     [enable_cudart_dynamic=no])
-  enable_cudart_dynamic=`echo $enable_cudart_dynamic`
   case $enable_cudart_dynamic in
-    yes | no) ;; # only acceptable options.
+    yes) enable_cudart_dynamic_define=1 ;;
+    no)  enable_cudart_dynamic_define=0 ;;
     *) AC_MSG_ERROR([unknown option '$enable_cudart_dynamic' for --enable-cudart-dynamic]) ;;
   esac
   AC_MSG_RESULT([${enable_cudart_dynamic}])
@@ -39,7 +39,11 @@ AC_DEFUN([CHECK_PKG_CUDA], [
          cuda_ldpath="${cuda_realpath}/lib64"
          CUDA_LDFLAGS="-L${cuda_ldpath}"
          CUDA_CPPFLAGS="-isystem ${cuda_realpath}/include"
-         CUDA_LIBS="-l${cudart_lib} -lrt -ldl"
+         AS_IF([test "x${enable_cudart_dynamic}" = "xyes"],
+               [CUDA_LIBS="-lrt -ldl"
+                CUDA_RUNTIME_LIBS="-l${cudart_lib}"],
+               [CUDA_LIBS="-l${cudart_lib} -lrt -ldl"
+                CUDA_RUNTIME_LIBS=""])
          LDFLAGS="${CUDA_LDFLAGS} ${LDFLAGS}"
          LIBS="${CUDA_LIBS} ${LIBS}"
          CPPFLAGS="${CUDA_CPPFLAGS} ${CPPFLAGS}"
@@ -48,7 +52,7 @@ AC_DEFUN([CHECK_PKG_CUDA], [
   AS_IF([test "${check_pkg_found}" = "yes"],
         [AC_SEARCH_LIBS(
          [cudaGetDriverEntryPoint],
-         [${cudartlib}],
+         [${cudart_lib}],
          [],
          [check_pkg_found=no],
          [-ldl -lrt])])
@@ -101,11 +105,13 @@ AC_DEFUN([CHECK_PKG_CUDA], [
   AC_DEFINE_UNQUOTED([HAVE_CUDA_DMABUF_SUPPORT], [${check_cuda_dmabuf_define}], [Defined to 1 if CUDA DMA-BUF support is available])
   AC_DEFINE_UNQUOTED([HAVE_CUDA_DMABUF_MAPPING_TYPE_PCIE], [${check_cuda_dmabuf_mapping_type_pcie}], [Defined to 1 if CUDA DMA mapping type PCIE support is available])
   AC_DEFINE_UNQUOTED([HAVE_CUDA_GDRFLUSH_SUPPORT], [${check_cuda_gdr_flush_define}], [Defined to 1 if CUDA cuFlushGPUDirectRDMAWrites support is available])
+  AC_DEFINE_UNQUOTED([ENABLE_CUDART_DYNAMIC], [${enable_cudart_dynamic_define}], [Defined to 1 if CUDA dynamic runtime linking is enabled])
   AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"])
 
   AC_SUBST([CUDA_LDFLAGS])
   AC_SUBST([CUDA_CPPFLAGS])
   AC_SUBST([CUDA_LIBS])
+  AC_SUBST([CUDA_RUNTIME_LIBS])
 
   CPPFLAGS="${check_pkg_CPPFLAGS_save}"
   LDFLAGS="${check_pkg_LDFLAGS_save}"
diff --git a/src/nccl_ofi_cuda.cpp b/src/nccl_ofi_cuda.cpp
@@ -6,6 +6,8 @@
 #include "config.h"
 
 #include <errno.h>
+#include <dlfcn.h>
+#include <memory>
 #include <cudaTypedefs.h>
 #include <cuda_runtime_api.h>
 
@@ -14,50 +16,62 @@
 #include "nccl_ofi_log.h"
 #include "nccl_ofi_param.h"
 
+/* CUDA Runtime function pointers - only for functions without driver equivalents */
+static cudaError_t (*pfn_cudaRuntimeGetVersion)(int *runtimeVersion) = NULL;
+
+/* Both entry point functions for cross-version compatibility */
+static cudaError_t (*pfn_cudaGetDriverEntryPointByVersion)(const char *symbol, void **funcPtr, unsigned int cudaVersion, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus) = NULL;
+static cudaError_t (*pfn_cudaGetDriverEntryPoint)(const char *symbol, void **funcPtr, unsigned long long flags, enum cudaDriverEntryPointQueryResult *driverStatus) = NULL;
+
+#if ENABLE_CUDART_DYNAMIC
+
+struct DlcloseDeleter {
+	void operator()(void* handle) const {
+		if (handle != nullptr) {
+			dlclose(handle);
+		}
+	}
+};
+
+/* Global unique_ptr to automatically call dlclose when plugin is unloaded */
+static std::unique_ptr<void, DlcloseDeleter> cudaruntime_lib;
+#endif
+
 #define DECLARE_CUDA_FUNCTION(function, version) static PFN_##function##_v##version pfn_##function = NULL
 
-#if CUDART_VERSION >= 13000
+/* Simple function resolution with fallback for cross-version compatibility */
 #define RESOLVE_CUDA_FUNCTION(function, version) do {                                                                  \
-		enum cudaDriverEntryPointQueryResult result;                                                            \
-		cudaError_t err =                                                                                       \
-			cudaGetDriverEntryPointByVersion(#function, (void **)&pfn_##function, version, cudaEnableDefault, &result); \
-		if (err != cudaSuccess) {                                                                               \
-			switch (result) {                                                                               \
-			case cudaDriverEntryPointSymbolNotFound:                                                        \
-				NCCL_OFI_WARN("Failed to resolve CUDA function %s", #function);                         \
-				break;                                                                                  \
-			case cudaDriverEntryPointVersionNotSufficent:                                                   \
-				NCCL_OFI_WARN("Insufficient driver to use CUDA function %s", #function);                \
-				break;                                                                                  \
-			case cudaDriverEntryPointSuccess:                                                               \
-			default:                                                                                        \
-				NCCL_OFI_WARN("Unexpected cudaDriverEntryPointQueryResutlt value %d", (int)result);     \
-				break;                                                                                  \
+		enum cudaDriverEntryPointQueryResult result = cudaDriverEntryPointSymbolNotFound;                   \
+		cudaError_t err = cudaErrorUnknown;                                                                     \
+		bool resolved = false;                                                                                  \
+		/* Try versioned entry point first (CUDA 13+ preferred) */                                             \
+		if (pfn_cudaGetDriverEntryPointByVersion != NULL) {                                                    \
+			err = pfn_cudaGetDriverEntryPointByVersion(#function, (void **)&pfn_##function, version, cudaEnableDefault, &result); \
+			if (err == cudaSuccess && pfn_##function != NULL) {                                             \
+				resolved = true;                                                                         \
 			}                                                                                               \
 		}                                                                                                       \
-	} while (0);
-#else
-#define RESOLVE_CUDA_FUNCTION(function, version) do {                                                                   \
-		enum cudaDriverEntryPointQueryResult result;                                                            \
-		cudaError_t err =                                                                                       \
-			cudaGetDriverEntryPoint(#function, (void **)&pfn_##function, cudaEnableDefault, &result);       \
-		if (err != cudaSuccess) {                                                                               \
-			switch (result) {                                                                               \
-			case cudaDriverEntryPointSymbolNotFound:                                                        \
-				NCCL_OFI_WARN("Failed to resolve CUDA function %s", #function);             	        \
-				break;                                                                                  \
-			case cudaDriverEntryPointVersionNotSufficent:                                                   \
-				NCCL_OFI_WARN("Insufficient driver to use CUDA function %s", #function);                \
-				break;                                                                                  \
-			case cudaDriverEntryPointSuccess:                                                               \
-			default:                                                                                        \
-				NCCL_OFI_WARN("Unexpected cudaDriverEntryPointQueryResutlt value %d", (int)result);     \
-				break;                                                                                  \
+		/* Fallback to legacy entry point for CUDA 12 compatibility */                                         \
+		if (!resolved && pfn_cudaGetDriverEntryPoint != NULL) {                                                \
+			err = pfn_cudaGetDriverEntryPoint(#function, (void **)&pfn_##function, cudaEnableDefault, &result); \
+			if (err == cudaSuccess && pfn_##function != NULL) {                                             \
+				resolved = true;                                                                         \
 			}                                                                                               \
 		}                                                                                                       \
+		if (!resolved) {                                                                                        \
+			NCCL_OFI_WARN("Failed to resolve CUDA function %s (last error: %d, result: %d)", #function, err, result);                             \
+			return -ENOTSUP;                                                                                \
+		}                                                                                                       \
 	} while (0);
-#endif
 
+#define LOAD_CUDA_RUNTIME_SYM(handle, sym)                                   \
+	pfn_##sym = (decltype(pfn_##sym))dlsym(handle, #sym);                 \
+	if (pfn_##sym == NULL) {                                              \
+		NCCL_OFI_WARN("Failed to load CUDA runtime symbol %s", #sym);     \
+		return -ENOTSUP;                                                  \
+	}
+
+/* Use driver APIs wherever possible - they are version-stable */
 DECLARE_CUDA_FUNCTION(cuDriverGetVersion, 2020);
 DECLARE_CUDA_FUNCTION(cuCtxGetDevice, 2000);
 DECLARE_CUDA_FUNCTION(cuDeviceGetAttribute, 2000);
@@ -77,13 +91,58 @@ int nccl_net_ofi_cuda_init(void)
 {
 	int driverVersion = -1;
 	int runtimeVersion = -1;
+	cudaError_t res;
+	CUresult cu_ret;
+
+#if ENABLE_CUDART_DYNAMIC
+	/* Dynamic loading for binaries when static library support disabled */
+	/* Load library only once and keep it loaded for program lifetime */
+	if (cudaruntime_lib == nullptr) {
+		(void) dlerror(); /* Clear any previous errors */
+		cudaruntime_lib = std::unique_ptr<void, DlcloseDeleter>(dlopen("libcudart.so", RTLD_NOW));
+		if (!cudaruntime_lib) {
+			NCCL_OFI_WARN("Failed to find CUDA Runtime library: %s", dlerror());
+			return -ENOTSUP;
+		}
+	}
 
-	cudaError_t res = cudaRuntimeGetVersion(&runtimeVersion);
+	LOAD_CUDA_RUNTIME_SYM(cudaruntime_lib.get(), cudaRuntimeGetVersion);
+
+	/* Get runtime version first to determine which entry point functions to load */
+	res = pfn_cudaRuntimeGetVersion(&runtimeVersion);
 	if (res != cudaSuccess) {
 		NCCL_OFI_WARN("Failed to query CUDA runtime version.");
 		return -EINVAL;
 	}
 
+	if (runtimeVersion >= 13000) {
+		LOAD_CUDA_RUNTIME_SYM(cudaruntime_lib.get(), cudaGetDriverEntryPointByVersion);
+	} else {
+		LOAD_CUDA_RUNTIME_SYM(cudaruntime_lib.get(), cudaGetDriverEntryPoint);
+	}
+
+	if (pfn_cudaGetDriverEntryPointByVersion == NULL && pfn_cudaGetDriverEntryPoint == NULL) {
+		NCCL_OFI_WARN("No CUDA driver entry point functions available in runtime");
+		return -ENOTSUP;
+	}
+#else
+	/* Static CUDA runtime - use direct function calls */
+	pfn_cudaRuntimeGetVersion = cudaRuntimeGetVersion;
+
+	/* Get runtime version first to determine which entry point functions to use */
+	res = cudaRuntimeGetVersion(&runtimeVersion);
+	if (res != cudaSuccess) {
+		NCCL_OFI_WARN("Failed to query CUDA runtime version.");
+		return -EINVAL;
+	}
+
+#if CUDART_VERSION >= 13000
+	pfn_cudaGetDriverEntryPointByVersion = cudaGetDriverEntryPointByVersion;
+#else
+	pfn_cudaGetDriverEntryPoint = cudaGetDriverEntryPoint;
+#endif
+#endif
+
 	RESOLVE_CUDA_FUNCTION(cuDriverGetVersion, 2020);
 	RESOLVE_CUDA_FUNCTION(cuCtxGetDevice, 2000);
 	RESOLVE_CUDA_FUNCTION(cuDeviceGetAttribute, 2000);
@@ -99,16 +158,16 @@ int nccl_net_ofi_cuda_init(void)
 	RESOLVE_CUDA_FUNCTION(cuMemFree, 3020);
 	RESOLVE_CUDA_FUNCTION(cuMemcpy, 4000);
 
-	CUresult cu_ret = pfn_cuDriverGetVersion(&driverVersion);
+	cu_ret = pfn_cuDriverGetVersion(&driverVersion);
 	if (cu_ret != CUDA_SUCCESS) {
 		NCCL_OFI_WARN("Failed to query CUDA driver version.");
 		return -EINVAL;
 	}
 
 	NCCL_OFI_INFO(NCCL_INIT | NCCL_NET,
-		      "Using CUDA driver version %d with runtime %d",
-		      driverVersion,
-		      runtimeVersion);
+	              "Using CUDA driver version %d with runtime %d",
+	              driverVersion,
+	              runtimeVersion);
 
 	if (HAVE_CUDA_GDRFLUSH_SUPPORT && nccl_net_ofi_cuda_have_gdr_support_attr() && ofi_nccl_cuda_flush_enable()) {
 		NCCL_OFI_WARN("CUDA flush enabled");
@@ -137,7 +196,6 @@ int nccl_net_ofi_cuda_flush_gpudirect_rdma_writes(void)
 #endif
 }
 
-
 int nccl_net_ofi_cuda_mem_alloc(void **ptr, size_t size)
 {
 	CUdeviceptr d_ptr;
diff --git a/tests/functional/Makefile.am b/tests/functional/Makefile.am
@@ -11,7 +11,7 @@ AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party
 AM_CPPFLAGS += -isystem $(abs_top_srcdir)/3rd-party/nccl/$(DEVICE_INTERFACE)/include
 AM_CPPFLAGS += $(MPI_CPPFLAGS) $(CUDA_CPPFLAGS)
 AM_LDFLAGS = $(MPI_LDFLAGS) $(CUDA_LDFLAGS)
-LDADD = $(top_builddir)/src/libinternal_plugin.la $(MPI_LIBS) $(CUDA_LIBS)
+LDADD = $(top_builddir)/src/libinternal_plugin.la $(MPI_LIBS) $(CUDA_LIBS) $(CUDA_RUNTIME_LIBS)
 
 # this is a little jenky, but we've always assumed we had wrapper compilers
 # available for MPI.  We don't want to just override CXX to get mpicxx used,