[Target] Support CUDA device function calls

Hzfengsy · Hzfengsy · commit f4db4728c226 · 2025-06-13T11:07:02.000+08:00
This commit adds support for CUDA device function calls by:
1. Modifying the calling convention handling in CUDA codegen to support both
   device kernel launches and device function calls
2. Updating the function signature printing to emit appropriate CUDA attributes
   (__global__ vs __device__) based on calling convention
3. Adding a test case demonstrating device function calls
4. Fixing target handling in split_host_device_mods to properly handle device
   function dictionaries
5. Adding a safety check for global symbol extraction

The changes enable proper compilation and execution of CUDA device functions
that can be called from CUDA kernels.

Example:
```python
@I.ir_module
class Module:
    @T.prim_func(private=True)
    def add(a: T.float32, b: T.float32) -&gt; T.float32:
        return a + b

    @T.prim_func
    def main(
        A: T.Buffer((1024, 1024), "float32"),
        B: T.Buffer((1024, 1024), "float32"),
        C: T.Buffer((1024, 1024), "float32"),
    ):
        for bx in T.thread_binding(1024, "blockIdx.x"):
            for tx in T.thread_binding(1024, "threadIdx.x"):
                C[bx, tx] = Module.add(A[bx, tx], B[bx, tx])

```
diff --git a/python/tvm/tir/build.py b/python/tvm/tir/build.py
@@ -54,18 +54,23 @@ class CallConv(enum.IntEnum):
         kDeviceKernelLaunch = 2
 
     host_mod = tvm.tir.transform.Filter(
-        lambda f: int(f.attrs.get("calling_conv", CallConv.kDefault))
-        != int(CallConv.kDeviceKernelLaunch)
+        lambda f: "cpu" in str(f.attrs.get("target", "cpu"))
     )(mod)
     device_mod = tvm.tir.transform.Filter(
-        lambda f: int(f.attrs.get("calling_conv", CallConv.kDefault))
-        == int(CallConv.kDeviceKernelLaunch)
+        lambda f: "cpu" not in str(f.attrs.get("target", "cpu"))
     )(mod)
+    # TODO(syfeng): Here we use str as key since target hash is not correct
+    target_str2target = {}
+    device_func_dict = {}
     device_mod_dict = {}
     for gv, func in device_mod.functions.items():
-        device_mod_dict.setdefault(func.attrs.get("target", None), dict()).update({gv: func})
-    for target, funcs in device_mod_dict.items():
-        device_mod_dict[target] = tvm.IRModule(funcs, attrs=device_mod.attrs)
+        target = func.attrs.get("target", None)
+        target_str = str(target) if target is not None else ""
+        target_str2target[target_str] = target  # This might be overridden by the last one
+        device_func_dict.setdefault(target_str, dict()).update({gv: func})
+    for target_str in target_str2target.keys():
+        target = target_str2target[target_str]
+        device_mod_dict[target] = tvm.IRModule(device_func_dict[target_str], attrs=device_mod.attrs)
     return host_mod, device_mod_dict
 
 
diff --git a/src/target/build_common.h b/src/target/build_common.h
@@ -56,7 +56,9 @@ inline std::unordered_map<std::string, runtime::FunctionInfo> ExtractFuncInfo(co
       }
     }
     auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    fmap[static_cast<std::string>(global_symbol.value())] = info;
+    if (global_symbol) {
+      fmap[static_cast<std::string>(global_symbol.value())] = info;
+    }
   }
   return fmap;
 }
diff --git a/src/target/opt/build_cuda_on.cc b/src/target/opt/build_cuda_on.cc
@@ -134,9 +134,12 @@ runtime::Module BuildCUDA(IRModule mod, Target target) {
   for (auto [gvar, base_func] : mod->functions) {
     ICHECK(base_func->IsInstance<PrimFuncNode>()) << "CodeGenCUDA: Can only take PrimFunc";
     auto prim_func = Downcast<PrimFunc>(base_func);
-    auto calling_conv = prim_func->GetAttr<Integer>(tvm::attr::kCallingConv);
-    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch)
-        << "CodeGenCUDA: expect calling_conv equals CallingConv::kDeviceKernelLaunch";
+    auto calling_conv =
+        prim_func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(tvm::CallingConv::kDefault));
+    ICHECK(calling_conv == CallingConv::kDeviceKernelLaunch ||
+           calling_conv == CallingConv::kDefault)
+        << "CodeGenCUDA: expect calling_conv equals CallingConv::kDeviceKernelLaunch or "
+           "CallingConv::kDefault";
     functions.Set(gvar, prim_func);
   }
 
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
@@ -140,7 +140,19 @@ void CodeGenCUDA::Init(bool output_ssa) {
   ICHECK_EQ(vid_global_barrier_state_, runtime::symbol::tvm_global_barrier_state);
 }
 
-void CodeGenCUDA::PrintFuncPrefix(std::ostream& os) { os << "extern \"C\" __global__ "; }
+void CodeGenCUDA::PrintFunctionSignature(const String& function_name, const PrimFunc& func,
+                                         std::ostream& os) {
+  auto calling_conv =
+      func->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(tvm::CallingConv::kDefault));
+  if (calling_conv == CallingConv::kDeviceKernelLaunch) {
+    os << "extern \"C\" __global__ ";
+  } else if (calling_conv == CallingConv::kDefault) {
+    os << "extern \"C\" __device__ ";
+  } else {
+    LOG(FATAL) << "Unsupported calling convention for cuda codegen: " << calling_conv;
+  }
+  CodeGenC::PrintFunctionSignature(function_name, func, os);
+}
 
 class ThreadIdxExtractor : public tir::StmtVisitor {
  private:
diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h
@@ -46,7 +46,8 @@ class CodeGenCUDA final : public CodeGenC {
             enable_fp4_ || need_math_constants_h_ || need_mma_h_);
   }
   // override behavior
-  void PrintFuncPrefix(std::ostream& os) final;
+  void PrintFunctionSignature(const String& function_name, const PrimFunc& func,
+                              std::ostream& os) final;
   void PrintExtraAttrs(const PrimFunc& f, std::ostream& os) final;  // NOLINT(*)
   void VisitStmt_(const ForNode* op) final;
   void PrintStorageSync(const CallNode* op) final;
diff --git a/tests/python/codegen/test_target_codegen_cuda.py b/tests/python/codegen/test_target_codegen_cuda.py
@@ -23,6 +23,7 @@
 import tvm.testing
 from tvm import te, topi
 from tvm.contrib.nvcc import have_bf16, have_fp16, have_int8
+from tvm.script import ir as I
 from tvm.script import tir as T
 
 
@@ -746,5 +747,28 @@ def func(A: T.Buffer((4,), "uint32"), B: T.Buffer((4,), "uint8")) -> None:
         tvm.compile(func, target="cuda")
 
 
+@tvm.testing.requires_cuda
+def test_cuda_device_func_call():
+    @I.ir_module
+    class Module:
+        @T.prim_func(private=True)
+        def add(a: T.float32, b: T.float32) -> T.float32:
+            return a + b
+
+        @T.prim_func
+        def main(
+            A: T.Buffer((1024, 1024), "float32"),
+            B: T.Buffer((1024, 1024), "float32"),
+            C: T.Buffer((1024, 1024), "float32"),
+        ):
+            for bx in T.thread_binding(1024, "blockIdx.x"):
+                for tx in T.thread_binding(1024, "threadIdx.x"):
+                    C[bx, tx] = Module.add(A[bx, tx], B[bx, tx])
+
+    lib = tvm.compile(Module, target="cuda")
+    cuda_code = lib.mod.imported_modules[0].get_source()
+    assert 'extern "C" __device__ float add(float a, float b) {\n  return (a + b);\n}' in cuda_code
+
+
 if __name__ == "__main__":
     tvm.testing.main()

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,9 @@ inline std::unordered_map<std::string, runtime::FunctionInfo> ExtractFuncInfo(co`
`56`	`56`	`}`
`57`	`57`	`}`
`58`	`58`	`auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);`
`59`		`- fmap[static_cast<std::string>(global_symbol.value())] = info;`
	`59`	`+ if (global_symbol) {`
	`60`	`+ fmap[static_cast<std::string>(global_symbol.value())] = info;`
	`61`	`+ }`
`60`	`62`	`}`
`61`	`63`	`return fmap;`
`62`	`64`	`}`