Skip to content

[NVPTX] Add intrinsics for the bmsk instruction #139299

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 9, 2025

Conversation

AlexMaclean
Copy link
Member

No description provided.

@llvmbot
Copy link
Member

llvmbot commented May 9, 2025

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/139299.diff

4 Files Affected:

  • (modified) llvm/docs/NVPTXUsage.rst (+26)
  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+10)
  • (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+12)
  • (added) llvm/test/CodeGen/NVPTX/bmsk.ll (+77)
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b6222300e4d4a..51bbfd0a5c88d 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -598,6 +598,32 @@ operand %b clamped to the range [0, 32]. The N lowest bits are then
 zero-extended the case of the '``zext``' variants, or sign-extended the case of
 the '``sext``' variants. If N is 0, the result is 0.
 
+'``llvm.nvvm.bmsk.{wrap,clamp}``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+    declare i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics creates a bit mask
+given a starting bit position and a bit width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics returns a value with
+all bits set to 0 except for %b bits starting at bit position %a. For the
+'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
+'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
+which in practice is equivalent to using them as is.
+
 TMA family of Intrinsics
 ------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2851206f2e84a..640fdf3f86326 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1367,6 +1367,16 @@ let TargetPrefix = "nvvm" in {
           [llvm_i32_ty, llvm_i32_ty],
           [IntrNoMem, IntrSpeculatable]>;
 
+
+//
+// BMSK - bit mask
+//
+  foreach mode = ["wrap", "clamp"] in
+    def int_nvvm_bmsk_ # mode :
+      DefaultAttrsIntrinsic<[llvm_i32_ty],
+        [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, IntrSpeculatable]>;
+
 //
 // Convert
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7b139d7b79e7d..c339817a2d214 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1693,6 +1693,18 @@ foreach sign = ["s", "u"] in {
   }
 }
 
+//
+// BMSK
+//
+
+foreach mode = ["wrap", "clamp"] in {
+  defvar intrin = !cast<Intrinsic>("int_nvvm_bmsk_" # mode);
+  defm BMSK_ # mode
+    : I3Inst<"bmsk." # mode # ".b32",
+             intrin, I32RT, commutative = false,
+             requires = [hasSM<70>, hasPTX<76>]>;
+}
+
 //
 // Convert
 //
diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll
new file mode 100644
index 0000000000000..ead4a42bc6c81
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bmsk.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-unknown-cuda"
+
+define i32 @bmsk_wrap(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_wrap(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_wrap_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [bmsk_wrap_param_1];
+; CHECK-NEXT:    bmsk.wrap.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @bmsk_clamp(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_clamp(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_clamp_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [bmsk_clamp_param_1];
+; CHECK-NEXT:    bmsk.clamp.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @bmsk_wrap_ii() {
+; CHECK-LABEL: bmsk_wrap_ii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 5;
+; CHECK-NEXT:    bmsk.wrap.b32 %r2, %r1, 6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 6)
+  ret i32 %c
+}
+
+define i32 @bmsk_clamp_ir(i32 %a) {
+; CHECK-LABEL: bmsk_clamp_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_clamp_ir_param_0];
+; CHECK-NEXT:    bmsk.clamp.b32 %r2, %r1, 7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 7)
+  ret i32 %c
+}
+
+define i32 @bmsk_wrap_ri(i32 %a) {
+; CHECK-LABEL: bmsk_wrap_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_wrap_ri_param_0];
+; CHECK-NEXT:    bmsk.wrap.b32 %r2, 5, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 %a)
+  ret i32 %c
+}

@llvmbot
Copy link
Member

llvmbot commented May 9, 2025

@llvm/pr-subscribers-llvm-ir

Author: Alex MacLean (AlexMaclean)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/139299.diff

4 Files Affected:

  • (modified) llvm/docs/NVPTXUsage.rst (+26)
  • (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+10)
  • (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+12)
  • (added) llvm/test/CodeGen/NVPTX/bmsk.ll (+77)
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b6222300e4d4a..51bbfd0a5c88d 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -598,6 +598,32 @@ operand %b clamped to the range [0, 32]. The N lowest bits are then
 zero-extended the case of the '``zext``' variants, or sign-extended the case of
 the '``sext``' variants. If N is 0, the result is 0.
 
+'``llvm.nvvm.bmsk.{wrap,clamp}``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+    declare i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+    declare i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics creates a bit mask
+given a starting bit position and a bit width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics returns a value with
+all bits set to 0 except for %b bits starting at bit position %a. For the
+'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
+'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
+which in practice is equivalent to using them as is.
+
 TMA family of Intrinsics
 ------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2851206f2e84a..640fdf3f86326 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1367,6 +1367,16 @@ let TargetPrefix = "nvvm" in {
           [llvm_i32_ty, llvm_i32_ty],
           [IntrNoMem, IntrSpeculatable]>;
 
+
+//
+// BMSK - bit mask
+//
+  foreach mode = ["wrap", "clamp"] in
+    def int_nvvm_bmsk_ # mode :
+      DefaultAttrsIntrinsic<[llvm_i32_ty],
+        [llvm_i32_ty, llvm_i32_ty],
+        [IntrNoMem, IntrSpeculatable]>;
+
 //
 // Convert
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7b139d7b79e7d..c339817a2d214 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1693,6 +1693,18 @@ foreach sign = ["s", "u"] in {
   }
 }
 
+//
+// BMSK
+//
+
+foreach mode = ["wrap", "clamp"] in {
+  defvar intrin = !cast<Intrinsic>("int_nvvm_bmsk_" # mode);
+  defm BMSK_ # mode
+    : I3Inst<"bmsk." # mode # ".b32",
+             intrin, I32RT, commutative = false,
+             requires = [hasSM<70>, hasPTX<76>]>;
+}
+
 //
 // Convert
 //
diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll
new file mode 100644
index 0000000000000..ead4a42bc6c81
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bmsk.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-unknown-cuda"
+
+define i32 @bmsk_wrap(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_wrap(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_wrap_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [bmsk_wrap_param_1];
+; CHECK-NEXT:    bmsk.wrap.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @bmsk_clamp(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_clamp(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_clamp_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [bmsk_clamp_param_1];
+; CHECK-NEXT:    bmsk.clamp.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+define i32 @bmsk_wrap_ii() {
+; CHECK-LABEL: bmsk_wrap_ii(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    mov.b32 %r1, 5;
+; CHECK-NEXT:    bmsk.wrap.b32 %r2, %r1, 6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 6)
+  ret i32 %c
+}
+
+define i32 @bmsk_clamp_ir(i32 %a) {
+; CHECK-LABEL: bmsk_clamp_ir(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_clamp_ir_param_0];
+; CHECK-NEXT:    bmsk.clamp.b32 %r2, %r1, 7;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 7)
+  ret i32 %c
+}
+
+define i32 @bmsk_wrap_ri(i32 %a) {
+; CHECK-LABEL: bmsk_wrap_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bmsk_wrap_ri_param_0];
+; CHECK-NEXT:    bmsk.wrap.b32 %r2, 5, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 %a)
+  ret i32 %c
+}

Copy link
Member

@Artem-B Artem-B left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@AlexMaclean AlexMaclean merged commit 4f70917 into llvm:main May 9, 2025
13 of 15 checks passed
all bits set to 0 except for %b bits starting at bit position %a. For the
'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
which in practice is equivalent to using them as is.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM,

We could add a link to the corresponding page in the PTX ISA.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants