-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[NVPTX] Add intrinsics for the bmsk instruction #139299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesFull diff: https://github.com/llvm/llvm-project/pull/139299.diff 4 Files Affected:
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b6222300e4d4a..51bbfd0a5c88d 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -598,6 +598,32 @@ operand %b clamped to the range [0, 32]. The N lowest bits are then
zero-extended the case of the '``zext``' variants, or sign-extended the case of
the '``sext``' variants. If N is 0, the result is 0.
+'``llvm.nvvm.bmsk.{wrap,clamp}``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+ declare i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics creates a bit mask
+given a starting bit position and a bit width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics returns a value with
+all bits set to 0 except for %b bits starting at bit position %a. For the
+'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
+'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
+which in practice is equivalent to using them as is.
+
TMA family of Intrinsics
------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2851206f2e84a..640fdf3f86326 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1367,6 +1367,16 @@ let TargetPrefix = "nvvm" in {
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;
+
+//
+// BMSK - bit mask
+//
+ foreach mode = ["wrap", "clamp"] in
+ def int_nvvm_bmsk_ # mode :
+ DefaultAttrsIntrinsic<[llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]>;
+
//
// Convert
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7b139d7b79e7d..c339817a2d214 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1693,6 +1693,18 @@ foreach sign = ["s", "u"] in {
}
}
+//
+// BMSK
+//
+
+foreach mode = ["wrap", "clamp"] in {
+ defvar intrin = !cast<Intrinsic>("int_nvvm_bmsk_" # mode);
+ defm BMSK_ # mode
+ : I3Inst<"bmsk." # mode # ".b32",
+ intrin, I32RT, commutative = false,
+ requires = [hasSM<70>, hasPTX<76>]>;
+}
+
//
// Convert
//
diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll
new file mode 100644
index 0000000000000..ead4a42bc6c81
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bmsk.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-unknown-cuda"
+
+define i32 @bmsk_wrap(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_wrap(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [bmsk_wrap_param_1];
+; CHECK-NEXT: bmsk.wrap.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @bmsk_clamp(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_clamp(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [bmsk_clamp_param_1];
+; CHECK-NEXT: bmsk.clamp.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @bmsk_wrap_ii() {
+; CHECK-LABEL: bmsk_wrap_ii(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.b32 %r1, 5;
+; CHECK-NEXT: bmsk.wrap.b32 %r2, %r1, 6;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 6)
+ ret i32 %c
+}
+
+define i32 @bmsk_clamp_ir(i32 %a) {
+; CHECK-LABEL: bmsk_clamp_ir(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_ir_param_0];
+; CHECK-NEXT: bmsk.clamp.b32 %r2, %r1, 7;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 7)
+ ret i32 %c
+}
+
+define i32 @bmsk_wrap_ri(i32 %a) {
+; CHECK-LABEL: bmsk_wrap_ri(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_ri_param_0];
+; CHECK-NEXT: bmsk.wrap.b32 %r2, 5, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 %a)
+ ret i32 %c
+}
|
@llvm/pr-subscribers-llvm-ir Author: Alex MacLean (AlexMaclean) ChangesFull diff: https://github.com/llvm/llvm-project/pull/139299.diff 4 Files Affected:
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index b6222300e4d4a..51bbfd0a5c88d 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -598,6 +598,32 @@ operand %b clamped to the range [0, 32]. The N lowest bits are then
zero-extended the case of the '``zext``' variants, or sign-extended the case of
the '``sext``' variants. If N is 0, the result is 0.
+'``llvm.nvvm.bmsk.{wrap,clamp}``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+ declare i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+
+Overview:
+"""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics creates a bit mask
+given a starting bit position and a bit width.
+
+Semantics:
+""""""""""
+
+The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics returns a value with
+all bits set to 0 except for %b bits starting at bit position %a. For the
+'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
+'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
+which in practice is equivalent to using them as is.
+
TMA family of Intrinsics
------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2851206f2e84a..640fdf3f86326 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1367,6 +1367,16 @@ let TargetPrefix = "nvvm" in {
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]>;
+
+//
+// BMSK - bit mask
+//
+ foreach mode = ["wrap", "clamp"] in
+ def int_nvvm_bmsk_ # mode :
+ DefaultAttrsIntrinsic<[llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]>;
+
//
// Convert
//
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7b139d7b79e7d..c339817a2d214 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1693,6 +1693,18 @@ foreach sign = ["s", "u"] in {
}
}
+//
+// BMSK
+//
+
+foreach mode = ["wrap", "clamp"] in {
+ defvar intrin = !cast<Intrinsic>("int_nvvm_bmsk_" # mode);
+ defm BMSK_ # mode
+ : I3Inst<"bmsk." # mode # ".b32",
+ intrin, I32RT, commutative = false,
+ requires = [hasSM<70>, hasPTX<76>]>;
+}
+
//
// Convert
//
diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll
new file mode 100644
index 0000000000000..ead4a42bc6c81
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bmsk.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
+
+target triple = "nvptx64-unknown-cuda"
+
+define i32 @bmsk_wrap(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_wrap(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [bmsk_wrap_param_1];
+; CHECK-NEXT: bmsk.wrap.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @bmsk_clamp(i32 %a, i32 %b) {
+; CHECK-LABEL: bmsk_clamp(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_param_0];
+; CHECK-NEXT: ld.param.u32 %r2, [bmsk_clamp_param_1];
+; CHECK-NEXT: bmsk.clamp.b32 %r3, %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
+ ret i32 %c
+}
+
+define i32 @bmsk_wrap_ii() {
+; CHECK-LABEL: bmsk_wrap_ii(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: mov.b32 %r1, 5;
+; CHECK-NEXT: bmsk.wrap.b32 %r2, %r1, 6;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 6)
+ ret i32 %c
+}
+
+define i32 @bmsk_clamp_ir(i32 %a) {
+; CHECK-LABEL: bmsk_clamp_ir(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_ir_param_0];
+; CHECK-NEXT: bmsk.clamp.b32 %r2, %r1, 7;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 7)
+ ret i32 %c
+}
+
+define i32 @bmsk_wrap_ri(i32 %a) {
+; CHECK-LABEL: bmsk_wrap_ri(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_ri_param_0];
+; CHECK-NEXT: bmsk.wrap.b32 %r2, 5, %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 %a)
+ ret i32 %c
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
all bits set to 0 except for %b bits starting at bit position %a. For the | ||
'``wrap``' variants, the values of %a and %b modulo 32 are used. For the | ||
'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32], | ||
which in practice is equivalent to using them as is. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM,
We could add a link to the corresponding page in the PTX ISA.
No description provided.