-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[PowerPC] vector shift word/double by element size - 1 use all ones #139794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-powerpc Author: None (RolandF77) ChangesVector shift word or double requires a shift amount vector of 31 or 63 which is too big for splat immediate and requires a multi-instruction sequence. However the PPC instructions only use 5 or 6 bits of the shift amount vector elements so an all ones mask, which we can splat immediate, works. Full diff: https://github.com/llvm/llvm-project/pull/139794.diff 6 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59bfec30dc211..257d393d8946a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18456,36 +18456,78 @@ static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
return SDValue();
}
-SDValue PPCTargetLowering::combineVectorSHL(SDNode *N,
- DAGCombinerInfo &DCI) const {
+SDValue PPCTargetLowering::combineVectorShift(SDNode *N,
+ DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Vector type expected.");
- SDValue N1 = N->getOperand(1);
- if (!Subtarget.hasP8Altivec() || N1.getOpcode() != ISD::BUILD_VECTOR ||
- !isOperationLegal(ISD::ADD, VT))
+ unsigned Opc = N->getOpcode();
+ assert((Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) &&
+ "Unexpected opcode.");
+
+ if (!isOperationLegal(N->getOpcode(), VT))
return SDValue();
- // For 64-bit there is no splat immediate so we want to catch shift by 1 here
- // before the BUILD_VECTOR is replaced by a load.
EVT EltTy = VT.getScalarType();
- if (EltTy != MVT::i64)
+ unsigned EltBits = EltTy.getSizeInBits();
+ if (EltTy != MVT::i64 && EltTy != MVT::i32)
return SDValue();
- BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
- APInt APSplatBits, APSplatUndef;
- unsigned SplatBitSize;
- bool HasAnyUndefs;
- bool BVNIsConstantSplat =
- BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
- HasAnyUndefs, 0, !Subtarget.isLittleEndian());
- if (!BVNIsConstantSplat || SplatBitSize != EltTy.getSizeInBits())
+ SDValue N1 = N->getOperand(1);
+ uint64_t SplatBits = 0;
+ bool AddSplatCase = false;
+ if (N1.getOpcode() == PPCISD::VADD_SPLAT &&
+ N1.getConstantOperandVal(1) == VT.getVectorNumElements()) {
+ AddSplatCase = true;
+ SplatBits = N1.getConstantOperandVal(0);
+ }
+
+ if (!AddSplatCase) {
+ if (N1.getOpcode() != ISD::BUILD_VECTOR)
+ return SDValue();
+
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ APInt APSplatBits, APSplatUndef;
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
+ bool BVNIsConstantSplat =
+ BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+ HasAnyUndefs, 0, !Subtarget.isLittleEndian());
+ if (!BVNIsConstantSplat || SplatBitSize != EltBits)
+ return SDValue();
+ SplatBits = APSplatBits.getZExtValue();
+ }
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ // We can't splat immediate 31 or 63 to shift by element size - 1 for vector
+ // word and vector double shifts, but we can splat immediate all ones and
+ // do the same thing (using PPC shifts).
+ if (SplatBits == (EltBits - 1)) {
+ unsigned NewOpc;
+ switch (Opc) {
+ case ISD::SHL:
+ NewOpc = PPCISD::SHL;
+ break;
+ case ISD::SRL:
+ NewOpc = PPCISD::SRL;
+ break;
+ case ISD::SRA:
+ NewOpc = PPCISD::SRA;
+ break;
+ }
+ SDValue SplatOnes = getCanonicalConstSplat(255, 1, VT, DCI.DAG, DL);
+ return DCI.DAG.getNode(NewOpc, DL, VT, N0, SplatOnes);
+ }
+
+ if (Opc != ISD::SHL || !isOperationLegal(ISD::ADD, VT))
return SDValue();
- uint64_t SplatBits = APSplatBits.getZExtValue();
- if (SplatBits != 1)
+
+ // For 64-bit there is no splat immediate so we want to catch shift by 1 here
+ // before the BUILD_VECTOR is replaced by a load.
+ if (EltTy != MVT::i64 || SplatBits != 1)
return SDValue();
- SDValue N0 = N->getOperand(0);
return DCI.DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
}
@@ -18494,7 +18536,7 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
return Value;
if (N->getValueType(0).isVector())
- return combineVectorSHL(N, DCI);
+ return combineVectorShift(N, DCI);
SDValue N0 = N->getOperand(0);
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -18526,6 +18568,9 @@ SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
return Value;
+ if (N->getValueType(0).isVector())
+ return combineVectorShift(N, DCI);
+
return SDValue();
}
@@ -18533,6 +18578,9 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
return Value;
+ if (N->getValueType(0).isVector())
+ return combineVectorShift(N, DCI);
+
return SDValue();
}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e7e7c21b50395..2c55b5427297a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1441,7 +1441,7 @@ namespace llvm {
SDValue combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue combineVectorSHL(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineVectorShift(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/PowerPC/mul-const-vector.ll b/llvm/test/CodeGen/PowerPC/mul-const-vector.ll
index e2ddef8b49758..e3d231adf734f 100644
--- a/llvm/test/CodeGen/PowerPC/mul-const-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/mul-const-vector.ll
@@ -252,23 +252,19 @@ define <4 x i32> @test7_v4i32(<4 x i32> %a) {
ret <4 x i32> %tmp.1
}
; CHECK-LABEL: test7_v4i32:
-; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
-; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
-; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
+; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
; CHECK-NOT: vmul
-; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
+; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
define <4 x i32> @test8_v4i32(<4 x i32> %a) {
%tmp.1 = mul nsw <4 x i32> %a, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> ; <<4 x i32>> [#uses=1]
ret <4 x i32> %tmp.1
}
; CHECK-LABEL: test8_v4i32:
-; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
-; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
-; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
+; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
; CHECK-NOT: vmul
-; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
-; CHECK-NEXT: vsubuwm v[[REG6:[0-9]+]], v[[REG5]], v2
+; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
+; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v2
define <2 x i64> @test1_v2i64(<2 x i64> %a) {
%tmp.1 = mul nsw <2 x i64> %a, <i64 16, i64 16> ; <<2 x i64>> [#uses=1]
@@ -356,8 +352,7 @@ define <2 x i64> @test7_v2i64(<2 x i64> %a) {
}
; CHECK-LABEL: test7_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
; CHECK-NOT: vmul
; CHECK-NEXT: vsld v[[REG4:[0-9]+]], v2, v[[REG2]]
@@ -367,8 +362,7 @@ define <2 x i64> @test8_v2i64(<2 x i64> %a) {
}
; CHECK-LABEL: test8_v2i64:
-; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
-; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
+; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
; CHECK-NOT: vmul
; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
; CHECK-NEXT: vsubudm v{{[0-9]+}}, v[[REG3]], v2
diff --git a/llvm/test/CodeGen/PowerPC/pr47891.ll b/llvm/test/CodeGen/PowerPC/pr47891.ll
index 46ff074fae647..6438302d574e6 100644
--- a/llvm/test/CodeGen/PowerPC/pr47891.ll
+++ b/llvm/test/CodeGen/PowerPC/pr47891.ll
@@ -7,13 +7,11 @@
define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
; CHECK-LABEL: poly2_lshift1:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: addis r6, r2, .LCPI0_0@toc@ha
+; CHECK-NEXT: ld r6, 0(r3)
; CHECK-NEXT: li r4, 72
; CHECK-NEXT: ld r5, 64(r3)
-; CHECK-NEXT: addi r6, r6, .LCPI0_0@toc@l
+; CHECK-NEXT: xxleqv v4, v4, v4
; CHECK-NEXT: lxvd2x vs0, r3, r4
-; CHECK-NEXT: lxvd2x v4, 0, r6
-; CHECK-NEXT: ld r6, 0(r3)
; CHECK-NEXT: sldi r7, r6, 1
; CHECK-NEXT: rotldi r6, r6, 1
; CHECK-NEXT: std r7, 0(r3)
@@ -35,11 +33,11 @@ define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
; CHECK-NEXT: std r7, 32(r3)
; CHECK-NEXT: ld r7, 40(r3)
; CHECK-NEXT: rldimi r6, r7, 1, 0
-; CHECK-NEXT: xxswapd v2, vs0
-; CHECK-NEXT: mtfprd f0, r5
; CHECK-NEXT: rotldi r7, r7, 1
; CHECK-NEXT: std r6, 40(r3)
; CHECK-NEXT: ld r6, 48(r3)
+; CHECK-NEXT: xxswapd v2, vs0
+; CHECK-NEXT: mtfprd f0, r5
; CHECK-NEXT: rldimi r7, r6, 1, 0
; CHECK-NEXT: rotldi r6, r6, 1
; CHECK-NEXT: std r7, 48(r3)
diff --git a/llvm/test/CodeGen/PowerPC/signbit-shift.ll b/llvm/test/CodeGen/PowerPC/signbit-shift.ll
index e8cedd47d812d..f8838b50816e1 100644
--- a/llvm/test/CodeGen/PowerPC/signbit-shift.ll
+++ b/llvm/test/CodeGen/PowerPC/signbit-shift.ll
@@ -188,12 +188,10 @@ define i32 @add_lshr_not(i32 %x) {
define <4 x i32> @add_lshr_not_vec_splat(<4 x i32> %x) {
; CHECK-LABEL: add_lshr_not_vec_splat:
; CHECK: # %bb.0:
-; CHECK-NEXT: vspltisw 3, -16
-; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: addis 3, 2, .LCPI15_0@toc@ha
-; CHECK-NEXT: vsubuwm 3, 4, 3
-; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l
+; CHECK-NEXT: xxleqv 35, 35, 35
; CHECK-NEXT: vsraw 2, 2, 3
+; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
; CHECK-NEXT: vadduwm 2, 2, 3
; CHECK-NEXT: blr
@@ -218,12 +216,10 @@ define i32 @sub_lshr_not(i32 %x) {
define <4 x i32> @sub_lshr_not_vec_splat(<4 x i32> %x) {
; CHECK-LABEL: sub_lshr_not_vec_splat:
; CHECK: # %bb.0:
-; CHECK-NEXT: vspltisw 3, -16
-; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: addis 3, 2, .LCPI17_0@toc@ha
-; CHECK-NEXT: vsubuwm 3, 4, 3
-; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l
+; CHECK-NEXT: xxleqv 35, 35, 35
; CHECK-NEXT: vsrw 2, 2, 3
+; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
; CHECK-NEXT: vadduwm 2, 2, 3
; CHECK-NEXT: blr
@@ -247,9 +243,7 @@ define i32 @sub_lshr(i32 %x, i32 %y) {
define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: sub_lshr_vec:
; CHECK: # %bb.0:
-; CHECK-NEXT: vspltisw 4, -16
-; CHECK-NEXT: vspltisw 5, 15
-; CHECK-NEXT: vsubuwm 4, 5, 4
+; CHECK-NEXT: xxleqv 36, 36, 36
; CHECK-NEXT: vsraw 2, 2, 4
; CHECK-NEXT: vadduwm 2, 3, 2
; CHECK-NEXT: blr
@@ -272,12 +266,10 @@ define i32 @sub_const_op_lshr(i32 %x) {
define <4 x i32> @sub_const_op_lshr_vec(<4 x i32> %x) {
; CHECK-LABEL: sub_const_op_lshr_vec:
; CHECK: # %bb.0:
-; CHECK-NEXT: vspltisw 3, -16
-; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: addis 3, 2, .LCPI21_0@toc@ha
-; CHECK-NEXT: vsubuwm 3, 4, 3
-; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l
+; CHECK-NEXT: xxleqv 35, 35, 35
; CHECK-NEXT: vsraw 2, 2, 3
+; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
; CHECK-NEXT: vadduwm 2, 2, 3
; CHECK-NEXT: blr
diff --git a/llvm/test/CodeGen/PowerPC/vselect-constants.ll b/llvm/test/CodeGen/PowerPC/vselect-constants.ll
index b72142943dd8b..e65d28188a88f 100644
--- a/llvm/test/CodeGen/PowerPC/vselect-constants.ll
+++ b/llvm/test/CodeGen/PowerPC/vselect-constants.ll
@@ -11,19 +11,17 @@ define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_C1_or_C2_vec:
; CHECK: # %bb.0:
; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha
-; CHECK-NEXT: vspltisw 3, -16
-; CHECK-NEXT: vspltisw 4, 15
+; CHECK-NEXT: xxleqv 37, 37, 37
+; CHECK-NEXT: vslw 2, 2, 5
; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l
-; CHECK-NEXT: vsubuwm 3, 4, 3
+; CHECK-NEXT: vsraw 2, 2, 5
; CHECK-NEXT: lxvd2x 0, 0, 3
; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha
-; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l
-; CHECK-NEXT: vsraw 2, 2, 3
-; CHECK-NEXT: xxswapd 37, 0
+; CHECK-NEXT: xxswapd 35, 0
; CHECK-NEXT: lxvd2x 0, 0, 3
-; CHECK-NEXT: xxswapd 32, 0
-; CHECK-NEXT: xxsel 34, 32, 37, 34
+; CHECK-NEXT: xxswapd 36, 0
+; CHECK-NEXT: xxsel 34, 36, 35, 34
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
ret <4 x i32> %add
@@ -82,15 +80,13 @@ define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_Cminus1_or_C_vec:
; CHECK: # %bb.0:
; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha
-; CHECK-NEXT: vspltisw 3, -16
-; CHECK-NEXT: vspltisw 4, 15
+; CHECK-NEXT: xxleqv 36, 36, 36
+; CHECK-NEXT: vslw 2, 2, 4
; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l
-; CHECK-NEXT: vsubuwm 3, 4, 3
+; CHECK-NEXT: vsraw 2, 2, 4
; CHECK-NEXT: lxvd2x 0, 0, 3
-; CHECK-NEXT: vslw 2, 2, 3
-; CHECK-NEXT: vsraw 2, 2, 3
-; CHECK-NEXT: xxswapd 37, 0
-; CHECK-NEXT: vadduwm 2, 2, 5
+; CHECK-NEXT: xxswapd 35, 0
+; CHECK-NEXT: vadduwm 2, 2, 3
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
ret <4 x i32> %add
@@ -114,9 +110,7 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_minus1_or_0_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_minus1_or_0_vec:
; CHECK: # %bb.0:
-; CHECK-NEXT: vspltisw 3, -16
-; CHECK-NEXT: vspltisw 4, 15
-; CHECK-NEXT: vsubuwm 3, 4, 3
+; CHECK-NEXT: xxleqv 35, 35, 35
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: blr
|
Vector shift word or double requires a shift amount vector of 31 or 63 which is too big for splat immediate and requires a multi-instruction sequence. However the PPC instructions only use 5 or 6 bits of the shift amount vector elements so an all ones mask, which we can generate efficiently, works.