-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[SelectionDAG] Handle fneg
/fabs
/fcopysign
in SimplifyDemandedBits
#139239
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: Iris Shi (el-ev) ChangesCloses #97427. Patch is 27.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/139239.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 564cc372f595c..706c30d08944d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18259,21 +18259,6 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
}
}
- // copysign(fabs(x), y) -> copysign(x, y)
- // copysign(fneg(x), y) -> copysign(x, y)
- // copysign(copysign(x,z), y) -> copysign(x, y)
- if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
- N0.getOpcode() == ISD::FCOPYSIGN)
- return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0.getOperand(0), N1);
-
- // copysign(x, abs(y)) -> abs(x)
- if (N1.getOpcode() == ISD::FABS)
- return DAG.getNode(ISD::FABS, DL, VT, N0);
-
- // copysign(x, copysign(y,z)) -> copysign(x, z)
- if (N1.getOpcode() == ISD::FCOPYSIGN)
- return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(1));
-
// copysign(x, fp_extend(y)) -> copysign(x, y)
// copysign(x, fp_round(y)) -> copysign(x, y)
if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
@@ -18814,6 +18799,9 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
N0.getOperand(0));
}
+ if (SimplifyDemandedBits(N0, APInt::getAllOnes(VT.getScalarSizeInBits())))
+ return SDValue(N, 0);
+
if (SDValue Cast = foldSignChangeInBitcast(N))
return Cast;
@@ -18887,14 +18875,9 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0}))
return C;
- // fold (fabs (fabs x)) -> (fabs x)
- if (N0.getOpcode() == ISD::FABS)
- return N->getOperand(0);
-
- // fold (fabs (fneg x)) -> (fabs x)
- // fold (fabs (fcopysign x, y)) -> (fabs x)
- if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
- return DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
+ if (SimplifyDemandedBits(N0,
+ APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
+ return SDValue(N, 0);
if (SDValue Cast = foldSignChangeInBitcast(N))
return Cast;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ba34c72156228..ffd74d09a11ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2958,6 +2958,77 @@ bool TargetLowering::SimplifyDemandedBits(
}
break;
}
+ case ISD::FABS: {
+ SDValue Op0 = Op.getOperand(0);
+ APInt SignMask = APInt::getSignMask(BitWidth);
+
+ if (!DemandedBits.intersects(SignMask))
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
+
+ if (Known.isNonNegative())
+ return TLO.CombineTo(Op, Op0);
+ if (Known.isNegative())
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0));
+
+ Known.Zero |= SignMask;
+ Known.One &= ~SignMask;
+
+ break;
+ }
+ case ISD::FCOPYSIGN: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ APInt SignMask = APInt::getSignMask(BitWidth);
+
+ if (!DemandedBits.intersects(SignMask))
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedBits(Op1, SignMask, DemandedElts, Known2, TLO,
+ Depth + 1))
+ return true;
+
+ if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
+ return true;
+
+ if ((Known.isNonNegative() && Known2.isNonNegative()) ||
+ (Known.isNegative() && Known2.isNegative()))
+ return TLO.CombineTo(Op, Op0);
+
+ if (Known2.isNonNegative())
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0));
+
+ if (Known2.isNegative()) {
+ Known.One |= SignMask;
+ Known.Zero &= ~SignMask;
+ }
+
+ break;
+ }
+ case ISD::FNEG: {
+ SDValue Op0 = Op.getOperand(0);
+ APInt SignMask = APInt::getSignMask(BitWidth);
+
+ if (!DemandedBits.intersects(SignMask))
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
+ Depth + 1))
+ return true;
+
+ if (Known.isNonNegative() || Known.isNegative()) {
+ Known.Zero ^= SignMask;
+ Known.One ^= SignMask;
+ }
+
+ break;
+ }
default:
// We also ask the target about intrinsics (which could be specific to it).
if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 5e5fdd6d31705..554b2e3444fe4 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -425,10 +425,7 @@ entry:
define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: adrp x8, .LCPI17_0
-; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
-; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
-; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT: fabs v0.4s, v0.4s
; CHECK-SD-NEXT: mov s0, v0.s[2]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index a597faa028f22..f03958a967328 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -427,16 +427,18 @@ entry:
define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
; GFX-942-LABEL: fptrunc_f64_to_bf16_abs:
; GFX-942: ; %bb.0: ; %entry
-; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
-; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
-; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX-942-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
+; GFX-942-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
+; GFX-942-NEXT: v_mov_b32_e32 v4, v0
+; GFX-942-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
+; GFX-942-NEXT: v_and_b32_e32 v9, 1, v8
+; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
+; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
+; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
+; GFX-942-NEXT: v_add_u32_e32 v4, v8, v4
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0
@@ -449,16 +451,18 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
;
; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
; GFX-950: ; %bb.0: ; %entry
-; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
-; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
-; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
+; GFX-950-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
+; GFX-950-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
+; GFX-950-NEXT: v_mov_b32_e32 v4, v0
+; GFX-950-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
+; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
+; GFX-950-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
+; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
+; GFX-950-NEXT: v_add_u32_e32 v0, v8, v0
+; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index c4957fd44e2be..0007cfb667b7d 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -18639,8 +18639,8 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -18648,8 +18648,8 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -18832,8 +18832,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18843,8 +18843,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18889,23 +18889,23 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT: s_bitset0_b32 s0, 31
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT: s_bitset0_b32 s0, 31
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_fabs_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index e9fd6119d0c36..9f9b14d1c87a0 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -211,22 +211,22 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_brev_b32 s8, -2
-; SI-NEXT: v_mov_b32_e32 v1, 0x43300000
-; SI-NEXT: v_mov_b32_e32 v0, 0
-; SI-NEXT: v_mov_b32_e32 v2, -1
-; SI-NEXT: v_mov_b32_e32 v3, 0x432fffff
+; SI-NEXT: v_mov_b32_e32 v0, -1
+; SI-NEXT: v_mov_b32_e32 v1, 0x432fffff
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: v_mov_b32_e32 v3, 0x43300000
+; SI-NEXT: s_mov_b32 s8, 0
+; SI-NEXT: s_mov_b32 s9, 0xc3300000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v6, s3
-; SI-NEXT: v_bfi_b32 v1, s8, v1, v6
-; SI-NEXT: v_mov_b32_e32 v7, s2
-; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1]
-; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[2:3]
+; SI-NEXT: v_mov_b32_e32 v4, s3
+; SI-NEXT: v_mov_b32_e32 v5, s2
+; SI-NEXT: v_add_f64 v[2:3], v[2:3], s[8:9]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -270,30 +270,31 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_brev_b32 s10, -2
-; SI-NEXT: v_mov_b32_e32 v6, 0x43300000
; SI-NEXT: s_mov_b32 s9, 0x432fffff
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0x43300000
+; SI-NEXT: s_mov_b32 s12, 0
+; SI-NEXT: s_mov_b32 s13, 0xc3300000
; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: v_mov_b32_e32 v4, s8
-; SI-NEXT: v_mov_b32_e32 v5, s9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, s7
-; SI-NEXT: v_bfi_b32 v1, s10, v6, v7
-; SI-NEXT: v_mov_b32_e32 v8, s6
-; SI-NEXT: v_mov_b32_e32 v9, s5
-; SI-NEXT: v_mov_b32_e32 v10, s4
-; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1]
-; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1]
-; SI-NEXT: v_bfi_b32 v1, s10, v6, v9
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1]
-; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[2:3]
+; SI-NEXT: v_mov_b32_e32 v8, s7
+; SI-NEXT: v_mov_b32_e32 v9, s6
+; SI-NEXT: v_add_f64 v[2:3], s[4:5], v[2:3]
+; SI-NEXT: v_mov_b32_e32 v10, s5
+; SI-NEXT: v_mov_b32_e32 v11, s4
+; SI-NEXT: v_add_f64 v[4:5], v[4:5], s[12:13]
+; SI-NEXT: v_add_f64 v[6:7], v[2:3], s[12:13]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v6, v11, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -347,46 +348,45 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x11
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_brev_b32 s14, -2
-; SI-NEXT: v_mov_b32_e32 v10, 0x43300000
; SI-NEXT: s_mov_b32 s13, 0x432fffff
-; SI-NEXT: v_mov_b32_e32 v4, 0
+; SI-NEXT: s_mov_b32 s14, 0
+; SI-NEXT: s_mov_b32 s15, 0x43300000
+; SI-NEXT: s_mov_b32 s16, 0
+; SI-NEXT: s_mov_b32 s17, 0xc3300000
; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: v_mov_b32_e32 v8, s12
-; SI-NEXT: v_mov_b32_e32 v9, s13
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v0, s14
+; SI-NEXT: v_mov_b32_e32 v1, s15
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s3
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v2
-; SI-NEXT: v_mov_b32_e32 v6, s2
-; SI-NEXT: v_mov_b32_e32 v7, s1
-; SI-NEXT: v_mov_b32_e32 v11, s0
-; SI-NEXT: v_mov_b32_e32 v12, s7
-; SI-NEXT: v_mov_b32_e32 v13, s6
-; SI-NEXT: v_mov_b32_e32 v14, s5
-; SI-NEXT: v_mov_b32_e32 v15, s4
-; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5]
-; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v7
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc
-; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[4:5]
-; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v12
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
-; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5]
-; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v14
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
-; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc
-; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5]
-; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc
+; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v10, s3
+; SI-NEXT: v_mov_b32_e32 v11, s2
+; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v12, s1
+; SI-NEXT: v_mov_b32_e32 v13, s0
+; SI-NEXT: v_add_f64 v[8:9], s[6:7], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v14, s7
+; SI-NEXT: v_mov_b32_e32 v15, s6
+; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v16, s5
+; SI-NEXT: v_add_f64 v[2:3], v[2:3], s[16:17]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc
+; SI-NEXT: v_mov_b32_e32 v17, s4
+; SI-NEXT: v_add_f64 v[6:7], v[6:7], s[16:17]
+; SI-NEXT: v_add_f64 v[8:9], v[8:9], s[16:17]
+; SI-NEXT: v_add_f64 v[10:11], v[0:1], s[16:17]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v1, v7, v12, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v6, v13, vcc
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v7, v9, v14, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v8, v15, vcc
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v5, v11, v16, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v10, v17, vcc
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 228420ef0acb0..0b6baf4b5f504 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -559,10 +559,11 @@ define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-NEXT: v_frexp_mant_f32_e64 v1, |v0|
-; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v1, s[4:5]
+; SI-NEXT: v_frexp_mant_f32_e64 v2, |v0|
+; SI-NEX...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Iris Shi (el-ev) ChangesCloses #97427. Patch is 27.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/139239.diff 9 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 564cc372f595c..706c30d08944d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18259,21 +18259,6 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
}
}
- // copysign(fabs(x), y) -> copysign(x, y)
- // copysign(fneg(x), y) -> copysign(x, y)
- // copysign(copysign(x,z), y) -> copysign(x, y)
- if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
- N0.getOpcode() == ISD::FCOPYSIGN)
- return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0.getOperand(0), N1);
-
- // copysign(x, abs(y)) -> abs(x)
- if (N1.getOpcode() == ISD::FABS)
- return DAG.getNode(ISD::FABS, DL, VT, N0);
-
- // copysign(x, copysign(y,z)) -> copysign(x, z)
- if (N1.getOpcode() == ISD::FCOPYSIGN)
- return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(1));
-
// copysign(x, fp_extend(y)) -> copysign(x, y)
// copysign(x, fp_round(y)) -> copysign(x, y)
if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
@@ -18814,6 +18799,9 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
N0.getOperand(0));
}
+ if (SimplifyDemandedBits(N0, APInt::getAllOnes(VT.getScalarSizeInBits())))
+ return SDValue(N, 0);
+
if (SDValue Cast = foldSignChangeInBitcast(N))
return Cast;
@@ -18887,14 +18875,9 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0}))
return C;
- // fold (fabs (fabs x)) -> (fabs x)
- if (N0.getOpcode() == ISD::FABS)
- return N->getOperand(0);
-
- // fold (fabs (fneg x)) -> (fabs x)
- // fold (fabs (fcopysign x, y)) -> (fabs x)
- if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
- return DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
+ if (SimplifyDemandedBits(N0,
+ APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
+ return SDValue(N, 0);
if (SDValue Cast = foldSignChangeInBitcast(N))
return Cast;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ba34c72156228..ffd74d09a11ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2958,6 +2958,77 @@ bool TargetLowering::SimplifyDemandedBits(
}
break;
}
+ case ISD::FABS: {
+ SDValue Op0 = Op.getOperand(0);
+ APInt SignMask = APInt::getSignMask(BitWidth);
+
+ if (!DemandedBits.intersects(SignMask))
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
+
+ if (Known.isNonNegative())
+ return TLO.CombineTo(Op, Op0);
+ if (Known.isNegative())
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0));
+
+ Known.Zero |= SignMask;
+ Known.One &= ~SignMask;
+
+ break;
+ }
+ case ISD::FCOPYSIGN: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ APInt SignMask = APInt::getSignMask(BitWidth);
+
+ if (!DemandedBits.intersects(SignMask))
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedBits(Op1, SignMask, DemandedElts, Known2, TLO,
+ Depth + 1))
+ return true;
+
+ if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
+ return true;
+
+ if ((Known.isNonNegative() && Known2.isNonNegative()) ||
+ (Known.isNegative() && Known2.isNegative()))
+ return TLO.CombineTo(Op, Op0);
+
+ if (Known2.isNonNegative())
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0));
+
+ if (Known2.isNegative()) {
+ Known.One |= SignMask;
+ Known.Zero &= ~SignMask;
+ }
+
+ break;
+ }
+ case ISD::FNEG: {
+ SDValue Op0 = Op.getOperand(0);
+ APInt SignMask = APInt::getSignMask(BitWidth);
+
+ if (!DemandedBits.intersects(SignMask))
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
+ Depth + 1))
+ return true;
+
+ if (Known.isNonNegative() || Known.isNegative()) {
+ Known.Zero ^= SignMask;
+ Known.One ^= SignMask;
+ }
+
+ break;
+ }
default:
// We also ask the target about intrinsics (which could be specific to it).
if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 5e5fdd6d31705..554b2e3444fe4 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -425,10 +425,7 @@ entry:
define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: adrp x8, .LCPI17_0
-; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
-; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
-; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT: fabs v0.4s, v0.4s
; CHECK-SD-NEXT: mov s0, v0.s[2]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index a597faa028f22..f03958a967328 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -427,16 +427,18 @@ entry:
define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
; GFX-942-LABEL: fptrunc_f64_to_bf16_abs:
; GFX-942: ; %bb.0: ; %entry
-; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
-; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
-; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX-942-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
+; GFX-942-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
+; GFX-942-NEXT: v_mov_b32_e32 v4, v0
+; GFX-942-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
+; GFX-942-NEXT: v_and_b32_e32 v9, 1, v8
+; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
+; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
+; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
+; GFX-942-NEXT: v_add_u32_e32 v4, v8, v4
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0
@@ -449,16 +451,18 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
;
; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
; GFX-950: ; %bb.0: ; %entry
-; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
-; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
-; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
+; GFX-950-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
+; GFX-950-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
+; GFX-950-NEXT: v_mov_b32_e32 v4, v0
+; GFX-950-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
+; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
+; GFX-950-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
+; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
+; GFX-950-NEXT: v_add_u32_e32 v0, v8, v0
+; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index c4957fd44e2be..0007cfb667b7d 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -18639,8 +18639,8 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -18648,8 +18648,8 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -18832,8 +18832,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18843,8 +18843,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18889,23 +18889,23 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT: s_bitset0_b32 s0, 31
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT: s_bitset0_b32 s0, 31
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_fabs_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index e9fd6119d0c36..9f9b14d1c87a0 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -211,22 +211,22 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_brev_b32 s8, -2
-; SI-NEXT: v_mov_b32_e32 v1, 0x43300000
-; SI-NEXT: v_mov_b32_e32 v0, 0
-; SI-NEXT: v_mov_b32_e32 v2, -1
-; SI-NEXT: v_mov_b32_e32 v3, 0x432fffff
+; SI-NEXT: v_mov_b32_e32 v0, -1
+; SI-NEXT: v_mov_b32_e32 v1, 0x432fffff
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: v_mov_b32_e32 v3, 0x43300000
+; SI-NEXT: s_mov_b32 s8, 0
+; SI-NEXT: s_mov_b32 s9, 0xc3300000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v6, s3
-; SI-NEXT: v_bfi_b32 v1, s8, v1, v6
-; SI-NEXT: v_mov_b32_e32 v7, s2
-; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1]
-; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[2:3]
+; SI-NEXT: v_mov_b32_e32 v4, s3
+; SI-NEXT: v_mov_b32_e32 v5, s2
+; SI-NEXT: v_add_f64 v[2:3], v[2:3], s[8:9]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -270,30 +270,31 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_brev_b32 s10, -2
-; SI-NEXT: v_mov_b32_e32 v6, 0x43300000
; SI-NEXT: s_mov_b32 s9, 0x432fffff
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0x43300000
+; SI-NEXT: s_mov_b32 s12, 0
+; SI-NEXT: s_mov_b32 s13, 0xc3300000
; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: v_mov_b32_e32 v4, s8
-; SI-NEXT: v_mov_b32_e32 v5, s9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, s7
-; SI-NEXT: v_bfi_b32 v1, s10, v6, v7
-; SI-NEXT: v_mov_b32_e32 v8, s6
-; SI-NEXT: v_mov_b32_e32 v9, s5
-; SI-NEXT: v_mov_b32_e32 v10, s4
-; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1]
-; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1]
-; SI-NEXT: v_bfi_b32 v1, s10, v6, v9
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1]
-; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[2:3]
+; SI-NEXT: v_mov_b32_e32 v8, s7
+; SI-NEXT: v_mov_b32_e32 v9, s6
+; SI-NEXT: v_add_f64 v[2:3], s[4:5], v[2:3]
+; SI-NEXT: v_mov_b32_e32 v10, s5
+; SI-NEXT: v_mov_b32_e32 v11, s4
+; SI-NEXT: v_add_f64 v[4:5], v[4:5], s[12:13]
+; SI-NEXT: v_add_f64 v[6:7], v[2:3], s[12:13]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v6, v11, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -347,46 +348,45 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x11
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_brev_b32 s14, -2
-; SI-NEXT: v_mov_b32_e32 v10, 0x43300000
; SI-NEXT: s_mov_b32 s13, 0x432fffff
-; SI-NEXT: v_mov_b32_e32 v4, 0
+; SI-NEXT: s_mov_b32 s14, 0
+; SI-NEXT: s_mov_b32 s15, 0x43300000
+; SI-NEXT: s_mov_b32 s16, 0
+; SI-NEXT: s_mov_b32 s17, 0xc3300000
; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: v_mov_b32_e32 v8, s12
-; SI-NEXT: v_mov_b32_e32 v9, s13
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v0, s14
+; SI-NEXT: v_mov_b32_e32 v1, s15
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s3
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v2
-; SI-NEXT: v_mov_b32_e32 v6, s2
-; SI-NEXT: v_mov_b32_e32 v7, s1
-; SI-NEXT: v_mov_b32_e32 v11, s0
-; SI-NEXT: v_mov_b32_e32 v12, s7
-; SI-NEXT: v_mov_b32_e32 v13, s6
-; SI-NEXT: v_mov_b32_e32 v14, s5
-; SI-NEXT: v_mov_b32_e32 v15, s4
-; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5]
-; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v7
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc
-; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[4:5]
-; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v12
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
-; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5]
-; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v14
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
-; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc
-; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5]
-; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc
+; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v10, s3
+; SI-NEXT: v_mov_b32_e32 v11, s2
+; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v12, s1
+; SI-NEXT: v_mov_b32_e32 v13, s0
+; SI-NEXT: v_add_f64 v[8:9], s[6:7], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v14, s7
+; SI-NEXT: v_mov_b32_e32 v15, s6
+; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v16, s5
+; SI-NEXT: v_add_f64 v[2:3], v[2:3], s[16:17]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc
+; SI-NEXT: v_mov_b32_e32 v17, s4
+; SI-NEXT: v_add_f64 v[6:7], v[6:7], s[16:17]
+; SI-NEXT: v_add_f64 v[8:9], v[8:9], s[16:17]
+; SI-NEXT: v_add_f64 v[10:11], v[0:1], s[16:17]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v1, v7, v12, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v6, v13, vcc
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v7, v9, v14, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v8, v15, vcc
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v5, v11, v16, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v10, v17, vcc
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 228420ef0acb0..0b6baf4b5f504 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -559,10 +559,11 @@ define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-NEXT: v_frexp_mant_f32_e64 v1, |v0|
-; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v1, s[4:5]
+; SI-NEXT: v_frexp_mant_f32_e64 v2, |v0|
+; SI-NEX...
[truncated]
|
; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 | ||
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0| |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Kind of a regression? It should be neutral in code size and cycles, but probably should prefer a bit-op to an FP op (e.g. this avoids the mode dependency)
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 | ||
; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc | ||
; CI-NEXT: v_sub_f32_e32 v1, v3, v1 | ||
; CI-NEXT: v_sub_f32_e32 v0, v2, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By going from FADD to FSUB we're losing the ability to commute the operands which can affect regalloc / memory folding etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For AMDGPU we can commute fsub by switching to add and using a source modifier at the cost of encoding size (although I don't think we implement that particular fold). We also don't have any memory folding
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I didn't mean just AMDGPU, I was thinking more generically.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have no clue why this and the and -> fmul
above happen
@@ -18259,21 +18259,6 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { | |||
} | |||
} | |||
|
|||
// copysign(fabs(x), y) -> copysign(x, y) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it may help if we can observe test changes with the SimplifyDemandedBits changes alone, without dropping the existing combines
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've tested locally, nothing changes when adding the removed combines back. But regressions happen when fold (fabs (fabs x)) -> (fabs x)
at L18878-L18880 is removed.
Closes #97427.