-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[LSR] Account for hardware loop instructions #147958
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
A hardware loop instruction combines a subtract, compare with zero, and branch. We currently account for the compare and branch being combined into one in Cost::RateFormula, as part of more general handling for compare-branch-zero, but don't account for the subtract, leading to suboptimal decisions in some cases. Fix this in Cost::RateRegister by noticing when we have such a subtract and discounting the AddRecCost in such a case.
@llvm/pr-subscribers-debuginfo @llvm/pr-subscribers-llvm-transforms Author: John Brawn (john-brawn-arm) ChangesA hardware loop instruction combines a subtract, compare with zero, and branch. We currently account for the compare and branch being combined into one in Cost::RateFormula, as part of more general handling for compare-branch-zero, but don't account for the subtract, leading to suboptimal decisions in some cases. Fix this in Cost::RateRegister by noticing when we have such a subtract and discounting the AddRecCost in such a case. Patch is 22.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147958.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 845afa6d4228b..c9a3e477ad86c 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -521,6 +521,8 @@ struct Formula {
bool hasZeroEnd() const;
+ bool countsDownToZero() const;
+
size_t getNumRegs() const;
Type *getType() const;
@@ -705,6 +707,16 @@ bool Formula::hasZeroEnd() const {
return true;
}
+bool Formula::countsDownToZero() const {
+ if (!hasZeroEnd())
+ return false;
+ assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
+ const APInt *StepInt;
+ if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
+ return false;
+ return StepInt->isNegative();
+}
+
/// Return the total number of register operands used by this formula. This does
/// not include register uses implied by non-constant addrec strides.
size_t Formula::getNumRegs() const {
@@ -1227,10 +1239,9 @@ class Cost {
return C.NumRegs == ~0u;
}
- void RateFormula(const Formula &F,
- SmallPtrSetImpl<const SCEV *> &Regs,
- const DenseSet<const SCEV *> &VisitedRegs,
- const LSRUse &LU,
+ void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
+ const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
+ bool HardwareLoopProfitable,
SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
void print(raw_ostream &OS) const;
@@ -1238,9 +1249,11 @@ class Cost {
private:
void RateRegister(const Formula &F, const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs);
+ SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
+ bool HardwareLoopProfitable);
void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
+ const LSRUse &LU, bool HardwareLoopProfitable,
SmallPtrSetImpl<const SCEV *> *LoserRegs);
};
@@ -1383,7 +1396,8 @@ static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
/// Tally up interesting quantities from the given register.
void Cost::RateRegister(const Formula &F, const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs) {
+ SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
+ bool HardwareLoopProfitable) {
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
// If this is an addrec for another loop, it should be an invariant
// with respect to L since L is the innermost loop (at least
@@ -1419,13 +1433,18 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
SE->isLoopInvariant(Start, L)))
LoopCost = 0;
}
+ // If the loop counts down to zero and we'll be using a hardware loop then
+ // the addrec will be combined into the hardware loop instruction.
+ if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
+ HardwareLoopProfitable)
+ LoopCost = 0;
C.AddRecCost += LoopCost;
// Add the step value register, if it needs one.
// TODO: The non-affine case isn't precisely modeled here.
if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
if (!Regs.count(AR->getOperand(1))) {
- RateRegister(F, AR->getOperand(1), Regs);
+ RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
if (isLoser())
return;
}
@@ -1448,22 +1467,22 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
/// one of those regs an instant loser.
void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
SmallPtrSetImpl<const SCEV *> &Regs,
+ const LSRUse &LU, bool HardwareLoopProfitable,
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
if (LoserRegs && LoserRegs->count(Reg)) {
Lose();
return;
}
if (Regs.insert(Reg).second) {
- RateRegister(F, Reg, Regs);
+ RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
if (LoserRegs && isLoser())
LoserRegs->insert(Reg);
}
}
-void Cost::RateFormula(const Formula &F,
- SmallPtrSetImpl<const SCEV *> &Regs,
+void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
- const LSRUse &LU,
+ const LSRUse &LU, bool HardwareLoopProfitable,
SmallPtrSetImpl<const SCEV *> *LoserRegs) {
if (isLoser())
return;
@@ -1477,7 +1496,8 @@ void Cost::RateFormula(const Formula &F,
Lose();
return;
}
- RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
+ RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
+ LoserRegs);
if (isLoser())
return;
}
@@ -1486,7 +1506,8 @@ void Cost::RateFormula(const Formula &F,
Lose();
return;
}
- RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
+ RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
+ LoserRegs);
if (isLoser())
return;
}
@@ -2112,6 +2133,7 @@ class LSRInstance {
TTI::AddressingModeKind AMK;
mutable SCEVExpander Rewriter;
bool Changed = false;
+ bool HardwareLoopProfitable = false;
/// This is the insert position that the current loop's induction variable
/// increment should be placed. In simple loops, this is the latch block's
@@ -3590,7 +3612,8 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
Formula F;
F.initialMatch(S, L, SE);
- BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
+ BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
+ HardwareLoopProfitable);
VisitedLSRUse.insert(LUIdx);
}
@@ -4728,7 +4751,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
// the corresponding bad register from the Regs set.
Cost CostF(L, SE, TTI, AMK);
Regs.clear();
- CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
+ CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
+ &LoserRegs);
if (CostF.isLoser()) {
// During initial formula generation, undesirable formulae are generated
// by uses within other loops that have some non-trivial address mode or
@@ -4761,7 +4785,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
Cost CostBest(L, SE, TTI, AMK);
Regs.clear();
- CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
+ CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
+ HardwareLoopProfitable);
if (CostF.isLess(CostBest))
std::swap(F, Best);
LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
@@ -5019,9 +5044,9 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
Cost CostFA(L, SE, TTI, AMK);
Cost CostFB(L, SE, TTI, AMK);
Regs.clear();
- CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
+ CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
Regs.clear();
- CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
+ CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
return CostFA.isLess(CostFB);
};
@@ -5426,7 +5451,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
// the current best, prune the search at that point.
NewCost = CurCost;
NewRegs = CurRegs;
- NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
+ NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
if (NewCost.isLess(SolutionCost)) {
Workspace.push_back(&F);
if (Workspace.size() != Uses.size()) {
@@ -6131,6 +6156,12 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
dbgs() << ":\n");
+ // Check if we expect this loop to use a hardware loop instruction, which will
+ // be used when calculating the costs of formulas.
+ HardwareLoopInfo HWLoopInfo(L);
+ HardwareLoopProfitable =
+ TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
+
// Configure SCEVExpander already now, so the correct mode is used for
// isSafeToExpand() checks.
#if LLVM_ENABLE_ABI_BREAKING_CHECKS
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll
new file mode 100644
index 0000000000000..037b272f60ec7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-le-cost.ll
@@ -0,0 +1,300 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob --verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOMVE
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob,+mve --verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-MVE
+
+; Check that loop strength reduction understands that it can fold a sub into an
+; le instruction and reduces the cost appropriately, causing it to do this no
+; matter the preferred addressing mode.
+
+define void @test(ptr %dst, i32 %n) {
+; CHECK-NOMVE-LABEL: test:
+; CHECK-NOMVE: @ %bb.0: @ %entry
+; CHECK-NOMVE-NEXT: push {r7, lr}
+; CHECK-NOMVE-NEXT: add.w r0, r0, r1, lsl #1
+; CHECK-NOMVE-NEXT: movs r2, #0
+; CHECK-NOMVE-NEXT: sub.w r12, r0, #2
+; CHECK-NOMVE-NEXT: movs r3, #0
+; CHECK-NOMVE-NEXT: .LBB0_1: @ %outer_loop
+; CHECK-NOMVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NOMVE-NEXT: @ Child Loop BB0_2 Depth 2
+; CHECK-NOMVE-NEXT: dls lr, r1
+; CHECK-NOMVE-NEXT: mov r0, r12
+; CHECK-NOMVE-NEXT: .LBB0_2: @ %inner_loop
+; CHECK-NOMVE-NEXT: @ Parent Loop BB0_1 Depth=1
+; CHECK-NOMVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NOMVE-NEXT: strh r2, [r0, #2]!
+; CHECK-NOMVE-NEXT: le lr, .LBB0_2
+; CHECK-NOMVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-NOMVE-NEXT: @ in Loop: Header=BB0_1 Depth=1
+; CHECK-NOMVE-NEXT: adds r3, #1
+; CHECK-NOMVE-NEXT: cmp r3, r1
+; CHECK-NOMVE-NEXT: it eq
+; CHECK-NOMVE-NEXT: popeq {r7, pc}
+; CHECK-NOMVE-NEXT: b .LBB0_1
+;
+; CHECK-MVE-LABEL: test:
+; CHECK-MVE: @ %bb.0: @ %entry
+; CHECK-MVE-NEXT: push {r7, lr}
+; CHECK-MVE-NEXT: add.w r12, r0, r1, lsl #1
+; CHECK-MVE-NEXT: movs r2, #0
+; CHECK-MVE-NEXT: movs r3, #0
+; CHECK-MVE-NEXT: .LBB0_1: @ %outer_loop
+; CHECK-MVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-MVE-NEXT: @ Child Loop BB0_2 Depth 2
+; CHECK-MVE-NEXT: dls lr, r1
+; CHECK-MVE-NEXT: mov r0, r12
+; CHECK-MVE-NEXT: .LBB0_2: @ %inner_loop
+; CHECK-MVE-NEXT: @ Parent Loop BB0_1 Depth=1
+; CHECK-MVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-MVE-NEXT: strh r2, [r0], #2
+; CHECK-MVE-NEXT: le lr, .LBB0_2
+; CHECK-MVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-MVE-NEXT: @ in Loop: Header=BB0_1 Depth=1
+; CHECK-MVE-NEXT: adds r3, #1
+; CHECK-MVE-NEXT: cmp r3, r1
+; CHECK-MVE-NEXT: it eq
+; CHECK-MVE-NEXT: popeq {r7, pc}
+; CHECK-MVE-NEXT: b .LBB0_1
+entry:
+ br label %outer_loop
+
+outer_loop:
+ %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+ br label %inner_loop
+
+inner_loop:
+ %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+ %add = add i32 %idx_inner, %n
+ %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+ store i16 0, ptr %gep, align 2
+ %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+ %cond_inner = icmp eq i32 %idx_inner.inc, %n
+ br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+ %idx_outer.inc = add nuw i32 %idx_outer, 1
+ %cond_outer = icmp eq i32 %idx_outer.inc, %n
+ br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+ ret void
+}
+
+define void @test_optsize(ptr %dst, i32 %n) optsize {
+; CHECK-LABEL: test_optsize:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: add.w r12, r0, r1, lsl #1
+; CHECK-NEXT: movs r2, #0
+; CHECK-NEXT: movs r3, #0
+; CHECK-NEXT: .LBB1_1: @ %outer_loop
+; CHECK-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NEXT: @ Child Loop BB1_2 Depth 2
+; CHECK-NEXT: dls lr, r1
+; CHECK-NEXT: mov r0, r12
+; CHECK-NEXT: .LBB1_2: @ %inner_loop
+; CHECK-NEXT: @ Parent Loop BB1_1 Depth=1
+; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT: strh r2, [r0], #2
+; CHECK-NEXT: le lr, .LBB1_2
+; CHECK-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-NEXT: @ in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: adds r3, #1
+; CHECK-NEXT: cmp r3, r1
+; CHECK-NEXT: it eq
+; CHECK-NEXT: popeq {r7, pc}
+; CHECK-NEXT: b .LBB1_1
+entry:
+ br label %outer_loop
+
+outer_loop:
+ %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+ br label %inner_loop
+
+inner_loop:
+ %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+ %add = add i32 %idx_inner, %n
+ %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+ store i16 0, ptr %gep, align 2
+ %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+ %cond_inner = icmp eq i32 %idx_inner.inc, %n
+ br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+ %idx_outer.inc = add nuw i32 %idx_outer, 1
+ %cond_outer = icmp eq i32 %idx_outer.inc, %n
+ br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+ ret void
+}
+
+; Check that when we can't use LE we don't discount the cost of a sub
+; instruction, so we only get it when postincrement is the preferred addressing
+; mode (i.e. when we have mve).
+
+declare void @otherfn()
+
+define void @test_no_le(ptr %dst, i32 %n) {
+; CHECK-NOMVE-LABEL: test_no_le:
+; CHECK-NOMVE: @ %bb.0: @ %entry
+; CHECK-NOMVE-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NOMVE-NEXT: add.w r5, r0, r1, lsl #1
+; CHECK-NOMVE-NEXT: mov r4, r1
+; CHECK-NOMVE-NEXT: movs r6, #0
+; CHECK-NOMVE-NEXT: mov.w r8, #0
+; CHECK-NOMVE-NEXT: .LBB2_1: @ %outer_loop
+; CHECK-NOMVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NOMVE-NEXT: @ Child Loop BB2_2 Depth 2
+; CHECK-NOMVE-NEXT: movs r7, #0
+; CHECK-NOMVE-NEXT: .LBB2_2: @ %inner_loop
+; CHECK-NOMVE-NEXT: @ Parent Loop BB2_1 Depth=1
+; CHECK-NOMVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NOMVE-NEXT: bl otherfn
+; CHECK-NOMVE-NEXT: strh.w r6, [r5, r7, lsl #1]
+; CHECK-NOMVE-NEXT: adds r7, #1
+; CHECK-NOMVE-NEXT: cmp r4, r7
+; CHECK-NOMVE-NEXT: bne .LBB2_2
+; CHECK-NOMVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-NOMVE-NEXT: @ in Loop: Header=BB2_1 Depth=1
+; CHECK-NOMVE-NEXT: add.w r8, r8, #1
+; CHECK-NOMVE-NEXT: cmp r8, r4
+; CHECK-NOMVE-NEXT: bne .LBB2_1
+; CHECK-NOMVE-NEXT: @ %bb.4: @ %exit
+; CHECK-NOMVE-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-MVE-LABEL: test_no_le:
+; CHECK-MVE: @ %bb.0: @ %entry
+; CHECK-MVE-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-MVE-NEXT: sub sp, #4
+; CHECK-MVE-NEXT: add.w r8, r0, r1, lsl #1
+; CHECK-MVE-NEXT: mov r9, r1
+; CHECK-MVE-NEXT: movs r6, #0
+; CHECK-MVE-NEXT: movs r7, #0
+; CHECK-MVE-NEXT: .LBB2_1: @ %outer_loop
+; CHECK-MVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-MVE-NEXT: @ Child Loop BB2_2 Depth 2
+; CHECK-MVE-NEXT: mov r5, r8
+; CHECK-MVE-NEXT: mov r4, r9
+; CHECK-MVE-NEXT: .LBB2_2: @ %inner_loop
+; CHECK-MVE-NEXT: @ Parent Loop BB2_1 Depth=1
+; CHECK-MVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-MVE-NEXT: bl otherfn
+; CHECK-MVE-NEXT: strh r6, [r5], #2
+; CHECK-MVE-NEXT: subs r4, #1
+; CHECK-MVE-NEXT: bne .LBB2_2
+; CHECK-MVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-MVE-NEXT: @ in Loop: Header=BB2_1 Depth=1
+; CHECK-MVE-NEXT: adds r7, #1
+; CHECK-MVE-NEXT: cmp r7, r9
+; CHECK-MVE-NEXT: bne .LBB2_1
+; CHECK-MVE-NEXT: @ %bb.4: @ %exit
+; CHECK-MVE-NEXT: add sp, #4
+; CHECK-MVE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
+entry:
+ br label %outer_loop
+
+outer_loop:
+ %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+ br label %inner_loop
+
+inner_loop:
+ %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+ call void @otherfn()
+ %add = add i32 %idx_inner, %n
+ %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+ store i16 0, ptr %gep, align 2
+ %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+ %cond_inner = icmp eq i32 %idx_inner.inc, %n
+ br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+ %idx_outer.inc = add nuw i32 %idx_outer, 1
+ %cond_outer = icmp eq i32 %idx_outer.inc, %n
+ br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+ ret void
+}
+
+define void @test_no_le_optsize(ptr %dst, i32 %n) optsize {
+; CHECK-NOMVE-LABEL: test_no_le_optsize:
+; CHECK-NOMVE: @ %bb.0: @ %entry
+; CHECK-NOMVE-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NOMVE-NEXT: add.w r5, r0, r1, lsl #1
+; CHECK-NOMVE-NEXT: mov r4, r1
+; CHECK-NOMVE-NEXT: movs r6, #0
+; CHECK-NOMVE-NEXT: mov.w r8, #0
+; CHECK-NOMVE-NEXT: .LBB3_1: @ %outer_loop
+; CHECK-NOMVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-NOMVE-NEXT: @ Child Loop BB3_2 Depth 2
+; CHECK-NOMVE-NEXT: movs r7, #0
+; CHECK-NOMVE-NEXT: .LBB3_2: @ %inner_loop
+; CHECK-NOMVE-NEXT: @ Parent Loop BB3_1 Depth=1
+; CHECK-NOMVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-NOMVE-NEXT: bl otherfn
+; CHECK-NOMVE-NEXT: strh.w r6, [r5, r7, lsl #1]
+; CHECK-NOMVE-NEXT: adds r7, #1
+; CHECK-NOMVE-NEXT: cmp r4, r7
+; CHECK-NOMVE-NEXT: bne .LBB3_2
+; CHECK-NOMVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-NOMVE-NEXT: @ in Loop: Header=BB3_1 Depth=1
+; CHECK-NOMVE-NEXT: add.w r8, r8, #1
+; CHECK-NOMVE-NEXT: cmp r8, r4
+; CHECK-NOMVE-NEXT: bne .LBB3_1
+; CHECK-NOMVE-NEXT: @ %bb.4: @ %exit
+; CHECK-NOMVE-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+;
+; CHECK-MVE-LABEL: test_no_le_optsize:
+; CHECK-MVE: @ %bb.0: @ %entry
+; CHECK-MVE-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-MVE-NEXT: sub sp, #4
+; CHECK-MVE-NEXT: add.w r8, r0, r1, lsl #1
+; CHECK-MVE-NEXT: mov r9, r1
+; CHECK-MVE-NEXT: movs r6, #0
+; CHECK-MVE-NEXT: movs r7, #0
+; CHECK-MVE-NEXT: .LBB3_1: @ %outer_loop
+; CHECK-MVE-NEXT: @ =>This Loop Header: Depth=1
+; CHECK-MVE-NEXT: @ Child Loop BB3_2 Depth 2
+; CHECK-MVE-NEXT: mov r5, r8
+; CHECK-MVE-NEXT: mov r4, r9
+; CHECK-MVE-NEXT: .LBB3_2: @ %inner_loop
+; CHECK-MVE-NEXT: @ Parent Loop BB3_1 Depth=1
+; CHECK-MVE-NEXT: @ => This Inner Loop Header: Depth=2
+; CHECK-MVE-NEXT: bl otherfn
+; CHECK-MVE-NEXT: strh r6, [r5], #2
+; CHECK-MVE-NEXT: subs r4, #1
+; CHECK-MVE-NEXT: bne .LBB3_2
+; CHECK-MVE-NEXT: @ %bb.3: @ %outer_loop_end
+; CHECK-MVE-NEXT: @ in Loop: Header=BB3_1 Depth=1
+; CHECK-MVE-NEXT: adds r7, #1
+; CHECK-MVE-NEXT: cmp r7, r9
+; CHECK-MVE-NEXT: bne .LBB3_1
+; CHECK-MVE-NEXT: @ %bb.4: @ %exit
+; CHECK-MVE-NEXT: add sp, #4
+; CHECK-MVE-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
+entry:
+ br label %outer_loop
+
+outer_loop:
+ %idx_outer = phi i32 [ %idx_outer.inc, %outer_loop_end ], [ 0, %entry ]
+ br label %inner_loop
+
+inner_loop:
+ %idx_inner = phi i32 [ 0, %outer_loop ], [ %idx_inner.inc, %inner_loop ]
+ call void @otherfn()
+ %add = add i32 %idx_inner, %n
+ %gep = getelementptr inbounds i16, ptr %dst, i32 %add
+ store i16 0, ptr %gep, align 2
+ %idx_inner.inc = add nuw nsw i32 %idx_inner, 1
+ %cond_inner = icmp eq i32 %idx_inner.inc, %n
+ br i1 %cond_inner, label %outer_loop_end, label %inner_loop
+
+outer_loop_end:
+ %idx_outer.inc = add nuw i32 %idx_outer, 1
+ %cond_outer = icmp eq i32 %idx_outer.inc, %n
+ br i1 %cond_outer, label %exit, label %outer_loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/sibling-loops.ll
index 1f3a43923db61..c6158cb611a70 100644
--- a/llv...
[truncated]
|
A hardware loop instruction combines a subtract, compare with zero, and branch. We currently account for the compare and branch being combined into one in Cost::RateFormula, as part of more general handling for compare-branch-zero, but don't account for the subtract, leading to suboptimal decisions in some cases.
Fix this in Cost::RateRegister by noticing when we have such a subtract and discounting the AddRecCost in such a case.