Skip to content

Commit a5455e3

Browse files
committed
[AMDGPUUnifyDivergentExitNodes] Add NewPM support
Meanwhile, use UniformityAnalysis instead of LegacyDivergenceAnalysis to collect divergence info. Reviewed By: arsenm, sameerds Differential Revision: https://reviews.llvm.org/D141355
1 parent 4bbee03 commit a5455e3

File tree

4 files changed

+128
-51
lines changed

4 files changed

+128
-51
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "AMDGPURegBankSelect.h"
2323
#include "AMDGPUTargetObjectFile.h"
2424
#include "AMDGPUTargetTransformInfo.h"
25+
#include "AMDGPUUnifyDivergentExitNodes.h"
2526
#include "GCNIterativeScheduler.h"
2627
#include "GCNSchedStrategy.h"
2728
#include "GCNVOPDUtils.h"
@@ -655,6 +656,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
655656
PM.addPass(AMDGPUPromoteKernelArgumentsPass());
656657
return true;
657658
}
659+
if (PassName == "amdgpu-unify-divergent-exit-nodes") {
660+
PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
661+
return true;
662+
}
658663
return false;
659664
});
660665

llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

Lines changed: 50 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
//
2020
//===----------------------------------------------------------------------===//
2121

22+
#include "AMDGPUUnifyDivergentExitNodes.h"
2223
#include "AMDGPU.h"
2324
#include "SIDefines.h"
2425
#include "llvm/ADT/ArrayRef.h"
@@ -53,40 +54,48 @@ using namespace llvm;
5354

5455
namespace {
5556

56-
class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
57+
class AMDGPUUnifyDivergentExitNodesImpl {
5758
private:
5859
const TargetTransformInfo *TTI = nullptr;
5960

6061
public:
61-
static char ID; // Pass identification, replacement for typeid
62-
63-
AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
64-
initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
65-
}
62+
AMDGPUUnifyDivergentExitNodesImpl() = delete;
63+
AMDGPUUnifyDivergentExitNodesImpl(const TargetTransformInfo *TTI)
64+
: TTI(TTI) {}
6665

6766
// We can preserve non-critical-edgeness when we unify function exit nodes
68-
void getAnalysisUsage(AnalysisUsage &AU) const override;
6967
BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
7068
ArrayRef<BasicBlock *> ReturningBlocks,
7169
StringRef Name);
72-
bool runOnFunction(Function &F) override;
70+
bool run(Function &F, DominatorTree &DT, const PostDominatorTree &PDT,
71+
const UniformityInfo &UA);
7372
};
7473

74+
class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
75+
public:
76+
static char ID;
77+
AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
78+
initializeAMDGPUUnifyDivergentExitNodesPass(
79+
*PassRegistry::getPassRegistry());
80+
}
81+
void getAnalysisUsage(AnalysisUsage &AU) const override;
82+
bool runOnFunction(Function &F) override;
83+
};
7584
} // end anonymous namespace
7685

7786
char AMDGPUUnifyDivergentExitNodes::ID = 0;
7887

7988
char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
8089

8190
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
82-
"Unify divergent function exit nodes", false, false)
91+
"Unify divergent function exit nodes", false, false)
8392
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
8493
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
8594
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
8695
INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
8796
"Unify divergent function exit nodes", false, false)
8897

89-
void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
98+
void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const {
9099
if (RequireAndPreserveDomTree)
91100
AU.addRequired<DominatorTreeWrapperPass>();
92101

@@ -132,7 +141,7 @@ static bool isUniformlyReached(const UniformityInfo &UA, BasicBlock &BB) {
132141
return true;
133142
}
134143

135-
BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
144+
BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
136145
Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
137146
StringRef Name) {
138147
// Otherwise, we need to insert a new basic block into the function, add a PHI
@@ -180,21 +189,14 @@ BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
180189
return NewRetBlock;
181190
}
182191

183-
bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
184-
DominatorTree *DT = nullptr;
185-
if (RequireAndPreserveDomTree)
186-
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
187-
188-
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
192+
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree &DT,
193+
const PostDominatorTree &PDT,
194+
const UniformityInfo &UA) {
189195
if (PDT.root_size() == 0 ||
190196
(PDT.root_size() == 1 &&
191197
!isa<BranchInst>(PDT.getRoot()->getTerminator())))
192198
return false;
193199

194-
UniformityInfo &UA =
195-
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
196-
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
197-
198200
// Loop over all of the blocks in a function, tracking all of the blocks that
199201
// return.
200202
SmallVector<BasicBlock *, 4> ReturningBlocks;
@@ -327,3 +329,30 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
327329
unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
328330
return true;
329331
}
332+
333+
bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
334+
DominatorTree *DT = nullptr;
335+
if (RequireAndPreserveDomTree)
336+
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
337+
const auto &PDT =
338+
getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
339+
const auto &UA = getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
340+
const auto *TranformInfo =
341+
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
342+
return AMDGPUUnifyDivergentExitNodesImpl(TranformInfo).run(F, *DT, PDT, UA);
343+
}
344+
345+
PreservedAnalyses
346+
AMDGPUUnifyDivergentExitNodesPass::run(Function &F,
347+
FunctionAnalysisManager &AM) {
348+
DominatorTree *DT = nullptr;
349+
if (RequireAndPreserveDomTree)
350+
DT = &AM.getResult<DominatorTreeAnalysis>(F);
351+
352+
const auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
353+
const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
354+
const auto *TransformInfo = &AM.getResult<TargetIRAnalysis>(F);
355+
return AMDGPUUnifyDivergentExitNodesImpl(TransformInfo).run(F, *DT, PDT, UA)
356+
? PreservedAnalyses::none()
357+
: PreservedAnalyses::all();
358+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
//===- AMDGPUUnifyDivergentExitNodes.h ------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring
10+
// there is at most one ret and one unreachable instruction, it ensures there is
11+
// at most one divergent exiting block.
12+
//
13+
// StructurizeCFG can't deal with multi-exit regions formed by branches to
14+
// multiple return nodes. It is not desirable to structurize regions with
15+
// uniform branches, so unifying those to the same return block as divergent
16+
// branches inhibits use of scalar branching. It still can't deal with the case
17+
// where one branch goes to return, and one unreachable. Replace unreachable in
18+
// this case with a return.
19+
//
20+
//===----------------------------------------------------------------------===//
21+
22+
#include "AMDGPU.h"
23+
24+
namespace llvm {
25+
class AMDGPUUnifyDivergentExitNodesPass
26+
: public PassInfoMixin<AMDGPUUnifyDivergentExitNodesPass> {
27+
public:
28+
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
29+
};
30+
31+
} // end namespace llvm

llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll

Lines changed: 42 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,48 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s
2+
; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
34

45
define void @nested_inf_loop(i1 %0, i1 %1) {
5-
; CHECK-LABEL: nested_inf_loop:
6-
; CHECK-NEXT: %bb.0: ; %BB
7-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8-
; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
9-
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
10-
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
11-
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
12-
; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1
13-
; CHECK-NEXT: s_mov_b64 s[8:9], 0
14-
; CHECK-NEXT: .LBB0_1: ; %BB1
15-
; CHECK: s_and_b64 s[10:11], exec, s[6:7]
16-
; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
17-
; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9]
18-
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
19-
; CHECK-NEXT: %bb.2: ; %BB2
20-
; CHECK: s_or_b64 exec, exec, s[8:9]
21-
; CHECK-NEXT: s_mov_b64 s[8:9], 0
22-
; CHECK-NEXT: .LBB0_3: ; %BB4
23-
; CHECK: s_and_b64 s[10:11], exec, s[4:5]
24-
; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
25-
; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9]
26-
; CHECK-NEXT: s_cbranch_execnz .LBB0_3
27-
; CHECK-NEXT: %bb.4: ; %loop.exit.guard
28-
; CHECK: s_or_b64 exec, exec, s[8:9]
29-
; CHECK-NEXT: s_mov_b64 vcc, 0
30-
; CHECK-NEXT: s_mov_b64 s[8:9], 0
31-
; CHECK-NEXT: s_branch .LBB0_1
32-
; CHECK-NEXT: %bb.5: ; %DummyReturnBlock
33-
; CHECK-NEXT: s_setpc_b64 s[30:31]
6+
; OPT-LABEL: @nested_inf_loop(
7+
; OPT-NEXT: BB:
8+
; OPT-NEXT: br label [[BB1:%.*]]
9+
; OPT: BB1:
10+
; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
11+
; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
12+
; OPT: infloop:
13+
; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
14+
; OPT: DummyReturnBlock:
15+
; OPT-NEXT: ret void
16+
;
17+
; ISA-LABEL: nested_inf_loop:
18+
; ISA-NEXT: %bb.0: ; %BB
19+
; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20+
; ISA-NEXT: v_and_b32_e32 v1, 1, v1
21+
; ISA-NEXT: v_and_b32_e32 v0, 1, v0
22+
; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
23+
; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
24+
; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1
25+
; ISA-NEXT: s_mov_b64 s[8:9], 0
26+
; ISA-NEXT: .LBB0_1: ; %BB1
27+
; ISA: s_and_b64 s[10:11], exec, s[6:7]
28+
; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
29+
; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
30+
; ISA-NEXT: s_cbranch_execnz .LBB0_1
31+
; ISA-NEXT: %bb.2: ; %BB2
32+
; ISA: s_or_b64 exec, exec, s[8:9]
33+
; ISA-NEXT: s_mov_b64 s[8:9], 0
34+
; ISA-NEXT: .LBB0_3: ; %BB4
35+
; ISA: s_and_b64 s[10:11], exec, s[4:5]
36+
; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
37+
; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
38+
; ISA-NEXT: s_cbranch_execnz .LBB0_3
39+
; ISA-NEXT: %bb.4: ; %loop.exit.guard
40+
; ISA: s_or_b64 exec, exec, s[8:9]
41+
; ISA-NEXT: s_mov_b64 vcc, 0
42+
; ISA-NEXT: s_mov_b64 s[8:9], 0
43+
; ISA-NEXT: s_branch .LBB0_1
44+
; ISA-NEXT: %bb.5: ; %DummyReturnBlock
45+
; ISA-NEXT: s_setpc_b64 s[30:31]
3446
BB:
3547
br label %BB1
3648

0 commit comments

Comments
 (0)