Skip to content

[AMDGPU] Move kernarg preload logic to separate pass #130434

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ ModulePass *createAMDGPULowerBufferFatPointersPass();
FunctionPass *createSIModeRegisterPass();
FunctionPass *createGCNPreRAOptimizationsLegacyPass();
FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
ModulePass *createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *);

struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
AMDGPUSimplifyLibCallsPass() {}
Expand Down Expand Up @@ -233,6 +234,9 @@ extern char &GCNRegPressurePrinterID;
void initializeAMDGPUPreloadKernArgPrologLegacyPass(PassRegistry &);
extern char &AMDGPUPreloadKernArgPrologLegacyID;

void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
extern char &AMDGPUPreloadKernelArgumentsLegacyID;

// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
Expand Down Expand Up @@ -347,6 +351,16 @@ class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};

class AMDGPUPreloadKernelArgumentsPass
: public PassInfoMixin<AMDGPUPreloadKernelArgumentsPass> {
const TargetMachine &TM;

public:
explicit AMDGPUPreloadKernelArgumentsPass(const TargetMachine &TM) : TM(TM) {}

PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};

class AMDGPUAnnotateUniformValuesPass
: public PassInfoMixin<AMDGPUAnnotateUniformValuesPass> {
public:
Expand Down
21 changes: 0 additions & 21 deletions llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,6 @@

using namespace llvm;

static cl::opt<unsigned> KernargPreloadCount(
"amdgpu-kernarg-preload-count",
cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));

static cl::opt<unsigned> IndirectCallSpecializationThreshold(
"amdgpu-indirect-call-specialization-threshold",
cl::desc(
Expand Down Expand Up @@ -1327,21 +1323,6 @@ struct AAAMDGPUNoAGPR

const char AAAMDGPUNoAGPR::ID = 0;

static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
for (unsigned I = 0;
I < F.arg_size() &&
I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
++I) {
Argument &Arg = *F.getArg(I);
// Check for incompatible attributes.
if (Arg.hasByRefAttr() || Arg.hasNestAttr())
break;

Arg.addAttr(Attribute::InReg);
}
}

static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
AMDGPUAttributorOptions Options,
ThinOrFullLTOPhase LTOPhase) {
Expand Down Expand Up @@ -1396,8 +1377,6 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
if (!AMDGPU::isEntryFunctionCC(CC)) {
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
} else if (CC == CallingConv::AMDGPU_KERNEL) {
addPreloadKernArgHint(*F, TM);
}

for (auto &I : instructions(F)) {
Expand Down
256 changes: 2 additions & 254 deletions llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,231 +27,6 @@ using namespace llvm;

namespace {

class PreloadKernelArgInfo {
private:
Function &F;
const GCNSubtarget &ST;
unsigned NumFreeUserSGPRs;

enum HiddenArg : unsigned {
HIDDEN_BLOCK_COUNT_X,
HIDDEN_BLOCK_COUNT_Y,
HIDDEN_BLOCK_COUNT_Z,
HIDDEN_GROUP_SIZE_X,
HIDDEN_GROUP_SIZE_Y,
HIDDEN_GROUP_SIZE_Z,
HIDDEN_REMAINDER_X,
HIDDEN_REMAINDER_Y,
HIDDEN_REMAINDER_Z,
END_HIDDEN_ARGS
};

// Stores information about a specific hidden argument.
struct HiddenArgInfo {
// Offset in bytes from the location in the kernearg segment pointed to by
// the implicitarg pointer.
uint8_t Offset;
// The size of the hidden argument in bytes.
uint8_t Size;
// The name of the hidden argument in the kernel signature.
const char *Name;
};

static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
{0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
{8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
{14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
{18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
{22, 2, "_hidden_remainder_z"}};

static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
if (HiddenArgs[I].Offset == Offset)
return static_cast<HiddenArg>(I);

return END_HIDDEN_ARGS;
}

static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
if (HA < END_HIDDEN_ARGS)
return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);

llvm_unreachable("Unexpected hidden argument.");
}

static const char *getHiddenArgName(HiddenArg HA) {
if (HA < END_HIDDEN_ARGS) {
return HiddenArgs[HA].Name;
}
llvm_unreachable("Unexpected hidden argument.");
}

// Clones the function after adding implicit arguments to the argument list
// and returns the new updated function. Preloaded implicit arguments are
// added up to and including the last one that will be preloaded, indicated by
// LastPreloadIndex. Currently preloading is only performed on the totality of
// sequential data from the kernarg segment including implicit (hidden)
// arguments. This means that all arguments up to the last preloaded argument
// will also be preloaded even if that data is unused.
Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
FunctionType *FT = F.getFunctionType();
LLVMContext &Ctx = F.getParent()->getContext();
SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());
for (unsigned I = 0; I <= LastPreloadIndex; ++I)
FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));

FunctionType *NFT =
FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
Function *NF =
Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());

NF->copyAttributesFrom(&F);
NF->copyMetadata(&F, 0);
NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);

F.getParent()->getFunctionList().insert(F.getIterator(), NF);
NF->takeName(&F);
NF->splice(NF->begin(), &F);

Function::arg_iterator NFArg = NF->arg_begin();
for (Argument &Arg : F.args()) {
Arg.replaceAllUsesWith(&*NFArg);
NFArg->takeName(&Arg);
++NFArg;
}

AttrBuilder AB(Ctx);
AB.addAttribute(Attribute::InReg);
AB.addAttribute("amdgpu-hidden-argument");
AttributeList AL = NF->getAttributes();
for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
NFArg++->setName(getHiddenArgName(HiddenArg(I)));
}

NF->setAttributes(AL);
F.replaceAllUsesWith(NF);
F.setCallingConv(CallingConv::C);
F.clearMetadata();

return NF;
}

public:
PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
setInitialFreeUserSGPRsCount();
}

// Returns the maximum number of user SGPRs that we have available to preload
// arguments.
void setInitialFreeUserSGPRsCount() {
GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
}

bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
uint64_t LastExplicitArgOffset) {
// Check if this argument may be loaded into the same register as the
// previous argument.
if (ArgOffset - LastExplicitArgOffset < 4 &&
!isAligned(Align(4), ArgOffset))
return true;

// Pad SGPRs for kernarg alignment.
ArgOffset = alignDown(ArgOffset, 4);
unsigned Padding = ArgOffset - LastExplicitArgOffset;
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
return false;

NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
return true;
}

// Try to allocate SGPRs to preload implicit kernel arguments.
void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
uint64_t LastExplicitArgOffset,
IRBuilder<> &Builder) {
Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
if (!ImplicitArgPtr)
return;

const DataLayout &DL = F.getParent()->getDataLayout();
// Pair is the load and the load offset.
SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;
for (auto *U : ImplicitArgPtr->users()) {
Instruction *CI = dyn_cast<Instruction>(U);
if (!CI || CI->getParent()->getParent() != &F)
continue;

for (auto *U : CI->users()) {
int64_t Offset = 0;
auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?
if (!Load) {
if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
continue;

Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
}

if (!Load || !Load->isSimple())
continue;

// FIXME: Expand to handle 64-bit implicit args and large merged loads.
LLVMContext &Ctx = F.getParent()->getContext();
Type *LoadTy = Load->getType();
HiddenArg HA = getHiddenArgFromOffset(Offset);
if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
continue;

ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
}
}

if (ImplicitArgLoads.empty())
return;

// Allocate loads in order of offset. We need to be sure that the implicit
// argument can actually be preloaded.
std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());

// If we fail to preload any implicit argument we know we don't have SGPRs
// to preload any subsequent ones with larger offsets. Find the first
// argument that we cannot preload.
auto *PreloadEnd = std::find_if(
ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
[&](const std::pair<LoadInst *, unsigned> &Load) {
unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
unsigned LoadOffset = Load.second;
if (!tryAllocPreloadSGPRs(LoadSize,
LoadOffset + ImplicitArgsBaseOffset,
LastExplicitArgOffset))
return true;

LastExplicitArgOffset =
ImplicitArgsBaseOffset + LoadOffset + LoadSize;
return false;
});

if (PreloadEnd == ImplicitArgLoads.begin())
return;

unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
assert(NF);
for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
LoadInst *LoadInst = I->first;
unsigned LoadOffset = I->second;
unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
Argument *Arg = NF->getArg(Index);
LoadInst->replaceAllUsesWith(Arg);
}
}
};

class AMDGPULowerKernelArguments : public FunctionPass {
public:
static char ID;
Expand Down Expand Up @@ -311,10 +86,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));

uint64_t ExplicitArgOffset = 0;
// Preloaded kernel arguments must be sequential.
bool InPreloadSequence = true;
PreloadKernelArgInfo PreloadInfo(F, ST);

for (Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
Expand All @@ -325,25 +96,10 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);

uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;

// Guard against the situation where hidden arguments have already been
// lowered and added to the kernel function signiture, i.e. in a situation
// where this pass has run twice.
if (Arg.hasAttribute("amdgpu-hidden-argument"))
break;

// Try to preload this argument into user SGPRs.
if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
!Arg.getType()->isAggregateType())
if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
LastExplicitArgOffset))
continue;

InPreloadSequence = false;

if (Arg.use_empty())
// Skip inreg arguments which should be preloaded.
if (Arg.use_empty() || Arg.hasInRegAttr())
continue;

// If this is byval, the loads are already explicit in the function. We just
Expand Down Expand Up @@ -483,14 +239,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
KernArgSegment->addRetAttr(
Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));

if (InPreloadSequence) {
uint64_t ImplicitArgsBaseOffset =
alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
BaseOffset;
PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
ExplicitArgOffset, Builder);
}

return true;
}

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
MODULE_PASS("amdgpu-perf-hint",
AMDGPUPerfHintAnalysisPass(
*static_cast<const GCNTargetMachine *>(this)))
MODULE_PASS("amdgpu-preload-kernel-arguments", AMDGPUPreloadKernelArgumentsPass(*this))
MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
Expand Down
Loading
Loading