-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[NFC][AMDGPU] Refactor allocatePreloadKernArgSGPRs #139753
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Tim Gymnich (tgymnich) Changes
Full diff: https://github.com/llvm/llvm-project/pull/139753.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8b93ed342c64a..c7d74b0ffb4b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1209,8 +1209,6 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
uint64_t ExplicitArgOffset = 0;
const DataLayout &DL = Fn.getDataLayout();
- unsigned InIndex = 0;
-
for (const Argument &Arg : Fn.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *BaseArgTy = Arg.getType();
@@ -1299,10 +1297,9 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
unsigned PartOffset = 0;
for (unsigned i = 0; i != NumRegs; ++i) {
- State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
- BasePartOffset + PartOffset,
- MemVT.getSimpleVT(),
- CCValAssign::Full));
+ State.addLoc(CCValAssign::getCustomMem(
+ Arg.getArgNo(), RegisterVT, BasePartOffset + PartOffset,
+ MemVT.getSimpleVT(), CCValAssign::Full));
PartOffset += MemVT.getStoreSize();
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5cd6561914364..f1881f395b2c6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2538,84 +2538,115 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}
+static bool allocPreloadKernArg(uint64_t &LastExplicitArgOffset,
+ uint64_t ArgOffset, unsigned ArgSize,
+ unsigned Idx, MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info, CCState &CCInfo) {
+ GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
+ const Align KernelArgBaseAlign = Align(16);
+ Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+ constexpr const unsigned SGPRSize = 4;
+ unsigned NumAllocSGPRs = alignTo(ArgSize, SGPRSize) / SGPRSize;
+
+ // Arg is preloaded into the previous SGPR.
+ if (ArgSize < SGPRSize && Alignment < SGPRSize) {
+ assert(Idx >= 1 && "No previous SGPR");
+ AMDGPUFunctionArgInfo &ArgInfo = Info.getArgInfo();
+ auto &ArgDesc = ArgInfo.PreloadKernArgs[Idx];
+ auto &PrevArgDesc = ArgInfo.PreloadKernArgs[Idx - 1];
+ ArgDesc.Regs.push_back(PrevArgDesc.Regs[0]);
+ return true;
+ }
+
+ unsigned Padding = ArgOffset - LastExplicitArgOffset;
+ unsigned PaddingSGPRs = alignTo(Padding, SGPRSize) / SGPRSize;
+ // Check for free user SGPRs for preloading.
+ if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs())
+ return false;
+
+ // Preload this argument.
+ const TargetRegisterClass *RC =
+ TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+ SmallVectorImpl<MCRegister> *PreloadRegs =
+ Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, Idx, PaddingSGPRs);
+
+ if (PreloadRegs->size() > 1)
+ RC = &AMDGPU::SGPR_32RegClass;
+
+ for (MCRegister Reg : *PreloadRegs) {
+ assert(Reg);
+ MF.addLiveIn(Reg, RC);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ LastExplicitArgOffset = NumAllocSGPRs * SGPRSize + ArgOffset;
+ return true;
+}
+
// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
// sequential starting from the first argument.
void SITargetLowering::allocatePreloadKernArgSGPRs(
- CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
- const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
+ CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, MachineFunction &MF,
const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
Function &F = MF.getFunction();
- unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
- GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
- bool InPreloadSequence = true;
- unsigned InIdx = 0;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
+ uint64_t ExplicitArgOffset = BaseOffset;
+ uint64_t LastExplicitArgOffset = ExplicitArgOffset;
+ unsigned LocIdx = 0;
bool AlignedForImplictArgs = false;
unsigned ImplicitArgOffset = 0;
+
for (auto &Arg : F.args()) {
- if (!InPreloadSequence || !Arg.hasInRegAttr())
+ if (!Arg.hasInRegAttr())
break;
- unsigned ArgIdx = Arg.getArgNo();
- // Don't preload non-original args or parts not in the current preload
- // sequence.
- if (InIdx < Ins.size() &&
- (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
+ const bool IsByRef = Arg.hasByRefAttr();
+ Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
+ unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
+
+ if (AllocSize == 0)
break;
- for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
- Ins[InIdx].getOrigArgIndex() == ArgIdx;
- InIdx++) {
- assert(ArgLocs[ArgIdx].isMemLoc());
- auto &ArgLoc = ArgLocs[InIdx];
- const Align KernelArgBaseAlign = Align(16);
- unsigned ArgOffset = ArgLoc.getLocMemOffset();
- Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
- unsigned NumAllocSGPRs =
- alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
-
- // Fix alignment for hidden arguments.
- if (Arg.hasAttribute("amdgpu-hidden-argument")) {
- if (!AlignedForImplictArgs) {
- ImplicitArgOffset =
- alignTo(LastExplicitArgOffset,
- Subtarget->getAlignmentForImplicitArgPtr()) -
- LastExplicitArgOffset;
- AlignedForImplictArgs = true;
- }
- ArgOffset += ImplicitArgOffset;
- }
+ MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
+ Align ABIAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
- // Arg is preloaded into the previous SGPR.
- if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
- assert(InIdx >= 1 && "No previous SGPR");
- Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
- Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
- continue;
- }
+ // Fix alignment for hidden arguments.
+ if (Arg.hasAttribute("amdgpu-hidden-argument") && !AlignedForImplictArgs) {
+ ImplicitArgOffset = alignTo(LastExplicitArgOffset,
+ Subtarget->getAlignmentForImplicitArgPtr()) -
+ LastExplicitArgOffset;
+ AlignedForImplictArgs = true;
+ }
- unsigned Padding = ArgOffset - LastExplicitArgOffset;
- unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
- // Check for free user SGPRs for preloading.
- if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
- InPreloadSequence = false;
- break;
- }
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
- // Preload this argument.
- const TargetRegisterClass *RC =
- TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
- SmallVectorImpl<MCRegister> *PreloadRegs =
- Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
-
- if (PreloadRegs->size() > 1)
- RC = &AMDGPU::SGPR_32RegClass;
- for (auto &Reg : *PreloadRegs) {
- assert(Reg);
- MF.addLiveIn(Reg, RC);
- CCInfo.AllocateReg(Reg);
- }
+ if (ArgLocs.empty()) {
+ // global isel
+ if (Arg.hasAttribute("amdgpu-hidden-argument"))
+ ArgOffset += ImplicitArgOffset;
- LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
+ if (!allocPreloadKernArg(LastExplicitArgOffset, ArgOffset, AllocSize,
+ Arg.getArgNo(), MF, TRI, Info, CCInfo))
+ return; // No more available SGPRs
+ } else {
+ // DAG isel
+ for (; LocIdx < ArgLocs.size() &&
+ ArgLocs[LocIdx].getValNo() == Arg.getArgNo();
+ LocIdx++) {
+ CCValAssign &ArgLoc = ArgLocs[LocIdx];
+ assert(ArgLoc.isMemLoc());
+ uint64_t LocOffset = ArgLoc.getLocMemOffset();
+ unsigned LocSize = ArgLoc.getLocVT().getStoreSize();
+ if (Arg.hasAttribute("amdgpu-hidden-argument"))
+ LocOffset += ImplicitArgOffset;
+
+ if (!allocPreloadKernArg(LastExplicitArgOffset, LocOffset, LocSize,
+ LocIdx, MF, TRI, Info, CCInfo))
+ return; // No more available SGPRs
+ }
}
}
}
@@ -2937,7 +2968,7 @@ SDValue SITargetLowering::LowerFormalArguments(
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
if (IsKernel && Subtarget->hasKernargPreload())
- allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
+ allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, MF, *TRI, *Info);
allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
} else if (!IsGraphics) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..36b5d1c922293 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -575,7 +575,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void allocatePreloadKernArgSGPRs(CCState &CCInfo,
SmallVectorImpl<CCValAssign> &ArgLocs,
- const SmallVectorImpl<ISD::InputArg> &Ins,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
|
if (ArgLocs.empty()) { | ||
// global isel | ||
if (Arg.hasAttribute("amdgpu-hidden-argument")) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These shouldn't be structurally different this way, I would expect ArgLocs to be identically populated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So we should probably also run AMDGPUTargetLowering::analyzeFormalArgumentsCompute
for GISel to populate ArgLocs
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC analyzeFormalArgumentsCompute is for kernels, and is the hacky interpretation of the IR signature as in-memory values. GlobalISel bypasses that, since this is mostly trying to reverse engineer what the original IR type was and getting it to match.
For arguments passed in registers, the situation should be simpler. Ideally the preloaded arguments would behave like normal arguments passed in registers for non-kernels
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's just treat this PR as a minor refactor and work on calling convention in a follow up.
Follow up to enable kernel argument preloading in GlobalISel: #134655