From 1baa89691a472afe572beabc4df59bc7dfc43adc Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Tue, 6 May 2025 22:07:30 -0700 Subject: [PATCH] [AMDGPU] Optimize LDS DMA soft waitcnt This patch adds support for optimizing `S_WAITCNT_VMCNT_LDS_DMA_soft` pseudo instructions by analyzing whether they can be removed based on the absence of LDS DMA operations. These optimizations are a precursor to a dependent patch where these waitcnt pseudos will actually be emitted by the memory legalizer. Adding the waitcnt in the memory model first without any optimization would be too painful of a performance penalty. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 13 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 1 + llvm/lib/Target/AMDGPU/SOPInstructions.td | 1 + llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir | 170 +++++++++++++++++++ 4 files changed, 185 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6f5083acd738d..7852c650a01d6 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1278,6 +1278,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( if (Opcode == AMDGPU::S_WAITCNT) { unsigned IEnc = II.getOperand(0).getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); + + // These pseudo waitcnt instructions are only needed to synchronize DS + // operations with direct LDS loads that use vmcnt. We can safely relax + // them when no outstanding direct LDS loads exist, even if other vmcnt + // events are pending. + if (II.getOpcode() == AMDGPU::S_WAITCNT_VMCNT_LDS_DMA_soft) { + unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; + AMDGPU::Waitcnt LDSDMAWait; + ScoreBrackets.determineWait(LOAD_CNT, RegNo, LDSDMAWait); + if (LDSDMAWait.LoadCnt == ~0u) + OldWait.LoadCnt = ~0u; + } + if (TrySimplify) ScoreBrackets.simplifyWaitcnt(OldWait); Wait = Wait.combined(OldWait); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 4b97f58ce92b9..3980cc9c92fd2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1010,6 +1010,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) { switch (Opcode) { case AMDGPU::S_WAITCNT_soft: + case AMDGPU::S_WAITCNT_VMCNT_LDS_DMA_soft: return AMDGPU::S_WAITCNT; case AMDGPU::S_WAITCNT_VSCNT_soft: return AMDGPU::S_WAITCNT_VSCNT; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 3d3f1ba3f5170..a6a080a3574db 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1608,6 +1608,7 @@ let OtherPredicates = [HasImageInsts] in { def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">; def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; } +def S_WAITCNT_VMCNT_LDS_DMA_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">; def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir index 21372c06d3223..71e7a9f29689d 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir @@ -117,3 +117,173 @@ body: | S_ENDPGM 0 ... + +# Soft waitcnt should be honored here. +# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: S_BARRIER +--- +name: buffer_load_dword_lds_ds_read_soft_wait +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + S_WAITCNT_VMCNT_LDS_DMA_soft 3952 + S_BARRIER + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# No need for waitcnt. +# GCN-LABEL: name: buffer_store_lds_dword_ds_read_soft_wait +# GCN: BUFFER_STORE_LDS_DWORD +# GCN-NEXT: S_BARRIER +--- +name: buffer_store_lds_dword_ds_read_soft_wait +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_STORE_LDS_DWORD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(3) poison` + 4), (store (s32) into `ptr addrspace(1) poison` + 4) + S_WAITCNT_VMCNT_LDS_DMA_soft 3952 + S_BARRIER + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# Soft waitcnt should mean vmcnt(1) before the barrier and vmcnt(0) after. +# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 3953 +# vmcnt(1) +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: series_of_buffer_load_dword_lds_ds_read_soft_wait +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8) + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + S_BARRIER + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# No waitcnt before the barrier because counter is too high +# GCN-LABEL: name: buffer_load_dword_lds_ds_read_soft_wait_redundant +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: buffer_load_dword_lds_ds_read_soft_wait_redundant +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + S_BARRIER + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# Combine waitcnt. +# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 3953 +# vmcnt(1) +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: series_of_buffer_load_dword_lds_ds_read_soft_wait_repeat +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8) + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + S_BARRIER + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + +# Merge waitcnt. +# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 3953 +# vmcnt(1) +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: series_of_buffer_load_dword_lds_ds_read_soft_wait_merge +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8) + S_WAITCNT_VMCNT_LDS_DMA_soft 3954 + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + S_BARRIER + S_WAITCNT_VMCNT_LDS_DMA_soft 3952 + S_WAITCNT_VMCNT_LDS_DMA_soft 3952 + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +... + + +# Handle the preexisting waitcnt. +# GCN-LABEL: name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting +# GCN: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_WAITCNT 0 +# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN +# GCN-NEXT: S_BARRIER +# GCN-NEXT: S_WAITCNT 3952 +# vmcnt(0) +# GCN-NEXT: DS_READ_B32_gfx9 +--- +name: series_of_buffer_load_dword_lds_ds_read_soft_wait_preexisting +body: | + bb.0: + $m0 = S_MOV_B32 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`) + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4) + S_WAITCNT 0 + BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 8), (store (s32) into `ptr addrspace(3) poison` + 8) + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + S_BARRIER + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + S_WAITCNT_VMCNT_LDS_DMA_soft 3953 + $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`) + S_ENDPGM 0 + +...