-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU][GlobalISel] Allow forming s16 U/SBFX pre-regbankselect #131309
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Pierre van Houtryve (Pierre-vh) ChangesMake s16 G_U/SBFX legal and widen them in RegBankSelect. Patch is 95.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131309.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 6e611ebb4b625..23dd20b51e8e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2068,10 +2068,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.minScalar(0, S32)
.lower();
+ // Only {S32, S32} or {S32, S64} should ever reach codegen.
+ // We allow S/UBFX for S16 so the combiner can form them before
+ // RegBankSelect, and RegBankSelect will then legalize them correctly.
getActionDefinitionsBuilder({G_SBFX, G_UBFX})
- .legalFor({{S32, S32}, {S64, S32}})
- .clampScalar(1, S32, S32)
- .clampScalar(0, S32, S64)
+ .legalFor({{S16, S16}, {S32, S32}, {S64, S32}})
+ .clampScalar(1, S16, S32)
+ .clampScalar(0, S16, S64)
.widenScalarToNextPow2(0)
.scalarize(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 27b86723ce474..ed0d52f6b2441 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1485,7 +1485,9 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
Register DstReg = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(DstReg);
+ const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
+ const LLT S16 = LLT::scalar(16);
unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
Register SrcReg = MI.getOperand(FirstOpnd).getReg();
@@ -1495,6 +1497,18 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank) {
+ if (Ty == S16) {
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
+ B.setInsertPt(B.getMBB(), MI);
+ LegalizerHelper Helper(B.getMF(), ApplyBank, B);
+
+ Helper.widenScalarDst(MI, S32);
+ Helper.widenScalarSrc(MI, S32, 1, AMDGPU::G_ANYEXT);
+ Helper.widenScalarSrc(MI, S32, 2, AMDGPU::G_ZEXT);
+ Helper.widenScalarSrc(MI, S32, 3, AMDGPU::G_ZEXT);
+ return true;
+ }
+
if (Ty == S32)
return true;
@@ -1554,6 +1568,11 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
+ if (Ty == S16) {
+ OffsetReg = B.buildAnyExtOrTrunc(S32, OffsetReg).getReg(0);
+ WidthReg = B.buildAnyExtOrTrunc(S32, WidthReg).getReg(0);
+ }
+
// Ensure the high bits are clear to insert the offset.
auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
@@ -1568,13 +1587,21 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
// TODO: It might be worth using a pseudo here to avoid scc clobber and
// register class constraints.
- unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
- (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+ unsigned Opc = (Ty != S64) ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32)
+ : (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
- auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+ Register BFEDst = DstReg;
+ if (Ty == S16) {
+ BFEDst = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegBank(BFEDst, AMDGPU::SGPRRegBank);
+ }
+ auto MIB = B.buildInstr(Opc, {BFEDst}, {SrcReg, MergedInputs});
if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
llvm_unreachable("failed to constrain BFE");
+ if (BFEDst != DstReg)
+ B.buildZExtOrTrunc(DstReg, BFEDst);
+
MI.eraseFromParent();
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 07fcb02d98649..d2b600b04f9fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -40,8 +40,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX8-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x60001
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7
@@ -70,8 +69,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX9-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x60001
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7
@@ -99,8 +97,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX10-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX10-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x60001
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -129,40 +126,38 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 7
; GFX11-NEXT: s_and_b32 s2, s2, 0x7f
-; GFX11-NEXT: s_and_b32 s1, s1, 0x7f
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x60001
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v1, v0, -7
; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7
; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt)
@@ -205,8 +200,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 7
; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2
-; GFX8-NEXT: v_and_b32_e32 v1, 0x7f, v1
-; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1
+; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 6
; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX8-NEXT: v_mul_lo_u32 v4, v3, -7
@@ -235,8 +229,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0x7f, v1
-; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1
+; GFX9-NEXT: v_bfe_u32 v1, v1, 1, 6
; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX9-NEXT: v_mul_lo_u32 v4, v3, -7
@@ -264,9 +257,8 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 7
; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1
+; GFX10-NEXT: v_bfe_u32 v1, v1, 1, 6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX10-NEXT: v_mul_lo_u32 v4, v3, -7
@@ -294,38 +286,37 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, 7
; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2
-; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfe_u32 v1, v1, 1, 6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GFX11-NEXT: v_mul_lo_u32 v4, v3, -7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v4, v3, -7
; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4
; GFX11-NEXT: v_mul_hi_u32 v3, v2, v3
-; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7
; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2
; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_sub_nc_u16 v3, 6, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_and_b32_e32 v3, 0x7f, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt)
@@ -345,10 +336,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX8-LABEL: s_fshl_i8:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
+; GFX8-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX8-NEXT: s_and_b32 s3, s2, 7
-; GFX8-NEXT: s_lshr_b32 s1, s1, 1
; GFX8-NEXT: s_andn2_b32 s2, 7, s2
+; GFX8-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s1, s1, s2
; GFX8-NEXT: s_or_b32 s0, s0, s1
@@ -356,10 +347,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX9-LABEL: s_fshl_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
+; GFX9-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX9-NEXT: s_and_b32 s3, s2, 7
-; GFX9-NEXT: s_lshr_b32 s1, s1, 1
; GFX9-NEXT: s_andn2_b32 s2, 7, s2
+; GFX9-NEXT: s_and_b32 s1, 0xffff, s1
; GFX9-NEXT: s_lshl_b32 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s1, s1, s2
; GFX9-NEXT: s_or_b32 s0, s0, s1
@@ -367,10 +358,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX10-LABEL: s_fshl_i8:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
+; GFX10-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX10-NEXT: s_and_b32 s3, s2, 7
-; GFX10-NEXT: s_lshr_b32 s1, s1, 1
; GFX10-NEXT: s_andn2_b32 s2, 7, s2
+; GFX10-NEXT: s_and_b32 s1, 0xffff, s1
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: s_or_b32 s0, s0, s1
@@ -378,10 +369,10 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) {
;
; GFX11-LABEL: s_fshl_i8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x70001
; GFX11-NEXT: s_and_b32 s3, s2, 7
-; GFX11-NEXT: s_lshr_b32 s1, s1, 1
; GFX11-NEXT: s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
; GFX11-NEXT: s_lshl_b32 s0, s0, s3
; GFX11-NEXT: s_lshr_b32 s1, s1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -408,11 +399,10 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, 1
; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 7
; GFX8-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -421,11 +411,10 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v3, 7, v2
-; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
-; GFX9-NEXT: v_mov_b32_e32 v3, 1
; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_bfe_u32 v1, v1, 1, 7
; GFX9-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0
; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -433,10 +422,9 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX10-LABEL: v_fshl_i8:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
+; GFX10-NEXT: v_bfe_u32 v1, v1, 1, 7
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1
@@ -446,16 +434,14 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX11-LABEL: v_fshl_i8:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX11-NEXT: v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT: v_bfe_u32 v1, v1, 1, 7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX11-NEXT: v_and_b32_e32 v3, 7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt)
@@ -463,42 +449,17 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
}
define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) {
-; GFX6-LABEL: s_fshl_i8_4:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_lshl_b32 s0, s0, 4
-; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: ; return to shader part epilog
-;
-; GFX8-LABEL: s_fshl_i8_4:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_and_b32 s1, s1, 0xff
-; GFX8-NEXT: s_lshl_b32 s0, s0, 4
-; GFX8-NEXT: s_lshr_b32 s1, s1, 4
-; GFX8-NEXT: s_or_b32 s0, s0, s1
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-LABEL: s_fshl_i8_4:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_and_b32 s1, s1, 0xff
-; GFX9-NEXT: s_lshl_b32 s0, s0, 4
-; GFX9-NEXT: s_lshr_b32 s1, s1, 4
-; GFX9-NEXT: s_or_b32 s0, s0, s1
-; GFX9-NEXT: ; return to shader part epilog
-;
-; GFX10-LABEL: s_fshl_i8_4:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_and_b32 s1, s1, 0xff
-; GFX10-NEXT: s_lshl_b32 s0, s0, 4
-; GFX10-NEXT: s_lshr_b32 s1, s1, 4
-; GFX10-NEXT: s_or_b32 s0, s0, s1
-; GFX10-NEXT: ; return to shader part epilog
+; GCN-LABEL: s_fshl_i8_4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 4
+; GCN-NEXT: s_bfe_u32 s1, s1, 0x40004
+; GCN-NEXT: s_or_b32 s0, s0, s1
+; GCN-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fshl_i8_4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s1, s1, 0xff
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_lshr_b32 s1, s1, 4
+; GFX11-NEXT: s_bfe_u32 s1, s1, 0x40004
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, s1
; GFX11-NEXT: ; return to shader part epilog
@@ -518,37 +479,33 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) {
; GFX8-LABEL: v_fshl_i8_4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, 4
; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0
-; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT: v_bfe_u32 v1, v1, 4, 4
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fshl_i8_4:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, 4
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 4, v0
-; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT: v_bfe_u32 v1, v1, 4, 4
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fs...
[truncated]
|
ee917df
to
c30cc50
Compare
9b16990
to
c4187f7
Compare
c30cc50
to
090fa3e
Compare
c4187f7
to
10436c8
Compare
090fa3e
to
2dc7126
Compare
10436c8
to
1971322
Compare
2dc7126
to
d65db02
Compare
1971322
to
dd6d017
Compare
8aa7f8b
to
16cbcc2
Compare
16cbcc2
to
ed5f76f
Compare
33c1a83
to
53bc645
Compare
Make s16 G_U/SBFX legal and widen them in RegBankSelect. This allows the set of BFX formation combines to work on s16 types.
ed5f76f
to
597912a
Compare
Ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't have 16-bit BFE instructions, so I don't understand why you would do this
The BFX formation combines check legality, but if U/SBFX with s16 is illegal it won't do it I feel like having BFX work on many types is useful, even if codegen ends up generating it with ext/truncs. Matching all the possible shift/and combinations for bitfield extracts can get repetitive |
I don't think this is a good motivation. We shouldn't be artificially making operations legal. It's hiding the true cost of the operation. We should pick a preferred form in terms of legal operations |
Make s16 G_U/SBFX legal and widen them in RegBankSelect.
This allows the set of BFX formation combines to work on s16 types.