-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU] Extend SRA i64 simplification for shift amts in range [33:62] #138913
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Signed-off-by: John Lu <[email protected]>
@llvm/pr-subscribers-backend-amdgpu Author: None (LU-JOHN) ChangesExtend sra i64 simplification to shift constants in range [33:62]. Shift amounts 32 and 63 were already handled. Patch is 287.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/138913.diff 15 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b6023b4f3fbcf..2238d7dffb742 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4153,22 +4153,23 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
SDLoc SL(N);
unsigned RHSVal = RHS->getZExtValue();
- // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
- if (RHSVal == 32) {
+ // For C >= 32
+ // (sra i64:x, C) -> build_pair (sra hi_32(x), C - 32), (sra hi_32(x), 31)
+ if (32 <= RHSVal) {
SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
- SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
- DAG.getConstant(31, SL, MVT::i32));
-
- SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
- return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
- }
+ SDValue HiShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+ DAG.getConstant(31, SL, MVT::i32));
+ SDValue LoShift;
+
+ if (RHSVal == 63)
+ LoShift = HiShift;
+ else if (RHSVal == 32)
+ LoShift = Hi;
+ else
+ LoShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+ DAG.getConstant(RHSVal - 32, SL, MVT::i32));
- // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
- if (RHSVal == 63) {
- SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
- SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
- DAG.getConstant(31, SL, MVT::i32));
- SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
+ SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {LoShift, HiShift});
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
}
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
index 933c6506d0270..613fdf388c0f1 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-mullohi.ll
@@ -150,9 +150,9 @@ define i32 @mul_one_bit_hi_hi_u32_lshr_ashr(i32 %arg, i32 %arg1, ptr %arg2) {
; CHECK-LABEL: mul_one_bit_hi_hi_u32_lshr_ashr:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mul_hi_u32 v4, v1, v0
-; CHECK-NEXT: v_ashrrev_i64 v[0:1], 33, v[3:4]
-; CHECK-NEXT: flat_store_dword v[2:3], v4
+; CHECK-NEXT: v_mul_hi_u32 v0, v1, v0
+; CHECK-NEXT: flat_store_dword v[2:3], v0
+; CHECK-NEXT: v_ashrrev_i32_e32 v0, 1, v0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 06c0417211809..51398a45055eb 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -4398,9 +4398,10 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v4
-; GFX9-NEXT: v_ashrrev_i64 v[2:3], 33, v[2:3]
-; GFX9-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v4
+; GFX9-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v3
+; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-O0-LABEL: v_sdiv_i128_v_pow2k:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 817c5def5614f..f795152dbe66e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1643,13 +1643,13 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT: s_ashr_i32 s8, s4, 16
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[4:5], 48
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s5, 16
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s4, 16
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5
; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -1666,13 +1666,13 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 48
-; GCN-HSA-NEXT: s_ashr_i32 s4, s2, 16
-; GCN-HSA-NEXT: s_sext_i32_i16 s1, s3
+; GCN-HSA-NEXT: s_ashr_i32 s0, s3, 16
+; GCN-HSA-NEXT: s_ashr_i32 s1, s2, 16
+; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3
; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT: v_mov_b32_e32 v1, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -6213,19 +6213,20 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s5
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s4, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s5, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s5, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -6240,24 +6241,25 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_mov_b32 s4, s3
; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16
+; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[2:3], 0x100000
-; GCN-HSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 48
-; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-HSA-NEXT: s_ashr_i32 s10, s3, 16
+; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[6:7], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
-; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -6579,33 +6581,35 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s5, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s7, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s16
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s17
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s15
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
@@ -6613,8 +6617,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
-; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6622,13 +6626,15 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_mov_b32 s8, s5
; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16
; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16
+; GCN-HSA-NEXT: s_ashr_i32 s13, s5, 16
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000
-; GCN-HSA-NEXT: s_ashr_i64 s[4:5], s[4:5], 48
-; GCN-HSA-NEXT: s_ashr_i64 s[6:7], s[6:7], 48
-; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GCN-HSA-NEXT: s_ashr_i32 s6, s5, 31
+; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GCN-HSA-NEXT: s_ashr_i32 s12, s7, 31
+; GCN-HSA-NEXT: s_ashr_i32 s7, s7, 16
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
@@ -6637,8 +6643,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
@@ -6646,8 +6652,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
@@ -6660,8 +6666,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15
-; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
+; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
@@ -7185,59 +7191,63 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GCN-NOHSA-SI-LABEL: constant_sextload_v16i16_to_v16i64:
; GCN-NOHSA-SI: ; %bb.0:
-; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, 0xf000
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s11
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s9
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s7
-; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s5
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s10, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s8, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s6, 16
-; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s4, 16
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[4:5], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[6:7], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[8:9], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[10:11], 0x100000
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[8:9], 48
-; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000
-; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s3
+; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s1
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s2, 16
+; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s0, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[0:1], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s1, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[16:17], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 31
+; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s7, 16
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[24:25], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000
+; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23
+; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21
+; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0...
[truncated]
|
Extend sra i64 simplification to shift constants in range [33:62]. Shift amounts 32 and 63 were already handled.
New testing for shift amts 33 and 62 added in sra.ll. Changes to other test files were to adapt previous test results to this extension.