[Codegen] Remove redundant instruction using machinelateCleanup #139716

rohitaggarwal007 · 2025-05-13T12:03:42Z

Remove redundant instructions after PostRegalloc.

llvmbot · 2025-05-13T12:04:13Z

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-backend-aarch64

Author: Rohit Aggarwal (rohitaggarwal007)

Changes

Remove redundant pseudo mov instructions after PostRegalloc.

Patch is 52.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/139716.diff

33 Files Affected:

(modified) llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp (+3-1)
(modified) llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/call-waitcnt.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/captured-frame-index.ll (+1-2)
(modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll (+1-2)
(modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (-12)
(modified) llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll (-2)
(modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+14-14)
(modified) llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll (-2)
(modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (-2)
(modified) llvm/test/CodeGen/AMDGPU/required-export-priority.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/sibling-call.ll (+6-9)
(modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (-8)
(modified) llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll (-1)
(modified) llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll (-2)
(modified) llvm/test/CodeGen/X86/avx-load-store.ll (-1)
(modified) llvm/test/CodeGen/X86/avx512-i1test.ll (+7-7)
(modified) llvm/test/CodeGen/X86/isel-brcond-fcmp.ll (+94-12)
(modified) llvm/test/CodeGen/X86/isel-brcond-icmp.ll (+56-176)
(modified) llvm/test/CodeGen/X86/pr36602.ll (-1)
(modified) llvm/test/CodeGen/X86/pr38795.ll (-1)
(modified) llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll (-1)
(modified) llvm/test/CodeGen/X86/scheduler-backtracking.ll (-5)
(modified) llvm/test/CodeGen/X86/tail-opts.ll (-5)
(modified) llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll (+1-5)
(modified) llvm/test/CodeGen/X86/x86-cmov-converter.ll (-2)
(modified) llvm/test/CodeGen/X86/zext-extract_subreg.ll (-3)
(modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected (-1)
(modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected (-1)
(modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_generated_funcs.ll.generated.expected (-1)
(modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_generated_funcs.ll.nogenerated.expected (-1)

diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
index c8c8ed99d93ea..c3b6115338f05 100644
--- a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
+++ b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
@@ -186,11 +186,13 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (MO.isReg()) {
-      if (MO.isDef()) {
+      if (MO.isDef() && DefedReg == MCRegister::NoRegister) {
         if (i == 0 && !MO.isImplicit() && !MO.isDead())
           DefedReg = MO.getReg();
         else
           return false;
+      } else if (MI->isPseudo() && MI->isMoveImmediate()) {
+        return DefedReg.isValid();
       } else if (MO.getReg() && MO.getReg() != FrameReg)
         return false;
     } else if (!(MO.isImm() || MO.isCImm() || MO.isFPImm() || MO.isCPI() ||
diff --git a/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll b/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
index 0f208f8ed9052..08346f1a857eb 100644
--- a/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
+++ b/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
@@ -80,7 +80,6 @@ define void @widget(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %a
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-NEXT:    stp xzr, xzr, [sp]
 ; CHECK-NEXT:    stp x8, xzr, [sp, #16]
 ; CHECK-NEXT:    bl _fprintf
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 9abb50651146a..6889110752ae0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -41,7 +41,6 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v0, v0, s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index 963b3a55259fa..852ffd5fa7183 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -113,8 +113,7 @@ define amdgpu_kernel void @stored_fi_to_fi() #0 {
 
 ; GCN-LABEL: {{^}}stored_fi_to_global:
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[FI]]
+; GCN: buffer_store_dword v{{[0-9]+}}
 define amdgpu_kernel void @stored_fi_to_global(ptr addrspace(1) %ptr) #0 {
   %tmp = alloca float, addrspace(5)
   store float 0.0, ptr  addrspace(5) %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index 81f768f303ca1..98771dcb441cc 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -31,7 +31,6 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr add
 ; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v1
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %if
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_atomic_csub v0, v0, v1, s[2:3] offset:28 glc
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index 49370e2fbf1b6..d212d7d52d841 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -33,7 +33,6 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %if
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] offset:28
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 0f9407b77aa83..8f92ee42c066a 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -72,8 +72,7 @@ done:
 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
 ; GCN: s_and_saveexec_b64
 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
-; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
 ; GCN: {{^}}.LBB2_2:
 ; GCN: s_or_b64 exec
 define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..a4ae669617263 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -323,8 +323,6 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
 ; SDAG-NEXT:    v_lshr_b64 v[37:38], v[6:7], v30
 ; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v29
-; SDAG-NEXT:    v_mov_b32_e32 v12, 0
-; SDAG-NEXT:    v_mov_b32_e32 v13, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v14, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v15, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -1107,8 +1105,6 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v28, vcc, 64, v22
 ; SDAG-NEXT:    v_lshr_b64 v[29:30], v[6:7], v22
 ; SDAG-NEXT:    v_add_i32_e32 v26, vcc, -1, v12
-; SDAG-NEXT:    v_mov_b32_e32 v20, 0
-; SDAG-NEXT:    v_mov_b32_e32 v21, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v11, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -1679,8 +1675,6 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v37, vcc, 64, v32
 ; SDAG-NEXT:    v_lshr_b64 v[24:25], v[0:1], v32
 ; SDAG-NEXT:    v_add_i32_e32 v36, vcc, -1, v31
-; SDAG-NEXT:    v_mov_b32_e32 v18, 0
-; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v22, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v23, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -1874,8 +1868,6 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v51, vcc, 64, v38
 ; SDAG-NEXT:    v_lshr_b64 v[22:23], v[4:5], v38
 ; SDAG-NEXT:    v_add_i32_e32 v50, vcc, -1, v37
-; SDAG-NEXT:    v_mov_b32_e32 v18, 0
-; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v21, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -2562,8 +2554,6 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v35, vcc, 64, v30
 ; SDAG-NEXT:    v_lshr_b64 v[26:27], v[2:3], v30
 ; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v8
-; SDAG-NEXT:    v_mov_b32_e32 v20, 0
-; SDAG-NEXT:    v_mov_b32_e32 v21, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v24, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v25, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -2737,8 +2727,6 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v39, vcc, 64, v34
 ; SDAG-NEXT:    v_lshr_b64 v[26:27], v[6:7], v34
 ; SDAG-NEXT:    v_add_i32_e32 v38, vcc, -1, v12
-; SDAG-NEXT:    v_mov_b32_e32 v22, 0
-; SDAG-NEXT:    v_mov_b32_e32 v23, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v24, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v25, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 13884eb788d8a..a3c7eb8c56fb0 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -53,7 +53,6 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s31, 0
@@ -77,7 +76,6 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 6384fdba7a45a..e8aa64f0b8f15 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -3234,20 +3234,20 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
-; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0
-; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, 0
-; GFX11-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, 0
-; GFX11-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v12, 0
-; GFX11-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v14, 0
-; GFX11-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v16, 0
-; GFX11-NEXT:    v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v18, 0
-; GFX11-NEXT:    v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v20, 0
-; GFX11-NEXT:    v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v22, 0
-; GFX11-NEXT:    v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v24, 0
-; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
-; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
-; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v6, 0
+; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v8, 0
+; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 0
+; GFX11-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v12, 0
+; GFX11-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 0
+; GFX11-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 0
+; GFX11-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 0
+; GFX11-NEXT:    v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v20, 0
+; GFX11-NEXT:    v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v22, 0
+; GFX11-NEXT:    v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v24, 0
+; GFX11-NEXT:    v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v26, 0
+; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v28, 0
+; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v30, 0
+; GFX11-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX11-NEXT:    s_mov_b32 s1, return_72xi32@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, return_72xi32@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v60, s31, 1
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index a3ebaec4811a9..97b7e26d1230b 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -225,12 +225,10 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
 ; MUBUF-NEXT:  ; %bb.2: ; %split
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x4000
 ; MUBUF-NEXT:    v_or_b32_e32 v0, 0x12d4, v1
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x4000
 ; MUBUF-NEXT:    s_movk_i32 s4, 0x4000
 ; MUBUF-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_or_b32_e32 v0, 0x12d0, v1
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x4000
 ; MUBUF-NEXT:    s_or_b32 s4, s4, 0x12c0
 ; MUBUF-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 2d95ec6f699dc..3d6e7c532348f 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -395,7 +395,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX908-NEXT:    s_mov_b64 s[16:17], exec
 ; GFX908-NEXT:    s_mov_b64 exec, 1
 ; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    v_writelane_b32 v2, s31, 0
@@ -743,7 +742,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX908-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX908-NEXT:    s_mov_b64 exec, 1
 ; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
index 4aa1ddee2efe3..e145380dca59e 100644
--- a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
+++ b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
@@ -267,7 +267,6 @@ define amdgpu_ps void @test_export_across_store_load(i32 %idx, float %v) #0 {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 16, v2, vcc_lo
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    scratch_store_b32 v0, v1, off
 ; GCN-NEXT:    scratch_load_b32 v0, off, off
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 35234236b848f..ddb7d6b9c3936 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -388,11 +388,7 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
 ; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -423,6 +419,9 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
 ; GCN-NEXT:    v_mov_b32_e32 v29, 0
 ; GCN-NEXT:    v_mov_b32_e32 v30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
@@ -528,10 +527,6 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
@@ -560,6 +555,9 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg
 ; GCN-NEXT:    v_mov_b32_e32 v28, 0
 ; GCN-NEXT:    v_mov_b32_e32 v29, 0
 ; GCN-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
@@ -928,7 +926,6 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
 ; GCN-NEXT:    s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 50056b62b3397..e9aebeef6ea6d 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -9971,7 +9971,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_mov_b32 s34, 0x80c00
@@ -9989,7 +9988,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10007,7 +10005,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_mov_b32 s34, 0x81400
@@ -10025,7 +10022,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10043,7 +10039,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
@@ -10061,7 +10056,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10105,7 +10099,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr ...
[truncated]

llvmbot · 2025-05-13T12:04:14Z

@llvm/pr-subscribers-backend-amdgpu

Author: Rohit Aggarwal (rohitaggarwal007)

Changes

Remove redundant pseudo mov instructions after PostRegalloc.

Patch is 52.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/139716.diff

33 Files Affected:

(modified) llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp (+3-1)
(modified) llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/call-waitcnt.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/captured-frame-index.ll (+1-2)
(modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll (+1-2)
(modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (-12)
(modified) llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll (-2)
(modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+14-14)
(modified) llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll (-2)
(modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (-2)
(modified) llvm/test/CodeGen/AMDGPU/required-export-priority.ll (-1)
(modified) llvm/test/CodeGen/AMDGPU/sibling-call.ll (+6-9)
(modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (-8)
(modified) llvm/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll (-1)
(modified) llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll (-2)
(modified) llvm/test/CodeGen/X86/avx-load-store.ll (-1)
(modified) llvm/test/CodeGen/X86/avx512-i1test.ll (+7-7)
(modified) llvm/test/CodeGen/X86/isel-brcond-fcmp.ll (+94-12)
(modified) llvm/test/CodeGen/X86/isel-brcond-icmp.ll (+56-176)
(modified) llvm/test/CodeGen/X86/pr36602.ll (-1)
(modified) llvm/test/CodeGen/X86/pr38795.ll (-1)
(modified) llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll (-1)
(modified) llvm/test/CodeGen/X86/scheduler-backtracking.ll (-5)
(modified) llvm/test/CodeGen/X86/tail-opts.ll (-5)
(modified) llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll (+1-5)
(modified) llvm/test/CodeGen/X86/x86-cmov-converter.ll (-2)
(modified) llvm/test/CodeGen/X86/zext-extract_subreg.ll (-3)
(modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected (-1)
(modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected (-1)
(modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_generated_funcs.ll.generated.expected (-1)
(modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_generated_funcs.ll.nogenerated.expected (-1)

diff --git a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
index c8c8ed99d93ea..c3b6115338f05 100644
--- a/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
+++ b/llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp
@@ -186,11 +186,13 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (MO.isReg()) {
-      if (MO.isDef()) {
+      if (MO.isDef() && DefedReg == MCRegister::NoRegister) {
         if (i == 0 && !MO.isImplicit() && !MO.isDead())
           DefedReg = MO.getReg();
         else
           return false;
+      } else if (MI->isPseudo() && MI->isMoveImmediate()) {
+        return DefedReg.isValid();
       } else if (MO.getReg() && MO.getReg() != FrameReg)
         return false;
     } else if (!(MO.isImm() || MO.isCImm() || MO.isFPImm() || MO.isCPI() ||
diff --git a/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll b/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
index 0f208f8ed9052..08346f1a857eb 100644
--- a/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
+++ b/llvm/test/CodeGen/AArch64/implicit-def-subreg-to-reg-regression.ll
@@ -80,7 +80,6 @@ define void @widget(i32 %arg, i32 %arg1, ptr %arg2, ptr %arg3, ptr %arg4, i32 %a
 ; CHECK-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    mov x1, xzr
-; CHECK-NEXT:    mov w8, #1 ; =0x1
 ; CHECK-NEXT:    stp xzr, xzr, [sp]
 ; CHECK-NEXT:    stp x8, xzr, [sp, #16]
 ; CHECK-NEXT:    bl _fprintf
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 9abb50651146a..6889110752ae0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -41,7 +41,6 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_store_dword v0, v0, s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index 963b3a55259fa..852ffd5fa7183 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -113,8 +113,7 @@ define amdgpu_kernel void @stored_fi_to_fi() #0 {
 
 ; GCN-LABEL: {{^}}stored_fi_to_global:
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[FI]]
+; GCN: buffer_store_dword v{{[0-9]+}}
 define amdgpu_kernel void @stored_fi_to_global(ptr addrspace(1) %ptr) #0 {
   %tmp = alloca float, addrspace(5)
   store float 0.0, ptr  addrspace(5) %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index 81f768f303ca1..98771dcb441cc 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -31,7 +31,6 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr add
 ; GCN-NEXT:    v_cmpx_ne_u32_e32 0, v1
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %if
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_atomic_csub v0, v0, v1, s[2:3] offset:28 glc
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
index 49370e2fbf1b6..d212d7d52d841 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll
@@ -33,7 +33,6 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_fadd_f32(ptr add
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %if
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] offset:28
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 0f9407b77aa83..8f92ee42c066a 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -72,8 +72,7 @@ done:
 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
 ; GCN: s_and_saveexec_b64
 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
-; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
 ; GCN: {{^}}.LBB2_2:
 ; GCN: s_or_b64 exec
 define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(ptr addrspace(1) %out, ptr addrspace(1) %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..a4ae669617263 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -323,8 +323,6 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
 ; SDAG-NEXT:    v_lshr_b64 v[37:38], v[6:7], v30
 ; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v29
-; SDAG-NEXT:    v_mov_b32_e32 v12, 0
-; SDAG-NEXT:    v_mov_b32_e32 v13, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v14, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v15, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -1107,8 +1105,6 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v28, vcc, 64, v22
 ; SDAG-NEXT:    v_lshr_b64 v[29:30], v[6:7], v22
 ; SDAG-NEXT:    v_add_i32_e32 v26, vcc, -1, v12
-; SDAG-NEXT:    v_mov_b32_e32 v20, 0
-; SDAG-NEXT:    v_mov_b32_e32 v21, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v11, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -1679,8 +1675,6 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v37, vcc, 64, v32
 ; SDAG-NEXT:    v_lshr_b64 v[24:25], v[0:1], v32
 ; SDAG-NEXT:    v_add_i32_e32 v36, vcc, -1, v31
-; SDAG-NEXT:    v_mov_b32_e32 v18, 0
-; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v22, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v23, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -1874,8 +1868,6 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v51, vcc, 64, v38
 ; SDAG-NEXT:    v_lshr_b64 v[22:23], v[4:5], v38
 ; SDAG-NEXT:    v_add_i32_e32 v50, vcc, -1, v37
-; SDAG-NEXT:    v_mov_b32_e32 v18, 0
-; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v21, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -2562,8 +2554,6 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v35, vcc, 64, v30
 ; SDAG-NEXT:    v_lshr_b64 v[26:27], v[2:3], v30
 ; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v8
-; SDAG-NEXT:    v_mov_b32_e32 v20, 0
-; SDAG-NEXT:    v_mov_b32_e32 v21, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v24, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v25, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
@@ -2737,8 +2727,6 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subrev_i32_e32 v39, vcc, 64, v34
 ; SDAG-NEXT:    v_lshr_b64 v[26:27], v[6:7], v34
 ; SDAG-NEXT:    v_add_i32_e32 v38, vcc, -1, v12
-; SDAG-NEXT:    v_mov_b32_e32 v22, 0
-; SDAG-NEXT:    v_mov_b32_e32 v23, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v24, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v25, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 13884eb788d8a..a3c7eb8c56fb0 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -53,7 +53,6 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    v_writelane_b32 v0, s31, 0
@@ -77,7 +76,6 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
-; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 s[4:5], exec
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, 1
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:16
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 6384fdba7a45a..e8aa64f0b8f15 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -3234,20 +3234,20 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
-; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0
-; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, 0
-; GFX11-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, 0
-; GFX11-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v12, 0
-; GFX11-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v14, 0
-; GFX11-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v16, 0
-; GFX11-NEXT:    v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v18, 0
-; GFX11-NEXT:    v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v20, 0
-; GFX11-NEXT:    v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v22, 0
-; GFX11-NEXT:    v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v24, 0
-; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
-; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
-; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v6, 0
+; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v8, 0
+; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 0
+; GFX11-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v12, 0
+; GFX11-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v14, 0
+; GFX11-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v16, 0
+; GFX11-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v18, 0
+; GFX11-NEXT:    v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v20, 0
+; GFX11-NEXT:    v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v22, 0
+; GFX11-NEXT:    v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v24, 0
+; GFX11-NEXT:    v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v26, 0
+; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v28, 0
+; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v30, 0
+; GFX11-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX11-NEXT:    s_mov_b32 s1, return_72xi32@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, return_72xi32@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v60, s31, 1
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index a3ebaec4811a9..97b7e26d1230b 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -225,12 +225,10 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
 ; MUBUF-NEXT:  ; %bb.2: ; %split
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x4000
 ; MUBUF-NEXT:    v_or_b32_e32 v0, 0x12d4, v1
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x4000
 ; MUBUF-NEXT:    s_movk_i32 s4, 0x4000
 ; MUBUF-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_or_b32_e32 v0, 0x12d0, v1
-; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x4000
 ; MUBUF-NEXT:    s_or_b32 s4, s4, 0x12c0
 ; MUBUF-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 2d95ec6f699dc..3d6e7c532348f 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -395,7 +395,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[16:17]
-; GFX908-NEXT:    s_mov_b64 s[16:17], exec
 ; GFX908-NEXT:    s_mov_b64 exec, 1
 ; GFX908-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    v_writelane_b32 v2, s31, 0
@@ -743,7 +742,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX908-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX908-NEXT:    s_mov_b64 exec, 1
 ; GFX908-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
index 4aa1ddee2efe3..e145380dca59e 100644
--- a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
+++ b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll
@@ -267,7 +267,6 @@ define amdgpu_ps void @test_export_across_store_load(i32 %idx, float %v) #0 {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 16, v2, vcc_lo
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    scratch_store_b32 v0, v1, off
 ; GCN-NEXT:    scratch_load_b32 v0, off, off
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 35234236b848f..ddb7d6b9c3936 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -388,11 +388,7 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
 ; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -423,6 +419,9 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
 ; GCN-NEXT:    v_mov_b32_e32 v29, 0
 ; GCN-NEXT:    v_mov_b32_e32 v30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
@@ -528,10 +527,6 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:48
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
@@ -560,6 +555,9 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg
 ; GCN-NEXT:    v_mov_b32_e32 v28, 0
 ; GCN-NEXT:    v_mov_b32_e32 v29, 0
 ; GCN-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
@@ -928,7 +926,6 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
 ; GCN-NEXT:    s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 50056b62b3397..e9aebeef6ea6d 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -9971,7 +9971,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_mov_b32 s34, 0x80c00
@@ -9989,7 +9988,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10007,7 +10005,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_mov_b32 s34, 0x81400
@@ -10025,7 +10022,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10043,7 +10039,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
@@ -10061,7 +10056,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10105,7 +10099,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr ...
[truncated]

arsenm · 2025-05-13T12:50:44Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

        if (i == 0 && !MO.isImplicit() && !MO.isDead())
          DefedReg = MO.getReg();
        else
          return false;
+      } else if (MI->isPseudo() && MI->isMoveImmediate()) {


isPseudo does not give you any semantic information, you cannot use it here

Sure, will remove it

arsenm · 2025-05-13T12:50:55Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

@@ -186,11 +186,13 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
    const MachineOperand &MO = MI->getOperand(i);
    if (MO.isReg()) {
-      if (MO.isDef()) {
+      if (MO.isDef() && DefedReg == MCRegister::NoRegister) {


Don't understand why this is necessary, in general this would be a malformed instruction

In some case instruction's MachineOperands MO.isDef() is getting true for more than one operand. Instruction having more than one define. So it just add a check that if the DefedReg is not assigned then try to find the value of register and assign it to DefedReg. Once it is assigned, do not reassign it.

As per my understanding, in any instruction's MOs there is only def and all other are use().

Please correct me if my understanding is incorrect.

Instructions can have multiple defs, but this code specifically only wants to handle instructions with a single def. This should probably early exit if it encounters a second def

I have updated the patch. Please check

arsenm · 2025-05-13T13:52:10Z

llvm/test/CodeGen/AMDGPU/captured-frame-index.ll

-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[FI]]


This lost the point of the test?

I will check

arsenm · 2025-05-13T13:53:09Z

llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll

@@ -72,8 +72,7 @@ done:
 ; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset:
 ; GCN: s_and_saveexec_b64
 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
-; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GFX9: global_load_sbyte {{v[0-9]+}}, [[ZERO]], {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
+; GFX9: global_load_sbyte {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4095{{$}}


This still needs the zero check, the mov is just above here now?

RKSimon

@rohitaggarwal007 please can merge against trunk - I've tried to clean up some tests to reduce the superfluous x86 diffs

…oject into RemovePseudoMov

jayfoad · 2025-05-15T13:09:09Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

+        else if (i != 0 && DefedReg != MCRegister::NoRegister) {
+          if (MO.isDead() && MO.isImplicit()) {
+            continue;
+          } else if (MO.isImplicit() &&


No else after continue

jayfoad · 2025-05-15T13:09:18Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

+                         .getRegisterInfo()
+                         ->isSubRegister(MO.getReg(), DefedReg)) {
+            continue;
+          } else {


No else after continue

arsenm · 2025-05-16T13:36:49Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

+                     MI->getParent()
+                         ->getParent()
+                         ->getSubtarget()
+                         .getRegisterInfo()
+                         ->isSubRegister(MO.getReg(), DefedReg)) {


This shouldn't have to do so much to find TRI, just pass it into isCandidate as an argument.

Also probably should use regsOverlap instead of isSubRegister?

arsenm · 2025-05-16T13:37:19Z

llvm/test/CodeGen/AArch64/vector-lrint.ll

+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-i32-GI: {{.*}}


Spurious test change?

Cleaned up the test case

jayfoad · 2025-05-19T09:54:10Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

@@ -189,7 +189,13 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
      if (MO.isDef()) {
        if (i == 0 && !MO.isImplicit() && !MO.isDead())
          DefedReg = MO.getReg();


I'd appreciate some comments to explain what's going on, for the existing cases and the new cases. Example:
// If the first def is explicit and not dead, remember it.
I'm not sure why explicit matters here.

jayfoad · 2025-05-19T09:58:26Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

@@ -189,7 +189,13 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
      if (MO.isDef()) {
        if (i == 0 && !MO.isImplicit() && !MO.isDead())
          DefedReg = MO.getReg();
-        else
+        else if (i != 0 && DefedReg != MCRegister::NoRegister) {
+          if (MO.isDead() && MO.isImplicit())


Example:
// If a later def is dead and implicit, ignore it.
Again I'm not sure why implicit matters.

Machine operand has this attribute for which I am trying do the enable the optimization.

Implicit doesn't have any semantic meaning though; in general a generic pass shouldn't need to be aware of whether the operand is implicit or not. If it's implicit it's there for liveness tracking and you can't ignore it

jayfoad · 2025-05-19T10:02:21Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

+        else if (i != 0 && DefedReg != MCRegister::NoRegister) {
+          if (MO.isDead() && MO.isImplicit())
+            continue;
+          if (MO.isImplicit() && TRI->regsOverlap(MO.getReg(), DefedReg))


Example:
// If a later implicit def overlaps with the first def, ignore it.
"Overlaps" seem like the wrong condition here. Don't you need to check that its a subset of the first def?

What I'd really like to see is an explanation of exactly what kind of instruction we're looking for, e.g. "must have at least one explicit def and any other def must either be dead or a subset of the first def". And then make sure that the code matches the comment.

The instructions which are of my interest are:
renamable $r9d = MOV32r0 implicit-def dead $eflags, implicit-def $r9
renamable $eax = MOV32r0 implicit-def dead $eflags, implicit-def $rax

I am following these instruction pattern to find and remove them.
There is review comment suggestion to use regOverlaps instead of isSubRegister. As per my usecase, it should be isSubRegister i.e. $eax is the subregister of $rax. It will limit the scope of the match. I am not aware any case in which 3rd operand is not super register of 1st register.

@arsenm Please add your view on this.

In the general case I don't think it's safe to remove an instruction like this, because that it removing a def of the upper bits of $rax, which could be used by some other instruction:

renamable $eax = MOV32r0 implicit-def dead $eflags, implicit-def $rax

The implicit operands could be anything, the extra operand could be an arbitrary def of an unrelated register.

I'm assuming this pattern appeared as a result of coalescing SUBREG_TO_REG. We should really try to remove it and require emitting instructions that explicitly define the used register parts. We have to go out our way to maintain the liveness it introduced in the producer instructions, and it introduces a lot of complexity in downstream optimizations (like this case). But that will be a big project, particularly for x86

In the general case I don't think it's safe to remove an instruction like this, because that it removing a def of the upper bits of $rax, which could be used by some other instruction:

renamable $eax = MOV32r0 implicit-def dead $eflags, implicit-def $rax

As rax is super register of eax so on changing the value of eax implicitly there will be change in value of rax. If there is use of eax/rax in some instruction, then existing logic should bail out.

Also, I don't understand how implicit-def for $rax is useful in this case?

The implicit operands could be anything, the extra operand could be an arbitrary def of an unrelated register.

Ok

I'm assuming this pattern appeared as a result of coalescing SUBREG_TO_REG. We should really try to remove it and require emitting instructions that explicitly define the used register parts. We have to go out our way to maintain the liveness it introduced in the producer instructions, and it introduces a lot of complexity in downstream optimizations (like this case). But that will be a big project, particularly for x86

I have observe this pattern after Register Allocation. The live intervals got splitted and a lot copy instructions were generated. Later copy instructions converted to mov32r0 instruction.

rohitaggarwal007 · 2025-05-21T11:01:42Z

Should i add some extra check on usage of the higher bit of the implicit register used?

arsenm · 2025-05-22T19:36:00Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

@@ -187,9 +187,26 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
    const MachineOperand &MO = MI->getOperand(i);
    if (MO.isReg()) {
      if (MO.isDef()) {
+        // To get the \DefedReg value, we need to check that 1st MachineOperand


This isnt' a /// comment, so the doxygen \ syntax won't do anything; plus I don't think it does anything for comments in the body of a function, only on the declaration

arsenm · 2025-05-22T19:37:12Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

@@ -189,7 +189,13 @@ static bool isCandidate(const MachineInstr *MI, Register &DefedReg,
      if (MO.isDef()) {
        if (i == 0 && !MO.isImplicit() && !MO.isDead())
          DefedReg = MO.getReg();
-        else
+        else if (i != 0 && DefedReg != MCRegister::NoRegister) {
+          if (MO.isDead() && MO.isImplicit())


Implicit doesn't have any semantic meaning though; in general a generic pass shouldn't need to be aware of whether the operand is implicit or not. If it's implicit it's there for liveness tracking and you can't ignore it

…oject into RemovePseudoMov

github-actions · 2025-05-28T10:55:53Z

✅ With the latest revision this PR passed the C/C++ code formatter.

rohitaggarwal007 · 2025-05-28T14:32:18Z

Before Register allocator these are the instructions:

undef %9510.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
MOV64mr %stack.48, 1, $noreg, 0, $noreg, %9510:gr64_with_sub_8bit :: (store (s64) into %stack.48)

After register allocation rewriter:

renamable $eax = MOV32r0 implicit-def dead $eflags, implicit-def $rax
MOV64mr %stack.48, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.48)

Register allocation is allowing implicit def of Super register(64 bit) as MOV32r0 support 32 bit explicitly.

Then it mean that internally MOV32r0 clears the top 32-bits remaining bits of $rax also.

Is my understanding correct?

jayfoad · 2025-05-29T07:59:02Z

See:

llvm-project/llvm/include/llvm/CodeGen/MachineOperand.h

Line 117 in 1064768

/// operands. On a sub-register def operand, it refers to the part of the

undef %9510.sub_32bit:gr64_with_sub_8bit = is a def of the low 32 bits of %9510, and acts like an undef read of the high 32 bits. I.e. before this instruction %9510 does not need a definition and after this instruction the high 32 bits are undef.

MOV32r0 does not clear the high 32 bits of $rax. It only modifies $eax aka the low 32 bits of $rax.

rohitaggarwal007 · 2025-06-03T09:29:30Z

See:

llvm-project/llvm/include/llvm/CodeGen/MachineOperand.h

Line 117 in 1064768

/// operands. On a sub-register def operand, it refers to the part of the

undef %9510.sub_32bit:gr64_with_sub_8bit = is a def of the low 32 bits of %9510, and acts like an undef read of the high 32 bits. I.e. before this instruction %9510 does not need a definition and after this instruction the high 32 bits are undef.

MOV32r0 does not clear the high 32 bits of $rax. It only modifies $eax aka the low 32 bits of $rax.

I went through some online material and found that 32 bit instruction are zext or signed extended to 64 bit instruction by the hardware and then it is executed.
I should automatically clear upper 32 bit of the $rax.

rohitaggarwal007 · 2025-06-03T09:39:25Z

The value of upper bit of $rax should not matter.

If the there is no redefinition of eax or rax between the candidate duplicate instruction and source instruction, there it is safe to remove the duplicate instruction. The value for upper bits of $rax will remain the same.

We have to just provide that there is not definition of rax or eax between the source and candidate instruction.

As per my understanding, It should be already be implemented in the pass but I am not able to find the logic for the same.
There is a use of MBBDefs.hasIdentical(DefedReg, &MI) api but not sure that it check for redefinition between them.

rohitaggarwal007 · 2025-06-17T07:45:21Z

@jayfoad @arsenm, any comment for my above two comments?

Thanks

jayfoad · 2025-06-17T08:27:09Z

See:

llvm-project/llvm/include/llvm/CodeGen/MachineOperand.h

Line 117 in 1064768

/// operands. On a sub-register def operand, it refers to the part of the

undef %9510.sub_32bit:gr64_with_sub_8bit = is a def of the low 32 bits of %9510, and acts like an undef read of the high 32 bits. I.e. before this instruction %9510 does not need a definition and after this instruction the high 32 bits are undef.
MOV32r0 does not clear the high 32 bits of $rax. It only modifies $eax aka the low 32 bits of $rax.

I went through some online material and found that 32 bit instruction are zext or signed extended to 64 bit instruction by the hardware and then it is executed. I should automatically clear upper 32 bit of the $rax.

You're right. I was wrong. This instruction does zero the upper 32 bits of $rax, which would explain why there is an implicit-def.

renamable $eax = MOV32r0 implicit-def dead $eflags, implicit-def $rax

jayfoad · 2025-06-17T08:34:13Z

llvm/lib/CodeGen/MachineLateInstrsCleanup.cpp

+          // If the machineOperand is Dead and Implicit then continue
+          // to next operand.


I know I asked for comments, but I wanted something that explains why the code does what it does. In this case, why do we ignore dead implicit operands, specifically?

Please remove comments like this that just state what the code does. They are not helpful in any way.

Remove redundant pseudo mov instruction

214caf4

llvmbot added backend:AArch64 backend:AMDGPU backend:X86 labels May 13, 2025

arsenm reviewed May 13, 2025

View reviewed changes

arsenm changed the title ~~[Codegen][Backend] Remove redundant pseudo mov instruction~~ [Codegen] Remove redundant pseudo mov instruction May 13, 2025

arsenm added the llvm:codegen label May 13, 2025

Remove the isPseudo check

9a1de88

arsenm reviewed May 13, 2025

View reviewed changes

RKSimon requested changes May 13, 2025

View reviewed changes

rohitaggarwal007 closed this May 13, 2025

rohitaggarwal007 force-pushed the RemovePseudoMov branch from 9a1de88 to 7038d50 Compare May 13, 2025 17:53

Rohit Aggarwal added 2 commits May 13, 2025 23:31

Merge branch 'RemovePseudoMov' of github.com:rohitaggarwal007/llvm-pr…

71abb2b

…oject into RemovePseudoMov

Fix the test cases

2ab66a2

rohitaggarwal007 reopened this May 13, 2025

Update the logic for defs

9ca3190

rohitaggarwal007 changed the title ~~[Codegen] Remove redundant pseudo mov instruction~~ [Codegen] Remove redundant instruction using machinelateCleanup May 15, 2025

jayfoad reviewed May 15, 2025

View reviewed changes

Rohit Aggarwal and others added 2 commits May 15, 2025 19:10

Restructure the if else ladder

33373fa

Merge branch 'llvm:main' into RemovePseudoMov

b358511

arsenm reviewed May 16, 2025

View reviewed changes

Cleanup test case and pass TRI as argument in isCandidate function

a709a64

jayfoad reviewed May 19, 2025

View reviewed changes

Rohit Aggarwal and others added 3 commits May 19, 2025 17:40

Updating the comments and isSubRegister call

f7792d2

Merge branch 'llvm:main' into RemovePseudoMov

3e1c49d

Merge branch 'llvm:main' into RemovePseudoMov

fe8860a

arsenm reviewed May 22, 2025

View reviewed changes

rohitaggarwal007 closed this May 26, 2025

rohitaggarwal007 force-pushed the RemovePseudoMov branch from fe8860a to 926c201 Compare May 26, 2025 08:37

Merge branch 'RemovePseudoMov' of github.com:rohitaggarwal007/llvm-pr…

51ebfc5

…oject into RemovePseudoMov

rohitaggarwal007 reopened this May 26, 2025

Rohit Aggarwal added 2 commits May 26, 2025 15:20

Update the comment

2d09638

Add logic to check the uses of the implicit-def operand

bee2fb4

Format the code

89b97de

jayfoad reviewed Jun 17, 2025

View reviewed changes

Add readRegs to the check

9957dcb

		; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
		; GCN: buffer_store_dword [[FI]]

		;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
		; CHECK-i32-GI: {{.*}}

		// If the machineOperand is Dead and Implicit then continue
		// to next operand.

[Codegen] Remove redundant instruction using machinelateCleanup #139716

Are you sure you want to change the base?

[Codegen] Remove redundant instruction using machinelateCleanup #139716

Uh oh!

Conversation

rohitaggarwal007 commented May 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented May 13, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

rohitaggarwal007 May 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

rohitaggarwal007 commented May 21, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

rohitaggarwal007 commented May 13, 2025 •

edited

Loading

llvmbot commented May 13, 2025 •

edited

Loading

rohitaggarwal007 May 20, 2025 •

edited

Loading

github-actions bot commented May 28, 2025 •

edited

Loading