Skip to content

Commit e540fc1

Browse files
committed
[Cute] Fix missing tmem_store fence
1 parent 723c36b commit e540fc1

File tree

1 file changed

+1
-0
lines changed

1 file changed

+1
-0
lines changed

flash_attn/cute/flash_fwd_sm100.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1340,6 +1340,7 @@ def softmax_step(
13401340
cute.arch.mbarrier_arrive(mbar_ptr + self.mbar_P_full_O_rescaled_offset + stage)
13411341
for i in cutlass.range_constexpr(cute.size(tStP_r2t.shape[2]) // 4 * 3, cute.size(tStP_r2t.shape[2])):
13421342
cute.copy(thr_tmem_store, tSrP_r2t_f32[None, None, i], tStP_r2t[None, None, i])
1343+
cute.arch.fence_view_async_tmem_store()
13431344
# Notify mma warp that the 2nd half of P is ready
13441345
cute.arch.mbarrier_arrive(mbar_ptr + self.mbar_P_full_2_offset + stage)
13451346
cute.arch.mbarrier_wait(mbar_ptr + self.mbar_softmax_corr_empty_offset + stage, si_corr_producer_phase)

0 commit comments

Comments
 (0)