[Cute] Do manual f32->f16x2 conversion for fwd_sm90

tridao · tridao · commit 10e8c39fdaaf · 2025-07-05T23:30:04.000-04:00
diff --git a/flash_attn/cute/blackwell_helpers.py b/flash_attn/cute/blackwell_helpers.py
@@ -308,15 +308,10 @@ def gemm_ptx_partial(
     smem_desc_base_b_lo = cutlass.const_expr(smem_desc_base_b_lo)
     smem_desc_b_hi = cutlass.const_expr(smem_desc_b_hi)
 
-    if cutlass.const_expr(not is_ts):
-        offset_a = [(cute.crd2idx((0, 0, k), sA_layout) * op.a_dtype.width // 8) >> 4
-                    for k in range(cute.size(tCrA.shape[2]))]
-    else:
-        offset_a = [cute.crd2idx((0, 0, k), sA_layout) * op.a_dtype.width // 32
-                    for k in range(cute.size(tCrA.shape[2]))]
+    tCrA_layout = tCrA.layout if cutlass.const_expr(not is_ts) else cute.recast_layout(32, tCrA.element_type.width, tCrA.layout)
+    offset_a = [cute.crd2idx((0, 0, k), tCrA_layout) for k in range(cute.size(tCrA.shape[2]))]
     offset_a_diff = [offset_a[k] - offset_a[k - 1] for k in range(1, cute.size(tCrA.shape[2]))]
-    offset_b = [(cute.crd2idx((0, 0, k), sB_layout) * sB.element_type.width // 8) >> 4
-                for k in range(cute.size(tCrB.shape[2]))]
+    offset_b = [cute.crd2idx((0, 0, k), tCrB.layout) for k in range(cute.size(tCrB.shape[2]))]
     offset_b_diff = [offset_b[k] - offset_b[k - 1] for k in range(1, cute.size(tCrB.shape[2]))]
 
     if cutlass.const_expr(not is_ts):
@@ -330,8 +325,8 @@ def gemm_ptx_partial(
             None,
             [
                 # acc.iterator.toint().ir_value(),
-                cutlass.Int32(cute.arch.make_warp_uniform(smem_desc_start_a_lo)).ir_value(),
-                cutlass.Int32(cute.arch.make_warp_uniform(smem_desc_start_b_lo)).ir_value(),
+                cutlass.Int32(smem_desc_start_a_lo).ir_value(),
+                cutlass.Int32(smem_desc_start_b_lo).ir_value(),
                 cutlass.Int32(not zero_init).ir_value(),
             ],
             "{\n\t"
diff --git a/flash_attn/cute/flash_fwd.py b/flash_attn/cute/flash_fwd.py
@@ -1637,7 +1637,11 @@ def scoremod_premask_fn(acc_S):
                 softmax.online_softmax(acc_S, is_first=True)
                 tOrP_acc = cute.make_tensor(acc_S.iterator, utils.convert_layout_acc_frgA(acc_S.layout))
                 tOrP = mma_params.tOrP if const_expr(self.mma_pv_is_rs) else cute.make_fragment_like(tOrP_acc, self.dtype)
-                tOrP.store(tOrP_acc.load().to(self.dtype))
+                # tOrP.store(tOrP_acc.load().to(self.dtype))
+                # the "to(self.dtype)" conversion fails to vectorize for block sizes other
+                # than 128 x 128, i.e. it calls convert on 1 fp32 element at a time instead of
+                # 2 elements. So we just call ptx directly.
+                utils.cvt_f16(tOrP_acc, tOrP)
                 if const_expr(not self.mma_pv_is_rs):
                     tPrP = smem_thr_copy_P.retile(tOrP)
                     cute.copy(smem_thr_copy_P, tPrP, tPsP)
@@ -1749,7 +1753,8 @@ def mma_one_n_block(
         # if cute.arch.thread_idx()[0] == 0: cute.print_tensor(utils.make_acc_tensor_mn_view(acc_S))
         tOrP_acc = cute.make_tensor(acc_S.iterator, utils.convert_layout_acc_frgA(acc_S.layout))
         tOrP = mma_params.tOrP if const_expr(self.mma_pv_is_rs) else cute.make_fragment_like(tOrP_acc, self.dtype)
-        tOrP.store(tOrP_acc.load().to(self.dtype))
+        # tOrP.store(tOrP_acc.load().to(self.dtype))
+        utils.cvt_f16(tOrP_acc, tOrP)
         if const_expr(not self.mma_pv_is_rs):
             tPrP = smem_copy_params.smem_thr_copy_P.retile(mma_params.tOrP)
             cute.copy(smem_copy_params.smem_thr_copy_P, tPrP, smem_copy_params.tPsP)
@@ -1817,7 +1822,8 @@ def mma_one_n_block_intrawg_overlap(
         pipeline_v.consumer_release(smem_pipe_read_v)
         tOrP_acc = cute.make_tensor(acc_S.iterator, utils.convert_layout_acc_frgA(acc_S.layout))
         tOrP = mma_params.tOrP if const_expr(self.mma_pv_is_rs) else cute.make_fragment_like(tOrP_acc, self.dtype)
-        tOrP.store(tOrP_acc.load().to(self.dtype))
+        # tOrP.store(tOrP_acc.load().to(self.dtype))
+        utils.cvt_f16(tOrP_acc, tOrP)
         if const_expr(not self.mma_pv_is_rs):
             tPrP = smem_copy_params.smem_thr_copy_P.retile(tOrP)
             cute.copy(smem_copy_params.smem_thr_copy_P, tPrP, smem_copy_params.tPsP)
diff --git a/flash_attn/cute/interface.py b/flash_attn/cute/interface.py
@@ -133,9 +133,9 @@ def _flash_attn_fwd(
     assert compute_capability in [9, 10], "Unsupported compute capability. Supported: 9.x, 10.x"
     current_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
 
-    # if compute_capability == 9:  # TODO: tune block size according to hdim
-    #     if not causal and not local:
-    #         n_block_size = 176
+    if compute_capability == 9:  # TODO: tune block size according to hdim
+        if not causal and not local:
+            n_block_size = 192
 
     compile_key = (
         dtype, head_dim, head_dim_v, qhead_per_kvhead, causal, softcap is not None,
@@ -154,6 +154,7 @@ def _flash_attn_fwd(
                 qhead_per_kvhead,
                 is_causal=causal,
                 is_local=local,
+                pack_gqa=False,
                 m_block_size=m_block_size,
                 n_block_size=n_block_size,
                 # num_stages=1,
diff --git a/flash_attn/cute/utils.py b/flash_attn/cute/utils.py
@@ -257,9 +257,21 @@ def fmax_reduce(
     x: cute.TensorSSA, init_val: float | Float32 | None = None, arch: cutlass.Constexpr[int] = 80
 ) -> Float32:
     if cutlass.const_expr(arch < 100 or cute.size(x.shape) % 8 != 0):
-        if cutlass.const_expr(init_val is None):
-            init_val = -cutlass.Float32.inf
-        return x.reduce(cute.ReductionOp.MAX, init_val, 0)
+        # if cutlass.const_expr(init_val is None):
+        #     init_val = -cutlass.Float32.if
+        # return x.reduce(cute.ReductionOp.MAX, init_val, 0)
+        res = cute.make_fragment(x.shape, Float32)
+        res.store(x)
+        local_max = [res[0], res[1], res[2], res[3]]
+        for i in cutlass.range_constexpr(4, cute.size(x.shape), 4):
+            local_max[0] = fmax(local_max[0], res[i + 0])
+            local_max[1] = fmax(local_max[1], res[i + 1])
+            local_max[2] = fmax(local_max[2], res[i + 2])
+            local_max[3] = fmax(local_max[3], res[i + 3])
+        local_max[0] = fmax(local_max[0], local_max[1])
+        local_max[2] = fmax(local_max[2], local_max[3])
+        local_max[0] = fmax(local_max[0], local_max[2])
+        return local_max[0] if cutlass.const_expr(init_val is None) else fmax(local_max[0], init_val)
     else:
         # [2025-06-15] x.reduce only seems to use 50% 3-input max and 50% 2-input max
         # We instead force the 3-input max.
@@ -290,6 +302,18 @@ def fadd_reduce(
         if cutlass.const_expr(init_val is None):
             init_val = Float32.zero
         return x.reduce(cute.ReductionOp.ADD, init_val, 0)
+        # res = cute.make_fragment(x.shape, Float32)
+        # res.store(x)
+        # local_sum = [res[0], res[1], res[2], res[3]]
+        # for i in cutlass.range_constexpr(4, cute.size(x.shape), 4):
+        #     local_sum[0] += res[i + 0]
+        #     local_sum[1] += res[i + 1]
+        #     local_sum[2] += res[i + 2]
+        #     local_sum[3] += res[i + 3]
+        # local_sum[0] += local_sum[1]
+        # local_sum[2] += local_sum[3]
+        # local_sum[0] += local_sum[2]
+        # return local_sum[0] if cutlass.const_expr(init_val is None) else local_sum[0] + init_val
     else:
         res = cute.make_fragment(x.shape, Float32)
         res.store(x)
@@ -440,3 +464,31 @@ def warp_prefix_sum(val: cutlass.Int32, lane: Optional[cutlass.Int32] = None) ->
             val += partial_sum
         # if cute.arch.thread_idx()[0] >= 128 and cute.arch.thread_idx()[0] < 128 + 32 and cute.arch.block_idx()[0] == 0: cute.printf("tidx = %d, partial_sum = %d, val = %d", cute.arch.thread_idx()[0] % 32, partial_sum, val)
     return val
+
+
+@dsl_user_op
+def cvt_f16x2_f32(a: float | Float32, b: float | Float32, to_dtype: Type, *, loc=None, ip=None) -> cutlass.Int32:
+    assert to_dtype in [cutlass.BFloat16, cutlass.Float16], "to_dtype must be BFloat16 or Float16"
+    return cutlass.Int32(
+        llvm.inline_asm(
+            T.i32(),
+            [Float32(a).ir_value(loc=loc, ip=ip), Float32(b).ir_value(loc=loc, ip=ip)],
+            f"cvt.rn.{'bf16x2' if to_dtype is cutlass.BFloat16 else 'f16x2'}.f32 $0, $2, $1;",
+            "=r,f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+
+
+@cute.jit
+def cvt_f16(src: cute.Tensor, dst: cute.Tensor):
+    assert cute.size(dst.shape) == cute.size(src.shape), "dst and src must have the same size"
+    assert cute.size(src.shape) % 2 == 0, "src must have an even number of elements"
+    assert dst.element_type in [cutlass.BFloat16, cutlass.Float16], "dst must be BFloat16 or Float16"
+    assert src.element_type is Float32, "src must be Float32"
+    dst_i32 = cute.recast_tensor(dst, cutlass.Int32)
+    assert cute.size(dst_i32.shape) * 2 == cute.size(src.shape)
+    for i in cutlass.range_constexpr(cute.size(dst_i32)):
+        dst_i32[i] = cvt_f16x2_f32(src[2 * i], src[2 * i + 1], dst.element_type)