handle two inputs in torchtitan

pytorch · danielvegamyhre · Apr 29, 2025 · Apr 30, 2025 · May 1, 2025 · Apr 30, 2025
commit 8bac877fc9dccdd5298b2ea582f8d1d17cbf1043
@@ -233,9 +233,13 @@ def forward(
             torch.Tensor: Output tensor after attention.
 
         """
-
-        bs, seqlen, _ = x.shape
-        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        if isinstance(x, tuple):
+            x_fp8_rowwise, x_fp8_colwise = x
+            bs, seqlen, _ = x_fp8_rowwise.shape 
+            xq, xk, xv = self.wq(x_fp8_rowwise, x_fp8_colwise), self.wk(x_fp8_rowwise, x_fp8_colwise), self.wv(x_fp8_rowwise, x_fp8_colwise)
+        else:
+            bs, seqlen, _ = x.shape
+            xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
 
         # Use -1 instead of `n_heads` (or `n_kv_heads`) to infer the actual
         # local heads from sizes of xq, xk, and xv as TP may have sharded them

@@ -171,11 +171,12 @@ def apply_tp(
         from torchao.float8.float8_tensor_parallel_rowwise_scales import (
             Float8ColwiseParallel,
             Float8RowwiseParallel,
+            PrepareFloat8ModuleInput,
         )
         rowwise_parallel, colwise_parallel, prepare_module_input = (
             Float8RowwiseParallel,
             Float8ColwiseParallel,
-            PrepareModuleInput,
+            PrepareFloat8ModuleInput,
         )
         logger.info("Using float8 rowwise all-gather")
     else: