Add the option for the macro and note (Dao-AILab#893)

drisspg · web-flow · commit 23e8fa5a263d · 2024-03-27T19:12:11.000-07:00
diff --git a/csrc/flash_attn/src/softmax.h b/csrc/flash_attn/src/softmax.h
@@ -78,7 +78,14 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tenso
             // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
             // max * log_2(e)) This allows the compiler to use the ffma
             // instruction instead of fadd and fmul separately.
-            tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+            // The following macro will disable the use of fma.
+            // See: https://github.com/pytorch/pytorch/issues/121558 for more details
+            // This macro is set in PyTorch and not FlashAttention
+            #ifdef UNFUSE_FMA
+                tensor(mi, ni) = exp2f(__fmul_rn(tensor(mi, ni), scale) - max_scaled);
+            #else
+                tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+            #endif
         }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,14 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tenso`
`78`	`78`	`// Instead of computing exp(x - max), we compute exp2(x * log_2(e) -`
`79`	`79`	`// max * log_2(e)) This allows the compiler to use the ffma`
`80`	`80`	`// instruction instead of fadd and fmul separately.`
`81`		`- tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);`
	`81`	`+ // The following macro will disable the use of fma.`
	`82`	`+ // See: https://github.com/pytorch/pytorch/issues/121558 for more details`
	`83`	`+ // This macro is set in PyTorch and not FlashAttention`
	`84`	`+ #ifdef UNFUSE_FMA`
	`85`	`+ tensor(mi, ni) = exp2f(__fmul_rn(tensor(mi, ni), scale) - max_scaled);`
	`86`	`+ #else`
	`87`	`+ tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);`
	`88`	`+ #endif`
`82`	`89`	`}`
`83`	`90`	`}`
`84`	`91`	`}`