[WIP] more aggressive persistent reduction (pytorch#161055)

eellison · pytorchmergebot · commit ebfee6010160 · 2025-08-30T01:08:45.000Z
Gives 18% speedup on rms norm (2048, 32768). And we have seen other instances where inductor is not aggressive enough about codegening persistent reductions - e.g. 39% on [this kernel from torch ao](pytorch#159769 (comment)). Codegen-ing persistent reductions can be risky if you run out of registers. Here, I'm effectively making persistent reductions an option of looped reductions by setting RBLOCK == rnumel, so that we can still fallback to looped reductions as needed. As criteria: - there needs to be significant memory savings from doing a persistent reduction (by keeping memory in register and avoiding another iteration over input) - we should not be coalescing on x dimension, otherwise large rblock will inhibit coalescing - we should not be especially register or arithmetic intensive (this last part uses mem_ops_per_thread, but could be improved). Still need to do dashboard run, although I'm not sure we get a lot of large rblock in our benchmarks. Pull Request resolved: pytorch#161055 Approved by: https://github.com/jansel
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
@@ -408,6 +408,7 @@ def __init__(
             else self.should_use_cooperative_reduction()
         )
         self.tiling_scores: Optional[dict[str, sympy.Expr]] = tiling_scores
+        self.tiling: dict[str, sympy.Expr] = tiling
         self.persistent_reduction: bool = (
             override_persistent_reduction
             if override_persistent_reduction is not None
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
@@ -4022,7 +4022,12 @@ def inductor_meta_common():
             )
         return inductor_meta
 
-    def codegen_kernel(self, name=None):
+    def codegen_kernel(self, name=None) -> str:
+        """
+        Convert the TritonKernel from Inductor SIMD IR to triton code, including inductor triton heuristics, imports,
+        metadata, and benchmarking infra.
+        """
+
         code = IndentedBuffer()
 
         size_hints = {}
@@ -4163,6 +4168,53 @@ def add_constexpr_arg(arg_name):
             "num_reduction": self.num_reduction,
             **self.inductor_meta_common(),
         }
+
+        # Bail on 3d tiling, which has more complicated coalesce patterns
+        looped_red = V.kernel.features.is_reduction() and not self.persistent_reduction
+        tiling_scores = self.tiling_scores
+        two_d_red = (
+            len(self.tiling) == 2 and tiling_scores is not None and "x" in tiling_scores
+        )
+        if looped_red and two_d_red:
+            assert tiling_scores is not None
+            memory_stats = self.features.memory_stats(self.tiling)
+            dim_stats = memory_stats.persistent.memory.dim[0]
+            mem_ops_per_thread = dim_stats.count_per_thread
+
+            # check if majority of reads are coalesced by the rblock
+            r_coalesce_ratio = tiling_scores["r0_"] / max(tiling_scores["x"], 1)
+
+            looped_mem = memory_stats.looped.memory.bytes
+            persistent_mem = memory_stats.persistent.memory.bytes
+            # check that we save significant memory by doing persistent
+            saved_bytes_ratio = V.graph.sizevars.size_hint(
+                looped_mem, fallback=config.unbacked_symint_fallback
+            ) / max(
+                V.graph.sizevars.size_hint(
+                    persistent_mem, fallback=config.unbacked_symint_fallback
+                ),
+                1,
+            )
+
+            # TODO - rnumel should be reasonably close to power of 2
+            if (
+                # significant memory bandwidth savings
+                saved_bytes_ratio >= 1.3
+                # large rblock inhibits xblock size, dont attempt if there is a decent amount of
+                # reads coalesced by xblock
+                and r_coalesce_ratio >= 8.0
+                # TODO - need more detailed register analysis
+                and V.graph.sizevars.statically_known_leq(
+                    self.features.reduction_numel, 32768
+                )
+                # We will already generate a persistent config in this case
+                and V.graph.sizevars.statically_known_gt(
+                    self.features.reduction_numel, 2048
+                )
+                and mem_ops_per_thread <= 10
+            ):
+                inductor_meta["add_persistent_rblock"] = True
+
         if self.tiling_scores:
             inductor_meta["tiling_scores"] = self.tiling_scores
 
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
@@ -82,6 +82,14 @@
 )
 
 
+class InductorConfig(Config):
+    """Inductor-specific Triton config with additional control flags"""
+
+    def __init__(self, *args, dynamic_scale_rblock=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dynamic_scale_rblock = dynamic_scale_rblock
+
+
 class NoTritonConfigsError(RuntimeError):
     pass
 
@@ -2249,6 +2257,7 @@ def triton_config_reduction(
     num_stages=1,
     num_warps=None,
     register_intensive=False,
+    dynamic_scale_rblock=True,
 ) -> Config:
     """
     Construct a reduction triton config with some adjustment heuristics
@@ -2292,7 +2301,12 @@ def total_numel() -> int:
     cfg = _get_config({"x": x, **rnumels})
     check_max_block(cfg)
     check_config(cfg, xnumel=size_hints["x"])
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+    return InductorConfig(
+        cfg,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        dynamic_scale_rblock=dynamic_scale_rblock,
+    )
 
 
 def _get_config(numels: dict[str, int]) -> dict[str, int]:
@@ -2490,11 +2504,10 @@ def _reduction_configs(
 
     register_intensive = False
     MAX_R0_BLOCK = 2048
-    if (
-        size_hints["x"] >= 1024
-        and inductor_meta.get("num_load", 0) + inductor_meta.get("num_reduction", 0)
-        >= 10
-    ):
+    loads_and_red = inductor_meta.get("num_load", 0) + inductor_meta.get(
+        "num_reduction", 0
+    )
+    if size_hints["x"] >= 1024 and loads_and_red >= 10:
         # A heuristics to reduce R0_BLOCK if a kernel potentially need many registers.
         # Consider load and reduction since load need move data into registers and
         # reduction needs an accumulator.
@@ -2510,7 +2523,14 @@ def _reduction_configs(
         MAX_R0_BLOCK = 1024
         register_intensive = True
 
-    def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
+    def make_config(
+        x,
+        r,
+        num_warps=None,
+        num_stages=1,
+        register_intensive=False,
+        dynamic_scale_rblock=True,
+    ):
         # For 3D case with tiling scores, create an adapted version
         if "y" in size_hints:
             assert "tiling_scores" in inductor_meta
@@ -2532,6 +2552,7 @@ def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
                 num_warps=num_warps,
                 num_stages=num_stages,
                 register_intensive=register_intensive,
+                dynamic_scale_rblock=dynamic_scale_rblock,
             )
 
     def outer_config_opt():
@@ -2598,6 +2619,19 @@ def outer_config_opt():
     # for correctness
     if not torch.version.hip and not is_fbcode():
         outer_config = outer_config_opt()
+
+    configs = []
+
+    if inductor_meta.get("add_persistent_rblock") and loads_and_red <= 8:
+        xnumel = max(4096 // rnumel, 1)
+        c = make_config(
+            xnumel,
+            rnumel,
+            register_intensive=register_intensive,
+            dynamic_scale_rblock=False,
+        )
+        configs.append(c)
+
     # For 3d tiling, default to more autotuning initially
     if "y" in size_hints:
         pass
@@ -2606,14 +2640,15 @@ def outer_config_opt():
     ):
         pass  # skip all these cases
     elif reduction_hint == ReductionHint.INNER:
-        return [contiguous_config]
+        return configs + [contiguous_config]
     elif reduction_hint == ReductionHint.OUTER:
-        return [outer_config]
+        return configs + [outer_config]
     elif reduction_hint == ReductionHint.OUTER_TINY:
-        return [tiny_config]
+        return configs + [tiny_config]
     if disable_pointwise_autotuning(inductor_meta):
-        return [make_config(32, 128)]
-    return [
+        return configs + [make_config(32, 128)]
+
+    return configs + [
         contiguous_config,
         outer_config,
         tiny_config,

Original file line number	Diff line number	Diff line change
`@@ -408,6 +408,7 @@ def __init__(`
`408`	`408`	`else self.should_use_cooperative_reduction()`
`409`	`409`	`)`
`410`	`410`	`self.tiling_scores: Optional[dict[str, sympy.Expr]] = tiling_scores`
	`411`	`+ self.tiling: dict[str, sympy.Expr] = tiling`
`411`	`412`	`self.persistent_reduction: bool = (`
`412`	`413`	`override_persistent_reduction`
`413`	`414`	`if override_persistent_reduction is not None`