Add option for selective op AC to filter mm shapes based on fqn

soulitzer · soulitzer · commit d1ada1d353fe · 2025-07-11T07:19:38.000-07:00
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -487,6 +487,18 @@ class ActivationCheckpoint:
     'int' (e.g., 2) for every nth layer, or 'op' for op level ac.
     """
 
+    selective_op_ac_force_recompute_mm_shapes_by_fqns: list[str] = field(default_factory=lambda: [])
+    """
+    When per-op selective ac is used, this list of fully qualified names (relative
+    to the module at which AC is applied) is used to determine which mm shapes to
+    force recompute, rather than being considered by rest of the sac policy, e.g
+    save every other mm. Only nn.Linear modules are supported today.
+
+    Note: this config applies to mms not limited to those matching the specified
+    fqns, e.g. if "moe.router.gate", corresponding to Linear(in, out), is specified,
+    ANY mm with shape matching (*, in) x (in, out) will be force recomputed.
+    """
+
 
 @dataclass
 class Float8:
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -261,11 +261,27 @@ def _apply_ac_to_transformer_block(module: nn.Module, ac_config):
             create_selective_checkpoint_contexts,
         )
 
+        mm_recompute_shapes = set()
+        for fqn in ac_config.selective_op_ac_force_recompute_mm_shapes_by_fqns:
+            submod = dict(module.named_modules()).get(fqn, None)
+            if submod is None:
+                continue
+            if not isinstance(submod, nn.Linear):
+                raise ValueError(
+                    "selective_op_ac_force_recompute_mm_shapes_by_fqns expected to match "
+                    f"a nn.Linear, but got: {submod}"
+                )
+            out_f, in_f = submod.weight.shape
+            mm_recompute_shapes.add((in_f, out_f))
+        logger.debug(f"Selective op AC force recomputing mms with rhs shapes {mm_recompute_shapes}")
+
         def _get_custom_policy(meta):
             def _custom_policy(ctx, func, *args, **kwargs):
                 mode = "recompute" if ctx.is_recompute else "forward"
                 mm_count_key = f"{mode}_mm_count"
                 if func == torch.ops.aten.mm.default:
+                    if args[1].shape in mm_recompute_shapes:
+                        return CheckpointPolicy.PREFER_RECOMPUTE
                     meta[mm_count_key] += 1
                 # Saves output of all compute ops, except every second mm
                 to_save = func in _save_list and not (