Always ignore freqs_cis (#1338)

fegin · web-flow · commit aefe15a32282 · 2025-06-26T09:18:03.000-07:00
We should always ignore freq_cis and other parameters in
excluded_parameters_for_model_only to avoid confusion.

TODO: Is this going to break PP with seed checkpoint?
diff --git a/scripts/convert_llama_to_dcp.py b/scripts/convert_llama_to_dcp.py
@@ -10,7 +10,6 @@
 
 import torch
 import torch.distributed.checkpoint as DCP
-from torchtitan.models.llama.model import precompute_freqs_cis
 from torchtitan.tools.logging import init_logger, logger
 
 
@@ -123,13 +122,6 @@ def convert_llama_weights(input_dir, output_dir, max_seq_len: int):
         for i in range(len(shards)):
             del shards[i]["output.weight"]
 
-    # NOTE: precompute freqs_cis because must be persisted by default in torchtitan
-    state_dict["freqs_cis"] = precompute_freqs_cis(
-        dims_per_head,
-        max_seq_len,
-        params.get("rope_theta", 500000),
-    )
-
     logger.info(f"Writing to DCP at '{output_dir}'")
     output_dir.mkdir(parents=True, exist_ok=True)
     storage_writer = DCP.filesystem.FileSystemWriter(output_dir, thread_count=8)
diff --git a/scripts/generate/test_generate.py b/scripts/generate/test_generate.py
@@ -24,8 +24,8 @@
     parallelize_module,
     RowwiseParallel,
 )
+from torchtitan.components.checkpoint import excluded_parameters_for_model_only
 from torchtitan.components.metrics import build_device_memory_monitor
-
 from torchtitan.config_manager import ConfigManager
 from torchtitan.distributed import ParallelDims, utils as dist_utils
 from torchtitan.protocols.train_spec import get_train_spec
@@ -142,6 +142,8 @@ def test_generate(
     model.eval()
 
     state_dict = {"model": model.state_dict()}
+    for k in excluded_parameters_for_model_only:
+        state_dict["model"].pop(k, None)
 
     # Checkpoint Loading
     begin = time.monotonic()
diff --git a/tests/unit_tests/test_checkpoint.py b/tests/unit_tests/test_checkpoint.py
@@ -534,6 +534,52 @@ def test_enable_first_step_checkpoint(self, mock_save, mock_rank):
 
         manager2.close()
 
+    @mock.patch("torch.distributed.get_rank", return_value=0)
+    @mock.patch("torchtitan.components.checkpoint.dcp.save")
+    def test_excluded_parameters_not_saved(self, mock_save, mock_rank):
+        """Test that freqs_cis is not saved"""
+
+        # Create a fake model with freqs_cis and other parameters
+        class FakeModelWithFreqsCis(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = nn.Parameter(torch.randn(2, 2))
+                self.bias = nn.Parameter(torch.randn(2))
+                # Register freqs_cis as a buffer (common pattern in transformer models)
+                self.register_buffer("freqs_cis", torch.randn(10, 5))
+                self.other_param = nn.Parameter(torch.randn(3, 3))
+
+        fake_model = FakeModelWithFreqsCis()
+        mock_save.side_effect = self.fake_save
+
+        cfg = self.job_config.checkpoint
+        cfg.keep_latest_k = 0  # Disable purging
+
+        manager = CheckpointManager(
+            dataloader=self.data_loader,
+            model_parts=[fake_model],
+            optimizers=self.optimizers,
+            lr_schedulers=self.lr_schedulers,
+            states=self.states,
+            job_config=self.job_config,
+            ft_manager=self.ft_manager,
+        )
+
+        manager.save(curr_step=1)
+        self.assertEqual(mock_save.call_count, 1)
+        checkpoint_path = os.path.join(self.test_folder, "step-1", "state_dict.pt")
+        saved_data = torch.load(checkpoint_path, weights_only=False)
+        model_state_dict = saved_data[MODEL]
+
+        # Verify that freqs_cis is NOT in the saved state dict
+        self.assertNotIn("freqs_cis", model_state_dict)
+        # Verify that other parameters ARE in the saved state dict
+        self.assertIn("weight", model_state_dict)
+        self.assertIn("bias", model_state_dict)
+        self.assertIn("other_param", model_state_dict)
+
+        manager.close()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -49,12 +49,25 @@ class AsyncMode(str, enum.Enum):
     ASYNC_WITH_PINNED_MEM = "async_with_pinned_mem"
 
 
+# For now, we will manually pop the freqs_cis buffer, as we made this permanent
+# temporarily and we don't want to include it in the exported state_dict.
+# Context: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/model.py#L404
+excluded_parameters_for_model_only = {"freqs_cis"}
+
+
 class ModelWrapper(Stateful):
     def __init__(self, model: nn.Module | list[nn.Module]) -> None:
         self.model = [model] if isinstance(model, nn.Module) else model
-        self.cache_state_dict = {
+        self.cache_state_dict = self._get_state_dict()
+
+    def _get_state_dict(self) -> dict[str, Any]:
+        state_dict = {
             k: v for sd in map(get_model_state_dict, self.model) for k, v in sd.items()
         }
+        # Exclude parameters that should not be saved
+        for excluded_key in excluded_parameters_for_model_only:
+            state_dict.pop(excluded_key, None)
+        return state_dict
 
     def state_dict(self) -> dict[str, Any]:
         return self.cache_state_dict
@@ -68,9 +81,7 @@ def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         list(map(func, self.model))
         # `set_model_state_dict()` does change the keys of the input state_dict,
         # we will need to reinitialize the cache_state_dict.
-        self.cache_state_dict = {
-            k: v for sd in map(get_model_state_dict, self.model) for k, v in sd.items()
-        }
+        self.cache_state_dict = self._get_state_dict()
 
 
 class Terminate:
@@ -81,12 +92,6 @@ class SaveDone:
     pass
 
 
-# For now, we will manually pop the freqs_cis buffer, as we made this permanent
-# temporarily and we don't want to include it in the exported state_dict.
-# Context: https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama3/model.py#L404
-excluded_parameters_for_model_only = {"freqs_cis"}
-
-
 @torch.no_grad()
 def save_with_gc(state, checkpoint_id):
     dcp.save(state, checkpoint_id=checkpoint_id)
@@ -569,8 +574,6 @@ def _states_to_load(self, model_only: bool) -> dict[str, Any]:
         # For the first step, we will only load the model weights.
         if model_only:
             sd = self.states[MODEL].state_dict()
-            for k in excluded_parameters_for_model_only:
-                sd.pop(k, None)
             return sd
 
         for exclude_key in self.exclude_from_loading:
@@ -600,9 +603,6 @@ def _save_last_step(self, curr_step: int) -> None:
             # }.
             self.states = self.states[MODEL].state_dict()
 
-            for k in excluded_parameters_for_model_only:
-                self.states.pop(k, None)
-
             if self.export_dtype != torch.float32:
                 self.states = {
                     k: v.to(self.export_dtype) for k, v in self.states.items()
diff --git a/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.py b/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.py
@@ -497,11 +497,6 @@ class MyJobConfig:
             size += v.numel() * v.element_size()
         logger.info(f"Total size of the model: {size / 1e9:.2f} GB")
 
-        # Do not support PP yet, we will need to iterate over the PP dimension and
-        # extract the corresponding state_dict and device_mesh.
-        if "freqs_cis" in state_dict:
-            state_dict.pop("freqs_cis")
-
         # Our tokenizer is not up-to-date yet.
         tok_embeddings_weight = state_dict.pop("tok_embeddings.weight")
         output_weight = state_dict.pop("output.weight")
@@ -531,8 +526,6 @@ def state_dict(self) -> dict[str, torch.Tensor]:
             dist.barrier()
             logger.info(f"Verifies state_dict {time.time() - begin}.")
         else:
-            # oh, this is pretty bad, when can we get rid of the freqs_cis issue?
-            state_dict["freqs_cis"] = None
             trainer.checkpointer.states[MODEL] = DummyModel(state_dict)
             trainer.checkpointer.last_save_model_weights_only = True
             trainer.checkpointer.export_dtype = next(iter(state_dict.values())).dtype
diff --git a/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py
@@ -498,11 +498,6 @@ class MyJobConfig:
             size += v.numel() * v.element_size()
         logger.info(f"Total size of the model: {size / 1e9:.2f} GB")
 
-        # Do not support PP yet, we will need to iterate over the PP dimension and
-        # extract the corresponding state_dict and device_mesh.
-        if "freq_cis" in state_dict:
-            state_dict.pop("freqs_cis")
-
         state_dict = CheckpointConverter(
             process_group=trainer.world_mesh.get_group(),
             path=config.checkpoint.convert_path,
@@ -526,8 +521,6 @@ def state_dict(self) -> dict[str, torch.Tensor]:
             dist.barrier()
             logger.info(f"Verifies state_dict {time.time() - begin}.")
         else:
-            # oh, this is pretty bad, when can we get rid of the freqs_cis issue?
-            state_dict["freqs_cis"] = None
             trainer.checkpointer.states[MODEL] = DummyModel(state_dict)
             trainer.checkpointer.last_save_model_weights_only = True
             trainer.checkpointer.export_dtype = next(iter(state_dict.values())).dtype