[megatron] compat megatron-core main branch (#4606)

Jintao-Huang · web-flow · commit 0e97eaa706dd · 2025-06-15T21:00:17.000+08:00
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -494,6 +494,7 @@ soft overlong 奖励参数
 - 🔥infer_backend: 推理加速后端，支持'pt'、'vllm'、'lmdeploy'三种推理引擎。默认为'pt'。
 - 🔥max_batch_size: 指定infer_backend为pt时生效，用于批量推理，默认为1。若设置为-1，则不受限制。
 - 🔥result_path: 推理结果存储路径（jsonl），默认为None，保存在checkpoint目录（含args.json文件）或者'./result'目录，最终存储路径会在命令行中打印。
+  - 注意：若已存在`result_path`文件，则会进行追加写入。
 - write_batch_size: 结果写入`result_path`的batch_size。默认为1000。若设置为-1，则不受限制。
 - metric: 对推理的结果进行评估，目前支持'acc'和'rouge'。默认为None，即不进行评估。
 - val_dataset_sample: 推理数据集采样数，默认为None。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -515,6 +515,7 @@ Inference arguments include the [base arguments](#base-arguments), [merge argume
 - 🔥infer_backend: Inference acceleration backend, supporting three inference engines: 'pt', 'vllm', and 'lmdeploy'. The default is 'pt'.
 - 🔥max_batch_size: Effective when infer_backend is set to 'pt'; used for batch inference, with a default value of 1. If set to -1, there is no restriction.
 - 🔥result_path: Path to store inference results (jsonl). The default is None, meaning results are saved in the checkpoint directory (with args.json file) or './result' directory. The final storage path will be printed in the command line.
+  - Note: If the `result_path` file already exists, it will be appended to.
 - write_batch_size: The batch size for writing results to result_path. Defaults to 1000. If set to -1, there is no restriction.
 - metric: Evaluate the results of the inference, currently supporting 'acc' and 'rouge'. The default is None, meaning no evaluation is performed.
 - val_dataset_sample: Number of samples from the inference dataset, default is None.
diff --git a/swift/megatron/argument/megatron_args.py b/swift/megatron/argument/megatron_args.py
@@ -225,6 +225,8 @@ def _init_mixed_precision(self):
             os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '1'
 
     def _init_moe(self):
+        if self.num_experts is None:
+            return
         if self.moe_shared_expert_intermediate_size == 0:
             self.moe_shared_expert_intermediate_size = None
         if self.moe_ffn_hidden_size is None:
diff --git a/swift/megatron/init.py b/swift/megatron/init.py
@@ -203,7 +203,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r
         if args.mtp_num_layers is not None:
             mtp_loss_scale = 1 / get_num_microbatches()
             MTPLossLoggingHelper.track_mtp_metrics(mtp_loss_scale, iteration, writer, wandb_writer, total_loss_dict)
-        if iteration % args.log_interval == 0:
+        if iteration % args.log_interval == 0 or iteration == 1:
             if args.record_memory_history and is_last_rank():
                 snapshot = torch.cuda.memory._snapshot()
                 from pickle import dump
diff --git a/swift/megatron/train/trainers/dpo_trainer.py b/swift/megatron/train/trainers/dpo_trainer.py
@@ -146,11 +146,10 @@ def loss_func(self, output_tensor: torch.Tensor, *, ref_logps: torch.Tensor, lab
         torch.distributed.all_reduce(
             reporting_metric, torch.distributed.ReduceOp.AVG, group=mpu.get_data_parallel_group())
         reporting_metric = {k: reporting_metric[i] for i, k in enumerate(metric.keys())}
-        return (
-            # fix megatron-lm bug
-            # https://github.com/NVIDIA/Megatron-LM/blob/core_r0.12.0/megatron/core/pipeline_parallel/schedules.py#L291
-            loss / mpu.get_context_parallel_world_size(),
-            reporting_metric)
+        # fix megatron-lm bug
+        # https://github.com/NVIDIA/Megatron-LM/blob/core_r0.12.0/megatron/core/pipeline_parallel/schedules.py#L291
+        loss = loss / mpu.get_context_parallel_world_size()
+        return (loss, reporting_metric)
 
     def _replace_data_iterator(self, data_iterator):
         args = get_args()
diff --git a/swift/megatron/train/trainers/trainer.py b/swift/megatron/train/trainers/trainer.py
@@ -4,6 +4,7 @@
 from contextlib import contextmanager
 from functools import partial
 
+import megatron.core
 import torch
 from megatron.core import mpu
 from megatron.core.enums import ModelType
@@ -12,6 +13,7 @@
 from megatron.core.rerun_state_machine import RerunMode, get_rerun_state_machine
 from megatron.core.utils import StragglerDetector
 from megatron.training import ft_integration, get_args, get_timers, is_last_rank, pretrain, print_rank_0, training
+from packaging import version
 from torch.distributed.nn import all_reduce
 
 from swift.utils import get_logger
@@ -129,7 +131,7 @@ def evaluate(self,
         # make validation batch size independent from training batch size
         eval_batch_size = args.global_batch_size
         eval_num_microbatches = eval_batch_size // (args.micro_batch_size * args.data_parallel_size)
-
+        megatron_core_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')
         with torch.no_grad():
             iteration = 0
             if verbose:
@@ -161,19 +163,35 @@ def evaluate(self,
                     torch.cuda.empty_cache()
 
                 if mpu.is_pipeline_last_stage(ignore_virtual=True):
-                    # Reduce across processes.
-                    for loss_dict in loss_dicts:
-                        for key in loss_dict:
+                    if megatron_core_013:
+                        for key in loss_dicts[0].keys():
                             if key not in total_loss_dict:
                                 total_loss_dict[key] = torch.tensor([0.0, 0.0], dtype=torch.float).cuda()
-                            val = loss_dict[key]
-                            if isinstance(val, tuple) or isinstance(val, list):
-                                total_loss_dict[key][0] += val[0]
-                                total_loss_dict[key][1] += val[1]
-                            else:
+                            val = [x[key].view(-1) for x in loss_dicts]
+                            if val[0].numel() == 2:
+                                val = torch.vstack(val).sum(dim=0)
+                                torch.distributed.all_reduce(
+                                    val, group=mpu.get_data_parallel_group(with_context_parallel=True))
+                                total_loss_dict[key] += val
+                            elif val[0].numel() == 1:
+                                val = torch.cat(val).sum()
                                 total_loss_dict[key][0] += val
-                                total_loss_dict[key][1] += 1
-
+                                total_loss_dict[key][1] += len(loss_dicts)
+                            else:
+                                raise ValueError(f'Invalid value shape: {val[0].shape} for key {key}')
+                    else:
+                        # Reduce across processes.
+                        for loss_dict in loss_dicts:
+                            for key in loss_dict:
+                                if key not in total_loss_dict:
+                                    total_loss_dict[key] = torch.tensor([0.0, 0.0], dtype=torch.float).cuda()
+                                val = loss_dict[key]
+                                if isinstance(val, tuple) or isinstance(val, list):
+                                    total_loss_dict[key][0] += val[0]
+                                    total_loss_dict[key][1] += val[1]
+                                else:
+                                    total_loss_dict[key][0] += val
+                                    total_loss_dict[key][1] += 1
                 args.consumed_valid_samples += eval_batch_size
 
                 if args.exit_duration_in_mins:
@@ -250,7 +268,8 @@ def loss_func(self, output_tensor: torch.Tensor, *, loss_mask: torch.Tensor):
         total_tokens = loss_mask.sum()
         loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)])
 
-        if args.context_parallel_size > 1:
+        megatron_core_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')
+        if args.context_parallel_size > 1 and not megatron_core_013:
             loss = all_reduce(loss, group=mpu.get_context_parallel_group())
 
         # Check individual rank losses are not NaN prior to DP all-reduce.
@@ -287,19 +306,21 @@ def loss_func(self, output_tensor: torch.Tensor, *, loss_mask: torch.Tensor):
             )
         # Reduce loss for logging.
         reporting_loss = loss.clone().detach()
-        torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
-
-        # loss[0] is a view of loss, so it has ._base not None, which triggers assert error
-        # in core/pipeline_parallel/schedule.py::deallocate_output_tensor, calling .clone()
-        # on loss[0] fixes this
-        local_num_tokens = loss[1].clone().detach().to(torch.int)
-        return (
+        lm_loss = loss[0]
+        if not megatron_core_013:
             # fix megatron-lm bug
             # https://github.com/NVIDIA/Megatron-LM/blob/core_r0.12.0/megatron/core/pipeline_parallel/schedules.py#L291
-            loss[0] / mpu.get_context_parallel_world_size(),
+            torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
+            lm_loss = lm_loss / mpu.get_context_parallel_world_size()
+            reporting_loss = (reporting_loss[0], reporting_loss[1])
+        else:
+            lm_loss = lm_loss.clone()
+        local_num_tokens = loss[1].clone().detach().to(torch.int)
+        return (
+            lm_loss,
             local_num_tokens,
             {
-                'lm loss': (reporting_loss[0], reporting_loss[1])
+                'lm loss': reporting_loss
             },
         )