fix ulysses dpo

modelscope · tastelikefeet · May 9, 2025 · May 9, 2025 · May 9, 2025 · May 9, 2025
commit 79d55325fc1dfcc5a8c7d284ffde637c41473b1d
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -74,7 +74,7 @@
   - 注意：若对deepseek-r1/qwq模型使用不包含`<think>...</think>`的数据集进行训练，请加在推理训练后模型时额外传入`--response_prefix ''`
 - padding_side: 当训练`batch_size>=2`时的padding_side，可选值为'left'、'right'，默认为'right'。（推理时的batch_size>=2时，只进行左padding）
 - loss_scale: 训练tokens的loss权重设置。默认为`'default'`，代表所有response（含history）以1计算交叉熵损失。可选值为'default'、'last_round'、'all'，以及agent需要的loss_scale: 'react'、'agentflan'、'alpha_umi'和'qwen'。其中'last_round'代表只计算最后一轮response的损失，'all'代表计算所有tokens的损失。agent部分可以查看[插件化](../Customization/插件化.md)和[Agent文档](./Agent支持.md)
-- sequence_parallel_size: 序列并行大小，默认是1。当前支持pt/sft。训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/long_text/sequence_parallel.sh)
+- sequence_parallel_size: 序列并行大小，默认是1。当前支持pt/sft/dpo。训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/long_text/sequence_parallel.sh)
 - use_chat_template: 使用chat模板或generation模板，默认为`True`。`swift pt`会自动设置为generation模板
 - template_backend: 选择template后端，可选为'swift'、'jinja'，默认为'swift'。如果使用jinja，则使用transformers的`apply_chat_template`。
   - 注意：jinja的template后端只支持推理，不支持训练。

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -77,7 +77,7 @@ Hints:
   - Note: If you are training the deepseek-r1/qwq model with a dataset that does not include `<think>...</think>`, please pass `--response_prefix ''` additionally when inferring after training.
 - padding_side: Padding side when `batch_size>=2` during training. Options are 'left' and 'right', with 'right' as the default. (For inference with batch_size>=2, only left padding is applied.)
 - loss_scale: Setting for the loss weight of training tokens. Default is `'default'`, meaning all responses (including history) are calculated with a cross-entropy loss of 1. Options are 'default', 'last_round', 'all', and agent-specific loss scales: 'react', 'agentflan', 'alpha_umi', and 'qwen'. 'last_round' means calculating only the loss of the last round's response, and 'all' calculates the loss for all tokens. For agent parts, see [Pluginization](../Customization/Pluginization.md) and [Agent Training](./Agent-support.md).
-- sequence_parallel_size: Sequence parallelism size, default is 1. Currently supported in pt/sft. The training script refers to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/long_text/sequence_parallel.sh).
+- sequence_parallel_size: Sequence parallelism size, default is 1. Currently supported in pt/sft/dpo. The training script refers to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/long_text/sequence_parallel.sh).
 - use_chat_template: Use chat template or generation template, default is `True`. `swift pt` is automatically set to the generation template.
 - template_backend: Selection of the template backend. Options are 'swift' and 'jinja', with 'swift' as the default. If using jinja, it applies transformer's `apply_chat_template`.
   - Note: The jinja template backend supports only inference, not training.

diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
@@ -15,6 +15,11 @@ class SwiftRLHF(SwiftSft):
     args: args_class
 
     def _prepare_model_tokenizer(self):
+        if self.args.sequence_parallel_size > 1:
+            # Duplicate calling is allowd to promise this function will
+            # be called before model initializing.
+            from swift.trainers.sequence_parallel import sequence_parallel
+            sequence_parallel.init_sequence_parallel(self.args.sequence_parallel_size)
         from swift.llm.infer.utils import prepare_adapter
         args = self.args
         for key in ['ref', 'reward', 'value']:
@@ -47,6 +52,13 @@ def _prepare_model_tokenizer(self):
 
             model = prepare_adapter(args, model, adapters)
             if origin_key in {'ref', 'reward'}:
+                if self.args.sequence_parallel_size > 1:
+                    from swift.trainers.sequence_parallel import sequence_parallel
+                    if hasattr(model, 'model_meta'):
+                        is_multimodal = model.model_meta.is_multimodal
+                    else:
+                        is_multimodal = model.model.model_meta.is_multimodal
+                    sequence_parallel.prepare_model(model, processor, split_in_forward=is_multimodal)
                 model.requires_grad_(False).eval()
             else:
                 model = self.prepare_model(args, model, task_type=task_type)

diff --git a/swift/trainers/sequence_parallel/ulysses.py b/swift/trainers/sequence_parallel/ulysses.py
@@ -273,8 +273,12 @@ def __init__(self):
         self.model_dtype = None
         self.causal_mask_func = None
         self.device_mesh = None
+        self._inited = False
 
     def init_sequence_parallel(self, size):
+        if self._inited:
+            return
+        self._inited = True
         self.sp_world_size = size
         rank, local_rank, world_size, local_world_size = get_dist_setting()
         self.dp_world_size = world_size // size