modelscope · Jintao-Huang · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025
diff --git a/docs/source/Instruction/Megatron-SWIFT训练.md b/docs/source/Instruction/Megatron-SWIFT训练.md
@@ -207,8 +207,8 @@ I am a language model developed by swift, you can call me swift-robot. How can I
 - 🔥eval_interval: 评估的间隔（steps），默认为None，即设置为save_interval。
 
 **混合精度参数**
-- fp16: fp16模式。默认为False。会根据模型的torch_dtype进行设置。请使用`--torch_dtype`进行设置，默认读取config.json。
-- bf16: bf16模式。默认为False。会根据模型的torch_dtype进行设置。
+- fp16: fp16模式。默认为None，会根据模型的torch_dtype进行设置。torch_dtype默认读取config.json。
+- bf16: bf16模式。默认为None，会根据模型的torch_dtype进行设置。
 - apply_query_key_layer_scaling: 将`Q * K^T` 缩放为 `1 / 层数`（例如：第layer_num层则除以layer_num）。这对fp16训练很有帮助。默认为None，即若使用`--fp16`，则设置为True。
 - attention_softmax_in_fp32: 在attention_mask和softmax中使用fp32进行计算。默认为True。
 

diff --git a/docs/source_en/Instruction/Megatron-SWIFT-Training.md b/docs/source_en/Instruction/Megatron-SWIFT-Training.md
@@ -216,8 +216,8 @@ seq_length: Defaults to None, meaning it is set to `max_length`. To restrict the
 
 **Mixed Precision Parameters**
 
-- fp16: FP16 mode. Default is False. Set according to the model's torch_dtype. Please use `--torch_dtype` to set it. By default, it reads from config.json.
-- bf16: BF16 mode. Default is False. Set according to the model's torch_dtype.
+- fp16: FP16 mode. The default is None, and it will be set according to the model's torch_dtype. The torch_dtype is read from the config.json by default.
+- bf16: BF16 mode. The default is None, and it will be set according to the model's torch_dtype.
 - apply_query_key_layer_scaling: Scales `Q * K^T` by `1 / layer number` (e.g., divide by layer_num for layer_num-th layer). This is helpful for FP16 training. Default is None, meaning that if `--fp16` is used, it will be set to True.
 - attention_softmax_in_fp32: Uses FP32 for computations in attention_mask and softmax. Default is True.
 

diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
@@ -98,23 +98,26 @@ def _init_max_memory(self):
     def _init_torch_dtype(self) -> None:
         """"If torch_dtype is None, find a proper dtype by the train_type/GPU"""
         from swift.llm import TrainArguments
-        if self.torch_dtype is None and isinstance(self, TrainArguments):
-            # Compatible with --fp16/--bf16
-            for key in ['fp16', 'bf16']:
-                value = getattr(self, key)
-                if value:
-                    self.torch_dtype = {'fp16': 'float16', 'bf16': 'bfloat16'}[key]
 
         self.torch_dtype: Optional[torch.dtype] = HfConfigFactory.to_torch_dtype(self.torch_dtype)
         self.torch_dtype: torch.dtype = self._init_model_info()
         # Mixed Precision Training
-        if isinstance(self, TrainArguments) and not is_torch_mps_available():
-            if self.torch_dtype in {torch.float16, torch.float32}:
-                self.fp16, self.bf16 = True, False
-            elif self.torch_dtype == torch.bfloat16:
-                self.fp16, self.bf16 = False, True
-            else:
-                raise ValueError(f'args.torch_dtype: {self.torch_dtype}')
+        if isinstance(self, TrainArguments):
+            self._init_mixed_precision()
+
+    def _init_mixed_precision(self):
+        if is_torch_mps_available():
+            fp16, bf16 = False, False
+        elif self.torch_dtype in {torch.float16, torch.float32}:
+            fp16, bf16 = True, False
+        elif self.torch_dtype == torch.bfloat16:
+            fp16, bf16 = False, True
+        else:
+            raise ValueError(f'args.torch_dtype: {self.torch_dtype}')
+        if self.fp16 is None:
+            self.fp16 = fp16
+        if self.bf16 is None:
+            self.bf16 = bf16
 
     def _init_rope_scaling(self):
         assert self.max_length is not None, 'Use max_model_len together with rope_scaling'

diff --git a/swift/llm/argument/infer_args.py b/swift/llm/argument/infer_args.py
@@ -152,7 +152,8 @@ def _init_stream(self):
     def _init_ddp(self):
         if not is_dist():
             return
-        assert not self.eval_human and not self.stream
+        assert not self.eval_human and not self.stream, (
+            f'args.eval_human: {self.eval_human}, args.stream: {self.stream}')
         self._init_device()
         init_process_group(self.ddp_backend)
 

diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
@@ -23,6 +23,8 @@ class Seq2SeqTrainingOverrideArguments(TrainArgumentsMixin, Seq2SeqTrainingArgum
     output_dir: Optional[str] = None
     learning_rate: Optional[float] = None
     eval_strategy: Optional[str] = None  # steps, epoch
+    fp16: Optional[bool] = None
+    bf16: Optional[bool] = None
 
     def _init_output_dir(self):
         if self.output_dir is None:

diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
@@ -72,7 +72,7 @@ class ModelMeta:
     task_type: Optional[str] = None
 
     # File patterns to ignore when downloading the model.
-    ignore_patterns: List[str] = field(default_factory=list)
+    ignore_patterns: Optional[List[str]] = None
     # Usually specifies the version limits of transformers.
     requires: List[str] = field(default_factory=list)
     tags: List[str] = field(default_factory=list)

diff --git a/swift/megatron/argument/megatron_args.py b/swift/megatron/argument/megatron_args.py
@@ -105,8 +105,8 @@ class MegatronArguments(ExtraMegatronArguments):
     transformer_impl: Literal['local', 'transformer_engine'] = 'transformer_engine'
 
     # mixed precision
-    fp16: bool = False
-    bf16: bool = False
+    fp16: Optional[bool] = None
+    bf16: Optional[bool] = None
     apply_query_key_layer_scaling: Optional[bool] = None
     attention_softmax_in_fp32: bool = True
 
@@ -132,10 +132,8 @@ class MegatronArguments(ExtraMegatronArguments):
     no_create_attention_mask_in_dataloader: bool = True
 
     def _init_mixed_precision(self):
-        if self.torch_dtype == torch.bfloat16:
-            self.bf16 = True
-        elif self.torch_dtype == torch.float16:
-            self.fp16 = True
+        from swift.llm.argument.base_args.model_args import ModelArguments
+        ModelArguments._init_mixed_precision(self)
         if self.apply_query_key_layer_scaling is None:
             self.apply_query_key_layer_scaling = self.fp16