Skip to content

Fix fp16 bf16 #3909

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/Instruction/Megatron-SWIFT训练.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,8 @@ I am a language model developed by swift, you can call me swift-robot. How can I
- 🔥eval_interval: 评估的间隔(steps),默认为None,即设置为save_interval。

**混合精度参数**
- fp16: fp16模式。默认为False。会根据模型的torch_dtype进行设置。请使用`--torch_dtype`进行设置,默认读取config.json。
- bf16: bf16模式。默认为False。会根据模型的torch_dtype进行设置。
- fp16: fp16模式。默认为None,会根据模型的torch_dtype进行设置。torch_dtype默认读取config.json。
- bf16: bf16模式。默认为None,会根据模型的torch_dtype进行设置。
- apply_query_key_layer_scaling: 将`Q * K^T` 缩放为 `1 / 层数`(例如:第layer_num层则除以layer_num)。这对fp16训练很有帮助。默认为None,即若使用`--fp16`,则设置为True。
- attention_softmax_in_fp32: 在attention_mask和softmax中使用fp32进行计算。默认为True。

Expand Down
4 changes: 2 additions & 2 deletions docs/source_en/Instruction/Megatron-SWIFT-Training.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ seq_length: Defaults to None, meaning it is set to `max_length`. To restrict the

**Mixed Precision Parameters**

- fp16: FP16 mode. Default is False. Set according to the model's torch_dtype. Please use `--torch_dtype` to set it. By default, it reads from config.json.
- bf16: BF16 mode. Default is False. Set according to the model's torch_dtype.
- fp16: FP16 mode. The default is None, and it will be set according to the model's torch_dtype. The torch_dtype is read from the config.json by default.
- bf16: BF16 mode. The default is None, and it will be set according to the model's torch_dtype.
- apply_query_key_layer_scaling: Scales `Q * K^T` by `1 / layer number` (e.g., divide by layer_num for layer_num-th layer). This is helpful for FP16 training. Default is None, meaning that if `--fp16` is used, it will be set to True.
- attention_softmax_in_fp32: Uses FP32 for computations in attention_mask and softmax. Default is True.

Expand Down
29 changes: 16 additions & 13 deletions swift/llm/argument/base_args/model_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,23 +98,26 @@ def _init_max_memory(self):
def _init_torch_dtype(self) -> None:
""""If torch_dtype is None, find a proper dtype by the train_type/GPU"""
from swift.llm import TrainArguments
if self.torch_dtype is None and isinstance(self, TrainArguments):
# Compatible with --fp16/--bf16
for key in ['fp16', 'bf16']:
value = getattr(self, key)
if value:
self.torch_dtype = {'fp16': 'float16', 'bf16': 'bfloat16'}[key]

self.torch_dtype: Optional[torch.dtype] = HfConfigFactory.to_torch_dtype(self.torch_dtype)
self.torch_dtype: torch.dtype = self._init_model_info()
# Mixed Precision Training
if isinstance(self, TrainArguments) and not is_torch_mps_available():
if self.torch_dtype in {torch.float16, torch.float32}:
self.fp16, self.bf16 = True, False
elif self.torch_dtype == torch.bfloat16:
self.fp16, self.bf16 = False, True
else:
raise ValueError(f'args.torch_dtype: {self.torch_dtype}')
if isinstance(self, TrainArguments):
self._init_mixed_precision()

def _init_mixed_precision(self):
if is_torch_mps_available():
fp16, bf16 = False, False
elif self.torch_dtype in {torch.float16, torch.float32}:
fp16, bf16 = True, False
elif self.torch_dtype == torch.bfloat16:
fp16, bf16 = False, True
else:
raise ValueError(f'args.torch_dtype: {self.torch_dtype}')
if self.fp16 is None:
self.fp16 = fp16
if self.bf16 is None:
self.bf16 = bf16

def _init_rope_scaling(self):
assert self.max_length is not None, 'Use max_model_len together with rope_scaling'
Expand Down
3 changes: 2 additions & 1 deletion swift/llm/argument/infer_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ def _init_stream(self):
def _init_ddp(self):
if not is_dist():
return
assert not self.eval_human and not self.stream
assert not self.eval_human and not self.stream, (
f'args.eval_human: {self.eval_human}, args.stream: {self.stream}')
self._init_device()
init_process_group(self.ddp_backend)

Expand Down
2 changes: 2 additions & 0 deletions swift/llm/argument/train_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class Seq2SeqTrainingOverrideArguments(TrainArgumentsMixin, Seq2SeqTrainingArgum
output_dir: Optional[str] = None
learning_rate: Optional[float] = None
eval_strategy: Optional[str] = None # steps, epoch
fp16: Optional[bool] = None
bf16: Optional[bool] = None

def _init_output_dir(self):
if self.output_dir is None:
Expand Down
2 changes: 1 addition & 1 deletion swift/llm/model/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class ModelMeta:
task_type: Optional[str] = None

# File patterns to ignore when downloading the model.
ignore_patterns: List[str] = field(default_factory=list)
ignore_patterns: Optional[List[str]] = None
# Usually specifies the version limits of transformers.
requires: List[str] = field(default_factory=list)
tags: List[str] = field(default_factory=list)
Expand Down
10 changes: 4 additions & 6 deletions swift/megatron/argument/megatron_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ class MegatronArguments(ExtraMegatronArguments):
transformer_impl: Literal['local', 'transformer_engine'] = 'transformer_engine'

# mixed precision
fp16: bool = False
bf16: bool = False
fp16: Optional[bool] = None
bf16: Optional[bool] = None
apply_query_key_layer_scaling: Optional[bool] = None
attention_softmax_in_fp32: bool = True

Expand All @@ -132,10 +132,8 @@ class MegatronArguments(ExtraMegatronArguments):
no_create_attention_mask_in_dataloader: bool = True

def _init_mixed_precision(self):
if self.torch_dtype == torch.bfloat16:
self.bf16 = True
elif self.torch_dtype == torch.float16:
self.fp16 = True
from swift.llm.argument.base_args.model_args import ModelArguments
ModelArguments._init_mixed_precision(self)
if self.apply_query_key_layer_scaling is None:
self.apply_query_key_layer_scaling = self.fp16

Expand Down
Loading