Description
您好,我试用swift3.0.3训练 QwenVL2 72B,多模态任务,担心length太长想用序列并行,但是报错,我的设置如下:
MODEL_P="72B"
TOKENIZERS_PARALLELISM=false NPROC_PER_NODE=${ARNOLD_WORKER_GPU} CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 MASTER_PORT=$METIS_WORKER_0_PORT MAX_PIXELS=602112
swift sft
--model /opt/tiger/soweval/Chen_Li/creator_model/Qwen2VL/Qwen2-VL-$MODEL_P-Instruct
--torch_dtype bfloat16
--num_train_epochs 10
--save_strategy steps
--save_steps 5
--train_type full
--logging_steps 1
--max_length 4000
--output_dir xxx/Qwen2VL/swift_train/output_$MODEL_P
--dataset xxx/2025-01-22/init.jsonl
--val_dataset xxx/2025-01-22/init.jsonl
--per_device_train_batch_size 1
--per_device_eval_batch_size 1
--gradient_accumulation_steps 64
--learning_rate 1e-4
--deepspeed zero3
--save_total_limit 3
--eval_steps 10
--freeze_vit true
--warmup_ratio 0.05
--dataloader_num_workers 4
--sequence_parallel_size 2 \
--attn_impl flash_attn
报错如下:
Traceback (most recent call last):
File "/home/tiger/.local/lib/python3.9/site-packages/swift/cli/sft.py", line 5, in
sft_main()
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/train/sft.py", line 249, in sft_main
sft_main()
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/train/sft.py", line 249, in sft_main
return SwiftSft(args).main()
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/base.py", line 46, in main
return SwiftSft(args).main()
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/base.py", line 46, in main
result = self.run()
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/train/sft.py", line 137, in run
result = self.run()
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/train/sft.py", line 137, in run
return self.train(trainer)
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/train/sft.py", line 189, in train
return self.train(trainer)
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/train/sft.py", line 189, in train
trainer.train(trainer.args.resume_from_checkpoint)
File "/home/tiger/.local/lib/python3.9/site-packages/swift/trainers/mixin.py", line 261, in train
trainer.train(trainer.args.resume_from_checkpoint)
File "/home/tiger/.local/lib/python3.9/site-packages/swift/trainers/mixin.py", line 261, in train
res = super().train(*args, **kwargs)
File "/home/tiger/.local/lib/python3.9/site-packages/transformers/trainer.py", line 2164, in train
res = super().train(*args, **kwargs)
File "/home/tiger/.local/lib/python3.9/site-packages/transformers/trainer.py", line 2164, in train
return inner_training_loop(
File "/home/tiger/.local/lib/python3.9/site-packages/transformers/trainer.py", line 2524, in _inner_training_loop
return inner_training_loop(
File "/home/tiger/.local/lib/python3.9/site-packages/transformers/trainer.py", line 2524, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
File "/home/tiger/.local/lib/python3.9/site-packages/transformers/trainer.py", line 3654, in training_step
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
File "/home/tiger/.local/lib/python3.9/site-packages/transformers/trainer.py", line 3654, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
File "/home/tiger/.local/lib/python3.9/site-packages/swift/trainers/trainers.py", line 144, in compute_loss
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
File "/home/tiger/.local/lib/python3.9/site-packages/swift/trainers/trainers.py", line 144, in compute_loss
outputs = model(**inputs)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
outputs = model(**inputs)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/tiger/.local/lib/python3.9/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
return forward_call(*args, **kwargs)
File "/home/tiger/.local/lib/python3.9/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/home/tiger/.local/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1914, in forward
ret_val = func(*args, **kwargs)
File "/home/tiger/.local/lib/python3.9/site-packages/deepspeed/runtime/engine.py", line 1914, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1547, in _call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py", line 1547, in _call_impl
args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc]
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/template/base.py", line 779, in pre_forward_hook
args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc]
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/template/base.py", line 779, in pre_forward_hook
kwargs = to_device(self._post_encode(model, old_kwargs), model.device)
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/template/template/qwen.py", line 310, in _post_encode
kwargs = to_device(self._post_encode(model, old_kwargs), model.device)
File "/home/tiger/.local/lib/python3.9/site-packages/swift/llm/template/template/qwen.py", line 310, in _post_encode
position_ids, _ = model.get_rope_index(input_ids, image_grid_thw, video_grid_thw, inputs['attention_mask'])
File "/home/tiger/.local/lib/python3.9/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py", line 1570, in get_rope_index
position_ids, _ = model.get_rope_index(input_ids, image_grid_thw, video_grid_thw, inputs['attention_mask'])
File "/home/tiger/.local/lib/python3.9/site-packages/transformers/models/qwen2_vl/modeling_qwen2_vl.py", line 1570, in get_rope_index
position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
RuntimeError: shape mismatch: value tensor of shape [3, 2515] cannot be broadcast to indexing result of shape [3, 2034]
position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
RuntimeError: shape mismatch: value tensor of shape [3, 2527] cannot be broadcast to indexing result of shape [3, 1974]