modelscope · Jintao-Huang · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025 · Apr 16, 2025
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -558,7 +558,7 @@ App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数)
 特定模型参数可以通过`--model_kwargs`或者环境变量进行设置，例如: `--model_kwargs '{"fps_max_frames": 12}'`或者`FPS_MAX_FRAMES=12`
 
 ### qwen2_vl, qvq, qwen2_5_vl
-参数含义同`qwen_vl_utils`或者`qwen_omni_utils`库，可以查看[这里](https://github.com/QwenLM/Qwen2-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)
+参数含义同`qwen_vl_utils`或者`qwen_omni_utils`库，可以查看[这里](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)
 
 - IMAGE_FACTOR: 默认为28
 - MIN_PIXELS: 默认为`4 * 28 * 28`

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -570,7 +570,7 @@ Export Arguments include the [basic arguments](#base-arguments) and [merge argum
 Specific model arguments can be set using `--model_kwargs` or environment variables, for example: `--model_kwargs '{"fps_max_frames": 12}'` or `FPS_MAX_FRAMES=12`.
 
 ### qwen2_vl, qvq, qwen2_5_vl
-The parameter meanings are the same as in the `qwen_vl_utils` or `qwen_omni_utils` library. You can refer to [here](https://github.com/QwenLM/Qwen2-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)
+The parameter meanings are the same as in the `qwen_vl_utils` or `qwen_omni_utils` library. You can refer to [here](https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L24)
 
 - IMAGE_FACTOR: Default is 28
 - MIN_PIXELS: Default is `4 * 28 * 28`

diff --git a/examples/train/grpo/multi_gpu_mp_colocate.sh b/examples/train/grpo/multi_gpu_mp_colocate.sh
@@ -2,7 +2,7 @@ MAX_PIXELS=1003520 \
 NPROC_PER_NODE=8 \
 swift rlhf \
     --rlhf_type grpo \
-    --model Qwen/Qwen2-VL-7B-Instruct \
+    --model Qwen/Qwen2.5-VL-7B-Instruct \
     --train_type lora \
     --dataset AI-ModelScope/chartqa_digit_r1v_format \
     --torch_dtype bfloat16 \

diff --git a/examples/train/moe/llama4.sh b/examples/train/moe/llama4.sh
@@ -13,7 +13,7 @@ swift sft \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
-    --target_regex '^(language_model)\..*\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$' \
+    --target_regex '^(language_model).*\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$' \
     --freeze_vit true \
     --gradient_accumulation_steps 4 \
     --gradient_checkpointing true \

diff --git a/examples/train/multi-gpu/device_map/train.sh b/examples/train/multi-gpu/device_map/train.sh
@@ -2,7 +2,7 @@
 CUDA_VISIBLE_DEVICES=0,1 \
 MAX_PIXELS=1003520 \
 swift sft \
-    --model Qwen/Qwen2-VL-72B-Instruct \
+    --model Qwen/Qwen2.5-VL-72B-Instruct \
     --dataset 'modelscope/coco_2014_caption:validation#20000' \
     --train_type lora \
     --torch_dtype bfloat16 \

diff --git a/examples/train/multimodal/caption.sh b/examples/train/multimodal/caption.sh
@@ -1,10 +1,10 @@
 # 22GiB
-# You can refer to `https://github.com/QwenLM/Qwen2-VL` for the meaning of the `MAX_PIXELS` parameter.
+# You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
 # 1003520 = 1280 * 28 * 28
 CUDA_VISIBLE_DEVICES=0 \
 MAX_PIXELS=1003520 \
 swift sft \
-    --model Qwen/Qwen2-VL-7B-Instruct \
+    --model Qwen/Qwen2.5-VL-7B-Instruct \
     --dataset 'modelscope/coco_2014_caption:validation#20000' \
     --train_type lora \
     --torch_dtype bfloat16 \

diff --git a/examples/train/multimodal/grounding.sh b/examples/train/multimodal/grounding.sh
@@ -1,5 +1,5 @@
 # 20GiB
-# You can refer to `https://github.com/QwenLM/Qwen2-VL` for the meaning of the `MAX_PIXELS` parameter.
+# You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
 CUDA_VISIBLE_DEVICES=0 \
 MAX_PIXELS=1003520 \
 swift sft \

diff --git a/examples/train/multimodal/lora_llm_full_vit/custom_plugin.py b/examples/train/multimodal/lora_llm_full_vit/custom_plugin.py
@@ -40,7 +40,7 @@ def save_pretrained(
 
     @staticmethod
     def prepare_model(args: 'TrainArguments', model: torch.nn.Module) -> torch.nn.Module:
-        target_regex = r'^model.layers.*'
+        target_regex = r'^model.*\.(o_proj|down_proj|gate_proj|v_proj|q_proj|k_proj|up_proj)$'
         lora_config = LoraConfig(
             task_type='CAUSAL_LM', r=args.lora_rank, lora_alpha=args.lora_alpha, target_modules=target_regex)
         model = Swift.prepare_model(model, lora_config)

diff --git a/examples/train/multimodal/lora_llm_full_vit/sft.sh b/examples/train/multimodal/lora_llm_full_vit/sft.sh
@@ -8,7 +8,7 @@ swift sft \
     --dataset 'AI-ModelScope/coco#20000' \
     --train_type custom \
     --optimizer custom \
-    --external_plugins 'examples/train/multimodal/custom_tuner/custom_plugin.py' \
+    --external_plugins 'examples/train/multimodal/lora_llm_full_vit/custom_plugin.py' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \

diff --git a/examples/train/multimodal/ocr.sh b/examples/train/multimodal/ocr.sh
@@ -2,7 +2,7 @@
 CUDA_VISIBLE_DEVICES=0 \
 MAX_PIXELS=1003520 \
 swift sft \
-    --model Qwen/Qwen2-VL-7B-Instruct \
+    --model Qwen/Qwen2.5-VL-7B-Instruct \
     --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \
     --train_type lora \
     --torch_dtype bfloat16 \

diff --git a/examples/train/multimodal/rlhf/dpo.sh b/examples/train/multimodal/rlhf/dpo.sh
@@ -1,5 +1,5 @@
 # 4*50GiB
-# You can refer to `https://github.com/QwenLM/Qwen2-VL` for the meaning of the `MAX_PIXELS` parameter.
+# You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `MAX_PIXELS` parameter.
 # --rlhf_type cpo/orpo/simpo/rm are also supported
 nproc_per_node=2
 
@@ -8,7 +8,7 @@ NPROC_PER_NODE=$nproc_per_node \
 MAX_PIXELS=1003520 \
 swift rlhf \
     --rlhf_type dpo \
-    --model Qwen/Qwen2-VL-7B-Instruct \
+    --model Qwen/Qwen2.5-VL-7B-Instruct \
     --dataset 'swift/RLAIF-V-Dataset#20000' \
     --train_type lora \
     --torch_dtype bfloat16 \

diff --git a/examples/train/multimodal/rlhf/kto.sh b/examples/train/multimodal/rlhf/kto.sh
@@ -7,7 +7,7 @@ NPROC_PER_NODE=$nproc_per_node \
 MAX_PIXELS=1003520 \
 swift rlhf \
     --rlhf_type kto \
-    --model Qwen/Qwen2-VL-7B-Instruct \
+    --model Qwen/Qwen2.5-VL-7B-Instruct \
     --dataset 'AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto#10000' \
     --train_type lora \
     --torch_dtype bfloat16 \

diff --git a/examples/train/multimodal/video.sh b/examples/train/multimodal/video.sh
@@ -1,5 +1,5 @@
 # 4*80GB
-# You can refer to `https://github.com/QwenLM/Qwen2-VL` for the meaning of the `VIDEO_MAX_PIXELS` parameter.
+# You can refer to `https://github.com/QwenLM/Qwen2.5-VL` for the meaning of the `VIDEO_MAX_PIXELS` parameter.
 nproc_per_node=4
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 \

diff --git a/examples/train/qlora/gptq.sh b/examples/train/qlora/gptq.sh
@@ -2,7 +2,7 @@
 CUDA_VISIBLE_DEVICES=0,1 \
 MAX_PIXELS=1003520 \
 swift sft \
-    --model Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4 \
+    --model Qwen/Qwen2.5-VL-72B-Instruct-GPTQ-Int4 \
     --dataset 'modelscope/coco_2014_caption:validation#20000' \
     --train_type lora \
     --torch_dtype bfloat16 \

diff --git a/examples/train/seq_cls/qwen2_vl/sft.sh b/examples/train/seq_cls/qwen2_vl/sft.sh
@@ -1,5 +1,5 @@
 # If `num_labels` is provided, it will be considered a classification task.
-# You can also specify `--model Qwen/Qwen2-VL-2B-Instruct --use_chat_template true`.
+# You can also specify `--model Qwen/Qwen2.5-VL-2B-Instruct --use_chat_template true`.
 CUDA_VISIBLE_DEVICES=0 \
 MAX_PIXELS=1003520 \
 swift sft \

diff --git a/swift/llm/model/model/moonshot.py b/swift/llm/model/model/moonshot.py
@@ -2,6 +2,7 @@
 from swift.llm import TemplateType
 from ..constant import LLMModelType, MLLMModelType
 from ..model_arch import ModelArch
+from ..patcher import patch_output_clone
 from ..register import (Model, ModelGroup, ModelMeta, get_model_tokenizer_multimodal,
                         get_model_tokenizer_with_flash_attn, register_model)
 
@@ -21,6 +22,14 @@
         requires=['transformers<4.49'],
     ))
 
+
+def get_model_tokenizer_kimi_vl(*args, **kwargs):
+    model, processor = get_model_tokenizer_multimodal(*args, **kwargs)
+    if model is not None:
+        patch_output_clone(model.language_model.model.embed_tokens)
+    return model, processor
+
+
 register_model(
     ModelMeta(
         MLLMModelType.kimi_vl,
@@ -31,7 +40,7 @@
             ])
         ],
         TemplateType.kimi_vl,
-        get_model_tokenizer_multimodal,
+        get_model_tokenizer_kimi_vl,
         architectures=['KimiVLForConditionalGeneration'],
         model_arch=ModelArch.llava_hf,
         requires=['transformers<4.49'],

diff --git a/swift/llm/train/tuner.py b/swift/llm/train/tuner.py
@@ -75,7 +75,7 @@ def get_multimodal_target_regex(
     target_modules = []
     for module in modules:
         target_modules += find_all_linears(model, model_arch, extra_layers, sub_module=module)
-    target_regex = rf'^({prefix_pattern})\..*\.({"|".join(target_modules)})$'
+    target_regex = rf'^({prefix_pattern}).*\.({"|".join(target_modules)})$'
     if rejected_pattern:
         target_regex = rf'(?!^({rejected_pattern}))' + target_regex
     return target_regex

diff --git a/swift/tuners/lora_layers.py b/swift/tuners/lora_layers.py
@@ -21,7 +21,8 @@
 from peft.utils import _get_submodules, get_quantization_config
 from transformers import Conv1D
 
-from swift import LoraConfig, get_logger
+from swift.utils import get_logger
+from .peft import LoraConfig
 from .utils import ActivationMixin, ModulesToSaveWrapper, SwiftAdapter
 
 logger = get_logger()

diff --git a/swift/tuners/part.py b/swift/tuners/part.py
@@ -8,7 +8,7 @@
 import torch
 from torch import nn
 
-from swift import get_logger
+from swift.utils import get_logger
 from .utils import ActivationMixin, SwiftAdapter, SwiftConfig, SwiftOutput
 
 logger = get_logger()

diff --git a/swift/tuners/peft.py b/swift/tuners/peft.py
@@ -23,7 +23,7 @@
 from peft.tuners.lora import Embedding
 from transformers import Trainer
 
-from swift import get_logger
+from swift.utils import get_logger
 
 try:
     from peft import FourierFTModel

diff --git a/swift/tuners/prompt.py b/swift/tuners/prompt.py
@@ -8,7 +8,7 @@
 import torch
 from torch import nn
 
-from swift import get_logger
+from swift.utils import get_logger
 from swift.utils.torch_utils import find_sub_module
 from .utils import ActivationMixin, SwiftAdapter, SwiftConfig, SwiftOutput
 

diff --git a/swift/tuners/restuning.py b/swift/tuners/restuning.py
@@ -8,7 +8,7 @@
 import torch
 import torch.nn as nn
 
-from swift import get_logger
+from swift.utils import get_logger
 from swift.utils.torch_utils import find_sub_module
 from .restuning_components import ResTuner, detach_tensors, probe_input_pre_hook, probe_output_hook
 from .utils import ActivationMixin, SwiftAdapter, SwiftConfig, SwiftOutput

diff --git a/swift/tuners/scetuning/scetuning.py b/swift/tuners/scetuning/scetuning.py
@@ -7,8 +7,8 @@
 import torch
 from torch import nn
 
-from swift import get_logger
 from swift.tuners.utils import ActivationMixin, SwiftAdapter, SwiftConfig, SwiftOutput
+from swift.utils import get_logger
 from swift.utils.torch_utils import find_sub_module
 from .scetuning_components import probe_output_hook