update

modelscope · Jintao-Huang · Apr 28, 2025 · Apr 27, 2025 · Apr 27, 2025 · Apr 27, 2025
commit b8bb97b891b253694e59d03577e2d93beaeb60a2
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -173,11 +173,11 @@
 |[Qwen/Qwen2.5-Math-1.5B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-1.5B)|qwen2_5_math|qwen2_5_math|transformers>=4.37|&#x2714;|math|[Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B)|
 |[Qwen/Qwen2.5-Math-7B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-7B)|qwen2_5_math|qwen2_5_math|transformers>=4.37|&#x2714;|math|[Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B)|
 |[Qwen/Qwen2.5-Math-72B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-72B)|qwen2_5_math|qwen2_5_math|transformers>=4.37|&#x2714;|math|[Qwen/Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-72B)|
-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)|
-|[Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)|
+|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat)|qwen2_moe|qwen|transformers>=4.40|&#x2714;|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)|
+|[Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B)|qwen2_moe|qwen|transformers>=4.40|&#x2714;|-|[Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)|
 |[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)|
-|[Qwen/Qwen2-57B-A14B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B-Instruct)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct)|
-|[Qwen/Qwen2-57B-A14B](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen2-57B-A14B](https://huggingface.co/Qwen/Qwen2-57B-A14B)|
+|[Qwen/Qwen2-57B-A14B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B-Instruct)|qwen2_moe|qwen|transformers>=4.40|&#x2714;|-|[Qwen/Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct)|
+|[Qwen/Qwen2-57B-A14B](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B)|qwen2_moe|qwen|transformers>=4.40|&#x2714;|-|[Qwen/Qwen2-57B-A14B](https://huggingface.co/Qwen/Qwen2-57B-A14B)|
 |[Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4)|
 |[Qwen/QwQ-32B-Preview](https://modelscope.cn/models/Qwen/QwQ-32B-Preview)|qwq_preview|qwq_preview|transformers>=4.37|&#x2714;|-|[Qwen/QwQ-32B-Preview](https://huggingface.co/Qwen/QwQ-32B-Preview)|
 |[Qwen/QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B)|qwq|qwq|transformers>=4.37|&#x2714;|-|[Qwen/QwQ-32B](https://huggingface.co/Qwen/QwQ-32B)|

diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -173,11 +173,11 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen2.5-Math-1.5B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-1.5B)|qwen2_5_math|qwen2_5_math|transformers>=4.37|&#x2714;|math|[Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B)|
 |[Qwen/Qwen2.5-Math-7B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-7B)|qwen2_5_math|qwen2_5_math|transformers>=4.37|&#x2714;|math|[Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B)|
 |[Qwen/Qwen2.5-Math-72B](https://modelscope.cn/models/Qwen/Qwen2.5-Math-72B)|qwen2_5_math|qwen2_5_math|transformers>=4.37|&#x2714;|math|[Qwen/Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-72B)|
-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)|
-|[Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)|
+|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat)|qwen2_moe|qwen|transformers>=4.40|&#x2714;|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat)|
+|[Qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B)|qwen2_moe|qwen|transformers>=4.40|&#x2714;|-|[Qwen/Qwen1.5-MoE-A2.7B](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B)|
 |[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4)|
-|[Qwen/Qwen2-57B-A14B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B-Instruct)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct)|
-|[Qwen/Qwen2-57B-A14B](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen2-57B-A14B](https://huggingface.co/Qwen/Qwen2-57B-A14B)|
+|[Qwen/Qwen2-57B-A14B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B-Instruct)|qwen2_moe|qwen|transformers>=4.40|&#x2714;|-|[Qwen/Qwen2-57B-A14B-Instruct](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct)|
+|[Qwen/Qwen2-57B-A14B](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B)|qwen2_moe|qwen|transformers>=4.40|&#x2714;|-|[Qwen/Qwen2-57B-A14B](https://huggingface.co/Qwen/Qwen2-57B-A14B)|
 |[Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4)|qwen2_moe|qwen|transformers>=4.40|&#x2718;|-|[Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4)|
 |[Qwen/QwQ-32B-Preview](https://modelscope.cn/models/Qwen/QwQ-32B-Preview)|qwq_preview|qwq_preview|transformers>=4.37|&#x2714;|-|[Qwen/QwQ-32B-Preview](https://huggingface.co/Qwen/QwQ-32B-Preview)|
 |[Qwen/QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B)|qwq|qwq|transformers>=4.37|&#x2714;|-|[Qwen/QwQ-32B](https://huggingface.co/Qwen/QwQ-32B)|

diff --git a/swift/megatron/model/config.py b/swift/megatron/model/config.py
@@ -39,7 +39,7 @@ def convert_hf_config(config) -> Dict[str, Any]:
                 hf_v = getattr(config, hf_k)
                 if k == 'rotary_base':
                     megatron_config[k] = int(hf_v)
-                elif k in {'untie_embeddings_and_output_weights', 'disable_bias_linear'}:
+                elif k in {'untie_embeddings_and_output_weights', 'disable_bias_linear', 'moe_router_pre_softmax'}:
                     megatron_config[k] = not hf_v
                 elif k == 'swiglu':
                     if hf_v == 'silu':

diff --git a/swift/megatron/model/gpt/hf2mcore.py b/swift/megatron/model/gpt/hf2mcore.py
@@ -44,7 +44,7 @@ def set_mlp_state(args, mg_mlp, hf_mlp):
             _set_mlp_state(mg_mlp.experts.local_experts[expert_idx], hf_mlp.experts[expert_idx])
 
         if mg_mlp.shared_experts is not None:
-            _set_mlp_state(mg_mlp.shared_experts, mg_mlp.shared_experts)
+            _set_mlp_state(mg_mlp.shared_experts, hf_mlp.shared_expert)
     else:
         _set_mlp_state(mg_mlp, hf_mlp)