PaddlePaddle · phlrain · Jul 4, 2025 · Jul 4, 2025
diff --git a/paddlenlp/transformers/deepseek_v2/configuration.py b/paddlenlp/transformers/deepseek_v2/configuration.py
@@ -182,7 +182,6 @@ def __init__(
         use_dualpipev=False,
         send_mtp_embed=False,
         recompute_fwd_gate_up=False,
-        dequant_input=False,
         is_split_group_gemm=False,
         **kwargs,
     ):
@@ -235,10 +234,8 @@ def __init__(
         self.use_dualpipev = use_dualpipev
         self.send_mtp_embed = send_mtp_embed
         self.recompute_fwd_gate_up = recompute_fwd_gate_up
-        self.dequant_input = dequant_input
         self.is_split_group_gemm = is_split_group_gemm
 
-
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -65,6 +65,7 @@
 
 
 DSV3_USE_FP8_GEMM = os.getenv("DSV3_USE_FP8_GEMM", "False").lower() == "true"
+DSV3_USE_FP8_DISPATCH = os.getenv("DSV3_USE_FP8_DISPATCH", "False").lower() == "true"
 
 
 def parse_args(args):
@@ -510,7 +511,7 @@ def dispatch_forward(self, inputs, previous_event=None, async_finish=False, allo
                 token_probs,
             ) = inputs
 
-        (hs_fp16_dispatched, dispatched_indices, dispatched_probs,) = self.fp8_fusion_moe_node.dispatch_node.forward(
+        (hs_dispatched, dispatched_indices, dispatched_probs,) = self.fp8_fusion_moe_node.dispatch_node.forward(
             hs_2d,
             token_indices,
             token_probs,
@@ -524,7 +525,7 @@ def dispatch_forward(self, inputs, previous_event=None, async_finish=False, allo
                 hidden_states,
                 residual,
                 l_aux,
-                hs_fp16_dispatched,
+                hs_dispatched,
                 dispatched_indices,
                 dispatched_probs,
             )
@@ -533,7 +534,7 @@ def dispatch_forward(self, inputs, previous_event=None, async_finish=False, allo
                 hidden_states,
                 residual,
                 l_aux,
-                hs_fp16_dispatched,
+                hs_dispatched,
                 dispatched_indices,
                 dispatched_probs,
             )
@@ -545,7 +546,7 @@ def mlp_forward(self, inputs):
                 hidden_states,
                 residual,
                 l_aux,
-                hs_fp16_dispatched,
+                hs_dispatched,
                 dispatched_indices,
                 dispatched_probs,
             ) = inputs
@@ -554,12 +555,12 @@ def mlp_forward(self, inputs):
                 hidden_states,
                 residual,
                 l_aux,
-                hs_fp16_dispatched,
+                hs_dispatched,
                 dispatched_indices,
                 dispatched_probs,
             ) = inputs
         hidden_states_out = self.fp8_fusion_moe_node.mlp_node.forward(
-            hs_fp16_dispatched, dispatched_indices, dispatched_probs
+            hs_dispatched, dispatched_indices, dispatched_probs
         )
 
         if self.send_mtp_embed:
@@ -573,18 +574,18 @@ def combine_forward(self, inputs, async_finish=False):
         else:
             (hidden_states, residual, l_aux, hidden_states_out) = inputs
 
-        output_combie = self.fp8_fusion_moe_node.combine_node.forward(hidden_states_out, async_finish=async_finish)
+        output_combine = self.fp8_fusion_moe_node.combine_node.forward(hidden_states_out, async_finish=async_finish)
         if self.send_mtp_embed:
-            return (inputs_embeds_mtp, hidden_states, residual, l_aux, output_combie)
+            return (inputs_embeds_mtp, hidden_states, residual, l_aux, output_combine)
         else:
-            return (hidden_states, residual, l_aux, output_combie)
+            return (hidden_states, residual, l_aux, output_combine)
 
     def post_process_forward(self, inputs):
         if self.send_mtp_embed:
-            (inputs_embeds_mtp, hidden_states, residual, l_aux, output_combie) = inputs
+            (inputs_embeds_mtp, hidden_states, residual, l_aux, output_combine) = inputs
         else:
-            (hidden_states, residual, l_aux, output_combie) = inputs
-        final_hidden_states = self.fp8_fusion_moe_node.combine_quant_node.forward(output_combie)
+            (hidden_states, residual, l_aux, output_combine) = inputs
+        final_hidden_states = self.fp8_fusion_moe_node.combine_quant_node.forward(output_combine)
         if self.send_mtp_embed:
             inputs = (inputs_embeds_mtp, hidden_states, residual, l_aux, final_hidden_states)
         else:
@@ -609,21 +610,21 @@ def post_process_backward(self, output_grad):
                 l_aux_grad,
                 final_hidden_states_grad,
             ) = self.post_process_node.backward(output_grad)
-        output_combie_grad = self.fp8_fusion_moe_node.combine_quant_node.backward(final_hidden_states_grad)
+        output_combine_grad = self.fp8_fusion_moe_node.combine_quant_node.backward(final_hidden_states_grad)
         if self.send_mtp_embed:
             return (
                 inputs_embeds_mtp_grad,
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                output_combie_grad,
+                output_combine_grad,
             )
         else:
             return (
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                output_combie_grad,
+                output_combine_grad,
             )
 
     def combine_backward(self, output_grad, async_finish=False):
@@ -633,18 +634,18 @@ def combine_backward(self, output_grad, async_finish=False):
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                output_combie_grad_bf16,
+                output_combine_grad,
             ) = output_grad
         else:
             (
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                output_combie_grad_bf16,
+                output_combine_grad,
             ) = output_grad
 
-        hidden_states_out_grad_bf16 = self.fp8_fusion_moe_node.combine_node.backward(
-            output_combie_grad_bf16,
+        hidden_states_out_grad = self.fp8_fusion_moe_node.combine_node.backward(
+            output_combine_grad,
             async_finish=async_finish,
         )
 
@@ -654,14 +655,14 @@ def combine_backward(self, output_grad, async_finish=False):
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                hidden_states_out_grad_bf16,
+                hidden_states_out_grad,
             )
         else:
             return (
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                hidden_states_out_grad_bf16,
+                hidden_states_out_grad,
             )
 
     def mlp_backward(self, output_grad):
@@ -680,25 +681,23 @@ def mlp_backward(self, output_grad):
                 l_aux_grad,
                 hidden_states_out_grad,
             ) = output_grad
-        hs_fp16_dispatched_grad, dispatched_probs_grad = self.fp8_fusion_moe_node.mlp_node.backward(
-            hidden_states_out_grad
-        )
+        hs_dispatched_grad, dispatched_probs_grad = self.fp8_fusion_moe_node.mlp_node.backward(hidden_states_out_grad)
 
         if self.send_mtp_embed:
             return (
                 inputs_embeds_mtp_grad,
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                hs_fp16_dispatched_grad,
+                hs_dispatched_grad,
                 dispatched_probs_grad,
             )
         else:
             return (
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                hs_fp16_dispatched_grad,
+                hs_dispatched_grad,
                 dispatched_probs_grad,
             )
 
@@ -709,19 +708,19 @@ def dispatch_backward(self, output_grad, async_finish=False):
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                hs_bf16_dispatched_grad,
+                hs_dispatched_grad,
                 dispatched_probs_grad,
             ) = output_grad
         else:
             (
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                hs_bf16_dispatched_grad,
+                hs_dispatched_grad,
                 dispatched_probs_grad,
             ) = output_grad
-        hs_bf16_grad, token_probs_grad = self.fp8_fusion_moe_node.dispatch_node.backward(
-            hs_bf16_dispatched_grad, dispatched_probs_grad, async_finish=async_finish
+        hs_grad, token_probs_grad = self.fp8_fusion_moe_node.dispatch_node.backward(
+            hs_dispatched_grad, dispatched_probs_grad, async_finish=async_finish
         )
 
         if self.send_mtp_embed:
@@ -730,11 +729,11 @@ def dispatch_backward(self, output_grad, async_finish=False):
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                hs_bf16_grad,
+                hs_grad,
                 token_probs_grad,
             )
         else:
-            return (hidden_states_grad, residual_grad, l_aux_grad, hs_bf16_grad, token_probs_grad)
+            return (hidden_states_grad, residual_grad, l_aux_grad, hs_grad, token_probs_grad)
 
     def attn_backward(self, output_grad):
         if self.send_mtp_embed:
@@ -743,19 +742,19 @@ def attn_backward(self, output_grad):
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                hs_bf16_grad,
+                hs_grad,
                 token_probs_grad,
             ) = output_grad
         else:
             (
                 hidden_states_grad,
                 residual_grad,
                 l_aux_grad,
-                hs_bf16_grad,
+                hs_grad,
                 token_probs_grad,
             ) = output_grad
         hidden_states_grad_, probs_grad, routing_map_grad = self.fp8_fusion_moe_node.dispatch_quant_node.backward(
-            hs_bf16_grad, token_probs_grad
+            hs_grad, token_probs_grad
         )
 
         if self.send_mtp_embed:
@@ -1234,7 +1233,6 @@ def build_schedule_node(self):
                     fp8_fusion_moe_node = FusionMoeNode(
                         self.mlp,
                         recompute_fwd_gate_up=self.config.recompute_fwd_gate_up,
-                        dequant_input=self.config.dequant_input,
                         is_split_group_gemm=self.config.is_split_group_gemm,
                         name="fp8_fusion_moe_node",
                     )