support_lastnorm_gather_split_r2.4 (#5925)

xiaoluomi · web-flow · commit f12b7a7a195e · 2026-01-08T19:29:59.000-08:00
* support_lastnorm_gather_split_r2.4

* support_lastnorm_gather_split_r2.4v1

* support_lastnorm_gather_split_r2.4v2
diff --git a/fastdeploy/model_executor/layers/normalization.py b/fastdeploy/model_executor/layers/normalization.py
@@ -105,14 +105,14 @@ def __init__(
         self.tp_rank = self.fd_config.parallel_config.tensor_parallel_rank
         self.tp_group = self.fd_config.parallel_config.tp_group
         is_input_norm = prefix.endswith(".input_layernorm")
-        is_last_norm = prefix.endswith(".norm")
+        self.is_last_norm = prefix.endswith(".norm")
         self.split_x = (
             self.fd_config.parallel_config.use_sequence_parallel_moe
             and self.layer_id == self.fd_config.model_config.moe_layer_start_index
             and is_input_norm
         )
         self.allgather_out = self.fd_config.parallel_config.use_sequence_parallel_moe and (
-            (self.layer_id > self.fd_config.model_config.moe_layer_start_index and is_input_norm) or is_last_norm
+            (self.layer_id > self.fd_config.model_config.moe_layer_start_index and is_input_norm)
         )
 
         self.init_weight()
diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py
@@ -594,6 +594,9 @@ def forward(
             )
         out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0]
 
+        if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe:
+            out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0])
+
         return out
 
 
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -459,6 +459,9 @@ def forward(
 
         out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0]
 
+        if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe:
+            out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0])
+
         if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
             out = forward_meta.attn_backend.reverse_transpose(out)
 
diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py
@@ -325,7 +325,10 @@ def forward(
         for i in range(self.num_layers):
             hidden_states, residual = self.mtp_block[i](forward_meta, hidden_states, residual)
 
-        hidden_states = self.norm(hidden_states, residual)[0]
+        hidden_states = self.norm(hidden_states, residual, forward_meta=forward_meta)[0]
+
+        if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe:
+            hidden_states = self.norm.allgather(hidden_states, forward_meta.ids_remove_padding.shape[0])
 
         return hidden_states
 
@@ -396,7 +399,7 @@ def load_weights(self, weights_iterator) -> None:
             ),
         )
 
-    def compute_logits(self, hidden_states: paddle.Tensor):
+    def compute_logits(self, hidden_states: paddle.Tensor, forward_meta: ForwardMeta):
         """
         compute logits
         """
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -548,6 +548,10 @@ def forward(
             )
 
         out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0]
+
+        if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe:
+            out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0])
+
         return out
 
 
diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py
@@ -370,6 +370,9 @@ def forward(
 
         out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0]
 
+        if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe:
+            out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0])
+
         return out
 
 
diff --git a/fastdeploy/model_executor/models/gpt_oss.py b/fastdeploy/model_executor/models/gpt_oss.py
@@ -214,8 +214,12 @@ def forward(self, ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta):
         for i in range(self.num_layers):
             hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual)
 
-        hidden_states = self.norm(hidden_states, residual)[0]
-        return hidden_states
+        out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0]
+
+        if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe:
+            out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0])
+
+        return out
 
 
 @ModelRegistry.register_model_class(
diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py
@@ -282,6 +282,9 @@ def forward(
 
         out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0]
 
+        if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe:
+            out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0])
+
         return out
 
 
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
@@ -1012,7 +1012,7 @@ def _propose_cuda(self, step_use_cudagraph: bool = False, is_dummy_run: bool = F
                 )
 
                 # 4. Compute logits, Sample
-                logits = self.model.compute_logits(hidden_states)
+                logits = self.model.compute_logits(hidden_states, forward_meta=self.forward_meta)
                 if self.enable_logprob and self.enable_draft_logprob and substep == 0:
                     first_token_logits = self.model.compute_logits(self.model_inputs["first_token_hidden_states"])
 
@@ -1125,7 +1125,7 @@ def _propose_xpu(self, step_use_cudagraph: bool = False, is_dummy_run: bool = Fa
                     model_output, self.model_inputs["cum_offsets"], self.forward_meta, self.model_inputs
                 )
                 # 4. Compute logits, Sample
-                logits = self.model.compute_logits(hidden_states)
+                logits = self.model.compute_logits(hidden_states, forward_meta=self.forward_meta)
                 sampled_token_ids, sampler_output = self.sampler(
                     logits,
                     self.sampling_metadata,

Original file line number	Diff line number	Diff line change
`@@ -594,6 +594,9 @@ def forward(`
`594`	`594`	`)`
`595`	`595`	`out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0]`
`596`	`596`
	`597`	`+ if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe:`
	`598`	`+ out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0])`
	`599`	`+`
`597`	`600`	`return out`
`598`	`601`
`599`	`602`
Original file line number	Diff line number	Diff line change
`@@ -548,6 +548,10 @@ def forward(`
`548`	`548`	`)`
`549`	`549`
`550`	`550`	`out = self.norm(hidden_states, residual, forward_meta=forward_meta)[0]`
	`551`	`+`
	`552`	`+ if self.norm.is_last_norm and self.norm.fd_config.parallel_config.use_sequence_parallel_moe:`
	`553`	`+ out = self.norm.allgather(out, forward_meta.ids_remove_padding.shape[0])`
	`554`	`+`
`551`	`555`	`return out`
`552`	`556`
`553`	`557`