[Cherry-Pick][Bug Fix]fix the bug for real size 0 in cudagraph (#3888)

zeroRains · Jiang-Jia-Jun · web-flow · commit d43549953c4a · 2025-09-08T14:06:10.000+08:00
* fix the bug for real size 0 in cudagraph

* fix cache_messager

---------

Co-authored-by: Jiang-Jia-Jun &lt;163579578+Jiang-Jia-Jun@users.noreply.github.com&gt;
diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py
@@ -163,7 +163,7 @@ def _prefill_layerwise_send_cache_thread(self):
         try:
             prefilled_step_idx_data = np.zeros(shape=[1], dtype=np.int32)
             prefilled_layer_idx_data = np.zeros(shape=[1], dtype=np.int32)
-            prefilled_layer_name = f"splitwise_complete_prefilled_step_{self.dp_rank_id}.{self.gpu_id}"
+            prefilled_layer_name = f"splitwise_complete_prefilled_layer_{self.dp_rank_id}.{self.gpu_id}"
             prefilled_step_name = f"splitwise_complete_prefilled_step_{self.dp_rank_id}.{self.gpu_id}"
             step_shm_value = IPCSignal(
                 name=f"splitwise_complete_prefilled_step_{self.dp_rank_id}",
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -42,6 +42,7 @@
 from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
 from fastdeploy.model_executor.model_loader import get_model_loader
 from fastdeploy.platforms import current_platform
+from fastdeploy.utils import ceil_div
 
 if current_platform.is_iluvatar():
     from fastdeploy.model_executor.ops.iluvatar import set_value_by_flags_and_idx
@@ -588,17 +589,16 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod
         """Set dummy prefill inputs to share_inputs"""
         # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
         max_dec_len = expected_decode_len + 1
-        full_length = min(
-            num_tokens // batch_size,
+        input_length = min(
+            ceil_div(num_tokens, batch_size),
             self.parallel_config.max_model_len - max_dec_len,
         )
 
         # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
         # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
         if self.fd_config.parallel_config.enable_expert_parallel:
-            full_length = min(full_length, 32)
+            input_length = min(input_length, 32)
 
-        input_length = int(full_length * self.cache_config.kv_cache_ratio)
         block_num = (
             input_length + self.cache_config.block_size - 1
         ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num