|
42 | 42 | from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler |
43 | 43 | from fastdeploy.model_executor.model_loader import get_model_loader |
44 | 44 | from fastdeploy.platforms import current_platform |
| 45 | +from fastdeploy.utils import ceil_div |
45 | 46 |
|
46 | 47 | if current_platform.is_iluvatar(): |
47 | 48 | from fastdeploy.model_executor.ops.iluvatar import set_value_by_flags_and_idx |
@@ -588,17 +589,16 @@ def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decod |
588 | 589 | """Set dummy prefill inputs to share_inputs""" |
589 | 590 | # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token |
590 | 591 | max_dec_len = expected_decode_len + 1 |
591 | | - full_length = min( |
592 | | - num_tokens // batch_size, |
| 592 | + input_length = min( |
| 593 | + ceil_div(num_tokens, batch_size), |
593 | 594 | self.parallel_config.max_model_len - max_dec_len, |
594 | 595 | ) |
595 | 596 |
|
596 | 597 | # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan. |
597 | 598 | # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP. |
598 | 599 | if self.fd_config.parallel_config.enable_expert_parallel: |
599 | | - full_length = min(full_length, 32) |
| 600 | + input_length = min(input_length, 32) |
600 | 601 |
|
601 | | - input_length = int(full_length * self.cache_config.kv_cache_ratio) |
602 | 602 | block_num = ( |
603 | 603 | input_length + self.cache_config.block_size - 1 |
604 | 604 | ) // self.cache_config.block_size + self.cache_config.enc_dec_block_num |
|
0 commit comments