diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a437f4ecd..79b9fbc91 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2877,7 +2877,7 @@ minimaxm3-fp4-mi355x-atom-disagg: # checkpoint lacks calibrated ROCm FP8 attention scales. Use the TP8-only H100 # search space: TP8 for latency and TP8+EP8 (TEP) at high concurrency. minimaxm3-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:minimax-m3 + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi300x @@ -2889,8 +2889,8 @@ minimaxm3-fp8-mi300x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 256, conc-end: 256 } + # TP8/EP1 full conc range; the prior EP8 conc256 row regressed throughput. + - { tp: 8, conc-start: 1, conc-end: 256 } - isl: 8192 osl: 1024 search-space: diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh index f2cdaf284..4a6ce42e7 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi300x.sh @@ -5,6 +5,9 @@ # is mandatory for MSA sparse attention. Keep the default BF16 KV cache on # gfx942: the checkpoint has no calibrated q/prob scales for ROCm FP8 # attention, and vLLM's fallback scale of 1.0 corrupts model accuracy. +# --async-scheduling and --max-num-batched-tokens 16384 are accuracy-safe +# scheduling levers: the larger prefill batch amortizes the per-step +# BF16-emulated MoE weight read, lifting high-concurrency throughput. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -34,6 +37,13 @@ SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MHA=0 + +export TORCH_BLAS_PREFER_HIPBLASLT=1 +export NCCL_MIN_NCHANNELS="${NCCL_MIN_NCHANNELS:-112}" +export GPU_MAX_HW_QUEUES="${GPU_MAX_HW_QUEUES:-2}" + if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context fi @@ -59,6 +69,8 @@ vllm serve "$MODEL" --port "$PORT" \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --attention-backend TRITON_ATTN \ + --async-scheduling \ + --max-num-batched-tokens 16384 \ --tool-call-parser minimax_m3 \ --reasoning-parser minimax_m3 \ --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b776a5d1d..e297e5518 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4324,6 +4324,17 @@ - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) for AITER MoE and shared-expert fusion support (vllm-project/vllm#46419, vllm-project/vllm#46545)." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1954 +- config-keys: + - minimaxm3-fp8-mi300x-vllm + description: + - "Update the MiniMax-M3 MXFP8 MI300X vLLM image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1." + - "Enable AITER kernels for MiniMax-M3 MXFP8 on MI300X/gfx942 via the single master toggle VLLM_ROCM_USE_AITER=1: the stock image left it unset, so the hot decode GEMMs and fused MoE ran on the generic kernels. The per-component AITER flags (MoE, linear, RMSNorm, FP8 batched-GEMM) default to True and are gated behind the master flag, so they are left at their defaults. Keep attention on TRITON_ATTN (VLLM_ROCM_USE_AITER_MHA=0, which defaults to True) because the MXFP8 checkpoint lacks calibrated q/prob scales for ROCm FP8 attention." + - "Add AMD-recommended, numerically-inert MI300X runtime knobs: TORCH_BLAS_PREFER_HIPBLASLT=1, NCCL_MIN_NCHANNELS=112 (raises RCCL channels above the ~32-64 default for TP8), GPU_MAX_HW_QUEUES=2 (caps HIP streams below the default of 4)." + - "Stack two accuracy-safe scheduling levers: --async-scheduling (overlaps CPU input-prep with GPU decode) and --max-num-batched-tokens 16384 (amortizes the per-step BF16-emulated MoE weight read of ~95 GB/rank over more prompt tokens, halving prefill weight-reads vs the 8192 default). Both are token-for-token identical (scheduling only); GSM8K exact-match holds at 0.959." + - "Switch the 1k1k conc256 search-space row from TP8/EP8 to TP8/EP1: the EP8 topology regressed high-concurrency throughput (434 vs 905 tok/s/gpu @ conc256, EP8 vs EP1) and EP1 matches the topology the prior AITER uplift was measured against." + - "Measured uplift on 8xMI300X, 1k1k random sweep (total tok/s/gpu): conc256 782.7->905 (EP8->EP1 + scheduling levers), conc128 598.9->628 (+4.9%), conc64 365.1->429 (+17.5%), conc32 295.6->327.4 (+10.8%), conc16 203.1->216.5 (+6.6%), conc8 127.6->136.6 (+7.1%), conc4 80.1->84.6 (+5.6%); conc1-2 unchanged (latency-bound). GSM8K exact-match holds at 0.959." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1951 + - config-keys: - minimaxm3-fp8-mi355x-vllm - minimaxm3-fp8-mi355x-vllm-mtp