From 3389f418d950fea6ff051e55daf1a32a08296d94 Mon Sep 17 00:00:00 2001 From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com> Date: Thu, 2 Jul 2026 07:37:45 -0500 Subject: [PATCH 1/2] [AMD] MiniMax-M3 FP4 MI355X vLLM MTP: close gap vs ATOM (INT4 all-reduce + index-sharing) Mirror the STP recipe from #1969 on the EAGLE3 spec-decoding (MTP) variant: INT4 quantized all-reduce, fp8 KV cache, and cross-layer indexer top-k sharing (index_topk_freq=4). Bump the image to a nightly containing vllm-project/vllm#47269. --- .github/configs/amd-master.yaml | 2 +- .../minimaxm3_fp4_mi355x_vllm_mtp.sh | 19 +++++++++++++++++++ perf-changelog.yaml | 8 ++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9386d9c27..531719eda 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2666,7 +2666,7 @@ minimaxm3-fp4-mi355x-vllm: # tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base # FP4 sweep at extreme concurrency where speculative decoding loses value. minimaxm3-fp4-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 + image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh index 8a15b8c89..f890f0fc0 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh @@ -5,6 +5,16 @@ # minimaxm3_fp4_mi355x_vllm.sh and uses three speculative tokens from # Inferact/MiniMax-M3-EAGLE3. The pinned nightly includes upstream AMD # MiniMax-M3 SupportsEagle3 support, so no runtime model patch is needed. +# +# Mirrors the three high-concurrency levers from minimaxm3_fp4_mi355x_vllm.sh: +# * INT4 quantized all-reduce (env knobs below) -- reduces the all-reduce +# cost (the biggest decode kernel); measured ~-12% to -17% TPOT at conc +# 64/128/256. Works on any nightly. +# * fp8 KV cache (--kv-cache-dtype fp8). +# * cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4) -- +# requires vllm-project/vllm#47269 (merged). +# Pin the image (.github/configs/amd-master.yaml) to a nightly containing +# #47269 before sweeping for the full curve. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -48,6 +58,13 @@ export VLLM_USE_BREAKABLE_CUDAGRAPH=0 export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_USE_AITER_MOE=1 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 +# INT4 quantized all-reduce for the (~1.5 MB) decode all-reduces, which are the +# single biggest decode kernel at high concurrency. The MIN_SIZE_KB override is +# required: vLLM's default INT4 quick-reduce size gate for (bf16, TP4) is 16 MB, +# so it never fires for decode-sized tensors without it. +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -78,6 +95,8 @@ vllm serve "$MODEL" --port "$PORT" \ --max-model-len "$MAX_MODEL_LEN" \ --attention-backend TRITON_ATTN \ --moe-backend aiter \ + --kv-cache-dtype fp8 \ + --hf-overrides '{"text_config": {"use_index_cache": true, "index_topk_freq": 4}}' \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --tool-call-parser minimax_m3 \ --enable-auto-tool-choice \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 450226250..c72f19552 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4395,6 +4395,14 @@ - "Sweeps tp 4/8 with and without EP at 1k1k and 8k1k, conc 1-256" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1933 +- config-keys: + - minimaxm3-fp4-mi355x-vllm-mtp + description: + - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM MTP (EAGLE3 spec decoding). Mirror the STP recipe (#1969): add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)." + - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold." + - "index_topk_freq needs vllm-project/vllm#47269 (merged); bump the image to vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa (contains it)." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER + - config-keys: - dsv4-fp4-b200-sglang description: From a65c6b7bff7db131a1ca4d9a4e100cd4fb86379b Mon Sep 17 00:00:00 2001 From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com> Date: Thu, 2 Jul 2026 07:38:49 -0500 Subject: [PATCH 2/2] chore(changelog): set pr-link for MiniMax-M3 FP4 MI355X vLLM MTP recipe (#1979) --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c72f19552..a37a9b498 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4401,7 +4401,7 @@ - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM MTP (EAGLE3 spec decoding). Mirror the STP recipe (#1969): add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)." - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold." - "index_topk_freq needs vllm-project/vllm#47269 (merged); bump the image to vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa (contains it)." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1979 - config-keys: - dsv4-fp4-b200-sglang