From 3389f418d950fea6ff051e55daf1a32a08296d94 Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Thu, 2 Jul 2026 07:37:45 -0500
Subject: [PATCH 1/2] [AMD] MiniMax-M3 FP4 MI355X vLLM MTP: close gap vs ATOM
 (INT4 all-reduce + index-sharing)

Mirror the STP recipe from #1969 on the EAGLE3 spec-decoding (MTP) variant:
INT4 quantized all-reduce, fp8 KV cache, and cross-layer indexer top-k sharing
(index_topk_freq=4). Bump the image to a nightly containing
vllm-project/vllm#47269.
---
 .github/configs/amd-master.yaml               |  2 +-
 .../minimaxm3_fp4_mi355x_vllm_mtp.sh          | 19 +++++++++++++++++++
 perf-changelog.yaml                           |  8 ++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9386d9c27..531719eda 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2666,7 +2666,7 @@ minimaxm3-fp4-mi355x-vllm:
 # tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base
 # FP4 sweep at extreme concurrency where speculative decoding loses value.
 minimaxm3-fp4-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
+  image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
index 8a15b8c89..f890f0fc0 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
@@ -5,6 +5,16 @@
 # minimaxm3_fp4_mi355x_vllm.sh and uses three speculative tokens from
 # Inferact/MiniMax-M3-EAGLE3. The pinned nightly includes upstream AMD
 # MiniMax-M3 SupportsEagle3 support, so no runtime model patch is needed.
+#
+# Mirrors the three high-concurrency levers from minimaxm3_fp4_mi355x_vllm.sh:
+#   * INT4 quantized all-reduce (env knobs below) -- reduces the all-reduce
+#     cost (the biggest decode kernel); measured ~-12% to -17% TPOT at conc
+#     64/128/256. Works on any nightly.
+#   * fp8 KV cache (--kv-cache-dtype fp8).
+#   * cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4) --
+#     requires vllm-project/vllm#47269 (merged).
+# Pin the image (.github/configs/amd-master.yaml) to a nightly containing
+# #47269 before sweeping for the full curve.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -48,6 +58,13 @@ export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_USE_AITER_MOE=1
 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
+# INT4 quantized all-reduce for the (~1.5 MB) decode all-reduces, which are the
+# single biggest decode kernel at high concurrency. The MIN_SIZE_KB override is
+# required: vLLM's default INT4 quick-reduce size gate for (bf16, TP4) is 16 MB,
+# so it never fires for decode-sized tensors without it.
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -78,6 +95,8 @@ vllm serve "$MODEL" --port "$PORT" \
     --max-model-len "$MAX_MODEL_LEN" \
     --attention-backend TRITON_ATTN \
     --moe-backend aiter \
+    --kv-cache-dtype fp8 \
+    --hf-overrides '{"text_config": {"use_index_cache": true, "index_topk_freq": 4}}' \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --tool-call-parser minimax_m3 \
     --enable-auto-tool-choice \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 450226250..c72f19552 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4395,6 +4395,14 @@
     - "Sweeps tp 4/8 with and without EP at 1k1k and 8k1k, conc 1-256"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1933
 
+- config-keys:
+    - minimaxm3-fp4-mi355x-vllm-mtp
+  description:
+    - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM MTP (EAGLE3 spec decoding). Mirror the STP recipe (#1969): add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)."
+    - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold."
+    - "index_topk_freq needs vllm-project/vllm#47269 (merged); bump the image to vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa (contains it)."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER
+
 - config-keys:
     - dsv4-fp4-b200-sglang
   description:

From a65c6b7bff7db131a1ca4d9a4e100cd4fb86379b Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Thu, 2 Jul 2026 07:38:49 -0500
Subject: [PATCH 2/2] chore(changelog): set pr-link for MiniMax-M3 FP4 MI355X
 vLLM MTP recipe (#1979)

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c72f19552..a37a9b498 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4401,7 +4401,7 @@
     - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM MTP (EAGLE3 spec decoding). Mirror the STP recipe (#1969): add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)."
     - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold."
     - "index_topk_freq needs vllm-project/vllm#47269 (merged); bump the image to vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa (contains it)."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1979
 
 - config-keys:
     - dsv4-fp4-b200-sglang