SemiAnalysisAI · Fangzhou-Ai · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
@@ -4,6 +4,16 @@
 # https://huggingface.co/amd/MiniMax-M3-MXFP4#reproduction
 # Block size 128 is mandatory for MSA. This fixed-sequence benchmark uses the
 # text-only language-model path with AITER MoE (vllm-project/vllm#46419).
+#
+# High-concurrency parity with the ATOM recipe comes from three levers:
+#   * INT4 quantized all-reduce (env knobs below) -- reduces the all-reduce
+#     cost (the biggest decode kernel); measured ~-12% to -17% TPOT at conc
+#     64/128/256. Works on any nightly.
+#   * fp8 KV cache (--kv-cache-dtype fp8).
+#   * cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4) --
+#     requires vllm-project/vllm#47269 (merged).
+# Pin the image (.github/configs/amd-master.yaml) to a nightly containing
+# #47269 before sweeping for the full curve.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -35,6 +45,13 @@ export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_USE_AITER_MOE=1
 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
+# INT4 quantized all-reduce for the (~1.5 MB) decode all-reduces, which are the
+# single biggest decode kernel at high concurrency. The MIN_SIZE_KB override is
+# required: vLLM's default INT4 quick-reduce size gate for (bf16, TP4) is 16 MB,
+# so it never fires for decode-sized tensors without it.
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -63,6 +80,8 @@ vllm serve "$MODEL" --port "$PORT" \
     --max-model-len "$MAX_MODEL_LEN" \
     --attention-backend TRITON_ATTN \
     --moe-backend aiter \
+    --kv-cache-dtype fp8 \
+    --hf-overrides '{"text_config": {"use_index_cache": true, "index_topk_freq": 4}}' \
     --tool-call-parser minimax_m3 \
     --enable-auto-tool-choice \
     --reasoning-parser minimax_m3 > "$SERVER_LOG" 2>&1 &

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -4351,3 +4351,11 @@
     - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41, which includes vllm-project/vllm PR #46380; no runtime patch needed"
     - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1966
+
+- config-keys:
+    - minimaxm3-fp4-mi355x-vllm
+  description:
+    - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM STP. Add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)."
+    - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold. Local (ATOM's benchmark): conc32 17.21ms / conc64 25.13ms vs ATOM ref 16.74 / 25.00 (matched); GSM8K limit100=0.95."
+    - "index_topk_freq needs vllm-project/vllm#47269 (merged) in the served image; pin the image to a nightly containing it before sweeping."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1969