diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9386d9c27..ea57b7e06 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2635,7 +2635,7 @@ minimaxm3-fp4-mi355x-vllm-disagg:
 # language-model path and mirror the MXFP8 MI355X search space for a direct
 # precision comparison.
 minimaxm3-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
+  image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
index 4be977a80..f0f0ea5f5 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh
@@ -4,6 +4,16 @@
 # https://huggingface.co/amd/MiniMax-M3-MXFP4#reproduction
 # Block size 128 is mandatory for MSA. This fixed-sequence benchmark uses the
 # text-only language-model path with AITER MoE (vllm-project/vllm#46419).
+#
+# High-concurrency parity with the ATOM recipe comes from three levers:
+#   * INT4 quantized all-reduce (env knobs below) -- reduces the all-reduce
+#     cost (the biggest decode kernel); measured ~-12% to -17% TPOT at conc
+#     64/128/256. Works on any nightly.
+#   * fp8 KV cache (--kv-cache-dtype fp8).
+#   * cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4) --
+#     requires vllm-project/vllm#47269 (merged).
+# Pin the image (.github/configs/amd-master.yaml) to a nightly containing
+# #47269 before sweeping for the full curve.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -35,6 +45,13 @@ export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_USE_AITER_MOE=1
 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
+# INT4 quantized all-reduce for the (~1.5 MB) decode all-reduces, which are the
+# single biggest decode kernel at high concurrency. The MIN_SIZE_KB override is
+# required: vLLM's default INT4 quick-reduce size gate for (bf16, TP4) is 16 MB,
+# so it never fires for decode-sized tensors without it.
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+export VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256
 
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
@@ -63,6 +80,8 @@ vllm serve "$MODEL" --port "$PORT" \
     --max-model-len "$MAX_MODEL_LEN" \
     --attention-backend TRITON_ATTN \
     --moe-backend aiter \
+    --kv-cache-dtype fp8 \
+    --hf-overrides '{"text_config": {"use_index_cache": true, "index_topk_freq": 4}}' \
     --tool-call-parser minimax_m3 \
     --enable-auto-tool-choice \
     --reasoning-parser minimax_m3 > "$SERVER_LOG" 2>&1 &
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 450226250..53137295f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4400,3 +4400,11 @@
   description:
     - "Bump SGLang image from lmsysorg/sglang:deepseek-v4-blackwell (digest sha256:df18bfc4...) to mainline nightly lmsysorg/sglang:nightly-dev-cu13-20260628-da802ddc."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1923
+
+- config-keys:
+    - minimaxm3-fp4-mi355x-vllm
+  description:
+    - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM STP. Add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)."
+    - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold. Local (ATOM's benchmark): conc32 17.21ms / conc64 25.13ms vs ATOM ref 16.74 / 25.00 (matched); GSM8K limit100=0.95."
+    - "index_topk_freq needs vllm-project/vllm#47269 (merged) in the served image; pin the image to a nightly containing it before sweeping."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1969