diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9386d9c27..ea57b7e06 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2635,7 +2635,7 @@ minimaxm3-fp4-mi355x-vllm-disagg: # language-model path and mirror the MXFP8 MI355X search space for a direct # precision comparison. minimaxm3-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 + image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh index 4be977a80..f0f0ea5f5 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm.sh @@ -4,6 +4,16 @@ # https://huggingface.co/amd/MiniMax-M3-MXFP4#reproduction # Block size 128 is mandatory for MSA. This fixed-sequence benchmark uses the # text-only language-model path with AITER MoE (vllm-project/vllm#46419). +# +# High-concurrency parity with the ATOM recipe comes from three levers: +# * INT4 quantized all-reduce (env knobs below) -- reduces the all-reduce +# cost (the biggest decode kernel); measured ~-12% to -17% TPOT at conc +# 64/128/256. Works on any nightly. +# * fp8 KV cache (--kv-cache-dtype fp8). +# * cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4) -- +# requires vllm-project/vllm#47269 (merged). +# Pin the image (.github/configs/amd-master.yaml) to a nightly containing +# #47269 before sweeping for the full curve. source "$(dirname "$0")/../../benchmark_lib.sh" @@ -35,6 +45,13 @@ export VLLM_USE_BREAKABLE_CUDAGRAPH=0 export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_USE_AITER_MOE=1 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 +# INT4 quantized all-reduce for the (~1.5 MB) decode all-reduces, which are the +# single biggest decode kernel at high concurrency. The MIN_SIZE_KB override is +# required: vLLM's default INT4 quick-reduce size gate for (bf16, TP4) is 16 MB, +# so it never fires for decode-sized tensors without it. +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +export VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256 if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context @@ -63,6 +80,8 @@ vllm serve "$MODEL" --port "$PORT" \ --max-model-len "$MAX_MODEL_LEN" \ --attention-backend TRITON_ATTN \ --moe-backend aiter \ + --kv-cache-dtype fp8 \ + --hf-overrides '{"text_config": {"use_index_cache": true, "index_topk_freq": 4}}' \ --tool-call-parser minimax_m3 \ --enable-auto-tool-choice \ --reasoning-parser minimax_m3 > "$SERVER_LOG" 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 450226250..53137295f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4400,3 +4400,11 @@ description: - "Bump SGLang image from lmsysorg/sglang:deepseek-v4-blackwell (digest sha256:df18bfc4...) to mainline nightly lmsysorg/sglang:nightly-dev-cu13-20260628-da802ddc." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1923 + +- config-keys: + - minimaxm3-fp4-mi355x-vllm + description: + - "Close the high-concurrency gap vs the ATOM recipe on MiniMax-M3 MXFP4 MI355X single-node vLLM STP. Add INT4 quantized all-reduce (VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4, VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16=0, VLLM_ROCM_QUICK_REDUCE_QUANTIZATION_MIN_SIZE_KB=256), fp8 KV cache (--kv-cache-dtype fp8), and cross-layer indexer top-k sharing (--hf-overrides index_topk_freq=4)." + - "INT4 quick-reduce reduces the all-reduce cost (the biggest decode kernel); the MIN_SIZE_KB override lowers the quantization codec threshold. Local (ATOM's benchmark): conc32 17.21ms / conc64 25.13ms vs ATOM ref 16.74 / 25.00 (matched); GSM8K limit100=0.95." + - "index_topk_freq needs vllm-project/vllm#47269 (merged) in the served image; pin the image to a nightly containing it before sweeping." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1969