From cdb22b27bb613f31de8ded2e3000afea3baa0171 Mon Sep 17 00:00:00 2001 From: Jason Li Date: Thu, 2 Jul 2026 07:40:22 -0700 Subject: [PATCH 1/3] feat: add MiniMax-M3 FP4 B200 disagg config --- .github/configs/nvidia-master.yaml | 28 ++++++ .../b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml | 85 +++++++++++++++++++ runners/launch_b200-dgxc.sh | 6 ++ 3 files changed, 119 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5a0e6a636..972f28b39 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11848,6 +11848,34 @@ qwen3.5-fp8-h100-sglang-agentic: - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] } - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] } +minimaxm3-fp4-b200-dynamo-vllm: + image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41 + model: nvidia/MiniMax-M3-NVFP4 + model-prefix: minimaxm3 + runner: b200-multinode + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [4096] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + minimaxm3-fp8-b300-dynamo-vllm: image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 model: MiniMaxAI/MiniMax-M3-MXFP8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..3798c8486 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml @@ -0,0 +1,85 @@ +name: "minimax-m3-vllm-disagg-b200-2p1d-fp4-dep2-dep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 # One DP8 decode worker provides 8 DP ranks. + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f10e0f4ea..8e915061e 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -113,6 +113,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then git checkout aflowers/vllm-gb200-v0.20.0 mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 + elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout main + mkdir -p recipes/vllm/minimax-m3/b200-fp4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4" recipes/vllm/minimax-m3/b200-fp4 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 From d6a1d6aed03624d7f373847649a8d4017a7fcaa5 Mon Sep 17 00:00:00 2001 From: Jason Li Date: Thu, 2 Jul 2026 07:41:12 -0700 Subject: [PATCH 2/3] chore: add MiniMax-M3 B200 benchmark trigger --- perf-changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3d89fa688..05124e38b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4407,3 +4407,11 @@ description: - "Bump SGLang image from lmsysorg/sglang:deepseek-v4-blackwell (digest sha256:df18bfc4...) to mainline nightly lmsysorg/sglang:nightly-dev-cu13-20260628-da802ddc." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1923 + +- config-keys: + - minimaxm3-fp4-b200-dynamo-vllm + description: + - "Add MiniMax-M3 NVFP4 B200 Dynamo-vLLM disaggregated 8k1k configuration at concurrency 4096." + - "Port the B300 4P2D DEP2/DEP8 recipe to a B200 2P1D topology using one prefill node and one decode node." + - "Use the b200-multinode runner and vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41; omit max-cudagraph-capture-size and max-num-batched-tokens from prefill." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1982 From 3d84ee1fd0b066b28aea72ca918917413d894812 Mon Sep 17 00:00:00 2001 From: Jason Li Date: Thu, 2 Jul 2026 07:49:16 -0700 Subject: [PATCH 3/3] fix: scale B200 disagg concurrency --- .github/configs/nvidia-master.yaml | 2 +- .../vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml | 2 +- perf-changelog.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 972f28b39..30349ab8d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11862,7 +11862,7 @@ minimaxm3-fp4-b200-dynamo-vllm: - isl: 8192 osl: 1024 search-space: - - conc-list: [4096] + - conc-list: [2048] prefill: num-worker: 2 tp: 2 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml index 3798c8486..f89a8ab9d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml @@ -81,5 +81,5 @@ benchmark: type: "sa-bench" isl: 8192 osl: 1024 - concurrencies: "4096" + concurrencies: "2048" req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 05124e38b..52801fdee 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4411,7 +4411,7 @@ - config-keys: - minimaxm3-fp4-b200-dynamo-vllm description: - - "Add MiniMax-M3 NVFP4 B200 Dynamo-vLLM disaggregated 8k1k configuration at concurrency 4096." + - "Add MiniMax-M3 NVFP4 B200 Dynamo-vLLM disaggregated 8k1k configuration at concurrency 2048." - "Port the B300 4P2D DEP2/DEP8 recipe to a B200 2P1D topology using one prefill node and one decode node." - "Use the b200-multinode runner and vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41; omit max-cudagraph-capture-size and max-num-batched-tokens from prefill." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1982