diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5a0e6a636..30349ab8d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11848,6 +11848,34 @@ qwen3.5-fp8-h100-sglang-agentic: - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] } - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] } +minimaxm3-fp4-b200-dynamo-vllm: + image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41 + model: nvidia/MiniMax-M3-NVFP4 + model-prefix: minimaxm3 + runner: b200-multinode + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [2048] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + minimaxm3-fp8-b300-dynamo-vllm: image: vllm/vllm-openai:minimax-m3-0618-x86_64-cu130 model: MiniMaxAI/MiniMax-M3-MXFP8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml new file mode 100644 index 000000000..f89a8ab9d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4/8k1k/2p1d-dep2-dep8-8k1k.yaml @@ -0,0 +1,85 @@ +name: "minimax-m3-vllm-disagg-b200-2p1d-fp4-dep2-dep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 # One DP8 decode worker provides 8 DP ranks. + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3d89fa688..52801fdee 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4407,3 +4407,11 @@ description: - "Bump SGLang image from lmsysorg/sglang:deepseek-v4-blackwell (digest sha256:df18bfc4...) to mainline nightly lmsysorg/sglang:nightly-dev-cu13-20260628-da802ddc." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1923 + +- config-keys: + - minimaxm3-fp4-b200-dynamo-vllm + description: + - "Add MiniMax-M3 NVFP4 B200 Dynamo-vLLM disaggregated 8k1k configuration at concurrency 2048." + - "Port the B300 4P2D DEP2/DEP8 recipe to a B200 2P1D topology using one prefill node and one decode node." + - "Use the b200-multinode runner and vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41; omit max-cudagraph-capture-size and max-num-batched-tokens from prefill." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1982 diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f10e0f4ea..8e915061e 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -113,6 +113,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then git checkout aflowers/vllm-gb200-v0.20.0 mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 + elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout main + mkdir -p recipes/vllm/minimax-m3/b200-fp4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b200-fp4" recipes/vllm/minimax-m3/b200-fp4 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1