From 92041529ca8b40c78ab693df2a3587b1af29558a Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Thu, 2 Jul 2026 07:53:30 -0500
Subject: [PATCH 1/4] [AMD] DeepSeek-V4 FP4 MI355X vLLM MTP: bump image to
 latest nightly

Update dsv4-fp4-mi355x-vllm-mtp from vllm/vllm-openai-rocm:v0.22.0 to the latest
nightly (nightly-09663abde0f50944a8d5ea30120666024b503faa). Note two-stage
attention kernels and AITER MLA in the changelog.
---
 .github/configs/amd-master.yaml | 2 +-
 perf-changelog.yaml             | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 9386d9c27..dabdb6a6f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1979,7 +1979,7 @@ dsv4-fp4-mi355x-vllm:
 # above ~conc32 (-37% @ conc32). Image reuses the base entry's v0.22.0 ROCm
 # build, which already contains the MTP commit.
 dsv4-fp4-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:v0.22.0
+  image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 450226250..34126a606 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4400,3 +4400,11 @@
   description:
     - "Bump SGLang image from lmsysorg/sglang:deepseek-v4-blackwell (digest sha256:df18bfc4...) to mainline nightly lmsysorg/sglang:nightly-dev-cu13-20260628-da802ddc."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1923
+
+- config-keys:
+    - dsv4-fp4-mi355x-vllm-mtp
+  description:
+    - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM MTP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa."
+    - "The nightly enables two-stage attention kernels (split-KV decode), reducing decode attention latency at high concurrency."
+    - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER

From f293588ac2ddd4fb3fda760fffc8f95ab42aef49 Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Thu, 2 Jul 2026 07:54:16 -0500
Subject: [PATCH 2/4] chore(changelog): set pr-link for
 dsv4-fp4-mi355x-vllm-mtp image bump (#1981)

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 34126a606..6b423a7ee 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4407,4 +4407,4 @@
     - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM MTP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa."
     - "The nightly enables two-stage attention kernels (split-KV decode), reducing decode attention latency at high concurrency."
     - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1981

From f5ae0ad15c9ddbda0a822f3c60759df708baebd8 Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Thu, 2 Jul 2026 07:55:15 -0500
Subject: [PATCH 3/4] docs(changelog): two-stage attention improves across all
 concurrency (mtp)

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 6b423a7ee..e4c96a179 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4405,6 +4405,6 @@
     - dsv4-fp4-mi355x-vllm-mtp
   description:
     - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM MTP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa."
-    - "The nightly enables two-stage attention kernels (split-KV decode), reducing decode attention latency at high concurrency."
+    - "The nightly enables two-stage attention kernels (split-KV decode), which reduce decode attention latency across all concurrency levels."
     - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1981

From a6ff5d8d2d4730dc02d858c94cc634aba09078a7 Mon Sep 17 00:00:00 2001
From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com>
Date: Thu, 2 Jul 2026 08:13:29 -0500
Subject: [PATCH 4/4] [AMD] dsv4 fp4 mi355x vllm MTP: use AITER MoE backend

---
 .../single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm_mtp.sh    | 5 +++--
 perf-changelog.yaml                                          | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm_mtp.sh
index b90d82de9..81373bdc0 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm_mtp.sh
@@ -12,7 +12,7 @@ set -eo pipefail
 # prompts silently regresses the acceptance rate.
 #
 # All other serving flags mirror the non-MTP MI355X recipe (TP=8,
-# VLLM_ROCM_USE_AITER=1, triton_unfused MoE, FP8 KV cache, mp executor, async
+# VLLM_ROCM_USE_AITER=1, AITER MoE, FP8 KV cache, mp executor, async
 # scheduling, mode=3 FULL_AND_PIECEWISE compilation). See
 # dsv4_fp4_mi355x_vllm.sh for per-flag rationale.
 
@@ -40,6 +40,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_MOE=1
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
@@ -74,7 +75,7 @@ vllm serve $MODEL --port $PORT \
     --gpu-memory-utilization 0.8 \
     --kv-cache-dtype fp8 \
     --trust-remote-code \
-    --moe-backend triton_unfused \
+    --moe-backend aiter \
     --tokenizer-mode deepseek_v4 \
     --reasoning-parser deepseek_v4 \
     --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e4c96a179..be05a13e1 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4407,4 +4407,5 @@
     - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM MTP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa."
     - "The nightly enables two-stage attention kernels (split-KV decode), which reduce decode attention latency across all concurrency levels."
     - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path."
+    - "Switch the MoE backend from triton_unfused to AITER MoE (VLLM_ROCM_USE_AITER_MOE=1 + --moe-backend aiter) for the FP4 experts."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1981