From e3732e726708fc65e9f868b88e8d642e45277b27 Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxia.yang@amd.com>
Date: Fri, 3 Jul 2026 14:38:43 +0000
Subject: [PATCH 1/4] [AMD] MiniMax-M3 MXFP8 MI355X vLLM: nightly + AITER-on
 TP4 + emulation linear

Bump minimaxm3-fp8-mi355x-vllm to nightly-09663abde..., enable AITER for
TP-only (vllm#47158 fix) via --moe-backend aiter, use --linear-backend
emulation (beats stock native MXFP8 linear), --max-num-batched-tokens
32768, and a single TP4 conc 1-512 sweep for 1k1k and 8k1k.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../fixed_seq_len/minimaxm3_fp8_mi355x.sh     | 23 +++++++++----------
 configs/amd-master.yaml                       |  9 +++-----
 perf-changelog.yaml                           | 10 ++++++++
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
index 89c136c27..da7d4a511 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
@@ -32,11 +32,7 @@ SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 # MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus
-# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The
-# fusion checks this env directly and runs on both the aiter and native MXFP8
-# MoE paths (it is independent of the AITER master switch, and self-disables
-# under expert parallelism inside the model), so enable it unconditionally.
-# (The AITER master switch itself is set below, gated on expert parallelism.)
+# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). 
 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6
 
@@ -55,17 +51,17 @@ elif [ "$EP_SIZE" -gt 1 ]; then
     PARALLEL_ARGS+=(--enable-expert-parallel)
 fi
 
-# Gate the AITER master switch on expert parallelism. With EP, the aiter fused
-# MoE path is the auto-selected backend (no --moe-backend override). With EP
-# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3
-# output, so leave it off and fall back to the native MXFP8 path (the
-# shared-experts fusion set above still applies — it is master-independent).
+# https://github.com/vllm-project/vllm/pull/47158 fix this
 if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then
-    export VLLM_ROCM_USE_AITER=1
-else
     export VLLM_ROCM_USE_AITER=0
+else
+    export VLLM_ROCM_USE_AITER=1
 fi
 
+# Larger per-step prefill token budget to improve TP4 throughput at high
+# concurrency. Overridable via env.
+MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}"
+
 start_gpu_monitor
 
 set -x
@@ -74,9 +70,12 @@ vllm serve "$MODEL" --port "$PORT" \
     --block-size 128 \
     --no-enable-prefix-caching \
     --language-model-only \
+    --moe-backend aiter \
     --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
     --kv-cache-dtype fp8 \
     --attention-backend TRITON_ATTN \
+    --linear-backend emulation \
     --tool-call-parser minimax_m3 \
     --reasoning-parser minimax_m3 \
     --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 &
diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml
index 41f60afda..20fddade6 100644
--- a/configs/amd-master.yaml
+++ b/configs/amd-master.yaml
@@ -2475,7 +2475,7 @@ dsv4-fp4-mi355x-atom-disagg:
 # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5
 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA.
 minimaxm3-fp8-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
+  image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x
@@ -2487,14 +2487,11 @@ minimaxm3-fp8-mi355x-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 32 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
+      - { tp: 4, conc-start: 1, conc-end: 512 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 1, conc-end: 2 }
-      - { tp: 4, conc-start: 2, conc-end: 128 }
+      - { tp: 4, conc-start: 1, conc-end: 512 }
 
 # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
 # minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3944e67c5..d73d06bbc 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4433,3 +4433,13 @@
     - "Add --online_quant_config with ptpc_fp8 and MoE layer exclusions (*block_sparse_moe) to all scripts."
     - "Replace deprecated AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 and ATOM_M3_SPARSE_USE_ASM_PA=1 with ATOM_FORCE_ATTN_TRITON=1."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2001
+
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm
+  description:
+    - "Bump the MiniMax-M3 MXFP8 MI355X vLLM image to nightly-09663abde0f50944a8d5ea30120666024b503faa"
+    - "Use --linear-backend emulation for the MXFP8 dense-linear path (beats the stock nightly native MXFP8 linear: ~+26% tput / -21% TPOT at 8k1k conc1, ~+2-3% at high concurrency)"
+    - "Add --max-num-batched-tokens 32768 (env MAX_NUM_BATCHED_TOKENS) to enlarge the per-step prefill budget and improve TP4 throughput at high concurrency"
+    - "Enable the AITER master switch for TP-only (no-EP) runs via --moe-backend aiter: the earlier degenerate-output issue that forced it off for TP-only is fixed by vllm-project/vllm#47158, so TP4 uses the AITER_MXFP8 MoE path (verified GSM8K 0.9613 flex / 0.9621 strict on this nightly)"
+    - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4: TP8 has poor throughput/GPU and plain TP4 matches or beats TP4/EP4 at high concurrency)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX

From 8699b80bed42569fcec274a383bad51dc67cf2bb Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxia.yang@amd.com>
Date: Fri, 3 Jul 2026 21:16:09 +0000
Subject: [PATCH 2/4] address feedback

Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
---
 .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh | 11 +++++------
 perf-changelog.yaml                                   |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
index da7d4a511..665f3f944 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
@@ -51,12 +51,11 @@ elif [ "$EP_SIZE" -gt 1 ]; then
     PARALLEL_ARGS+=(--enable-expert-parallel)
 fi
 
-# https://github.com/vllm-project/vllm/pull/47158 fix this
-if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then
-    export VLLM_ROCM_USE_AITER=0
-else
-    export VLLM_ROCM_USE_AITER=1
-fi
+# Previously when EP is On, VLLM_ROCM_USE_AITER needs to be off.
+# After https://github.com/vllm-project/vllm/pull/47158, 
+# it can be simplified as VLLM_ROCM_USE_AITER=1.
+# As the configs are TP only, remove the conditional check.
+export VLLM_ROCM_USE_AITER=1
 
 # Larger per-step prefill token budget to improve TP4 throughput at high
 # concurrency. Overridable via env.
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index d73d06bbc..b47c384f5 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4442,4 +4442,4 @@
     - "Add --max-num-batched-tokens 32768 (env MAX_NUM_BATCHED_TOKENS) to enlarge the per-step prefill budget and improve TP4 throughput at high concurrency"
     - "Enable the AITER master switch for TP-only (no-EP) runs via --moe-backend aiter: the earlier degenerate-output issue that forced it off for TP-only is fixed by vllm-project/vllm#47158, so TP4 uses the AITER_MXFP8 MoE path (verified GSM8K 0.9613 flex / 0.9621 strict on this nightly)"
     - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4: TP8 has poor throughput/GPU and plain TP4 matches or beats TP4/EP4 at high concurrency)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003

From 1a5a9ad08919f4510103b723e1aade24e806e29b Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxia.yang@amd.com>
Date: Fri, 3 Jul 2026 22:26:14 +0000
Subject: [PATCH 3/4] [AMD] Port serve-command tuning to MiniMax-M3 MXFP8
 MI355X vLLM MTP recipe

Match minimaxm3_fp8_mi355x.sh: add --moe-backend aiter, --linear-backend
emulation, and --max-num-batched-tokens 32768 to the EAGLE3 MTP recipe
(keeping --speculative-config and the EAGLE3 in-place patch). Make
DRAFT_MODEL env-overridable for local testing. Verified locally on
nightly-09663abde (TP4 conc512, local eagle3 draft): 5120/5120 completed,
eagle3 patch a no-op (nightly natively supports SupportsEagle3).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
index 50a7d6d9f..9f0c8e83f 100644
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
@@ -38,7 +38,7 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
-DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"
+DRAFT_MODEL="${DRAFT_MODEL:-Inferact/MiniMax-M3-EAGLE3}"
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -62,11 +62,7 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600
 # avoids the M3-decode breakable-cudagraph path that previously forced eager.
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 # MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus
-# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The
-# fusion checks this env directly and runs on both the aiter and native MXFP8
-# MoE paths (it is independent of the AITER master switch, and self-disables
-# under expert parallelism inside the model), so enable it unconditionally.
-# (The AITER master switch itself is set below, gated on expert parallelism.)
+# the router-append shared-experts MoE fusion (vllm-project/vllm#46545).
 export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6
 
@@ -85,20 +81,22 @@ elif [ "$EP_SIZE" -gt 1 ]; then
     PARALLEL_ARGS+=(--enable-expert-parallel)
 fi
 
-# Gate the AITER master switch on expert parallelism. With EP, the aiter fused
-# MoE path is the auto-selected backend (no --moe-backend override). With EP
-# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3
-# output, so leave it off and fall back to the native MXFP8 path (the
-# shared-experts fusion set above still applies — it is master-independent).
+# Gate the AITER master switch on expert parallelism. With EP, 
+# the AITER master switch produces degenerate MiniMax-M3
+# output, so leave it off.
 if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then
-    export VLLM_ROCM_USE_AITER=1
-else
     export VLLM_ROCM_USE_AITER=0
+else
+    export VLLM_ROCM_USE_AITER=1
 fi
 
 # use 3 speculative tokens for all configs for now
 NUM_SPEC_TOKENS=3
 
+# Larger per-step prefill token budget to improve TP4 throughput at high
+# concurrency. Overridable via env.
+MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}"
+
 # [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the
 # SupportsEagle3 interface (functionstackx/vllm#1). Mirrors nvidia/model.py:
 # adds EagleModelMixin to the inner model + aux-hidden-state emission, and
@@ -193,9 +191,12 @@ vllm serve "$MODEL" --port "$PORT" \
     --block-size 128 \
     --no-enable-prefix-caching \
     --language-model-only \
+    --moe-backend aiter \
     --max-model-len "$MAX_MODEL_LEN" \
+    --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \
     --kv-cache-dtype fp8 \
     --attention-backend TRITON_ATTN \
+    --linear-backend emulation \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --tool-call-parser minimax_m3 \
     --reasoning-parser minimax_m3 \

From c69668564c753e04d4a7bc3c5bc41f11c7dd169c Mon Sep 17 00:00:00 2001
From: Hongxia Yang <hongxia.yang@amd.com>
Date: Sat, 4 Jul 2026 01:24:54 +0000
Subject: [PATCH 4/4] [AMD] MiniMax-M3 MXFP8 MI355X vLLM MTP: nightly bump +
 TP4 configs

Make the ported MTP recipe changes take effect: bump the
minimaxm3-fp8-mi355x-vllm-mtp image to nightly-09663abde... and simplify
its search space to a single TP4 conc 1-512 sweep (drop TP8/EP layouts,
matching the non-MTP config). Add the changelog entry.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 configs/amd-master.yaml | 12 +++---------
 perf-changelog.yaml     |  8 ++++++++
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml
index 20fddade6..ad8a2ba2a 100644
--- a/configs/amd-master.yaml
+++ b/configs/amd-master.yaml
@@ -2504,7 +2504,7 @@ minimaxm3-fp8-mi355x-vllm:
 # acceptance dilutes in big batches, and the draft weights + draft KV shave
 # headroom — tp2-ep2 is dropped since its KV headroom was already thin.
 minimaxm3-fp8-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
+  image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
   model: MiniMaxAI/MiniMax-M3-MXFP8
   model-prefix: minimaxm3
   runner: mi355x
@@ -2516,17 +2516,11 @@ minimaxm3-fp8-mi355x-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp }
-      - { tp: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 512, spec-decoding: mtp }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp }
-      - { tp: 4, conc-start: 2, conc-end: 128, spec-decoding: mtp }
-      - { tp: 8, conc-start: 1, conc-end: 1, spec-decoding: mtp }
+      - { tp: 4, conc-start: 1, conc-end: 512, spec-decoding: mtp }
 
 # MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config.
 minimaxm3-fp4-mi355x-vllm-disagg:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index b47c384f5..a2acea5d6 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4443,3 +4443,11 @@
     - "Enable the AITER master switch for TP-only (no-EP) runs via --moe-backend aiter: the earlier degenerate-output issue that forced it off for TP-only is fixed by vllm-project/vllm#47158, so TP4 uses the AITER_MXFP8 MoE path (verified GSM8K 0.9613 flex / 0.9621 strict on this nightly)"
     - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4: TP8 has poor throughput/GPU and plain TP4 matches or beats TP4/EP4 at high concurrency)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003
+
+- config-keys:
+    - minimaxm3-fp8-mi355x-vllm-mtp
+  description:
+    - "Bump the MiniMax-M3 MXFP8 MI355X vLLM MTP (EAGLE3) image to nightly-09663abde0f50944a8d5ea30120666024b503faa, which natively supports SupportsEagle3 (the in-place EAGLE3 patch is now a no-op) and carries vllm-project/vllm#47158"
+    - "Port the non-MTP serve-command tuning to the MTP recipe: --moe-backend aiter, --linear-backend emulation, --max-num-batched-tokens 32768, and the AITER master switch on for TP-only runs (kept --speculative-config eagle3 with 3 draft tokens)"
+    - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4, matching the non-MTP entry; verified locally on this nightly at TP4 conc512, 5120/5120 completed)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003