From e3732e726708fc65e9f868b88e8d642e45277b27 Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Fri, 3 Jul 2026 14:38:43 +0000 Subject: [PATCH 1/4] [AMD] MiniMax-M3 MXFP8 MI355X vLLM: nightly + AITER-on TP4 + emulation linear Bump minimaxm3-fp8-mi355x-vllm to nightly-09663abde..., enable AITER for TP-only (vllm#47158 fix) via --moe-backend aiter, use --linear-backend emulation (beats stock native MXFP8 linear), --max-num-batched-tokens 32768, and a single TP4 conc 1-512 sweep for 1k1k and 8k1k. Co-authored-by: Cursor --- .../fixed_seq_len/minimaxm3_fp8_mi355x.sh | 23 +++++++++---------- configs/amd-master.yaml | 9 +++----- perf-changelog.yaml | 10 ++++++++ 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh index 89c136c27..da7d4a511 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh @@ -32,11 +32,7 @@ SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 # MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus -# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The -# fusion checks this env directly and runs on both the aiter and native MXFP8 -# MoE paths (it is independent of the AITER master switch, and self-disables -# under expert parallelism inside the model), so enable it unconditionally. -# (The AITER master switch itself is set below, gated on expert parallelism.) +# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 @@ -55,17 +51,17 @@ elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS+=(--enable-expert-parallel) fi -# Gate the AITER master switch on expert parallelism. With EP, the aiter fused -# MoE path is the auto-selected backend (no --moe-backend override). With EP -# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3 -# output, so leave it off and fall back to the native MXFP8 path (the -# shared-experts fusion set above still applies — it is master-independent). +# https://github.com/vllm-project/vllm/pull/47158 fix this if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then - export VLLM_ROCM_USE_AITER=1 -else export VLLM_ROCM_USE_AITER=0 +else + export VLLM_ROCM_USE_AITER=1 fi +# Larger per-step prefill token budget to improve TP4 throughput at high +# concurrency. Overridable via env. +MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}" + start_gpu_monitor set -x @@ -74,9 +70,12 @@ vllm serve "$MODEL" --port "$PORT" \ --block-size 128 \ --no-enable-prefix-caching \ --language-model-only \ + --moe-backend aiter \ --max-model-len "$MAX_MODEL_LEN" \ + --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \ --kv-cache-dtype fp8 \ --attention-backend TRITON_ATTN \ + --linear-backend emulation \ --tool-call-parser minimax_m3 \ --reasoning-parser minimax_m3 \ --enable-auto-tool-choice > "$SERVER_LOG" 2>&1 & diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml index 41f60afda..20fddade6 100644 --- a/configs/amd-master.yaml +++ b/configs/amd-master.yaml @@ -2475,7 +2475,7 @@ dsv4-fp4-mi355x-atom-disagg: # https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5 # MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA. minimaxm3-fp8-mi355x-vllm: - image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 + image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2487,14 +2487,11 @@ minimaxm3-fp8-mi355x-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 32 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 512 } + - { tp: 4, conc-start: 1, conc-end: 512 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 1, conc-end: 2 } - - { tp: 4, conc-start: 2, conc-end: 128 } + - { tp: 4, conc-start: 1, conc-end: 512 } # EAGLE3 speculative-decoding (spec-decoding: mtp) variant of # minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3944e67c5..d73d06bbc 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4433,3 +4433,13 @@ - "Add --online_quant_config with ptpc_fp8 and MoE layer exclusions (*block_sparse_moe) to all scripts." - "Replace deprecated AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0 and ATOM_M3_SPARSE_USE_ASM_PA=1 with ATOM_FORCE_ATTN_TRITON=1." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2001 + +- config-keys: + - minimaxm3-fp8-mi355x-vllm + description: + - "Bump the MiniMax-M3 MXFP8 MI355X vLLM image to nightly-09663abde0f50944a8d5ea30120666024b503faa" + - "Use --linear-backend emulation for the MXFP8 dense-linear path (beats the stock nightly native MXFP8 linear: ~+26% tput / -21% TPOT at 8k1k conc1, ~+2-3% at high concurrency)" + - "Add --max-num-batched-tokens 32768 (env MAX_NUM_BATCHED_TOKENS) to enlarge the per-step prefill budget and improve TP4 throughput at high concurrency" + - "Enable the AITER master switch for TP-only (no-EP) runs via --moe-backend aiter: the earlier degenerate-output issue that forced it off for TP-only is fixed by vllm-project/vllm#47158, so TP4 uses the AITER_MXFP8 MoE path (verified GSM8K 0.9613 flex / 0.9621 strict on this nightly)" + - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4: TP8 has poor throughput/GPU and plain TP4 matches or beats TP4/EP4 at high concurrency)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From 8699b80bed42569fcec274a383bad51dc67cf2bb Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Fri, 3 Jul 2026 21:16:09 +0000 Subject: [PATCH 2/4] address feedback Signed-off-by: Hongxia Yang --- .../single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh | 11 +++++------ perf-changelog.yaml | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh index da7d4a511..665f3f944 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh @@ -51,12 +51,11 @@ elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS+=(--enable-expert-parallel) fi -# https://github.com/vllm-project/vllm/pull/47158 fix this -if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then - export VLLM_ROCM_USE_AITER=0 -else - export VLLM_ROCM_USE_AITER=1 -fi +# Previously when EP is On, VLLM_ROCM_USE_AITER needs to be off. +# After https://github.com/vllm-project/vllm/pull/47158, +# it can be simplified as VLLM_ROCM_USE_AITER=1. +# As the configs are TP only, remove the conditional check. +export VLLM_ROCM_USE_AITER=1 # Larger per-step prefill token budget to improve TP4 throughput at high # concurrency. Overridable via env. diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d73d06bbc..b47c384f5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4442,4 +4442,4 @@ - "Add --max-num-batched-tokens 32768 (env MAX_NUM_BATCHED_TOKENS) to enlarge the per-step prefill budget and improve TP4 throughput at high concurrency" - "Enable the AITER master switch for TP-only (no-EP) runs via --moe-backend aiter: the earlier degenerate-output issue that forced it off for TP-only is fixed by vllm-project/vllm#47158, so TP4 uses the AITER_MXFP8 MoE path (verified GSM8K 0.9613 flex / 0.9621 strict on this nightly)" - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4: TP8 has poor throughput/GPU and plain TP4 matches or beats TP4/EP4 at high concurrency)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003 From 1a5a9ad08919f4510103b723e1aade24e806e29b Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Fri, 3 Jul 2026 22:26:14 +0000 Subject: [PATCH 3/4] [AMD] Port serve-command tuning to MiniMax-M3 MXFP8 MI355X vLLM MTP recipe Match minimaxm3_fp8_mi355x.sh: add --moe-backend aiter, --linear-backend emulation, and --max-num-batched-tokens 32768 to the EAGLE3 MTP recipe (keeping --speculative-config and the EAGLE3 in-place patch). Make DRAFT_MODEL env-overridable for local testing. Verified locally on nightly-09663abde (TP4 conc512, local eagle3 draft): 5120/5120 completed, eagle3 patch a no-op (nightly natively supports SupportsEagle3). Co-authored-by: Cursor --- .../fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh index 50a7d6d9f..9f0c8e83f 100644 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh @@ -38,7 +38,7 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME -DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3" +DRAFT_MODEL="${DRAFT_MODEL:-Inferact/MiniMax-M3-EAGLE3}" if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -62,11 +62,7 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600 # avoids the M3-decode breakable-cudagraph path that previously forced eager. export VLLM_USE_BREAKABLE_CUDAGRAPH=0 # MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus -# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The -# fusion checks this env directly and runs on both the aiter and native MXFP8 -# MoE paths (it is independent of the AITER master switch, and self-disables -# under expert parallelism inside the model), so enable it unconditionally. -# (The AITER master switch itself is set below, gated on expert parallelism.) +# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 @@ -85,20 +81,22 @@ elif [ "$EP_SIZE" -gt 1 ]; then PARALLEL_ARGS+=(--enable-expert-parallel) fi -# Gate the AITER master switch on expert parallelism. With EP, the aiter fused -# MoE path is the auto-selected backend (no --moe-backend override). With EP -# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3 -# output, so leave it off and fall back to the native MXFP8 path (the -# shared-experts fusion set above still applies — it is master-independent). +# Gate the AITER master switch on expert parallelism. With EP, +# the AITER master switch produces degenerate MiniMax-M3 +# output, so leave it off. if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then - export VLLM_ROCM_USE_AITER=1 -else export VLLM_ROCM_USE_AITER=0 +else + export VLLM_ROCM_USE_AITER=1 fi # use 3 speculative tokens for all configs for now NUM_SPEC_TOKENS=3 +# Larger per-step prefill token budget to improve TP4 throughput at high +# concurrency. Overridable via env. +MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-32768}" + # [AI generated draft test] Patch the installed AMD MiniMax-M3 model to add the # SupportsEagle3 interface (functionstackx/vllm#1). Mirrors nvidia/model.py: # adds EagleModelMixin to the inner model + aux-hidden-state emission, and @@ -193,9 +191,12 @@ vllm serve "$MODEL" --port "$PORT" \ --block-size 128 \ --no-enable-prefix-caching \ --language-model-only \ + --moe-backend aiter \ --max-model-len "$MAX_MODEL_LEN" \ + --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS" \ --kv-cache-dtype fp8 \ --attention-backend TRITON_ATTN \ + --linear-backend emulation \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --tool-call-parser minimax_m3 \ --reasoning-parser minimax_m3 \ From c69668564c753e04d4a7bc3c5bc41f11c7dd169c Mon Sep 17 00:00:00 2001 From: Hongxia Yang Date: Sat, 4 Jul 2026 01:24:54 +0000 Subject: [PATCH 4/4] [AMD] MiniMax-M3 MXFP8 MI355X vLLM MTP: nightly bump + TP4 configs Make the ported MTP recipe changes take effect: bump the minimaxm3-fp8-mi355x-vllm-mtp image to nightly-09663abde... and simplify its search space to a single TP4 conc 1-512 sweep (drop TP8/EP layouts, matching the non-MTP config). Add the changelog entry. Co-authored-by: Cursor --- configs/amd-master.yaml | 12 +++--------- perf-changelog.yaml | 8 ++++++++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml index 20fddade6..ad8a2ba2a 100644 --- a/configs/amd-master.yaml +++ b/configs/amd-master.yaml @@ -2504,7 +2504,7 @@ minimaxm3-fp8-mi355x-vllm: # acceptance dilutes in big batches, and the draft weights + draft KV shave # headroom — tp2-ep2 is dropped since its KV headroom was already thin. minimaxm3-fp8-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 + image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa model: MiniMaxAI/MiniMax-M3-MXFP8 model-prefix: minimaxm3 runner: mi355x @@ -2516,17 +2516,11 @@ minimaxm3-fp8-mi355x-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp } - - { tp: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 512, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp } - - { tp: 4, conc-start: 2, conc-end: 128, spec-decoding: mtp } - - { tp: 8, conc-start: 1, conc-end: 1, spec-decoding: mtp } + - { tp: 4, conc-start: 1, conc-end: 512, spec-decoding: mtp } # MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config. minimaxm3-fp4-mi355x-vllm-disagg: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b47c384f5..a2acea5d6 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4443,3 +4443,11 @@ - "Enable the AITER master switch for TP-only (no-EP) runs via --moe-backend aiter: the earlier degenerate-output issue that forced it off for TP-only is fixed by vllm-project/vllm#47158, so TP4 uses the AITER_MXFP8 MoE path (verified GSM8K 0.9613 flex / 0.9621 strict on this nightly)" - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4: TP8 has poor throughput/GPU and plain TP4 matches or beats TP4/EP4 at high concurrency)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003 + +- config-keys: + - minimaxm3-fp8-mi355x-vllm-mtp + description: + - "Bump the MiniMax-M3 MXFP8 MI355X vLLM MTP (EAGLE3) image to nightly-09663abde0f50944a8d5ea30120666024b503faa, which natively supports SupportsEagle3 (the in-place EAGLE3 patch is now a no-op) and carries vllm-project/vllm#47158" + - "Port the non-MTP serve-command tuning to the MTP recipe: --moe-backend aiter, --linear-backend emulation, --max-num-batched-tokens 32768, and the AITER master switch on for TP-only runs (kept --speculative-config eagle3 with 3 draft tokens)" + - "Simplify both search spaces to a single TP4 conc 1-512 sweep for 1k1k and 8k1k (drop TP8 and TP4/EP4, matching the non-MTP entry; verified locally on this nightly at TP4 conc512, 5120/5120 completed)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2003