diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm_mtp.sh index b90d82de9..ce51f8c5d 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm_mtp.sh @@ -12,11 +12,11 @@ set -eo pipefail # prompts silently regresses the acceptance rate. # # All other serving flags mirror the non-MTP MI355X recipe (TP=8, -# VLLM_ROCM_USE_AITER=1, triton_unfused MoE, FP8 KV cache, mp executor, async +# VLLM_ROCM_USE_AITER=1, AITER MoE, FP8 KV cache, mp executor, async # scheduling, mode=3 FULL_AND_PIECEWISE compilation). See # dsv4_fp4_mi355x_vllm.sh for per-flag rationale. -source "$(dirname "$0")/../benchmark_lib.sh" +source "$(dirname "$0")/../../benchmark_lib.sh" check_env_vars \ MODEL \ @@ -40,6 +40,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MOE=1 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -74,7 +75,7 @@ vllm serve $MODEL --port $PORT \ --gpu-memory-utilization 0.8 \ --kv-cache-dtype fp8 \ --trust-remote-code \ - --moe-backend triton_unfused \ + --moe-backend aiter \ --tokenizer-mode deepseek_v4 \ --reasoning-parser deepseek_v4 \ --speculative-config "{\"method\": \"mtp\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml index d3f1c70e6..67190d652 100644 --- a/configs/amd-master.yaml +++ b/configs/amd-master.yaml @@ -1939,7 +1939,7 @@ dsv4-fp4-mi355x-vllm: # above ~conc32 (-37% @ conc32). Image reuses the base entry's v0.22.0 ROCm # build, which already contains the MTP commit. dsv4-fp4-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index f13d283d6..af445d635 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4448,3 +4448,12 @@ - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path." - "Switch the MoE backend from triton_unfused to AITER MoE (VLLM_ROCM_USE_AITER_MOE=1 + --moe-backend aiter) for the FP4 experts." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1980 + +- config-keys: + - dsv4-fp4-mi355x-vllm-mtp + description: + - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM MTP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa." + - "The nightly enables two-stage attention kernels (split-KV decode), which reduce decode attention latency across all concurrency levels." + - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path." + - "Switch the MoE backend from triton_unfused to AITER MoE (VLLM_ROCM_USE_AITER_MOE=1 + --moe-backend aiter) for the FP4 experts." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1981