From b2a5cd17734f81959f9e9e3bf4c4c2d21ea80043 Mon Sep 17 00:00:00 2001
From: Oseltamivir <58582368+Oseltamivir@users.noreply.github.com>
Date: Sat, 4 Jul 2026 06:59:16 +0800
Subject: [PATCH] feat(collectivex): gate and publish v1 artifacts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Require an explicit V1 release tag and the locked full-matrix digest before a sweep can become publication input. Promote exactly three successful tagged runs from one source SHA in disposable runner storage, then retain only the verified sanitized NDJSON artifact for JIT delivery.
中文:为 CollectiveX V1 增加显式发布门禁与产物发布流程。只有显式选择 V1 release tag 且匹配固定完整矩阵摘要的扫描运行才能作为发布输入。发布工作流只接收来自同一 source SHA 的三个成功运行,在 runner 临时存储中完成 promotion,并仅保留通过验证和清理的 NDJSON 产物供前端即时读取。
---
.github/workflows/collectivex-publish.yml | 197 +
.github/workflows/collectivex-sweep.yml | 464 ++-
experimental/CollectiveX/.gitignore | 15 +
experimental/CollectiveX/README.md | 160 +
experimental/CollectiveX/README_zh.md | 154 +
experimental/CollectiveX/artifact_safety.py | 212 ++
experimental/CollectiveX/capability.py | 232 ++
experimental/CollectiveX/configs/suites.yaml | 36 +
.../CollectiveX/configs/workloads.yaml | 9 +
experimental/CollectiveX/contracts.py | 2823 ++++++++++++++
experimental/CollectiveX/docs/methodology.md | 305 ++
.../CollectiveX/docs/methodology_zh.md | 297 ++
experimental/CollectiveX/identity.py | 201 +
.../CollectiveX/launchers/launch_gb-nv.sh | 110 +
.../CollectiveX/launchers/launch_mi-amds.sh | 160 +
.../launchers/launch_single-slurm.sh | 154 +
experimental/CollectiveX/publisher.py | 3284 +++++++++++++++++
experimental/CollectiveX/requirements.txt | 8 +
experimental/CollectiveX/runtime/common.sh | 2339 ++++++++++++
.../CollectiveX/runtime/run_in_container.sh | 1087 ++++++
.../schemas/channel-v1.schema.json | 23 +
.../schemas/private-bundle-v1.schema.json | 162 +
.../schemas/public-dataset-v1.schema.json | 606 +++
.../schemas/raw-case-v1.schema.json | 1199 ++++++
.../schemas/samples-v1.schema.json | 80 +
.../schemas/terminal-outcome-v1.schema.json | 289 ++
experimental/CollectiveX/source_archive.py | 349 ++
experimental/CollectiveX/summarize.py | 105 +
experimental/CollectiveX/sweep_matrix.py | 1031 ++++++
experimental/CollectiveX/tests/ep_deepep.py | 294 ++
.../CollectiveX/tests/ep_deepep_hybrid.py | 457 +++
.../CollectiveX/tests/ep_deepep_v2.py | 528 +++
experimental/CollectiveX/tests/ep_harness.py | 1780 +++++++++
experimental/CollectiveX/tests/ep_mori.py | 359 ++
experimental/CollectiveX/tests/ep_nccl.py | 186 +
experimental/CollectiveX/tests/ep_uccl.py | 405 ++
experimental/CollectiveX/tests/eplb.py | 199 +
.../CollectiveX/tests/make_workloads.py | 119 +
experimental/CollectiveX/tests/routing.py | 191 +
experimental/CollectiveX/tests/run_ep.py | 427 +++
.../tests/test_deepep_v2_contract.py | 2151 +++++++++++
.../CollectiveX/tests/test_publisher.py | 2418 ++++++++++++
.../tests/test_sampling_contract.py | 3213 ++++++++++++++++
experimental/CollectiveX/tests/workload.py | 358 ++
44 files changed, 29085 insertions(+), 91 deletions(-)
create mode 100644 .github/workflows/collectivex-publish.yml
create mode 100644 experimental/CollectiveX/.gitignore
create mode 100644 experimental/CollectiveX/README.md
create mode 100644 experimental/CollectiveX/README_zh.md
create mode 100644 experimental/CollectiveX/artifact_safety.py
create mode 100644 experimental/CollectiveX/capability.py
create mode 100644 experimental/CollectiveX/configs/suites.yaml
create mode 100644 experimental/CollectiveX/configs/workloads.yaml
create mode 100644 experimental/CollectiveX/contracts.py
create mode 100644 experimental/CollectiveX/docs/methodology.md
create mode 100644 experimental/CollectiveX/docs/methodology_zh.md
create mode 100644 experimental/CollectiveX/identity.py
create mode 100644 experimental/CollectiveX/launchers/launch_gb-nv.sh
create mode 100644 experimental/CollectiveX/launchers/launch_mi-amds.sh
create mode 100644 experimental/CollectiveX/launchers/launch_single-slurm.sh
create mode 100644 experimental/CollectiveX/publisher.py
create mode 100644 experimental/CollectiveX/requirements.txt
create mode 100644 experimental/CollectiveX/runtime/common.sh
create mode 100644 experimental/CollectiveX/runtime/run_in_container.sh
create mode 100644 experimental/CollectiveX/schemas/channel-v1.schema.json
create mode 100644 experimental/CollectiveX/schemas/private-bundle-v1.schema.json
create mode 100644 experimental/CollectiveX/schemas/public-dataset-v1.schema.json
create mode 100644 experimental/CollectiveX/schemas/raw-case-v1.schema.json
create mode 100644 experimental/CollectiveX/schemas/samples-v1.schema.json
create mode 100644 experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json
create mode 100644 experimental/CollectiveX/source_archive.py
create mode 100644 experimental/CollectiveX/summarize.py
create mode 100644 experimental/CollectiveX/sweep_matrix.py
create mode 100644 experimental/CollectiveX/tests/ep_deepep.py
create mode 100644 experimental/CollectiveX/tests/ep_deepep_hybrid.py
create mode 100644 experimental/CollectiveX/tests/ep_deepep_v2.py
create mode 100644 experimental/CollectiveX/tests/ep_harness.py
create mode 100644 experimental/CollectiveX/tests/ep_mori.py
create mode 100644 experimental/CollectiveX/tests/ep_nccl.py
create mode 100644 experimental/CollectiveX/tests/ep_uccl.py
create mode 100644 experimental/CollectiveX/tests/eplb.py
create mode 100644 experimental/CollectiveX/tests/make_workloads.py
create mode 100644 experimental/CollectiveX/tests/routing.py
create mode 100644 experimental/CollectiveX/tests/run_ep.py
create mode 100644 experimental/CollectiveX/tests/test_deepep_v2_contract.py
create mode 100644 experimental/CollectiveX/tests/test_publisher.py
create mode 100644 experimental/CollectiveX/tests/test_sampling_contract.py
create mode 100644 experimental/CollectiveX/tests/workload.py
diff --git a/.github/workflows/collectivex-publish.yml b/.github/workflows/collectivex-publish.yml
new file mode 100644
index 0000000000..38143123b9
--- /dev/null
+++ b/.github/workflows/collectivex-publish.yml
@@ -0,0 +1,197 @@
+name: CollectiveX Publish V1
+
+on:
+ workflow_dispatch:
+ inputs:
+ run_ids:
+ description: Three successful CollectiveX Sweep run IDs, comma-separated
+ required: true
+ type: string
+
+permissions:
+ actions: read
+ contents: read
+
+concurrency:
+ group: collectivex-publish-v1
+ cancel-in-progress: false
+
+jobs:
+ publish:
+ runs-on: ubuntu-latest
+ timeout-minutes: 30
+ env:
+ GH_TOKEN: ${{ github.token }}
+ RUN_IDS: ${{ inputs.run_ids }}
+ steps:
+ - name: Verify source runs
+ id: runs
+ env:
+ REPOSITORY: ${{ github.repository }}
+ run: |
+ set -euo pipefail
+ IFS=',' read -r -a run_ids <<< "$RUN_IDS"
+ [ "${#run_ids[@]}" -eq 3 ] || {
+ echo 'run_ids must contain exactly three IDs' >&2
+ exit 1
+ }
+ [ "$(printf '%s\n' "${run_ids[@]}" | sort -u | wc -l)" -eq 3 ] || {
+ echo 'run_ids must be unique' >&2
+ exit 1
+ }
+
+ : > "$RUNNER_TEMP/collectivex-runs.tsv"
+ source_sha=''
+ for run_id in "${run_ids[@]}"; do
+ [[ "$run_id" =~ ^[1-9][0-9]*$ ]] || {
+ echo 'run_ids contains a non-decimal ID' >&2
+ exit 1
+ }
+ metadata=$(gh api "repos/$REPOSITORY/actions/runs/$run_id")
+ name=$(jq -r '.name' <<< "$metadata")
+ path=$(jq -r '.path' <<< "$metadata")
+ branch=$(jq -r '.head_branch' <<< "$metadata")
+ status=$(jq -r '.status' <<< "$metadata")
+ conclusion=$(jq -r '.conclusion' <<< "$metadata")
+ sha=$(jq -r '.head_sha' <<< "$metadata")
+ attempt=$(jq -r '.run_attempt' <<< "$metadata")
+ [ "$name" = 'CollectiveX Sweep' ] \
+ && [ "$path" = '.github/workflows/collectivex-sweep.yml' ] \
+ && [ "$branch" = 'collectivex' ] \
+ && [ "$status" = 'completed' ] \
+ && [ "$conclusion" = 'success' ] \
+ && [[ "$sha" =~ ^[0-9a-f]{40}$ ]] \
+ && [[ "$attempt" =~ ^[1-9][0-9]*$ ]] || {
+ echo "run $run_id is not an eligible CollectiveX V1 sweep" >&2
+ exit 1
+ }
+ if [ -z "$source_sha" ]; then
+ source_sha="$sha"
+ else
+ [ "$sha" = "$source_sha" ] || {
+ echo 'source runs do not share one source SHA' >&2
+ exit 1
+ }
+ fi
+ printf '%s\t%s\t%s\n' "$run_id" "$attempt" "$sha" \
+ >> "$RUNNER_TEMP/collectivex-runs.tsv"
+ done
+ echo "source_sha=$source_sha" >> "$GITHUB_OUTPUT"
+
+ - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v5.0.0
+ with:
+ ref: ${{ steps.runs.outputs.source_sha }}
+ clean: true
+ persist-credentials: false
+
+ - name: Install publisher dependencies
+ run: python3 -m pip install --quiet -r experimental/CollectiveX/requirements.txt
+
+ - name: Build promoted publication
+ env:
+ REPOSITORY: ${{ github.repository }}
+ run: |
+ set -euo pipefail
+ store="$RUNNER_TEMP/collectivex-publisher"
+ downloads="$RUNNER_TEMP/collectivex-downloads"
+ output="$RUNNER_TEMP/collectivex-publication"
+ umask 027
+ mkdir -m 750 "$store"
+ mkdir -m 750 "$downloads" "$output"
+ : > "$RUNNER_TEMP/collectivex-bundles.txt"
+
+ while IFS=$'\t' read -r run_id attempt source_sha; do
+ run_dir="$downloads/$run_id"
+ mkdir -m 750 "$run_dir"
+ gh run download "$run_id" --repo "$REPOSITORY" --dir "$run_dir"
+ matrix="$run_dir/cxsweep-matrix-$run_id-$attempt/matrix_full.json"
+ marker="$run_dir/cxrelease-v1-$run_id-$attempt/release.json"
+ [ -f "$matrix" ] || {
+ echo "run $run_id is missing its exact matrix artifact" >&2
+ exit 1
+ }
+ [ -f "$marker" ] || {
+ echo "run $run_id is not tagged for V1 publication" >&2
+ exit 1
+ }
+ matrix_sha=$(sha256sum "$matrix" | cut -d' ' -f1)
+ jq -e \
+ --arg run_id "$run_id" \
+ --arg attempt "$attempt" \
+ --arg source_sha "$source_sha" \
+ --arg matrix_sha "$matrix_sha" \
+ 'keys == ["format","matrix_sha256","release_tag","run_attempt","run_id","source_sha"]
+ and .format == "collectivex.release-tag.v1"
+ and .release_tag == "v1"
+ and .run_id == $run_id
+ and .run_attempt == $attempt
+ and .source_sha == $source_sha
+ and .matrix_sha256 == $matrix_sha' \
+ "$marker" >/dev/null || {
+ echo "run $run_id has an invalid V1 release marker" >&2
+ exit 1
+ }
+
+ mapfile -t artifacts < <(
+ find "$run_dir" -mindepth 1 -maxdepth 1 -type d \
+ \( -name "cxshard-*-$run_id-$attempt" \
+ -o -name "cxunsupported-$run_id-$attempt" \) -print | sort
+ )
+ [ "${#artifacts[@]}" -gt 0 ] || {
+ echo "run $run_id has no result artifacts" >&2
+ exit 1
+ }
+ artifact_args=()
+ for artifact in "${artifacts[@]}"; do
+ artifact_args+=(--artifact "$artifact")
+ done
+ result=$(
+ python3 experimental/CollectiveX/publisher.py --store-root "$store" ingest \
+ --matrix "$matrix" \
+ "${artifact_args[@]}" \
+ --repository "$REPOSITORY" \
+ --run-id "$run_id" \
+ --run-attempt "$attempt" \
+ --source-sha "$source_sha"
+ )
+ bundle_id=$(jq -er '.bundle_id' <<< "$result")
+ printf '%s\n' "$bundle_id" >> "$RUNNER_TEMP/collectivex-bundles.txt"
+ done < "$RUNNER_TEMP/collectivex-runs.tsv"
+
+ mapfile -t bundle_ids < "$RUNNER_TEMP/collectivex-bundles.txt"
+ promote_args=()
+ for bundle_id in "${bundle_ids[@]}"; do
+ promote_args+=(--bundle "$bundle_id")
+ done
+ result=$(
+ python3 experimental/CollectiveX/publisher.py --store-root "$store" promote \
+ "${promote_args[@]}"
+ )
+ dataset_id=$(jq -er '.dataset_sha256' <<< "$result")
+ dataset="$store/public/datasets/$dataset_id/dataset.json"
+ [ -f "$dataset" ] || {
+ echo 'publisher did not install the promoted dataset' >&2
+ exit 1
+ }
+ publication="$output/collectivex_public_v1_$dataset_id.ndjson"
+ cp -- "$dataset" "$publication"
+ python3 experimental/CollectiveX/artifact_safety.py "$publication"
+ python3 experimental/CollectiveX/publisher.py --store-root "$store" verify \
+ --channel dev-latest "${promote_args[@]}"
+ sha256sum "$publication"
+ {
+ echo '## CollectiveX V1 publication'
+ echo
+ echo "Dataset: \`$dataset_id\`"
+ echo
+ echo 'Source runs:'
+ sed 's/^/- `/' "$RUNNER_TEMP/collectivex-runs.tsv" | sed 's/$/`/'
+ } >> "$GITHUB_STEP_SUMMARY"
+
+ - name: Upload JIT publication artifact
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ with:
+ name: cxpublication-v1-${{ github.run_id }}-${{ github.run_attempt }}
+ path: ${{ runner.temp }}/collectivex-publication/*.ndjson
+ if-no-files-found: error
+ retention-days: 90
diff --git a/.github/workflows/collectivex-sweep.yml b/.github/workflows/collectivex-sweep.yml
index b56bc285aa..2837a6da10 100644
--- a/.github/workflows/collectivex-sweep.yml
+++ b/.github/workflows/collectivex-sweep.yml
@@ -1,40 +1,46 @@
# CollectiveX Sweep — one structured run instead of thousands of dispatches.
#
-# Shape (mirrors the InferenceX CI tracker): setup -> sweep (a MATRIX job = "a job with other jobs
-# in it") -> aggregate (the collector "at the end"). The matrix unit is a SHARD = one allocation that
-# sweeps many cases sharing (sku, backend, mode, resource) — generate_matrix's own grouping, chunked
-# so no cell exceeds the job budget. Each cell emits a handful of per-case JSONs; the aggregate job
-# collects every shard into ONE line-delimited file (results/aggregate/*.ndjson) so there aren't
-# thousands of individual result files. Run once per backend (deepep / uccl / flashinfer /
-# deepep-hybrid / nccl-ep, + deepep_v2) for full parity.
+# Shape: setup -> sweep. The matrix unit is a shard: one allocation that sweeps
+# cases sharing (sku, backend, nodes). Each cell uploads its privacy-checked raw
+# result JSONs. The isolated v1 publisher consumes downloaded shards separately.
name: CollectiveX Sweep
+permissions:
+ contents: read
on:
workflow_dispatch:
inputs:
backend:
- description: EP library to sweep (deepep matrix is remapped onto the others, capability-filtered)
+ description: "EP library to sweep — 'all' runs every EP backend in one matrix"
type: choice
- default: deepep
- options: [deepep, uccl, flashinfer, deepep-hybrid, nccl-ep]
- deepep_v2:
- description: DeepEP V2 from-source kernels (kernel_gen=v2; deepep backend only)
- type: boolean
- default: false
+ default: all
+ options: [all, deepep, deepep-v2, uccl, deepep-hybrid, mori, nccl-ep]
suites:
description: "'all' or comma-list of suite names"
type: string
default: all
only_sku:
- description: Restrict to one SKU (h100-dgxc|h200|b300|b200-dgxc|gb200|gb300|mi355x); blank = all
+ description: Restrict to one GHA runner pool (h100-dgxc|h200-dgxc|b300|b200-dgxc|gb200|gb300|mi325x|mi355x); blank = all
+ type: string
+ default: ''
+ min_nodes:
+ description: Keep only shards with at least this node/tray count (2 keeps every EP16 and GB EP8; blank = all)
+ type: string
+ default: ''
+ max_nodes:
+ description: Keep only shards with at most this node/tray count (1 keeps non-GB EP8; blank = all)
type: string
default: ''
max_cases:
- description: Max cases per shard cell (chunk larger shards)
+ description: Max cases per shard cell before chunking into another GHA job (128 = no chunking for current suites)
type: string
- default: '14'
-
+ default: '128'
+ release_tag:
+ description: Publication gate; unversioned runs are diagnostic and cannot be published
+ type: choice
+ default: unversioned
+ options: [unversioned, v1]
concurrency:
- group: cx-sweep-${{ github.ref }}-${{ inputs.backend }}-${{ inputs.deepep_v2 }}-${{ inputs.only_sku }}
+ group: cx-sweep-${{ github.ref }}-${{ inputs.release_tag }}-${{ inputs.backend }}-${{ inputs.only_sku }}
cancel-in-progress: false
jobs:
@@ -46,26 +52,150 @@ jobs:
n: ${{ steps.gen.outputs.n }}
steps:
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v5.0.0
- with: { clean: true }
- - run: pip install --quiet pyyaml
+ with: { clean: true, persist-credentials: false }
+ - name: Install matrix dependencies
+ run: python3 -m pip install --quiet PyYAML==6.0.2
- id: gen
working-directory: experimental/CollectiveX
+ env:
+ INPUT_BACKEND: ${{ inputs.backend }}
+ INPUT_SUITES: ${{ inputs.suites }}
+ INPUT_ONLY_SKU: ${{ inputs.only_sku }}
+ INPUT_MIN_NODES: ${{ inputs.min_nodes }}
+ INPUT_MAX_NODES: ${{ inputs.max_nodes }}
+ INPUT_MAX_CASES: ${{ inputs.max_cases }}
+ COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
+ COLLECTIVEX_ARTIFACT_NAME: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }}
+ COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_unsupported
+ run: |
+ set -euo pipefail
+ args=(--suites "$INPUT_SUITES" --max-cases "$INPUT_MAX_CASES")
+ case "$INPUT_BACKEND" in
+ all) args+=(--backends all) ;;
+ *) args+=(--backend "$INPUT_BACKEND") ;;
+ esac
+ [ -n "$INPUT_ONLY_SKU" ] && args+=(--only-sku "$INPUT_ONLY_SKU")
+ [ -n "$INPUT_MIN_NODES" ] && args+=(--min-nodes "$INPUT_MIN_NODES")
+ [ -n "$INPUT_MAX_NODES" ] && args+=(--max-nodes "$INPUT_MAX_NODES")
+ python3 sweep_matrix.py "${args[@]}" --out matrix_full.json >/dev/null
+ python3 artifact_safety.py matrix_full.json
+ SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='case_ids'} for x in m['include']]}))")
+ {
+ echo "matrix=$SLIM"
+ echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")"
+ echo "source_backends=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(' '.join(sorted({x['backend'] for x in m['include']} & {'deepep-v2','deepep-hybrid'})))")"
+ } >> "$GITHUB_OUTPUT"
+ unsupported_n=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(sum(x['disposition']=='unsupported' for x in m['requested_cases']))")
+ echo "unsupported_n=$unsupported_n" >> "$GITHUB_OUTPUT"
+ if [ "$unsupported_n" -gt 0 ]; then
+ python3 sweep_matrix.py --emit-unsupported-from matrix_full.json \
+ --out-dir unsupported
+ fi
+ python3 -c "import json;m=json.load(open('matrix_full.json'));r=m['requested_cases'];print('shard-cells:',len(m['include']),'runnable:',sum(x['disposition']=='runnable' for x in r),'unsupported:',sum(x['disposition']=='unsupported' for x in r))"
+ - name: Prepare pinned backend source archive
+ if: ${{ steps.gen.outputs.source_backends != '' }}
+ working-directory: experimental/CollectiveX
+ env:
+ SOURCE_BACKENDS: ${{ steps.gen.outputs.source_backends }}
+ COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_sources
run: |
set -euo pipefail
- ov=""; [ "${{ inputs.backend }}" != "deepep" ] && ov="--backend ${{ inputs.backend }}"
- v2=""; [ "${{ inputs.deepep_v2 }}" = "true" ] && v2="--deepep-v2"
- os=""; [ -n "${{ inputs.only_sku }}" ] && os="--only-sku ${{ inputs.only_sku }}"
- # full matrix (with cases) -> artifact for the cells; slim (no cases) -> the strategy output.
- python3 sweep_matrix.py --suites "${{ inputs.suites }}" --max-cases "${{ inputs.max_cases }}" $ov $v2 $os --out matrix_full.json >/dev/null
- SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='cases'} for x in m['include']]}))")
- echo "matrix=$SLIM" >> "$GITHUB_OUTPUT"
- echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")" >> "$GITHUB_OUTPUT"
- python3 -c "import json;m=json.load(open('matrix_full.json'));print('shard-cells:',len(m['include']),'cases:',sum(x['n'] for x in m['include']))"
+ source runtime/common.sh
+ work="$RUNNER_TEMP/collectivex-backend-sources"
+ archive="$RUNNER_TEMP/collectivex-backend-sources.tar"
+ rm -rf -- "$work" "$archive"
+ umask 077
+ mkdir -m 700 "$work"
+ mkdir -p "$work/experimental/CollectiveX"
+ read -r -a backends <<< "$SOURCE_BACKENDS"
+ [ "${#backends[@]}" -gt 0 ]
+ for backend in "${backends[@]}"; do
+ cx_prepare_backend_source "$work" "$backend"
+ done
+ cx_cleanup_private_logs 0
+ tar --sort=name --mtime='@1' --owner=0 --group=0 --numeric-owner \
+ -C "$work/experimental/CollectiveX" -cf "$archive" .cx_sources
+ sha256sum "$archive"
+ rm -rf -- "$work"
+ - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ if: ${{ steps.gen.outputs.source_backends != '' }}
+ with:
+ name: cxbackend-sources-${{ github.run_id }}-${{ github.run_attempt }}
+ path: ${{ runner.temp }}/collectivex-backend-sources.tar
+ if-no-files-found: error
+ retention-days: 3
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
- name: cxsweep-matrix-${{ github.run_id }}
+ name: cxsweep-matrix-${{ github.run_id }}-${{ github.run_attempt }}
path: experimental/CollectiveX/matrix_full.json
if-no-files-found: error
+ retention-days: 3
+ - name: Create V1 release marker
+ if: ${{ inputs.release_tag == 'v1' }}
+ env:
+ EXPECTED_MATRIX_SHA256: f1ca85f9689922b90edd5767b9ff2a902f6b896f32f68b2ca086dde3fd2157d0
+ RUN_ID: ${{ github.run_id }}
+ RUN_ATTEMPT: ${{ github.run_attempt }}
+ SOURCE_SHA: ${{ github.sha }}
+ run: |
+ set -euo pipefail
+ destination="$RUNNER_TEMP/collectivex-release"
+ install -d -m 700 "$destination"
+ python3 - "$destination/release.json" <<'PY'
+ import hashlib
+ import json
+ import os
+ import pathlib
+ import sys
+
+ matrix = pathlib.Path("experimental/CollectiveX/matrix_full.json").read_bytes()
+ matrix_sha256 = hashlib.sha256(matrix).hexdigest()
+ if matrix_sha256 != os.environ["EXPECTED_MATRIX_SHA256"]:
+ raise SystemExit("V1 release tag requires the locked full matrix")
+ marker = {
+ "format": "collectivex.release-tag.v1",
+ "matrix_sha256": matrix_sha256,
+ "release_tag": "v1",
+ "run_attempt": os.environ["RUN_ATTEMPT"],
+ "run_id": os.environ["RUN_ID"],
+ "source_sha": os.environ["SOURCE_SHA"],
+ }
+ pathlib.Path(sys.argv[1]).write_text(
+ json.dumps(marker, sort_keys=True, separators=(",", ":")) + "\n"
+ )
+ PY
+ python3 experimental/CollectiveX/artifact_safety.py "$destination/release.json"
+ - name: Upload V1 release marker
+ if: ${{ inputs.release_tag == 'v1' }}
+ uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ with:
+ name: cxrelease-v1-${{ github.run_id }}-${{ github.run_attempt }}
+ path: ${{ runner.temp }}/collectivex-release/release.json
+ if-no-files-found: error
+ retention-days: 3
+ - name: Validate unsupported artifact safety
+ id: unsupported_safety
+ if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 }}
+ run: |
+ python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/unsupported/*.json
+ - name: Validate unsupported outcomes
+ id: unsupported_contracts
+ if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 && steps.unsupported_safety.outcome == 'success' }}
+ env:
+ COLLECTIVEX_ARTIFACT_NAME: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }}
+ COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_unsupported
+ run: |
+ python3 experimental/CollectiveX/contracts.py validate-delivery \
+ --source experimental/CollectiveX/matrix_full.json \
+ --disposition unsupported \
+ experimental/CollectiveX/unsupported/*.json
+ - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+ if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 && steps.unsupported_contracts.outcome == 'success' && steps.unsupported_safety.outcome == 'success' }}
+ with:
+ name: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }}
+ path: experimental/CollectiveX/unsupported/*.json
+ if-no-files-found: error
+ retention-days: 3
# ---- sweep: ONE matrix cell per shard (the parent job with child jobs) ----
sweep:
@@ -73,82 +203,234 @@ jobs:
if: ${{ fromJSON(needs.setup.outputs.n) > 0 }}
strategy:
fail-fast: false
- max-parallel: 10 # don't saturate the ~20-runner fleet; cells queue as slots free
+ max-parallel: 10
matrix: ${{ fromJSON(needs.setup.outputs.matrix) }}
- # h200 label spans two clusters; pin to the validated dgxc pool (mirrors collectivex-experimental).
- runs-on: ${{ matrix.sku == 'h200' && 'h200-dgxc' || matrix.sku }}
+ runs-on: ${{ matrix.sku }}
timeout-minutes: 350
env:
CX_BENCH: ${{ matrix.backend }}
- CX_DEEPEP_V2: ${{ matrix.deepep_v2 && '1' || '' }}
CX_NODES: ${{ matrix.nodes }}
- CX_SHARD_FILE: results/.shard_${{ matrix.id }}.json
+ CX_GPUS_PER_NODE: ${{ matrix.gpus_per_node }}
+ CX_SCALE_UP_DOMAIN: ${{ matrix.scale_up_domain }}
+ CX_SHARD_FILE: .shards/${{ matrix.id }}.json
+ CX_SHARD_SKU: ${{ matrix.sku }}
+ COLLECTIVEX_CANONICAL_GHA: '1'
COLLECTIVEX_SOURCE_SHA: ${{ github.sha }}
- CX_NODELIST: ${{ matrix.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
- CX_STAGE_DIR: ${{ matrix.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
+ COLLECTIVEX_ARTIFACT_NAME: cxshard-${{ matrix.id }}-${{ github.run_id }}-${{ github.run_attempt }}
+ # Consolidated shards run one bounded build-group in one Slurm allocation, so
+ # the launcher's default 45-min --time is too short. 300 min covers a cold
+ # compute-node image import plus the shard. The allocation releases early
+ # when the shard finishes, so short shards don't waste it.
+ CX_TIME: '300'
+ COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_${{ matrix.id }}
+ CX_JOB_ROOT: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}
+ CX_SOURCE_ROOT: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}/source
+ HOME: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}/home
steps:
- - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v5.0.0
- with: { clean: true }
+ - name: Prepare isolated source
+ id: source
+ env:
+ COLLECTIVEX_REPOSITORY: ${{ github.repository }}
+ run: |
+ set -euo pipefail
+ python3 - <<'PY'
+ import os
+ import re
+ import shutil
+ import stat
+ import time
+
+ pattern = re.compile(r"inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+")
+ cutoff = time.time() - 86400
+ for entry in os.scandir("/tmp"):
+ if not pattern.fullmatch(entry.name):
+ continue
+ try:
+ metadata = entry.stat(follow_symlinks=False)
+ except FileNotFoundError:
+ continue
+ if (
+ not stat.S_ISDIR(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or stat.S_IMODE(metadata.st_mode) != 0o700
+ or metadata.st_mtime >= cutoff
+ ):
+ continue
+ marked = False
+ for marker_name in ("cleanup-safe", "cleanup-unsafe"):
+ try:
+ marker = os.stat(
+ os.path.join(entry.path, marker_name), follow_symlinks=False
+ )
+ except FileNotFoundError:
+ continue
+ marked = (
+ stat.S_ISREG(marker.st_mode)
+ and marker.st_uid == os.getuid()
+ and stat.S_IMODE(marker.st_mode) == 0o600
+ )
+ if marked:
+ break
+ if marked:
+ shutil.rmtree(entry.path)
+ PY
+ [[ "$CX_JOB_ROOT" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \
+ || { echo "CollectiveX isolated root is invalid" >&2; exit 1; }
+ [ "$CX_SOURCE_ROOT" = "$CX_JOB_ROOT/source" ] \
+ || { echo "CollectiveX source root is invalid" >&2; exit 1; }
+ if [ -e "$CX_JOB_ROOT" ] || [ -L "$CX_JOB_ROOT" ]; then
+ echo "CollectiveX isolated root already exists" >&2
+ exit 1
+ fi
+ umask 077
+ mkdir -m 700 -- "$CX_JOB_ROOT"
+ trap 'rc=$?; [ "$rc" = 0 ] || rm -rf -- "$CX_JOB_ROOT"; exit "$rc"' EXIT
+ mkdir -m 700 -- "$HOME" "$CX_JOB_ROOT/control" "$CX_JOB_ROOT/artifact" "$CX_SOURCE_ROOT"
+ : > "$CX_JOB_ROOT/cleanup-safe"
+ if ! {
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null git init -q "$CX_SOURCE_ROOT"
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \
+ git -C "$CX_SOURCE_ROOT" remote add origin \
+ "https://github.com/${COLLECTIVEX_REPOSITORY}.git"
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \
+ git -C "$CX_SOURCE_ROOT" -c credential.helper= -c protocol.version=2 \
+ fetch -q --no-tags --depth=1 origin "$COLLECTIVEX_SOURCE_SHA"
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \
+ git -C "$CX_SOURCE_ROOT" -c advice.detachedHead=false \
+ checkout -q --detach FETCH_HEAD
+ [ "$(git -C "$CX_SOURCE_ROOT" rev-parse HEAD)" = "$COLLECTIVEX_SOURCE_SHA" ]
+ } /dev/null 2>&1; then
+ echo "CollectiveX source preparation failed" >&2
+ exit 1
+ fi
+ [ "$(stat -c '%a' "$CX_JOB_ROOT")" = 700 ] \
+ || { echo "CollectiveX isolated root has unsafe permissions" >&2; exit 1; }
+ echo 'prepared=true' >> "$GITHUB_OUTPUT"
+ trap - EXIT
- uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
- name: cxsweep-matrix-${{ github.run_id }}
- path: experimental/CollectiveX
- - name: Extract this shard's cases (stdlib only — no runner deps)
- working-directory: experimental/CollectiveX
+ name: cxsweep-matrix-${{ github.run_id }}-${{ github.run_attempt }}
+ path: ${{ env.CX_JOB_ROOT }}/control
+ - name: Download pinned backend source archive
+ if: ${{ matrix.backend == 'deepep-v2' || matrix.backend == 'deepep-hybrid' }}
+ uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+ with:
+ name: cxbackend-sources-${{ github.run_id }}-${{ github.run_attempt }}
+ path: ${{ env.CX_JOB_ROOT }}/control
+ - name: Install pinned backend source seed
+ if: ${{ matrix.backend == 'deepep-v2' || matrix.backend == 'deepep-hybrid' }}
+ env:
+ EXPECTED_BACKEND: ${{ matrix.backend }}
run: |
set -euo pipefail
- python3 -c "
- import json
- m=json.load(open('matrix_full.json'))
- s=[x for x in m['include'] if x['id']=='${{ matrix.id }}']
- assert s, 'shard ${{ matrix.id }} not in matrix'
- s=s[0]
- json.dump({'id':s['id'],'sku':s['sku'],'backend':s['backend'],'nodes':s['nodes'],'deepep_v2':s['deepep_v2'],'cases':s['cases']}, open('results/.shard_${{ matrix.id }}.json','w'))
- print('shard ${{ matrix.id }}:', len(s['cases']), 'cases')
- "
+ archive="$CX_JOB_ROOT/control/collectivex-backend-sources.tar"
+ destination="$CX_SOURCE_ROOT/experimental/CollectiveX"
+ seed_root="$destination/.cx_sources"
+ [ -f "$archive" ] && [ ! -e "$seed_root" ] && [ ! -L "$seed_root" ]
+ source "$destination/runtime/common.sh"
+ source_path="$(cx_backend_source_path "$seed_root" "$EXPECTED_BACKEND")"
+ source_basename="${source_path#"$seed_root/"}"
+ [ -n "$source_basename" ] \
+ && [ "$source_path" = "$seed_root/$source_basename" ] \
+ && [[ "$source_basename" != */* ]]
+ python3 "$destination/source_archive.py" \
+ "$archive" "$destination" "$source_basename"
+ cx_backend_source_is_valid "$EXPECTED_BACKEND" "$source_path"
+ printf 'CX_BACKEND_SOURCE_SEED_ROOT=%s\n' "$seed_root" >> "$GITHUB_ENV"
+ - name: Extract and validate this shard's cases
+ run: |
+ set -euo pipefail
+ cd "$CX_SOURCE_ROOT/experimental/CollectiveX" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ python3 sweep_matrix.py \
+ --extract-from "$CX_JOB_ROOT/control/matrix_full.json" \
+ --shard-id '${{ matrix.id }}' \
+ --expect-sku '${{ matrix.sku }}' \
+ --expect-backend '${{ matrix.backend }}' \
+ --expect-nodes '${{ matrix.nodes }}' \
+ --out '${{ env.CX_SHARD_FILE }}' >/dev/null
- name: Sweep shard ${{ matrix.id }} (${{ matrix.n }} cases, one allocation)
+ id: sweep_shard
env:
- RUNNER_NAME: ${{ runner.name }}
- run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
+ COLLECTIVEX_OPERATOR_CONFIG_CONTENT: ${{ secrets.COLLECTIVEX_OPERATOR_CONFIG_V1 }}
+ COLLECTIVEX_OPERATOR_CONFIG_REQUIRED: '1'
+ run: |
+ set -euo pipefail
+ umask 077
+ : > "$CX_JOB_ROOT/cleanup-unsafe"
+ rm -f -- "$CX_JOB_ROOT/cleanup-safe"
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ bash "experimental/CollectiveX/launchers/launch_${{ matrix.launcher }}.sh"
+ - name: Confirm allocation cleanup
+ id: allocation_cleanup
+ if: ${{ always() && steps.source.outputs.prepared == 'true' }}
+ run: |
+ set -euo pipefail
+ [ -f "$CX_JOB_ROOT/cleanup-safe" ] && [ ! -e "$CX_JOB_ROOT/cleanup-unsafe" ] \
+ || { echo "CollectiveX allocation cleanup was not confirmed" >&2; exit 1; }
+ - name: Validate shard artifact safety
+ id: artifact_safety
+ if: ${{ always() && steps.allocation_cleanup.outcome == 'success' }}
+ run: |
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/results/*.json
+ - name: Validate shard delivery completeness
+ id: delivery_contracts
+ if: ${{ always() && steps.artifact_safety.outcome == 'success' }}
+ run: |
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ python3 experimental/CollectiveX/contracts.py validate-delivery \
+ --source "experimental/CollectiveX/${CX_SHARD_FILE}" \
+ experimental/CollectiveX/results/*.json
- name: Shard summary
- if: always()
- run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true
+ if: ${{ always() && steps.artifact_safety.outcome == 'success' && steps.delivery_contracts.outcome == 'success' }}
+ run: |
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ python3 experimental/CollectiveX/summarize.py \
+ --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true
+ - name: Stage shard artifact
+ id: stage_artifact
+ if: ${{ always() && steps.delivery_contracts.outcome == 'success' && steps.artifact_safety.outcome == 'success' }}
+ run: |
+ set -euo pipefail
+ cd "$CX_SOURCE_ROOT" 2>/dev/null \
+ || { echo "CollectiveX source is unavailable" >&2; exit 1; }
+ cp -- experimental/CollectiveX/results/*.json "$CX_JOB_ROOT/artifact/"
- name: Upload shard results
- if: always()
+ id: upload_artifact
+ if: always() && steps.stage_artifact.outcome == 'success' && steps.delivery_contracts.outcome == 'success' && steps.artifact_safety.outcome == 'success'
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
- name: cxshard-${{ matrix.id }}-${{ github.run_id }}
- path: experimental/CollectiveX/results/*.json # glob skips the hidden .shard_*.json
- if-no-files-found: warn
-
- # ---- aggregate: collect every shard into ONE ndjson (the "result aggregator at the end") ----
- aggregate:
- needs: sweep
- if: always()
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v5.0.0
- with: { clean: true }
- - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
- with:
- pattern: cxshard-*-${{ github.run_id }}
- path: _shards
- merge-multiple: true
- - name: Aggregate shards -> one ndjson
- working-directory: experimental/CollectiveX
+ name: cxshard-${{ matrix.id }}-${{ github.run_id }}-${{ github.run_attempt }}
+ path: |
+ ${{ env.CX_JOB_ROOT }}/artifact/*.json
+ if-no-files-found: error
+ retention-days: 3
+ - name: Cleanup isolated workspace
+ if: ${{ always() && steps.source.outputs.prepared == 'true' }}
run: |
set -euo pipefail
- tag="${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}"
- python3 aggregate_results.py --in-dir ../../_shards --out "results/aggregate/collectivex_${tag}_${{ github.run_id }}.ndjson"
- {
- echo "## CollectiveX sweep aggregate (${tag})"
- echo '```'
- wc -l results/aggregate/*.ndjson 2>/dev/null || echo "no ndjson"
- echo '```'
- } >> "$GITHUB_STEP_SUMMARY"
- - name: Upload aggregate
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
- with:
- name: cxsweep-aggregate-${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}-${{ github.run_id }}
- path: experimental/CollectiveX/results/aggregate/*.ndjson
- if-no-files-found: warn
+ [[ "$CX_JOB_ROOT" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \
+ || { echo "CollectiveX cleanup root is invalid" >&2; exit 1; }
+ [ "$CX_SOURCE_ROOT" = "$CX_JOB_ROOT/source" ] \
+ || { echo "CollectiveX cleanup source is invalid" >&2; exit 1; }
+ [ -f "$CX_JOB_ROOT/cleanup-safe" ] && [ ! -e "$CX_JOB_ROOT/cleanup-unsafe" ] \
+ || { echo "CollectiveX allocation cleanup was not confirmed; retaining isolated files" >&2; exit 1; }
+ if [ '${{ steps.sweep_shard.outcome }}' = success ] \
+ && [ '${{ steps.allocation_cleanup.outcome }}' = success ] \
+ && [ '${{ steps.artifact_safety.outcome }}' = success ] \
+ && [ '${{ steps.delivery_contracts.outcome }}' = success ] \
+ && [ '${{ steps.stage_artifact.outcome }}' = success ] \
+ && [ '${{ steps.upload_artifact.outcome }}' = success ] \
+ && [ -f "$CX_SOURCE_ROOT/experimental/CollectiveX/runtime/common.sh" ]; then
+ # shellcheck source=/dev/null
+ if source "$CX_SOURCE_ROOT/experimental/CollectiveX/runtime/common.sh" \
+ >/dev/null 2>&1; then
+ cx_cleanup_private_logs 0
+ fi
+ fi
+ rm -rf -- "$CX_JOB_ROOT"
diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore
new file mode 100644
index 0000000000..56b307215b
--- /dev/null
+++ b/experimental/CollectiveX/.gitignore
@@ -0,0 +1,15 @@
+__pycache__/
+*.pyc
+results/
+unsupported/
+.shards/
+.cx_workloads/
+.cx_backend/
+/matrix_full.json
+gpucore.*
+
+# Local plans and infrastructure inventory.
+goal.md
+notes.md
+configs/platforms.yaml
+private-infra.md
diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md
new file mode 100644
index 0000000000..91469935a0
--- /dev/null
+++ b/experimental/CollectiveX/README.md
@@ -0,0 +1,160 @@
+# CollectiveX
+
+
+
+**English** | [中文](./README_zh.md)
+
+
+
+CollectiveX is an experimental MoE expert-parallel communication benchmark. It measures dispatch,
+combine, and paired roundtrip latency across EP libraries and accelerator systems.
+
+> Publication hold: historical schema 3-5 data is diagnostic. No current dataset is approved for
+> rankings, recommendations, or regression baselines.
+
+## v1 Execution Profile
+
+Every scheduled case is BF16 with backend-tuned resources and packed placement. The explicit mode
+selects one of two contracts:
+
+- Normal mode uses `layout-and-dispatch-v1`, rank-deduplicated token payloads, and activation-only
+ combine. Uniform core coverage and one Zipf sensitivity remain; EPLB is measured only as the Zipf
+ remedy.
+- Low-latency mode uses `expert-packed-weighted-combine-v1`, token-expert payloads, and gate-weighted
+ combine through genuine DeepEP V1 or UCCL low-latency APIs. It is decode-only and never shares a
+ ranking cohort with normal mode. Other backends are explicitly unsupported for this suite.
+
+Both modes use `fixed-512-v1`: 64 trials x 8 timed iterations with 32 synchronized full roundtrip
+warmups before each measured component at every trial/point. Roundtrip is measured first; each
+iteration takes the cross-rank maximum before nearest-rank p50/p90/p95/p99, and roundtrip p99 is the
+headline latency. A stdlib integer counter produces byte-identical routing and gate weights.
+
+The canonical matrix covers H100, H200, B200, B300, GB200, GB300, MI325X, and MI355X. It requests
+608 cases / 1,600 token points: 364 runnable cases / 940 points, emitted as 58 executable workflow
+shards/allocation cells, plus 244 explicit unsupported cases / 660 points. `sweep_matrix.py`
+materializes every token ladder and rejects missing, stale, malformed, or altered shard controls.
+Shards are emitted round-robin by SKU so the bounded GHA matrix uses every runner pool early.
+
+| Systems | EP8 | EP16 |
+|---|---|---|
+| H100/H200/B200/B300 | 1x8 NVLink, scale-up | 2x8 NVLink + RDMA, scale-out |
+| MI325X/MI355X | 1x8 XGMI, scale-up | 2x8 XGMI + RDMA, scale-out |
+| GB200/GB300 | 2x4 MNNVL, scale-up | 4x4 MNNVL, scale-up |
+
+Physical host count does not determine scope: both GB topologies stay inside one 72-GPU MNNVL
+scale-up domain.
+
+| Backend | Current scope |
+|---|---|
+| DeepEP V1 | Image-pinned `deep_ep.Buffer`: normal and native low-latency APIs; upstream v1.2.1 on x86 and the image's GB fork on arm64 |
+| DeepEP V2 | PR #605 `ElasticBuffer` plus #630: LSA for scale-up and GIN for x86 EP16 scale-out; source/SASS-bound reproducible JIT |
+| DeepEP Hybrid | Pinned `HybridEPBuffer`: x86 EP16 multi-domain RDMA/DOCA; GB EP8/EP16 in one MNNVL communication domain |
+| UCCL | Pinned 0.1.1 wheel and wrapper with normal and native low-latency APIs on Hopper; Blackwell is explicitly unsupported |
+| NCCL/RCCL A2A | Portable rank-deduplicated payload plus expert/routing-metadata reference |
+| MoRI | EP8 uses MI325X AsyncLL or MI355X IntraNode; EP16 pins InterNodeV1 over 2x8 XGMI + RDMA |
+
+FlashInfer is outside v1 because its exercised EP path failed intermittently at runtime. It is not
+misreported as a platform capability limitation and can return after a stable pinned path is proven.
+
+DeepEP V2 means the `ElasticBuffer` implementation introduced by
+[DeepEP PR #605](https://github.com/deepseek-ai/DeepEP/pull/605), not a newer legacy `Buffer` build.
+The pinned source is the minimal upstream [PR #630](https://github.com/deepseek-ai/DeepEP/pull/630)
+follow-up: its parent is the #605 merge tree and its only source change fixes pure scale-up
+initialization when GIN is unavailable. Scale-up cases request NCCL Device API LSA and fail closed
+unless the realized LSA team covers the full EP world. x86 EP16 scale-out cases instead require the
+hybrid path with GIN, two logical scale-out domains represented by two physical RDMA ranks, and eight
+scale-up ranks per domain; GB EP16 remains MNNVL scale-up and therefore uses LSA. The isolated build
+records the API, source, loaded libraries, generated JIT source, executable SASS, and raw CUBIN
+diagnostics. The current H100 runner pool is explicitly unsupported for V2 because NCCL 2.30.4
+reports that its EP8 communicator lacks Device API symmetric-memory support; re-enabling that pool
+requires an all-rank CUDA P2P/LSA-capable runtime. Other NVIDIA SKUs remain unvalidated until their
+GPU outcomes pass the native correctness and publication gates.
+
+Removed v1 axes include cached-layout `[cl]`, runtime-visible `[rv]`, FP8, quantized combine,
+extra routing distributions, activation profiles, uneven allocation, placement permutations, model
+envelopes, and scaling studies.
+
+## Workflow And Artifacts
+
+`.github/workflows/collectivex-sweep.yml` generates a public-SKU matrix, extracts a strict ignored
+`.shards/.json` control, executes one allocation per shard, privacy-checks result JSON, and uploads
+raw GitHub artifacts. Runs default to `release_tag=unversioned` and are diagnostic-only. A V1 run
+must explicitly select `release_tag=v1`; setup then requires the locked full-matrix digest and emits
+a run/attempt/source-bound `cxrelease-v1-*` marker. Partial and filtered runs cannot receive it.
+
+`.github/workflows/collectivex-publish.yml` is an explicit V1 gate. It accepts exactly three
+successful tagged sweep run IDs from one source SHA, revalidates their GitHub metadata and release
+markers, and runs `publisher.py` in a disposable runner-local workspace. Only a fully promoted,
+privacy-checked, content-addressed dataset is uploaded as `cxpublication-v1-*`; raw artifacts and
+the private publisher workspace are never exposed to the frontend.
+
+There is no results server, attached store, Vercel storage, GCP, Neon, managed database, or managed
+object store. With the existing server-side `GITHUB_TOKEN`, the frontend discovers the latest
+successful version-scoped publication workflow, downloads its NDJSON artifact just in time, verifies
+the ZIP layout, UTF-8/NDJSON shape, schema, promotion state, and SHA-256, then serves versioned channel
+and immutable dataset URLs. The UI keeps an explicit benchmark-version selector; V2 and later
+versions must use separate release tags and publication identities. The full validation contract is
+in [docs/methodology.md](docs/methodology.md).
+
+## Runner Configuration
+
+Runner-local Slurm and storage values use a strict per-SKU JSON document at
+`$XDG_CONFIG_HOME/inferencex/collectivex.json` or `COLLECTIVEX_OPERATOR_CONFIG`. The mode-0600,
+same-owner, non-symlink file is outside the checkout and never uploaded. Unknown runners, fields,
+duplicate keys, endpoint literals, unsafe paths, and non-JSON input fail closed; configuration is
+never evaluated as shell. GHA passes encrypted `COLLECTIVEX_OPERATOR_CONFIG_V1` content only to the
+launcher, which validates it, exports the selected SKU's allowlisted values, and deletes the
+temporary copy before allocation. Required JSON fields are:
+
+| SKU | Variables |
+|---|---|
+| `h100-dgxc`, `b200-dgxc` | `partition`, `account`, `squash_dir`, `stage_dir` |
+| `h200-dgxc` | `partition`, `squash_dir`, `stage_dir` |
+| `b300` | `partition`, `account`, `squash_dir`, `stage_dir` |
+| `gb200` | `partition`, `account`, ordered `storage_roots` |
+| `gb300` | `partition`, `account`, `squash_dir`, `stage_dir`, `enroot_cache_path` |
+| `mi325x`, `mi355x` | `partition`, `squash_dir`, `stage_dir` |
+
+Every selected non-MNNVL EP16 placement additionally requires `socket_ifname` and `rdma_devices`
+for its operator-approved fabric; optional
+`ib_gid_index` and `rdma_service_level` are also allowlisted. CollectiveX does not heuristically
+select a management route or HCA. After allocation, every non-MNNVL scale-out node must prove that
+all configured interfaces and active HCA ports exist before backend setup. Scale-up and MNNVL jobs
+clear these overrides. Scale-out NCCL/RCCL is pinned to `IB` with exact-match HCA selectors so a
+socket fallback fails instead of being mislabeled as RDMA.
+
+`stage_dir` is a pre-existing, runner-owned, non-symlinked base outside the checkout and workflow
+workspace. It is not group- or world-writable and is visible at the same path on the runner and every
+allocated node. Jobs create only a marked mode-0700 execution child, prove cross-node read/write
+visibility, and remove that exact child after allocation teardown; they never mount the runner
+checkout or create a stage beneath image storage on AMD.
+
+Before import, each Docker Hub tag is resolved with bounded registry requests and must match its
+pinned digest; digest-qualified overrides are rejected. Enroot imports use a fixed filesystem epoch
+and a versioned, registry-digest-bound cache key. Every mounted squash is freshly hashed. The
+verified registry digest and local squash hash are both recorded. Image-provided DeepEP is checked
+against exact wheel and installed-file fingerprints; source-built backends use pinned commits and
+runtime-verified GPU targets. DeepEP V2's mode-0700 cluster-local build cache is keyed by a versioned
+build recipe, verified image, architecture, upstream trees, and dependency pins; only its fixed
+`/cx-cache` mount reaches the container, and it never enters result artifacts.
+Pinned V2 and Hybrid sources are fetched once per workflow. Each job validates the complete archive,
+extracts only its exact backend root, permits only contained relative leaf symlinks to archived
+regular files, and revalidates the Git tree and submodule pins before staging.
+Compute containers receive an explicit environment allowlist. Private host, address, device, NIC,
+credential, workspace, and path data stays in encrypted config, ignored operator notes, or bounded
+mode-0600 runner logs; it is never uploaded.
+
+## Local Checks
+
+```bash
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python -m unittest discover experimental/CollectiveX/tests -p 'test_*.py'
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python experimental/CollectiveX/sweep_matrix.py --backends all --out /tmp/cx-matrix.json >/dev/null
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python experimental/CollectiveX/publisher.py --store-root "$COLLECTIVEX_STORE_ROOT" verify
+bash -n experimental/CollectiveX/runtime/*.sh experimental/CollectiveX/launchers/*.sh
+```
+
+Core paths are `capability.py`, `configs/`, `contracts.py`, `schemas/`, `sweep_matrix.py`,
+`publisher.py`, `runtime/`, `launchers/`, and `tests/`.
diff --git a/experimental/CollectiveX/README_zh.md b/experimental/CollectiveX/README_zh.md
new file mode 100644
index 0000000000..fb32369b40
--- /dev/null
+++ b/experimental/CollectiveX/README_zh.md
@@ -0,0 +1,154 @@
+# CollectiveX
+
+
+
+[English](./README.md) | **中文**
+
+
+
+CollectiveX 是实验性的 MoE 专家并行通信基准,用于测量不同 EP 库和加速器系统的
+dispatch、combine 及配对 roundtrip 延迟。
+
+> 发布暂停:历史 schema 3-5 数据仅供诊断。目前没有数据集获准用于排名、推荐或回归基线。
+
+## v1 执行配置
+
+每个调度用例均采用 BF16、后端调优资源和 packed placement。显式指定的 mode 选择以下两个
+契约之一:
+
+- Normal mode 使用 `layout-and-dispatch-v1`、按 rank 去重的 token payload 和 activation-only
+ combine。核心覆盖使用 uniform routing,并保留一个 Zipf 敏感性场景;EPLB 只作为 Zipf
+ 的修正方案测量。
+- Low-latency mode 使用 `expert-packed-weighted-combine-v1`、token-expert payload 和
+ gate-weighted combine,并且只调用真正的 DeepEP V1 或 UCCL low-latency API。该模式仅覆盖
+ 解码,绝不与 normal mode 共用排名 cohort。其他后端在此 suite 中均显式标为 unsupported。
+
+两种模式统一使用 `fixed-512-v1`:64 trials x 8 timed iterations;每个 trial/point 的每个被测
+组件前执行 32 次同步完整 roundtrip warmup。先测 roundtrip;每次 iteration 先取跨 rank 最大值,
+再按 nearest-rank 计算 p50/p90/p95/p99,主要延迟指标为 roundtrip p99。stdlib 整数计数器
+生成逐字节一致的 routing 和 gate weights。
+
+规范矩阵覆盖 H100、H200、B200、B300、GB200、GB300、MI325X 和 MI355X。矩阵请求
+608 个 cases / 1,600 个 token points:364 个可运行 cases / 940 个 points,并形成 58 个可执行
+workflow shards/allocation cells;另有 244 个显式 unsupported cases / 660 个 points。
+`sweep_matrix.py` 物化每个 token ladder,并拒绝缺失、过期、格式错误或被修改的 shard controls。
+分片按 SKU round-robin 发出,使受限的 GHA matrix 尽早使用所有 runner pools。
+
+| 系统 | EP8 | EP16 |
+|---|---|---|
+| H100/H200/B200/B300 | 1x8 NVLink,scale-up | 2x8 NVLink + RDMA,scale-out |
+| MI325X/MI355X | 1x8 XGMI,scale-up | 2x8 XGMI + RDMA,scale-out |
+| GB200/GB300 | 2x4 MNNVL,scale-up | 4x4 MNNVL,scale-up |
+
+物理主机数量不能决定通信范围:两种 GB 拓扑都位于同一个 72-GPU MNNVL scale-up domain 内。
+
+| 后端 | 当前范围 |
+|---|---|
+| DeepEP V1 | 镜像固定的 `deep_ep.Buffer`:提供 normal 和原生 low-latency API;x86 使用 upstream v1.2.1,arm64 使用镜像内 GB fork |
+| DeepEP V2 | PR #605 `ElasticBuffer` 加 #630:scale-up 使用 LSA,x86 EP16 scale-out 使用 GIN;JIT 可复现并绑定 source/SASS |
+| DeepEP Hybrid | 固定的 `HybridEPBuffer`:x86 EP16 使用 multi-domain RDMA/DOCA;GB EP8/EP16 位于同一个 MNNVL communication domain |
+| UCCL | Hopper 上固定的 0.1.1 wheel 和 wrapper,提供 normal 和原生 low-latency API;Blackwell 显式标为 unsupported |
+| NCCL/RCCL A2A | 可移植的 rank-deduplicated payload 加 expert/routing-metadata reference |
+| MoRI | EP8 使用 MI325X AsyncLL 或 MI355X IntraNode;EP16 固定使用 2x8 XGMI + RDMA 上的 InterNodeV1 |
+
+FlashInfer 不在 v1 范围内,因为已测试的 EP path 在运行时存在间歇性失败。该问题不会被误报为
+平台能力限制;在证明有稳定的固定实现后可重新加入。
+
+DeepEP V2 指 [DeepEP PR #605](https://github.com/deepseek-ai/DeepEP/pull/605) 引入的
+`ElasticBuffer` 实现,而不是更新的 legacy `Buffer` build。固定 source 使用最小化的 upstream
+[PR #630](https://github.com/deepseek-ai/DeepEP/pull/630) 后续修复:其 parent 是 #605 merge
+tree,唯一 source 变更是修复 GIN 不可用时的纯 scale-up 初始化。Scale-up cases 请求 NCCL
+Device API LSA;若实际建立的 LSA team 未覆盖整个 EP world,则直接失败。x86 EP16 scale-out
+cases 必须使用启用 GIN 的 hybrid path,其精确拓扑为两个逻辑 scale-out domains(由两个物理
+RDMA ranks 表示)、每个 domain 八个 scale-up ranks;GB EP16 仍是 MNNVL scale-up,因此继续
+使用 LSA。隔离构建会记录 API、source、loaded libraries、generated JIT source、executable
+SASS 与 raw CUBIN diagnostics。当前 H100 runner pool 被明确标记为 V2 unsupported,因为 NCCL
+2.30.4 报告其 EP8 communicator 不具备 Device API symmetric-memory 支持;只有该 pool 的
+runtime 支持全 rank CUDA P2P/LSA 后才能重新启用。其他 NVIDIA SKU 在 GPU outcome 通过 native
+correctness 和 publication gates 前仍为 unvalidated。
+
+v1 已移除的轴包括 cached-layout `[cl]`、runtime-visible `[rv]`、FP8、quantized combine、
+额外 routing distributions、activation profiles、uneven allocation、placement permutations、
+model envelopes 和 scaling studies。
+
+## Workflow 与产物
+
+`.github/workflows/collectivex-sweep.yml` 生成 public-SKU matrix,提取严格且被忽略的
+`.shards/.json` control,每个 shard 执行一次 allocation,对结果 JSON 做隐私检查并上传
+raw GitHub artifacts。运行默认使用 `release_tag=unversioned`,仅供诊断。V1 运行必须显式选择
+`release_tag=v1`;setup 随后要求固定的完整 matrix digest,并生成绑定 run、attempt 与 source 的
+`cxrelease-v1-*` marker。Partial 或 filtered 运行无法获得该 marker。
+
+`.github/workflows/collectivex-publish.yml` 是显式的 V1 gate。它只接受三个来自同一 source SHA、
+成功且带 V1 tag 的 sweep run IDs,重新校验 GitHub metadata 与 release markers,并在 runner 本地
+可丢弃工作区中执行 `publisher.py`。只有完整通过 promotion、隐私检查和内容寻址的数据集才会以
+`cxpublication-v1-*` 上传;raw artifacts 与 publisher private workspace 永不暴露给前端。
+
+系统不需要 results server、attached store、Vercel storage、GCP、Neon、managed database 或
+managed object store。前端使用已有的 server-side `GITHUB_TOKEN`,即时发现最新成功且按版本隔离
+的 publication workflow,下载其 NDJSON artifact,校验 ZIP layout、UTF-8/NDJSON 结构、schema、
+promotion 状态与 SHA-256,随后提供带版本的 channel URL 和 immutable dataset URL。UI 保留显式
+benchmark-version selector;V2 及后续版本必须使用独立的 release tag 与 publication identity。
+完整 validation contract 见 [docs/methodology_zh.md](docs/methodology_zh.md)。
+
+## Runner 配置
+
+Runner 本地 Slurm 和 storage 值使用严格的 per-SKU JSON 文档,路径为
+`$XDG_CONFIG_HOME/inferencex/collectivex.json` 或 `COLLECTIVEX_OPERATOR_CONFIG`。该 mode-0600、
+同 owner、非 symlink 文件位于 checkout 外且永不上传。未知 runners、fields、duplicate keys、
+endpoint literals、unsafe paths 和非 JSON 输入均 fail closed;配置绝不作为 shell 执行。GHA
+仅将加密的 `COLLECTIVEX_OPERATOR_CONFIG_V1` 内容传给 launcher;launcher 验证后只导出所选
+SKU 的 allowlisted values,并在 allocation 前删除临时副本。必需 JSON fields 如下:
+
+| SKU | 变量 |
+|---|---|
+| `h100-dgxc`, `b200-dgxc` | `partition`, `account`, `squash_dir`, `stage_dir` |
+| `h200-dgxc` | `partition`, `squash_dir`, `stage_dir` |
+| `b300` | `partition`, `account`, `squash_dir`, `stage_dir` |
+| `gb200` | `partition`, `account`, 有序 `storage_roots` |
+| `gb300` | `partition`, `account`, `squash_dir`, `stage_dir`, `enroot_cache_path` |
+| `mi325x`, `mi355x` | `partition`, `squash_dir`, `stage_dir` |
+
+每个已选中的非 MNNVL EP16 placement 还必须提供 `socket_ifname` 和 `rdma_devices`,用来指定
+operator 审核过的 fabric;还可配置 allowlisted
+`ib_gid_index` 与 `rdma_service_level`。CollectiveX 不会通过启发式规则选择 management route 或
+HCA。Allocation 完成后,每个非 MNNVL scale-out 节点都必须证明所有已配置 interface 与 active
+HCA port 存在,之后才允许初始化 backend。Scale-up 和 MNNVL job 会清除这些 overrides。
+Scale-out NCCL/RCCL 固定使用 `IB` 与精确匹配的 HCA selectors;如果无法使用 RDMA,job 会失败,
+而不会回退到 socket 后仍被错误标记为 RDMA。
+
+`stage_dir` 必须是 checkout 与 workflow workspace 外预创建且由 runner owner 持有的 base,
+不能经过 symlink,group 和 world 都不能写入,并且 runner 与所有 allocation 节点必须以相同路径
+访问。Job 只创建带 marker 的 mode-0700 execution child,验证跨节点读写可见性,并在
+allocation teardown 后只删除该 child;不会挂载 runner checkout,也不会在 AMD image storage
+下创建 stage。
+
+导入前,每个 Docker Hub tag 都通过有界 registry requests 解析,并且必须匹配固定 digest;拒绝
+digest-qualified overrides。Enroot imports 使用固定 filesystem epoch 和带版本、绑定 registry
+digest 的 cache key。每个已挂载 squash 都重新计算 hash,同时记录 verified registry digest 和
+local squash hash。镜像提供的 DeepEP 会按精确 wheel 和 installed-file fingerprints 检查;
+source-built backends 使用固定 commits 和 runtime-verified GPU targets。DeepEP V2 的 mode-0700
+cluster-local build cache 由版本化 build recipe、verified image、architecture、upstream
+trees 和 dependency pins 共同寻址;container 只看到固定的 `/cx-cache` mount,且该 cache 永不
+进入 result artifacts。
+固定的 V2 与 Hybrid source 在每个 workflow 中只获取一次。每个 job 都会验证完整 archive,仅
+提取自身精确 backend root,只允许指向 archive 内 regular file 的受限相对 leaf symlink,并在
+staging 前重新核对 Git tree 与 submodule pins。
+Compute containers 仅接收显式 environment allowlist。Private host、address、device、NIC、
+credential、workspace 和 path 数据只保留在加密配置、忽略的 operator notes 或有界 mode-0600
+runner logs 中,永不上传。
+
+## 本地检查
+
+```bash
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python -m unittest discover experimental/CollectiveX/tests -p 'test_*.py'
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python experimental/CollectiveX/sweep_matrix.py --backends all --out /tmp/cx-matrix.json >/dev/null
+uv run --with-requirements experimental/CollectiveX/requirements.txt \
+ python experimental/CollectiveX/publisher.py --store-root "$COLLECTIVEX_STORE_ROOT" verify
+bash -n experimental/CollectiveX/runtime/*.sh experimental/CollectiveX/launchers/*.sh
+```
+
+核心路径为 `capability.py`、`configs/`、`contracts.py`、`schemas/`、`sweep_matrix.py`、
+`publisher.py`、`runtime/`、`launchers/` 和 `tests/`。
diff --git a/experimental/CollectiveX/artifact_safety.py b/experimental/CollectiveX/artifact_safety.py
new file mode 100644
index 0000000000..83d522fba8
--- /dev/null
+++ b/experimental/CollectiveX/artifact_safety.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""Fail-closed privacy check for CollectiveX public result documents."""
+from __future__ import annotations
+
+import argparse
+import ipaddress
+import json
+import os
+import re
+import stat
+
+
+SENSITIVE_FIELDS = frozenset({
+ "environment", "env", "host", "hostname", "uuid", "gpu_uuid", "device_uuid",
+ "pci_bus_id", "ip_address", "ip_addresses", "master_addr", "ssh", "ssh_target",
+ "nodelist", "node_list", "nic_guid", "ib_guid", "topology_matrix", "rdma_devices",
+ "user", "username", "password", "passwd", "secret", "token", "access_token",
+ "api_token", "auth_token", "api_key", "private_key", "credential", "credentials",
+ "address", "addresses", "ip", "ips",
+})
+SENSITIVE_FIELDS_COMPACT = frozenset(item.replace("_", "") for item in SENSITIVE_FIELDS)
+SENSITIVE_FIELD_SUFFIXES = (
+ "_host", "_hostname", "_address", "_addresses", "_path", "_paths", "_ip", "_ips",
+ "_password", "_passwd", "_secret", "_token", "_credential", "_credentials",
+ "_uuid", "_guid", "_bus_id",
+)
+SENSITIVE_VALUE_PATTERNS = (
+ ("private-path", re.compile(
+ r"(? str:
+ normalized = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", str(value).strip())
+ normalized = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", normalized)
+ return normalized.lower().replace("-", "_")
+
+
+def _sensitive_value_rule(value: str, *, contextual: bool = True) -> str | None:
+ matched = next(
+ (
+ name for name, pattern in SENSITIVE_VALUE_PATTERNS
+ if (contextual or name not in CONTEXTUAL_VALUE_RULES) and pattern.search(value)
+ ),
+ None,
+ )
+ if matched:
+ return matched
+ for candidate in IPV6_CANDIDATE.findall(value):
+ try:
+ address = candidate.split("%", 1)[0]
+ if ipaddress.ip_address(address).version == 6:
+ return "ipv6-address"
+ except ValueError:
+ continue
+ return None
+
+
+def assert_publication_safe(docs: list[dict]) -> None:
+ """Reject private infrastructure fields and value shapes."""
+ def walk(value, doc_index: int, parent_field: str | None = None) -> None:
+ if isinstance(value, dict):
+ for key, child in value.items():
+ field = _normalized_field(key)
+ compact = field.replace("_", "")
+ if (
+ field in SENSITIVE_FIELDS
+ or compact in SENSITIVE_FIELDS_COMPACT
+ or field.endswith(SENSITIVE_FIELD_SUFFIXES)
+ ):
+ raise ArtifactSafetyError(
+ f"artifact safety: doc[{doc_index}] contains forbidden private field"
+ )
+ key_rule = _sensitive_value_rule(str(key))
+ if key_rule:
+ raise ArtifactSafetyError(
+ f"artifact safety: doc[{doc_index}] contains forbidden {key_rule} key"
+ )
+ walk(child, doc_index, field)
+ elif isinstance(value, list):
+ for child in value:
+ walk(child, doc_index, parent_field)
+ elif isinstance(value, str):
+ rule = _sensitive_value_rule(value, contextual=parent_field != "ref")
+ if rule:
+ raise ArtifactSafetyError(
+ f"artifact safety: doc[{doc_index}] contains forbidden {rule} value"
+ )
+
+ for index, doc in enumerate(docs):
+ if not isinstance(doc, dict):
+ raise ArtifactSafetyError(f"artifact safety: doc[{index}] is not a JSON object")
+ walk(doc, index)
+
+
+def load_documents(paths: list[str]) -> list[dict]:
+ docs: list[dict] = []
+ for path in paths:
+ try:
+ metadata = os.lstat(path)
+ except OSError as exc:
+ raise ArtifactSafetyError("artifact safety: result file is unavailable") from exc
+ if (
+ not stat.S_ISREG(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or metadata.st_size <= 0
+ or metadata.st_size > MAX_INPUT_BYTES
+ ):
+ raise ArtifactSafetyError("artifact safety: result file is unavailable")
+ descriptor = -1
+ try:
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ opened = os.fstat(descriptor)
+ if (
+ not stat.S_ISREG(opened.st_mode)
+ or (opened.st_dev, opened.st_ino, opened.st_size)
+ != (metadata.st_dev, metadata.st_ino, metadata.st_size)
+ ):
+ raise ArtifactSafetyError("artifact safety: result file changed during open")
+ with os.fdopen(descriptor, encoding="utf-8") as fh:
+ descriptor = -1
+ if path.endswith(".ndjson"):
+ for line_number, line in enumerate(fh, 1):
+ if not line.strip():
+ continue
+ try:
+ docs.append(json.loads(line))
+ except json.JSONDecodeError as exc:
+ raise ArtifactSafetyError(
+ f"artifact safety: malformed NDJSON at input line {line_number}"
+ ) from exc
+ else:
+ docs.append(json.load(fh))
+ except json.JSONDecodeError as exc:
+ raise ArtifactSafetyError("artifact safety: malformed JSON input") from exc
+ except (OSError, UnicodeError) as exc:
+ raise ArtifactSafetyError("artifact safety: result file is unreadable") from exc
+ finally:
+ if descriptor >= 0:
+ os.close(descriptor)
+ if not docs:
+ raise ArtifactSafetyError("artifact safety: no public result documents found")
+ return docs
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="Check CollectiveX result artifacts for private data")
+ parser.add_argument("paths", nargs="+")
+ args = parser.parse_args()
+ try:
+ docs = load_documents(args.paths)
+ assert_publication_safe(docs)
+ except ArtifactSafetyError as exc:
+ parser.error(str(exc))
+ print(f"artifact safety: {len(docs)} public document(s) passed")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/capability.py b/experimental/CollectiveX/capability.py
new file mode 100644
index 0000000000..a9431a86b4
--- /dev/null
+++ b/experimental/CollectiveX/capability.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+"""Public runner and backend capability registry for CollectiveX v1."""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+
+DEEPEP_V2_COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6"
+DEEPEP_V2_SKU_CAPABILITIES = {
+ "h100-dgxc": {
+ "schedulable": False,
+ "basis": "current-runner-nccl-device-api-symmetric-memory-unavailable",
+ },
+ "h200-dgxc": {"schedulable": True, "basis": "upstream-sm90-requirement"},
+ "b200-dgxc": {"schedulable": True, "basis": "upstream-sm100-result"},
+ "gb200": {"schedulable": True, "basis": "upstream-sm100-result"},
+ "b300": {"schedulable": True, "basis": "pinned-pr605-pr630-sm103-maps-sm100f"},
+ "gb300": {"schedulable": True, "basis": "pinned-pr605-pr630-sm103-maps-sm100f"},
+ "mi325x": {"schedulable": False, "basis": "nvidia-only"},
+ "mi355x": {"schedulable": False, "basis": "nvidia-only"},
+}
+
+
+def _topologies(
+ product: str, *, gpus_per_node: int, scale_up_domain: int, scale_up_transport: str
+) -> dict[int, dict[str, Any]]:
+ scale_up_class = (
+ f"{product}-nvl72-mnnvl"
+ if scale_up_transport == "mnnvl"
+ else f"{product}-xgmi"
+ if scale_up_transport == "xgmi"
+ else f"{product}-{scale_up_transport}-island"
+ )
+ return {
+ 8: {
+ "nodes": 8 // gpus_per_node,
+ "gpus_per_node": gpus_per_node,
+ "scale_up_domain": scale_up_domain,
+ "scope": "scale-up",
+ "scale_up_transport": scale_up_transport,
+ "scale_out_transport": None,
+ "transport": scale_up_transport,
+ "topology_class": scale_up_class,
+ },
+ 16: {
+ "nodes": 16 // gpus_per_node,
+ "gpus_per_node": gpus_per_node,
+ "scale_up_domain": scale_up_domain,
+ "scope": "scale-up" if scale_up_domain >= 16 else "scale-out",
+ "scale_up_transport": scale_up_transport,
+ "scale_out_transport": None if scale_up_domain >= 16 else "rdma",
+ "transport": (
+ scale_up_transport
+ if scale_up_domain >= 16
+ else f"{scale_up_transport}-rdma"
+ ),
+ "topology_class": (
+ scale_up_class
+ if scale_up_domain >= 16
+ else f"{product}-{scale_up_transport}-rdma"
+ ),
+ },
+ }
+
+
+def _platform(
+ *, vendor: str, arch: str, machine: str, product: str, gpus_per_node: int,
+ scale_up_domain: int, scale_up_transport: str, launcher: str,
+) -> dict[str, Any]:
+ topologies = _topologies(
+ product,
+ gpus_per_node=gpus_per_node,
+ scale_up_domain=scale_up_domain,
+ scale_up_transport=scale_up_transport,
+ )
+ ep8 = topologies[8]
+ return {
+ "vendor": vendor,
+ "arch": arch,
+ "machine": machine,
+ "product": product,
+ # EP8 defaults remain while downstream readers migrate to per-EP records.
+ "transport": ep8["transport"],
+ "topology_class": ep8["topology_class"],
+ "gpus_per_node": gpus_per_node,
+ "scale_up_domain": scale_up_domain,
+ "ep_degrees": tuple(topologies),
+ "topologies": topologies,
+ "launcher": launcher,
+ }
+
+
+PLATFORMS = {
+ "h100-dgxc": _platform(
+ vendor="nvidia", arch="sm90", machine="amd64", product="h100",
+ gpus_per_node=8, scale_up_domain=8, scale_up_transport="nvlink",
+ launcher="single-slurm",
+ ),
+ "h200-dgxc": _platform(
+ vendor="nvidia", arch="sm90", machine="amd64", product="h200",
+ gpus_per_node=8, scale_up_domain=8, scale_up_transport="nvlink",
+ launcher="single-slurm",
+ ),
+ "b200-dgxc": _platform(
+ vendor="nvidia", arch="sm100", machine="amd64", product="b200",
+ gpus_per_node=8, scale_up_domain=8, scale_up_transport="nvlink",
+ launcher="single-slurm",
+ ),
+ "b300": _platform(
+ vendor="nvidia", arch="sm103", machine="amd64", product="b300",
+ gpus_per_node=8, scale_up_domain=8, scale_up_transport="nvlink",
+ launcher="single-slurm",
+ ),
+ "gb200": _platform(
+ vendor="nvidia", arch="sm100", machine="arm64", product="gb200",
+ gpus_per_node=4, scale_up_domain=72, scale_up_transport="mnnvl",
+ launcher="gb-nv",
+ ),
+ "gb300": _platform(
+ vendor="nvidia", arch="sm103", machine="arm64", product="gb300",
+ gpus_per_node=4, scale_up_domain=72, scale_up_transport="mnnvl",
+ launcher="gb-nv",
+ ),
+ "mi325x": _platform(
+ vendor="amd", arch="gfx942", machine="amd64", product="mi325x",
+ gpus_per_node=8, scale_up_domain=8, scale_up_transport="xgmi",
+ launcher="mi-amds",
+ ),
+ "mi355x": _platform(
+ vendor="amd", arch="gfx950", machine="amd64", product="mi355x",
+ gpus_per_node=8, scale_up_domain=8, scale_up_transport="xgmi",
+ launcher="mi-amds",
+ ),
+}
+
+BACKENDS = {
+ "deepep": {"vendors": {"nvidia"}},
+ "deepep-v2": {
+ "vendors": {"nvidia"},
+ "implementation": "deep_ep.ElasticBuffer",
+ "source": "deepseek-ai/DeepEP#605+#630",
+ "commit": DEEPEP_V2_COMMIT,
+ "communication_backend": "nccl-device-lsa",
+ "torch": "2.10.0+cu130",
+ "nccl": "2.30.4",
+ "sku_capabilities": DEEPEP_V2_SKU_CAPABILITIES,
+ },
+ "uccl": {
+ "vendors": {"nvidia"},
+ "machines": {"amd64"},
+ "excluded_skus": {"b200-dgxc", "b300"},
+ },
+ "deepep-hybrid": {"vendors": {"nvidia"}},
+ "mori": {"vendors": {"amd"}},
+ "nccl-ep": {"vendors": {"nvidia", "amd"}},
+}
+SWEEP_BACKENDS = tuple(BACKENDS)
+
+
+def runtime_identity_issues(
+ sku: str, *, vendor: str, arch: str, machine: str, device_name: str,
+ device_count: int, world_size: int,
+) -> list[str]:
+ """Validate public product identity on every rank without private device identifiers."""
+ platform = PLATFORMS.get(sku)
+ if platform is None:
+ return [f"unknown runner identity {sku!r}"]
+ issues = []
+ for field, observed in (("vendor", vendor), ("arch", arch), ("machine", machine)):
+ if observed != platform[field]:
+ issues.append(f"{field}={observed!r}, expected {platform[field]!r}")
+ products = set(re.findall(r"[a-z]+\d+[a-z]*", device_name.lower()))
+ if platform["product"] not in products:
+ issues.append(f"device product {device_name!r} does not identify {platform['product']}")
+ if device_count != platform["gpus_per_node"]:
+ issues.append(
+ f"visible GPUs={device_count}, expected {platform['gpus_per_node']} per node"
+ )
+ if world_size not in platform["ep_degrees"]:
+ issues.append(f"EP{world_size} is not registered for {sku}")
+ return issues
+
+
+def topology_for(sku: str, ep: int) -> dict[str, Any] | None:
+ """Return the exact public topology registered for one SKU/EP cell."""
+ platform = PLATFORMS.get(sku)
+ if platform is None:
+ return None
+ return platform["topologies"].get(ep)
+
+
+def resolve(sku: str, backend: str, *, ep: int | None = None, nodes: int | None = None,
+ routing: str = "uniform", eplb: bool = False,
+ mode: str = "normal") -> tuple[bool, str]:
+ """Return whether one fixed-v1 case can run on a public GHA runner label."""
+ platform, implementation = PLATFORMS.get(sku), BACKENDS.get(backend)
+ if platform is None:
+ return False, f"unknown GHA runner label {sku!r}"
+ if implementation is None:
+ return False, f"unknown backend {backend!r}"
+ if mode not in {"normal", "low-latency"}:
+ return False, f"unknown benchmark mode {mode!r}"
+ if mode == "low-latency" and backend not in {"deepep", "uccl"}:
+ return False, f"{backend} has no distinct low-latency API"
+ if ep is None:
+ if nodes is None:
+ ep = platform["ep_degrees"][0]
+ else:
+ matches = [
+ degree for degree, topology in platform["topologies"].items()
+ if topology["nodes"] == nodes
+ ]
+ if len(matches) != 1:
+ return False, f"{sku} does not register a unique {nodes}-node EP degree"
+ ep = matches[0]
+ topology = topology_for(sku, ep)
+ if topology is None or (nodes is not None and nodes != topology["nodes"]):
+ return False, f"{sku} does not register EP{ep} on {nodes} nodes"
+ if routing not in {"uniform", "zipf"} or (eplb and routing != "zipf"):
+ return False, "v1 routing is uniform or zipf, with EPLB only on zipf"
+ if platform["vendor"] not in implementation["vendors"]:
+ return False, f"{backend} does not support {platform['vendor']}"
+ sku_capability = implementation.get("sku_capabilities", {}).get(sku)
+ if sku_capability is not None and not sku_capability["schedulable"]:
+ return False, f"{backend} is unsupported on {sku}: {sku_capability['basis']}"
+ if platform["machine"] not in implementation.get("machines", {platform["machine"]}):
+ return False, f"{backend} does not support {platform['machine']}"
+ if sku in implementation.get("excluded_skus", set()):
+ return False, f"{backend} is unavailable on {sku}"
+ return True, "ok"
diff --git a/experimental/CollectiveX/configs/suites.yaml b/experimental/CollectiveX/configs/suites.yaml
new file mode 100644
index 0000000000..1508695637
--- /dev/null
+++ b/experimental/CollectiveX/configs/suites.yaml
@@ -0,0 +1,36 @@
+# CollectiveX v1 comparison suites.
+schema_version: 1
+
+suites:
+ ep-core-v1:
+ mode: normal
+ workloads: [deepseek-v3-v1]
+ platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x]
+ ep_degrees: [8, 16]
+ routings: [uniform]
+ phases: [decode, prefill]
+ token_points_prefill: [256, 512]
+ required_publication: official
+
+ ep-routing-v1:
+ mode: normal
+ workloads: [deepseek-v3-v1]
+ platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x]
+ ep_degrees: [8, 16]
+ routings: [zipf]
+ eplb: [false, true]
+ phases: [decode, prefill]
+ token_points_decode: [128]
+ token_points_prefill: [512]
+ required_publication: comparable-experimental
+
+ ep-low-latency-v1:
+ mode: low-latency
+ backends: [deepep, uccl]
+ workloads: [deepseek-v3-v1]
+ platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x]
+ ep_degrees: [8, 16]
+ routings: [uniform]
+ phases: [decode]
+ token_points_decode: [1, 2, 4, 8, 16, 32, 64, 128]
+ required_publication: official
diff --git a/experimental/CollectiveX/configs/workloads.yaml b/experimental/CollectiveX/configs/workloads.yaml
new file mode 100644
index 0000000000..b5b68334c4
--- /dev/null
+++ b/experimental/CollectiveX/configs/workloads.yaml
@@ -0,0 +1,9 @@
+# CollectiveX v1 canonical workload and phase metadata.
+schema_version: 1
+
+model_derived:
+ deepseek-v3-v1:
+ hidden: 7168
+ topk: 8
+ routed_experts: 256
+ verified_against: "deepseek-ai/DeepSeek-V3@e815299b0bcbac849fa540c768ef21845365c9eb/config.json"
diff --git a/experimental/CollectiveX/contracts.py b/experimental/CollectiveX/contracts.py
new file mode 100644
index 0000000000..fbebe070c8
--- /dev/null
+++ b/experimental/CollectiveX/contracts.py
@@ -0,0 +1,2823 @@
+#!/usr/bin/env python3
+"""Strict native attempt contracts and metric validation for CollectiveX v1."""
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+from functools import lru_cache
+import hashlib
+import json
+import math
+import os
+from pathlib import Path, PurePosixPath
+import re
+import sys
+from typing import Any, Iterable
+
+import artifact_safety
+import capability
+import identity
+
+TESTS = Path(__file__).resolve().parent / "tests"
+sys.path.insert(0, str(TESTS))
+import eplb as eplb_contract # noqa: E402
+import workload as workload_contract # noqa: E402
+
+RAW_FORMAT = "collectivex.ep.v1"
+SAMPLES_FORMAT = "collectivex.samples.v1"
+TERMINAL_FORMAT = "collectivex.terminal.v1"
+TERMINAL_CASE_FIELDS = {
+ "backend", "canonical", "eplb", "ep", "experts", "gpus_per_node", "hidden",
+ "ladder", "mode", "nodes", "phase", "required_publication", "routing",
+ "samples_per_point", "scale_out_transport", "scale_up_domain", "scale_up_transport",
+ "scope", "suite", "timing", "topk", "topology_class", "transport",
+ "warmup_semantics", "workload",
+}
+ALLOCATION_FACTOR_FIELDS = {
+ "artifact", "execution_id", "job", "repo", "run_attempt", "run_id", "runner",
+ "source_sha",
+}
+GIT_RUN_FIELDS = {"artifact", "job", "ref", "repo", "run_attempt", "run_id", "source_sha"}
+PRE_EXECUTION_FAILURE_REASONS = {
+ "setup": "launcher-setup-failed",
+ "repository-stage": "repository-staging-failed",
+ "registry-verification": "container-registry-verification-failed",
+ "scheduler-allocation": "scheduler-allocation-failed",
+ "container-import": "container-image-preparation-failed",
+ "container-hash": "container-image-identity-failed",
+ "container-launch": "container-runtime-launch-failed",
+ "backend-setup": "backend-setup-failed",
+ "artifact-collection": "artifact-collection-failed",
+}
+RUNTIME_FAILURE_REASONS = {
+ **PRE_EXECUTION_FAILURE_REASONS,
+ "runtime-identity": "runtime-identity-mismatch",
+ "timeout": "execution-timeout",
+ "deadlock": "execution-deadlock",
+ "execution": "distributed-command-failed",
+}
+POST_EMIT_FAILURE_REASONS = {
+ mode: "post-emit-distributed-command-failed"
+ for mode in ("runtime-identity", "timeout", "deadlock", "execution")
+}
+CAPABILITY_FAILURE_REASONS = frozenset({
+ "backend-platform-unsupported",
+ "backend-token-capacity",
+})
+RETURN_CODE_FAILURE_MODES = {
+ 5: "runtime-identity",
+ 124: "timeout",
+ 137: "timeout",
+}
+PERCENTILES = ("p50", "p90", "p95", "p99")
+V1_CONDITIONING_LADDERS = {
+ "decode": (1, 2, 4, 8, 16, 32, 64, 128),
+ "prefill": (1, 2, 4, 8, 16, 32, 64, 128, 256, 512),
+}
+V1_CONDITIONING_ROUNDS_PER_SHAPE = 8
+DEEPEP_V2_JIT_KERNELS = frozenset({
+ "barrier", "combine", "combine_reduce_epilogue", "dispatch",
+ "dispatch_copy_epilogue",
+})
+DEEPEP_V2_V1_PROVENANCE = {
+ "deepep_version": "2.0.0",
+ "deepep_distribution_version": "2.0.0+fa8a9b1",
+ "deepep_commit": "fa8a9b16898204afd347c663b89e65ef87dc6ce6",
+ "deepep_tree": "29809e75c5874e6609dac4804e7b651d5226959f",
+ "deepep_pr": 605,
+ "deepep_fix_pr": 630,
+ "fmt_commit": "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa",
+ "torch_version": "2.10.0+cu130",
+ "nccl_package_version": "2.30.4",
+ "nccl_version": "2.30.4",
+ "nvshmem_package_version": "3.3.9",
+}
+UCCL_DEPENDENCY_VERSIONS = {
+ "intervaltree": "3.1.0",
+ "nvidia-cuda-runtime-cu12": "12.9.79",
+ "sortedcontainers": "2.4.0",
+}
+SCHEMA_DIR = Path(__file__).resolve().parent / "schemas"
+_SCHEMA_CACHE: dict[str, dict[str, Any]] = {}
+REQUIRED_BACKEND_PROVENANCE = {
+ "deepep": (
+ "deepep_version", "deepep_commit", "backend_lineage", "allow_mnnvl",
+ "mnnvl_comm", "mode", "num_nvl_bytes", "num_rdma_bytes",
+ ),
+ "deepep-v2": (
+ *DEEPEP_V2_V1_PROVENANCE, "api_signature_sha256", "loaded_libraries",
+ "jit_cubins", "jit_random_seed", "deterministic", "num_experts",
+ "tuning_num_experts", "allow_hybrid_mode", "gin_enabled",
+ "communication_backend",
+ ),
+ "deepep-hybrid": (
+ "deepep_commit", "deepep_tree", "branch", "backend_lineage",
+ "loaded_libraries", "realized_config", "jit_kernel_keys", "jit_shared_objects",
+ ),
+ "uccl": (
+ "uccl_version", "uccl_commit", "uccl_wrapper_commit", "backend_lineage",
+ "loaded_libraries", "uccl_dependency_versions", "mode", "num_nvl_bytes",
+ "num_rdma_bytes",
+ ),
+ "mori": ("mori_commit",),
+ "nccl-ep": ("nccl_version", "collective_library", "backend_lineage"),
+}
+PROVENANCE_KEYS = {
+ "allocated_qps", "allow_hybrid_mode", "allow_mnnvl", "allow_multiple_reduction",
+ "api", "api_signature_sha256", "backend", "backend_lineage", "block_num",
+ "block_num_floored", "block_num_target", "branch", "collective_library",
+ "combine_dtype", "combine_warps", "communication_backend", "cuda_version",
+ "deepep_commit", "deepep_distribution_version", "deepep_fix_pr", "deepep_pr", "deepep_tree",
+ "deepep_version", "deterministic", "device_cus",
+ "device_sms", "dispatch_dtype", "dispatch_warps", "enable_sdma", "fmt_commit",
+ "gin_enabled",
+ "gpus_per_node", "heap_size",
+ "impl", "jit_cache_key", "jit_cubins", "jit_kernel_keys", "jit_random_seed",
+ "jit_shared_objects", "kernel_type",
+ "loaded_libraries", "local_experts",
+ "logical_scaleout_ranks",
+ "logical_scaleup_ranks", "mapping_variant", "max_num_inp_token_per_rank",
+ "max_num_tokens", "max_total_recv_tokens", "mnnvl_comm", "mode", "mori_commit",
+ "nccl_communicator", "nccl_package_version", "nccl_version", "num_experts",
+ "nvshmem_package_version",
+ "num_max_tokens_per_rank", "num_nvl_bytes", "num_qps", "num_qps_per_rank",
+ "num_rdma_bytes", "num_sms", "path",
+ "physical_nvlink_ranks", "physical_rdma_ranks", "prefer_overlap_with_compute",
+ "rdma_block_num",
+ "realized_config", "reference_semantics", "requested_num_sms", "resource_mode", "routing_factor",
+ "routing_metadata", "sm_fraction", "top_k",
+ "torch_git_version", "torch_version", "transport", "trtllm", "tuned_source",
+ "tuning_num_experts",
+ "uccl_commit", "uccl_dependency_versions", "uccl_version", "uccl_wrapper_commit",
+ "use_external_inp_buf",
+ "workspace",
+}
+
+
+class ContractError(ValueError):
+ """A document differs from the native v1 contract."""
+
+
+def scheduled_case_profile(case: dict[str, Any], path: str = "case") -> dict[str, Any]:
+ """Resolve an explicit scheduled mode to its immutable measurement profile."""
+ try:
+ return identity.profile_for_case(case)
+ except identity.IdentityError as exc:
+ raise ContractError(f"{path}: {exc}") from exc
+
+
+def resolve_deepep_mnnvl(
+ *, requested: bool, signature_parameters: Iterable[str], deepep_commit: str | None
+) -> tuple[dict[str, bool], str]:
+ """Resolve one explicit DeepEP MNNVL API mode without signature fallbacks."""
+ if not requested:
+ return {}, "not-requested"
+ if "allow_mnnvl" in set(signature_parameters):
+ return {"allow_mnnvl": True}, "explicit-allow-mnnvl"
+ raise ContractError(
+ f"requested DeepEP MNNVL is unsupported by commit {deepep_commit or 'unknown'}"
+ )
+
+
+def collective_kernel_generation(collective_library: Any) -> str:
+ """Return the public NCCL/RCCL implementation lineage."""
+ if collective_library not in {"nccl", "rccl"}:
+ raise ContractError("reference collective library must be nccl or rccl")
+ return collective_library
+
+
+def project_resource_profile(provenance: dict[str, Any]) -> dict[str, Any]:
+ """Project backend provenance into the canonical cross-backend resource vocabulary."""
+ device_units = provenance.get("device_sms") or provenance.get("device_cus")
+ if provenance.get("num_sms") is not None:
+ kind, configured = "sm", provenance["num_sms"]
+ elif (
+ provenance.get("block_num") is not None
+ and provenance.get("kernel_type") != "AsyncLL"
+ ):
+ kind, configured = "cu_block", provenance["block_num"]
+ else:
+ kind, configured = None, None
+ achieved = configured / device_units if configured and device_units else None
+ fixed = "fixed-kernel" in str(provenance.get("tuned_source", ""))
+ source = str(provenance.get("tuned_source", ""))
+ num_nvl_bytes = provenance.get("num_nvl_bytes")
+ num_rdma_bytes = provenance.get("num_rdma_bytes")
+ persistent_bytes = (
+ (num_nvl_bytes or 0) + (num_rdma_bytes or 0)
+ if num_nvl_bytes is not None or num_rdma_bytes is not None
+ else provenance.get("heap_size")
+ )
+ return {
+ "achieved_fraction": round(achieved, 4) if achieved else None,
+ "comm_units_kind": kind,
+ "configured_units": configured,
+ "conformance_class": (
+ "not-applicable" if fixed else "best-known" if "default" not in source
+ else "backend-default"
+ ),
+ "device_units": device_units,
+ "fixed_kernel": fixed,
+ "nonconforming": False,
+ "pareto_eligible": False,
+ "persistent_bytes": persistent_bytes,
+ "qps_per_rank": provenance.get("num_qps_per_rank"),
+ "requested_fraction": None,
+ "resource_class": "fixed-kernel" if fixed else "backend-tuned",
+ "target_achieved_within_tol": None,
+ "tolerance": 0.10,
+ "tuned_source": provenance.get("tuned_source"),
+ "warps_combine": provenance.get("combine_warps"),
+ "warps_dispatch": provenance.get("dispatch_warps"),
+ }
+
+
+def backend_version(provenance: dict[str, Any]) -> str | None:
+ """Return the canonical public backend version from implementation provenance."""
+ for field in (
+ "deepep_version", "uccl_version", "nccl_version",
+ "mori_commit", "deepep_commit",
+ ):
+ value = provenance.get(field)
+ if value is not None and str(value).strip():
+ return str(value)[:160]
+ return None
+
+
+def public_series_config(
+ *, kernel_generation: Any, provenance: dict[str, Any],
+ resource_profile: dict[str, Any], resource_mode: Any, device_product: Any,
+) -> dict[str, Any]:
+ """Project raw implementation facts into the exact public configuration fields."""
+ generation = None if kernel_generation == "n-a" else kernel_generation
+ profile = "profile-" + _sha256_json(resource_profile)[:16]
+ return {
+ "backend": {
+ "generation": generation,
+ "version": backend_version(provenance),
+ },
+ "resource": {
+ "mode": resource_mode,
+ "profile": profile,
+ "comm_units_kind": resource_profile.get("comm_units_kind"),
+ "configured_units": resource_profile.get("configured_units"),
+ },
+ "system": {"label": str(device_product)[:160]},
+ }
+
+
+def public_series_config_sha256(config: dict[str, Any]) -> str:
+ """Commit the canonical public configuration projection into series identity."""
+ return _sha256_json(config)
+
+
+SOURCE_BUILT_LIBRARY_ROLES = frozenset({
+ "deepep-extension", "deepep-hybrid-extension",
+})
+
+
+def series_provenance(provenance: dict[str, Any]) -> dict[str, Any]:
+ """Project stable semantic build identity while retaining raw binaries in private evidence."""
+ projected = {
+ key: value for key, value in provenance.items()
+ if key not in {"jit_cache_key", "jit_shared_objects", "path", "sm_fraction"}
+ }
+ libraries = provenance.get("loaded_libraries")
+ if isinstance(libraries, list):
+ projected["loaded_libraries"] = [
+ {
+ "name": item.get("name"),
+ "role": item.get("role"),
+ "source_tree": provenance.get("deepep_tree"),
+ }
+ if isinstance(item, dict) and item.get("role") in SOURCE_BUILT_LIBRARY_ROLES
+ else item
+ for item in libraries
+ ]
+ jit_cubins = provenance.get("jit_cubins")
+ if isinstance(jit_cubins, list):
+ projected["jit_cubins"] = [
+ {
+ "cache_key": item.get("cache_key"),
+ "sass_sha256": item.get("sass_sha256"),
+ "source_sha256": item.get("source_sha256"),
+ }
+ if isinstance(item, dict)
+ else item
+ for item in jit_cubins
+ ]
+ return projected
+
+
+def routing_implementation_control_sha256(implementation: dict[str, Any]) -> str:
+ """Bind routing cohorts to the same static build/generator and non-treatment configuration."""
+ provenance = implementation.get("provenance")
+ if not isinstance(provenance, dict):
+ raise ContractError("implementation provenance is unavailable")
+ semantic = series_provenance(provenance)
+ treatment_fields = {
+ "jit_cache_key", "jit_cubins", "jit_kernel_keys", "jit_shared_objects",
+ "local_experts", "num_experts", "path", "realized_config", "sm_fraction",
+ }
+ return _sha256_json({
+ "kernel_generation": implementation.get("kernel_generation"),
+ "name": implementation.get("name"),
+ "provenance": {
+ key: value for key, value in semantic.items()
+ if key not in treatment_fields
+ },
+ "resource_profile": implementation.get("resource_profile"),
+ })
+
+
+def _resolved_provenance_value(field: str, value: Any) -> bool:
+ if value is None or isinstance(value, (dict, list, tuple, set)) and not value:
+ return False
+ text = str(value).strip().lower()
+ if not text or text in {"unknown", "none", "null", "n/a", "?", "capture-failed"}:
+ return False
+ if "capture-failed" in text:
+ return False
+ if field.endswith("_commit") and (
+ text in {"main", "hybrid-ep", "uccl", "pkg-uccl"}
+ or text.endswith(("-unknown", "-none", "-main", "-hybrid-ep"))
+ ):
+ return False
+ return True
+
+
+def _content_evidence_is_valid(value: Any, required_roles: set[str]) -> bool:
+ if not isinstance(value, list) or not value:
+ return False
+ records: set[tuple[str, str]] = set()
+ roles: set[str] = set()
+ for item in value:
+ if not isinstance(item, dict) or set(item) != {"name", "role", "sha256"}:
+ return False
+ name, role, digest = item["name"], item["role"], item["sha256"]
+ if (
+ not isinstance(name, str)
+ or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,159}", name)
+ or not isinstance(role, str)
+ or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,127}", role)
+ or not isinstance(digest, str)
+ or not re.fullmatch(r"[0-9a-f]{64}", digest)
+ or (role, name) in records
+ ):
+ return False
+ records.add((role, name))
+ roles.add(role)
+ return required_roles <= roles
+
+
+def _deepep_v2_jit_cubins_are_valid(value: Any) -> bool:
+ if not isinstance(value, list) or len(value) != len(DEEPEP_V2_JIT_KERNELS):
+ return False
+ cache_keys = []
+ kernel_names = set()
+ for item in value:
+ if not isinstance(item, dict) or set(item) != {
+ "cache_key", "cubin_sha256", "sass_sha256", "source_sha256",
+ }:
+ return False
+ cache_key = item["cache_key"]
+ match = (
+ re.fullmatch(r"kernel\.([A-Za-z0-9_+-]+)\.[0-9a-f]{32}", cache_key)
+ if isinstance(cache_key, str)
+ else None
+ )
+ if (
+ match is None
+ or any(
+ not isinstance(item[field], str)
+ or not re.fullmatch(r"[0-9a-f]{64}", item[field])
+ for field in ("cubin_sha256", "sass_sha256", "source_sha256")
+ )
+ ):
+ return False
+ cache_keys.append(cache_key)
+ kernel_names.add(match.group(1))
+ return (
+ cache_keys == sorted(set(cache_keys))
+ and kernel_names == DEEPEP_V2_JIT_KERNELS
+ )
+
+
+HYBRID_REALIZED_CONFIG_FIELDS = {
+ "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank",
+ "num_of_ranks_per_node", "num_of_nodes", "pad_multiple",
+ "num_of_tokens_per_chunk_preprocessing_api",
+ "num_of_threads_per_block_preprocessing_api", "num_of_blocks_preprocessing_api",
+ "num_of_blocks_permute", "num_of_blocks_unpermute", "token_data_type",
+ "num_of_stages_dispatch_api", "num_of_stages_permute_block_dispatch_api",
+ "num_of_in_flight_s2g_dispatch_api",
+ "num_of_in_flight_s2g_permute_block_dispatch_api",
+ "num_of_additional_in_flight_s2g_dispatch_api",
+ "num_of_tokens_per_chunk_dispatch_api", "num_of_blocks_dispatch_api",
+ "forward_dispatch_api", "device_side_sync_dispatch_api",
+ "num_of_stages_g2s_combine_api", "num_of_stages_s2g_combine_api",
+ "num_of_tokens_per_chunk_combine_api", "num_of_tokens_per_group_combine_api",
+ "num_of_blocks_combine_api", "num_of_additional_in_flight_s2g_combine_api",
+ "backward_combine_api", "device_side_sync_combine_api",
+}
+HYBRID_REALIZED_BOOL_FIELDS = {
+ "forward_dispatch_api", "device_side_sync_dispatch_api", "backward_combine_api",
+ "device_side_sync_combine_api",
+}
+
+
+def _hybrid_realized_config_is_valid(value: Any) -> bool:
+ if not isinstance(value, dict) or set(value) != HYBRID_REALIZED_CONFIG_FIELDS:
+ return False
+ for field, field_value in value.items():
+ if field in HYBRID_REALIZED_BOOL_FIELDS:
+ if type(field_value) is not bool:
+ return False
+ elif field == "token_data_type":
+ if field_value not in {"UINT8", "UINT16"}:
+ return False
+ elif type(field_value) is not int or field_value < 0:
+ return False
+ return all(value[field] > 0 for field in (
+ "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank",
+ "num_of_ranks_per_node", "num_of_nodes",
+ ))
+
+
+def hybrid_communication_domains(ep_size: int, scale_up_domain: int) -> tuple[int, int]:
+ """Return active ranks per fabric domain and the number of such domains."""
+ if type(ep_size) is not int or type(scale_up_domain) is not int:
+ raise ContractError("hybrid communication topology must be integral")
+ if ep_size <= 0 or scale_up_domain <= 0:
+ raise ContractError("hybrid communication topology must be positive")
+ domain_ranks = min(ep_size, scale_up_domain)
+ if ep_size % domain_ranks:
+ raise ContractError("hybrid EP size does not divide into communication domains")
+ return domain_ranks, ep_size // domain_ranks
+
+
+def _hybrid_kernel_keys_are_valid(value: Any) -> bool:
+ return (
+ isinstance(value, list)
+ and len(value) == 3
+ and len(set(value)) == 3
+ and value == sorted(value)
+ and all(
+ isinstance(key, str)
+ and re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,511}", key)
+ for key in value
+ )
+ )
+
+
+def _hybrid_jit_evidence_is_valid(value: Any, kernel_keys: Any) -> bool:
+ if not _hybrid_kernel_keys_are_valid(kernel_keys) or not isinstance(value, list):
+ return False
+ if len(value) != len(kernel_keys):
+ return False
+ rank_sets = []
+ for expected_key, item in zip(kernel_keys, value):
+ if not isinstance(item, dict) or set(item) != {"kernel_key", "rank_artifacts"}:
+ return False
+ rank_artifacts = item["rank_artifacts"]
+ if item["kernel_key"] != expected_key or not isinstance(rank_artifacts, list):
+ return False
+ ranks = []
+ for artifact in rank_artifacts:
+ if not isinstance(artifact, dict) or set(artifact) != {"bytes", "rank", "sha256"}:
+ return False
+ rank, digest, size = artifact["rank"], artifact["sha256"], artifact["bytes"]
+ if (
+ type(rank) is not int
+ or rank < 0
+ or not isinstance(digest, str)
+ or not re.fullmatch(r"[0-9a-f]{64}", digest)
+ or type(size) is not int
+ or size <= 0
+ ):
+ return False
+ ranks.append(rank)
+ if not ranks or ranks != list(range(len(ranks))):
+ return False
+ rank_sets.append(ranks)
+ return all(ranks == rank_sets[0] for ranks in rank_sets)
+
+
+def backend_provenance_issues(backend: str, provenance: dict[str, Any]) -> list[str]:
+ unknown = [
+ field for field, value in provenance.items()
+ if isinstance(value, str) and value.strip().lower() == "unknown"
+ ]
+ unresolved = [
+ field for field in REQUIRED_BACKEND_PROVENANCE.get(backend, ())
+ if not _resolved_provenance_value(field, provenance.get(field))
+ ]
+ if backend == "deepep":
+ mode = provenance.get("mnnvl_comm")
+ allow = provenance.get("allow_mnnvl")
+ valid_modes = {
+ "not-requested": False,
+ "explicit-allow-mnnvl": True,
+ }
+ if type(allow) is not bool or valid_modes.get(mode) is not allow:
+ unresolved.append("mnnvl_comm")
+ if provenance.get("backend_lineage") != "deepep-v1":
+ unresolved.append("backend_lineage")
+ if backend in {"deepep", "uccl"}:
+ mode = provenance.get("mode")
+ num_nvl_bytes = provenance.get("num_nvl_bytes")
+ num_rdma_bytes = provenance.get("num_rdma_bytes")
+ if mode not in {"normal", "low-latency"}:
+ unresolved.append("mode")
+ if type(num_nvl_bytes) is not int or num_nvl_bytes < 0:
+ unresolved.append("num_nvl_bytes")
+ if type(num_rdma_bytes) is not int or num_rdma_bytes < 0:
+ unresolved.append("num_rdma_bytes")
+ if mode == "normal" and (type(num_nvl_bytes) is not int or num_nvl_bytes <= 0):
+ unresolved.append("num_nvl_bytes")
+ if mode == "low-latency":
+ if num_nvl_bytes != 0:
+ unresolved.append("num_nvl_bytes")
+ if type(num_rdma_bytes) is not int or num_rdma_bytes <= 0:
+ unresolved.append("num_rdma_bytes")
+ if (
+ type(provenance.get("num_max_tokens_per_rank")) is not int
+ or provenance["num_max_tokens_per_rank"] <= 0
+ ):
+ unresolved.append("num_max_tokens_per_rank")
+ if backend == "deepep" and (
+ type(provenance.get("num_qps_per_rank")) is not int
+ or provenance["num_qps_per_rank"] <= 0
+ ):
+ unresolved.append("num_qps_per_rank")
+ if backend == "deepep-v2":
+ for field in ("num_experts", "tuning_num_experts"):
+ if type(provenance.get(field)) is not int or provenance[field] <= 0:
+ unresolved.append(field)
+ if not _deepep_v2_jit_cubins_are_valid(provenance.get("jit_cubins")):
+ unresolved.append("jit_cubins")
+ if provenance.get("jit_random_seed") != "collectivex-deepep-v2-fa8a9b1":
+ unresolved.append("jit_random_seed")
+ unresolved.extend(
+ field for field, expected in DEEPEP_V2_V1_PROVENANCE.items()
+ if provenance.get(field) != expected
+ )
+ policy = (
+ provenance.get("allow_hybrid_mode"),
+ provenance.get("gin_enabled"),
+ provenance.get("communication_backend"),
+ )
+ if policy not in {
+ (False, False, "nccl-device-lsa"),
+ (True, True, "nccl-gin"),
+ }:
+ unresolved.extend(
+ ("allow_hybrid_mode", "gin_enabled", "communication_backend")
+ )
+ content_roles = {
+ "deepep-v2": {"deepep-extension", "nccl", "nvshmem"},
+ "deepep-hybrid": {"deepep-extension", "deepep-hybrid-extension"},
+ "uccl": {
+ "uccl-distribution", "uccl-wrapper", "intervaltree-distribution",
+ "sortedcontainers-distribution", "cuda-runtime",
+ },
+ }.get(backend)
+ if content_roles is not None and not _content_evidence_is_valid(
+ provenance.get("loaded_libraries"), content_roles
+ ):
+ unresolved.append("loaded_libraries")
+ if backend in {"deepep-v2", "deepep-hybrid"} and not re.fullmatch(
+ r"[0-9a-f]{40}", str(provenance.get("deepep_tree", ""))
+ ):
+ unresolved.append("deepep_tree")
+ if backend == "deepep-hybrid" and provenance.get("backend_lineage") != "deepep-hybrid":
+ unresolved.append("backend_lineage")
+ if backend == "deepep-hybrid":
+ if not _hybrid_realized_config_is_valid(provenance.get("realized_config")):
+ unresolved.append("realized_config")
+ if not _hybrid_kernel_keys_are_valid(provenance.get("jit_kernel_keys")):
+ unresolved.append("jit_kernel_keys")
+ if not _hybrid_jit_evidence_is_valid(
+ provenance.get("jit_shared_objects"), provenance.get("jit_kernel_keys")
+ ):
+ unresolved.append("jit_shared_objects")
+ if backend == "uccl" and provenance.get("backend_lineage") != "uccl":
+ unresolved.append("backend_lineage")
+ if backend == "uccl" and provenance.get("uccl_dependency_versions") != (
+ UCCL_DEPENDENCY_VERSIONS
+ ):
+ unresolved.append("uccl_dependency_versions")
+ if backend == "nccl-ep":
+ collective = provenance.get("collective_library")
+ if collective not in {"nccl", "rccl"}:
+ unresolved.append("collective_library")
+ if provenance.get("backend_lineage") != collective:
+ unresolved.append("backend_lineage")
+ if backend == "mori" and provenance.get("kernel_type") == "InterNodeV1":
+ expected = {
+ "block_num": 96,
+ "rdma_block_num": 64,
+ "dispatch_warps": 8,
+ "combine_warps": 8,
+ "num_qps": 1,
+ "use_external_inp_buf": True,
+ "gpus_per_node": 8,
+ }
+ unresolved.extend(
+ field for field, value in expected.items()
+ if provenance.get(field) != value
+ )
+ for field, minimum in (
+ ("num_nvl_bytes", 0), ("num_rdma_bytes", 0),
+ ("num_qps_per_rank", 1),
+ ):
+ if field in provenance and (
+ type(provenance[field]) is not int or provenance[field] < minimum
+ ):
+ unresolved.append(field)
+ if "rdma_block_num" in provenance and (
+ type(provenance["rdma_block_num"]) is not int
+ or provenance["rdma_block_num"] < 0
+ ):
+ unresolved.append("rdma_block_num")
+ if "use_external_inp_buf" in provenance and type(
+ provenance["use_external_inp_buf"]
+ ) is not bool:
+ unresolved.append("use_external_inp_buf")
+ return sorted(set(unknown + unresolved))
+
+
+def provenance_complete(
+ provenance: dict[str, Any], backend: str, git_run: dict[str, Any] | None,
+ *, image_digest: Any, image_verified: Any, squash_sha256: Any,
+) -> bool:
+ image = str(image_digest or "")
+ squash = str(squash_sha256 or "")
+ return (
+ not backend_provenance_issues(backend, provenance)
+ and image_verified is True
+ and bool(re.fullmatch(r"sha256:[0-9a-f]{64}", image))
+ and bool(re.fullmatch(r"[0-9a-f]{64}", squash))
+ and isinstance(git_run, dict)
+ and all(git_run.get(field) for field in GIT_RUN_FIELDS)
+ )
+
+
+def strict_load(path: str | os.PathLike[str]) -> Any:
+ """Load JSON while rejecting duplicate keys and non-finite constants."""
+ def pairs(items):
+ result = {}
+ for key, value in items:
+ if key in result:
+ raise ContractError(f"duplicate JSON key {key!r}")
+ result[key] = value
+ return result
+
+ def constant(value):
+ raise ContractError(f"non-finite JSON number {value}")
+
+ try:
+ with open(path) as handle:
+ return json.load(handle, object_pairs_hook=pairs, parse_constant=constant)
+ except (OSError, json.JSONDecodeError) as exc:
+ raise ContractError(f"invalid JSON {path}: {exc}") from exc
+
+
+def canonical_json_bytes(value: Any) -> bytes:
+ """Canonical finite JSON bytes for checksums and immutable artifacts."""
+ _finite_tree(value)
+ try:
+ return json.dumps(
+ value, allow_nan=False, ensure_ascii=False, sort_keys=True,
+ separators=(",", ":"),
+ ).encode("utf-8")
+ except (TypeError, ValueError) as exc:
+ raise ContractError(f"value is not canonical JSON: {exc}") from exc
+
+
+def content_manifest_evidence(
+ *, role: str, name: str, files: Iterable[tuple[str, str | os.PathLike[str]]]
+) -> dict[str, str]:
+ """Hash a labeled file set without exposing any host path in provenance."""
+ if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,127}", role):
+ raise ContractError("content evidence role is invalid")
+ if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,159}", name):
+ raise ContractError("content evidence name is invalid")
+ manifest: list[dict[str, Any]] = []
+ labels: set[str] = set()
+ for label, raw_path in files:
+ logical = PurePosixPath(label)
+ if (
+ not label
+ or logical.is_absolute()
+ or ".." in logical.parts
+ or label in labels
+ or any(ord(character) < 0x20 or ord(character) > 0x7E for character in label)
+ ):
+ raise ContractError("content evidence label is invalid or duplicated")
+ path = Path(raw_path)
+ if not path.is_file():
+ raise ContractError("content evidence source is not a file")
+ digest = hashlib.sha256()
+ size = 0
+ with path.open("rb") as handle:
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ digest.update(chunk)
+ size += len(chunk)
+ labels.add(label)
+ manifest.append({"bytes": size, "label": label, "sha256": digest.hexdigest()})
+ if not manifest:
+ raise ContractError("content evidence cannot be empty")
+ digest = hashlib.sha256(
+ canonical_json_bytes(sorted(manifest, key=lambda item: item["label"]))
+ ).hexdigest()
+ return {"name": name, "role": role, "sha256": digest}
+
+
+def _obj(value: Any, path: str) -> dict[str, Any]:
+ if not isinstance(value, dict):
+ raise ContractError(f"{path} must be an object")
+ return value
+
+
+def _keys(value: Any, expected: set[str], path: str) -> dict[str, Any]:
+ obj = _obj(value, path)
+ actual = set(obj)
+ if actual != expected:
+ raise ContractError(
+ f"{path} fields differ: missing={sorted(expected - actual)}, "
+ f"extra={sorted(actual - expected)}"
+ )
+ return obj
+
+
+def _text(value: Any, path: str, *, nullable: bool = False) -> str | None:
+ if nullable and value is None:
+ return None
+ if not isinstance(value, str) or not value:
+ raise ContractError(f"{path} must be a non-empty string")
+ return value
+
+
+def _integer(value: Any, path: str, *, minimum: int = 0) -> int:
+ if type(value) is not int or value < minimum:
+ raise ContractError(f"{path} must be an integer >= {minimum}")
+ return value
+
+
+def validate_conditioning_contract(value: Any, phase: str) -> dict[str, Any]:
+ """Validate the exact phase-specific v1 conditioning schedule."""
+ if phase not in V1_CONDITIONING_LADDERS:
+ raise ContractError("raw conditioning phase is invalid")
+ conditioning = _keys(
+ value, {"contract", "ladder", "roundtrips_per_shape"},
+ "raw.measurement.conditioning",
+ )
+ ladder = conditioning["ladder"]
+ if (
+ conditioning["contract"] != identity.V1_CASE_PROFILE["conditioning_contract"]
+ or type(ladder) is not list
+ or any(type(point) is not int for point in ladder)
+ or ladder != list(V1_CONDITIONING_LADDERS[phase])
+ or _integer(
+ conditioning["roundtrips_per_shape"],
+ "raw.measurement.conditioning.roundtrips_per_shape",
+ minimum=1,
+ ) != V1_CONDITIONING_ROUNDS_PER_SHAPE
+ ):
+ raise ContractError(f"raw {phase} conditioning contract differs")
+ return conditioning
+
+
+def _number(value: Any, path: str, *, minimum: float | None = None) -> float:
+ if isinstance(value, bool) or not isinstance(value, (int, float)) or not math.isfinite(value):
+ raise ContractError(f"{path} must be finite")
+ result = float(value)
+ if minimum is not None and result < minimum:
+ raise ContractError(f"{path} must be >= {minimum}")
+ return result
+
+
+def _finite_tree(value: Any, path: str = "$") -> None:
+ if isinstance(value, float) and not math.isfinite(value):
+ raise ContractError(f"{path} contains a non-finite number")
+ if isinstance(value, list):
+ for index, item in enumerate(value):
+ _finite_tree(item, f"{path}[{index}]")
+ elif isinstance(value, dict):
+ for key, item in value.items():
+ _finite_tree(item, f"{path}.{key}")
+
+
+def _typed(value: Any, kind: str, path: str) -> str:
+ if not identity.is_typed_id(value, kind):
+ raise ContractError(f"{path} is not a {kind} ID")
+ return value
+
+
+def _sha256_json(value: Any) -> str:
+ payload = json.dumps(
+ value, allow_nan=False, ensure_ascii=False, sort_keys=True, separators=(",", ":")
+ ).encode()
+ return hashlib.sha256(payload).hexdigest()
+
+
+@lru_cache(maxsize=None)
+def _expected_eplb_plan(
+ routing: str,
+ topk: int,
+ logical_experts: int,
+ physical_experts: int,
+ ep_size: int,
+ seed: int,
+ reference_tokens_per_rank: int,
+) -> dict[str, Any]:
+ indices, _ = workload_contract.canonical_routing_rows(
+ reference_tokens_per_rank * ep_size,
+ logical_experts,
+ topk,
+ routing,
+ seed,
+ )
+ load = [0] * logical_experts
+ for row in indices:
+ for expert in row:
+ load[expert] += 1
+ return eplb_contract.build_plan(load, physical_experts, ep_size)
+
+
+@lru_cache(maxsize=None)
+def _expected_canonical_trace(
+ routing: str,
+ hidden: int,
+ topk: int,
+ logical_experts: int,
+ physical_experts: int,
+ ep_size: int,
+ tokens_per_rank: int,
+ seed: int,
+ eplb_enabled: bool,
+ reference_tokens_per_rank: int,
+) -> tuple[str, dict[str, str], str, list[list[int]], list[list[float]]]:
+ member, checksums, indices, weights = workload_contract.canonical_member(
+ routing,
+ hidden,
+ topk,
+ logical_experts,
+ ep_size,
+ tokens_per_rank,
+ seed,
+ )
+ if eplb_enabled:
+ plan = _expected_eplb_plan(
+ routing,
+ topk,
+ logical_experts,
+ physical_experts,
+ ep_size,
+ seed,
+ reference_tokens_per_rank,
+ )
+ indices = eplb_contract.remap_rows(indices, plan)
+ routing_hash = workload_contract.trace_checksums(indices, weights)["trace"]
+ return member, checksums, routing_hash, indices, weights
+
+
+def _coefficient_of_variation(values: list[int]) -> float:
+ mean = sum(values) / len(values)
+ if mean == 0:
+ return 0.0
+ variance = sum((value - mean) ** 2 for value in values) / len(values)
+ return variance**0.5 / mean
+
+
+def _expected_routing_summary(
+ indices: list[list[int]],
+ weights: list[list[float]],
+ *,
+ physical_experts: int,
+ ep_size: int,
+ tokens_per_rank: int,
+ gpus_per_node: int,
+ scale_up_domain: int,
+) -> dict[str, Any]:
+ """Recompute every published routing/load statistic without torch."""
+ experts_per_rank = physical_experts // ep_size
+ expert_load = [0] * physical_experts
+ assignment_load = [0] * ep_size
+ payload_load = [0] * ep_size
+ fanouts: list[int] = []
+ local = same_node = same_domain = copies = 0
+ for token, row in enumerate(indices):
+ destinations = {expert // experts_per_rank for expert in row}
+ source = token // tokens_per_rank
+ fanouts.append(len(destinations))
+ for expert in row:
+ expert_load[expert] += 1
+ assignment_load[expert // experts_per_rank] += 1
+ for destination in destinations:
+ payload_load[destination] += 1
+ copies += 1
+ local += destination == source
+ same_node += destination // gpus_per_node == source // gpus_per_node
+ same_domain += destination // scale_up_domain == source // scale_up_domain
+ fanout_histogram = [fanouts.count(value) for value in range(1, ep_size + 1)]
+ expert_mean = sum(expert_load) / len(expert_load)
+ return {
+ "empty_expert_count": expert_load.count(0),
+ "empty_rank_count": payload_load.count(0),
+ "expert_assignment_rank_cv": _coefficient_of_variation(assignment_load),
+ "expert_assignments_per_rank": assignment_load,
+ "expert_load_cv": _coefficient_of_variation(expert_load),
+ "expert_load_max": max(expert_load),
+ "expert_load_mean": expert_mean,
+ "expert_load_min": min(expert_load),
+ "fanout_histogram": fanout_histogram,
+ "fanout_max": max(fanouts),
+ "fanout_mean": sum(fanouts) / len(fanouts),
+ "fanout_min": min(fanouts),
+ "hash": workload_contract.trace_checksums(indices, weights)["trace"],
+ "hotspot_ratio": max(expert_load) / expert_mean if expert_mean else 0.0,
+ "locality": {
+ "placement": "packed",
+ "local_rank_fraction": local / copies,
+ "same_node_fraction": same_node / copies,
+ "same_scaleup_domain_fraction": same_domain / copies,
+ "cross_node_fraction": 1 - same_node / copies,
+ "cross_domain_fraction": 1 - same_domain / copies,
+ "gpus_per_node": gpus_per_node,
+ "scale_up_domain": scale_up_domain,
+ "copies": copies,
+ },
+ "payload_copies_per_rank": payload_load,
+ "payload_rank_cv": _coefficient_of_variation(payload_load),
+ "routed_copies": copies,
+ "source_token_stats": {
+ "min": tokens_per_rank,
+ "mean": float(tokens_per_rank),
+ "max": tokens_per_rank,
+ "cv": 0.0,
+ "empty_ranks": 0,
+ "total": tokens_per_rank * ep_size,
+ "ranks": ep_size,
+ },
+ }
+
+
+def _expected_histogram(samples: list[float], bins: int = 40) -> dict[str, Any]:
+ low, high = min(samples), max(samples)
+ if high <= low:
+ return {"n": len(samples), "min": low, "max": high, "bins": bins, "counts": [len(samples)]}
+ counts = [0] * bins
+ span = high - low
+ for sample in samples:
+ index = min(bins - 1, int((sample - low) / span * bins))
+ counts[index] += 1
+ return {
+ "n": len(samples),
+ "min": round(low, 3),
+ "max": round(high, 3),
+ "bins": bins,
+ "counts": counts,
+ }
+
+
+def _expected_anomalies(
+ tokens: int, components: dict[str, Any]
+) -> list[dict[str, Any]]:
+ dispatch = components["dispatch"]["percentiles_us"]
+ combine = components["combine"]["percentiles_us"]
+ roundtrip = components["roundtrip"]["percentiles_us"]
+ isolated = components["isolated_sum"]["percentiles_us"]
+ anomalies: list[dict[str, Any]] = []
+ if isolated is not None and roundtrip["p99"] > 3.0 * isolated["p99"]:
+ anomalies.append({
+ "type": "roundtrip_gt_isolated_sum",
+ "T": tokens,
+ "roundtrip_p99": round(roundtrip["p99"], 2),
+ "isolated_sum_p99": round(isolated["p99"], 2),
+ "ratio": round(roundtrip["p99"] / isolated["p99"], 2),
+ "threshold": 3.0,
+ })
+ floor = max(dispatch["p50"], combine["p50"]) if dispatch and combine else None
+ if floor and roundtrip["p50"] < 0.95 * floor:
+ anomalies.append({
+ "type": "roundtrip_lt_component_floor",
+ "T": tokens,
+ "roundtrip_p50": round(roundtrip["p50"], 2),
+ "component_floor_p50": round(floor, 2),
+ })
+ return anomalies
+
+
+def _validate_canonical_workload(
+ workload: dict[str, Any],
+ scheduled_case: dict[str, Any],
+ rows: list[dict[str, Any]],
+ eplb: dict[str, Any],
+) -> None:
+ """Bind every canonical member and measured routing hash to its scheduled token row."""
+ profile = identity.V1_CASE_PROFILE
+ if eplb["enabled"]:
+ plan = _expected_eplb_plan(
+ scheduled_case["routing"],
+ scheduled_case["topk"],
+ scheduled_case["experts"],
+ eplb["num_physical_experts"],
+ scheduled_case["ep"],
+ profile["seed"],
+ profile["eplb_reference_tokens_per_rank"],
+ )
+ if eplb["mapping_hash"] != eplb_contract.mapping_hash(plan):
+ raise ContractError("raw EPLB mapping differs from the frozen canonical plan")
+
+ expected: dict[str, dict[str, str]] = {}
+ for index, row in enumerate(rows):
+ member, checksums, routing_hash, _, _ = _expected_canonical_trace(
+ scheduled_case["routing"],
+ scheduled_case["hidden"],
+ scheduled_case["topk"],
+ scheduled_case["experts"],
+ eplb["num_physical_experts"],
+ scheduled_case["ep"],
+ row["tokens_per_rank"],
+ profile["seed"],
+ eplb["enabled"],
+ profile["eplb_reference_tokens_per_rank"],
+ )
+ if row["routing"]["hash"] != routing_hash:
+ raise ContractError(
+ f"raw.measurement.rows[{index}].routing.hash differs from its canonical member"
+ )
+ expected[member] = checksums
+ if (
+ len(expected) != len(rows)
+ or workload["members"] != sorted(expected)
+ or workload["manifest_checksums"] != expected
+ ):
+ raise ContractError("raw canonical member set/checksums differ from scheduled rows")
+ expected_workload_id = identity.workload_id({
+ "members": [
+ {"checksums": expected[member], "workload_id": member}
+ for member in sorted(expected)
+ ]
+ })
+ if workload["workload_id"] != expected_workload_id:
+ raise ContractError("raw composite workload identity differs from scheduled rows")
+
+
+def _nearest_rank(samples: list[float], q: int) -> float:
+ ordered = sorted(samples)
+ return ordered[max(0, min(len(ordered) - 1, math.ceil(q / 100 * len(ordered)) - 1))]
+
+
+def _close(observed: Any, expected: float, path: str, tolerance: float = 1e-6) -> None:
+ value = _number(observed, path)
+ if not math.isclose(value, expected, rel_tol=tolerance, abs_tol=tolerance):
+ raise ContractError(f"{path}={value} differs from recomputed {expected}")
+
+
+def _equivalent(
+ observed: Any, expected: Any, path: str, *, tolerance: float = 1e-6
+) -> None:
+ """Compare a recomputed JSON subtree while allowing only float roundoff."""
+ if isinstance(expected, dict):
+ value = _keys(observed, set(expected), path)
+ for key, child in expected.items():
+ _equivalent(value[key], child, f"{path}.{key}", tolerance=tolerance)
+ return
+ if isinstance(expected, list):
+ if not isinstance(observed, list) or len(observed) != len(expected):
+ raise ContractError(f"{path} differs from recomputed evidence")
+ for index, child in enumerate(expected):
+ _equivalent(observed[index], child, f"{path}[{index}]", tolerance=tolerance)
+ return
+ if isinstance(expected, float):
+ _close(observed, expected, path, tolerance)
+ return
+ if type(observed) is not type(expected) or observed != expected:
+ raise ContractError(f"{path} differs from recomputed evidence")
+
+
+def _schema_equal(left: Any, right: Any) -> bool:
+ """JSON Schema equality: booleans are distinct from numbers."""
+ if isinstance(left, bool) or isinstance(right, bool):
+ return type(left) is type(right) and left == right
+ if isinstance(left, dict) and isinstance(right, dict):
+ return set(left) == set(right) and all(
+ _schema_equal(left[key], right[key]) for key in left
+ )
+ if isinstance(left, list) and isinstance(right, list):
+ return len(left) == len(right) and all(
+ _schema_equal(a, b) for a, b in zip(left, right, strict=True)
+ )
+ return left == right
+
+
+def _schema_ref(root: dict[str, Any], reference: str) -> dict[str, Any]:
+ if not reference.startswith("#/"):
+ raise ContractError("native artifact schema contains a non-local reference")
+ value: Any = root
+ for part in reference[2:].split("/"):
+ part = part.replace("~1", "/").replace("~0", "~")
+ if not isinstance(value, dict) or part not in value:
+ raise ContractError("native artifact schema contains a broken reference")
+ value = value[part]
+ if not isinstance(value, dict):
+ raise ContractError("native artifact schema reference is not an object")
+ return value
+
+
+def _schema_type_matches(value: Any, expected: str) -> bool:
+ if expected == "null":
+ return value is None
+ if expected == "boolean":
+ return type(value) is bool
+ if expected == "object":
+ return isinstance(value, dict)
+ if expected == "array":
+ return isinstance(value, list)
+ if expected == "string":
+ return isinstance(value, str)
+ if expected == "number":
+ return (
+ not isinstance(value, bool)
+ and isinstance(value, (int, float))
+ and math.isfinite(value)
+ )
+ if expected == "integer":
+ return (
+ not isinstance(value, bool)
+ and isinstance(value, (int, float))
+ and math.isfinite(value)
+ and float(value).is_integer()
+ )
+ raise ContractError(f"native artifact schema uses unsupported type {expected!r}")
+
+
+def _validate_schema_value(
+ value: Any, schema: dict[str, Any], root: dict[str, Any], path: str
+) -> None:
+ """Validate the bounded JSON Schema subset used by native artifact contracts."""
+ if "$ref" in schema:
+ _validate_schema_value(value, _schema_ref(root, schema["$ref"]), root, path)
+ return
+ if "oneOf" in schema:
+ matches = 0
+ for candidate in schema["oneOf"]:
+ try:
+ _validate_schema_value(value, candidate, root, path)
+ except ContractError:
+ continue
+ matches += 1
+ if matches != 1:
+ raise ContractError(f"{path} must match exactly one native schema alternative")
+ return
+ expected_type = schema.get("type")
+ if expected_type is not None and not _schema_type_matches(value, expected_type):
+ raise ContractError(f"{path} is not a schema {expected_type}")
+ if "const" in schema and not _schema_equal(value, schema["const"]):
+ raise ContractError(f"{path} differs from its schema constant")
+ if "enum" in schema and not any(_schema_equal(value, item) for item in schema["enum"]):
+ raise ContractError(f"{path} is outside its schema enum")
+
+ if isinstance(value, dict):
+ required = set(schema.get("required", ()))
+ properties = schema.get("properties", {})
+ missing = required - set(value)
+ if missing:
+ raise ContractError(f"{path} lacks schema fields {sorted(missing)}")
+ additional = schema.get("additionalProperties", True)
+ extra = set(value) - set(properties)
+ if additional is False and extra:
+ raise ContractError(f"{path} has extra schema fields {sorted(extra)}")
+ for key, item in value.items():
+ if key in properties:
+ _validate_schema_value(item, properties[key], root, f"{path}.{key}")
+ elif isinstance(additional, dict):
+ _validate_schema_value(item, additional, root, f"{path}.{key}")
+ property_names = schema.get("propertyNames")
+ if property_names is not None:
+ for key in value:
+ _validate_schema_value(key, property_names, root, f"{path}.")
+
+ if isinstance(value, list):
+ if len(value) < schema.get("minItems", 0):
+ raise ContractError(f"{path} has too few schema items")
+ maximum = schema.get("maxItems")
+ if maximum is not None and len(value) > maximum:
+ raise ContractError(f"{path} has too many schema items")
+ if schema.get("uniqueItems") and any(
+ _schema_equal(item, prior)
+ for index, item in enumerate(value)
+ for prior in value[:index]
+ ):
+ raise ContractError(f"{path} schema items are not unique")
+ if "items" in schema:
+ for index, item in enumerate(value):
+ _validate_schema_value(item, schema["items"], root, f"{path}[{index}]")
+
+ if isinstance(value, str):
+ if len(value) < schema.get("minLength", 0):
+ raise ContractError(f"{path} is shorter than its schema minimum")
+ maximum = schema.get("maxLength")
+ if maximum is not None and len(value) > maximum:
+ raise ContractError(f"{path} is longer than its schema maximum")
+ if "pattern" in schema and re.search(schema["pattern"], value) is None:
+ raise ContractError(f"{path} does not match its schema pattern")
+ if schema.get("format") == "date-time":
+ try:
+ parsed = dt.datetime.fromisoformat(value.replace("Z", "+00:00"))
+ except ValueError as exc:
+ raise ContractError(f"{path} is not a schema date-time") from exc
+ if parsed.tzinfo is None:
+ raise ContractError(f"{path} schema date-time lacks a timezone")
+
+ if (
+ not isinstance(value, bool)
+ and isinstance(value, (int, float))
+ and math.isfinite(value)
+ ):
+ if "minimum" in schema and value < schema["minimum"]:
+ raise ContractError(f"{path} is below its schema minimum")
+ if "maximum" in schema and value > schema["maximum"]:
+ raise ContractError(f"{path} is above its schema maximum")
+
+
+def _validate_native_schema(name: str, value: Any) -> None:
+ schema = _SCHEMA_CACHE.get(name)
+ if schema is None:
+ loaded = strict_load(SCHEMA_DIR / name)
+ if not isinstance(loaded, dict):
+ raise ContractError(f"native artifact schema {name} is not an object")
+ schema = loaded
+ _SCHEMA_CACHE[name] = schema
+ _validate_schema_value(value, schema, schema, "$")
+
+
+def validate_samples_document(document: Any) -> dict[str, Any]:
+ _validate_native_schema("samples-v1.schema.json", document)
+ doc = _keys(
+ document,
+ {"allocation_id", "attempt_id", "case_id", "format", "points", "sampling",
+ "schema_version", "series_id"},
+ "samples",
+ )
+ if doc["format"] != SAMPLES_FORMAT or doc["schema_version"] != 1:
+ raise ContractError("samples format/schema differs from v1")
+ for field, kind in (
+ ("allocation_id", "allocation"), ("attempt_id", "attempt"),
+ ("case_id", "case"), ("series_id", "series"),
+ ):
+ _typed(doc[field], kind, f"samples.{field}")
+ sampling = _keys(
+ doc["sampling"], {"iterations_per_trial", "reduction", "trials"}, "samples.sampling"
+ )
+ if (
+ _integer(sampling["iterations_per_trial"], "samples.sampling.iterations_per_trial", minimum=1) != 8
+ or _integer(sampling["trials"], "samples.sampling.trials", minimum=1) != 64
+ or sampling["reduction"] != identity.V1_CASE_PROFILE["rank_reduction"]
+ ):
+ raise ContractError("samples must use the fixed 8x64 cross-rank-max contract")
+ points = doc["points"]
+ if not isinstance(points, list) or not points:
+ raise ContractError("samples.points must be non-empty")
+ seen = set()
+ for index, point_value in enumerate(points):
+ path = f"samples.points[{index}]"
+ point = _keys(
+ point_value,
+ {"components", "evidence_id", "point_id", "sample_sha256", "tokens_per_rank"},
+ path,
+ )
+ tokens = _integer(point["tokens_per_rank"], f"{path}.tokens_per_rank", minimum=1)
+ if tokens in seen:
+ raise ContractError(f"duplicate sample token point {tokens}")
+ seen.add(tokens)
+ _typed(point["point_id"], "point", f"{path}.point_id")
+ _typed(point["evidence_id"], "evidence", f"{path}.evidence_id")
+ components = _keys(point["components"], {"combine", "dispatch", "roundtrip"}, f"{path}.components")
+ for name, component_value in components.items():
+ component = _keys(
+ component_value, {"availability", "sample_count", "trials"},
+ f"{path}.components.{name}",
+ )
+ availability = component["availability"]
+ count = _integer(component["sample_count"], f"{path}.components.{name}.sample_count")
+ trials = component["trials"]
+ if availability == "unavailable":
+ if count != 0 or trials is not None or name == "roundtrip":
+ raise ContractError(f"{path}.components.{name} has invalid unavailability")
+ continue
+ if availability != "measured" or not isinstance(trials, list) or len(trials) != 64:
+ raise ContractError(f"{path}.components.{name} must contain 64 measured trials")
+ if any(not isinstance(trial, list) or len(trial) != 8 for trial in trials):
+ raise ContractError(f"{path}.components.{name} trials must each contain 8 samples")
+ flattened = [
+ _number(sample, f"{path}.components.{name}.trials", minimum=0.0)
+ for trial in trials for sample in trial
+ ]
+ if count != 512 or len(flattened) != 512:
+ raise ContractError(f"{path}.components.{name} must contain 512 samples")
+ sample_base = {"components": components, "tokens_per_rank": tokens}
+ if point["sample_sha256"] != _sha256_json(sample_base):
+ raise ContractError(f"{path}.sample_sha256 differs")
+ return doc
+
+
+def _validate_component(
+ component_value: Any,
+ sample_component: dict[str, Any] | None,
+ path: str,
+ *,
+ derived: bool = False,
+) -> None:
+ component = _keys(
+ component_value, {"availability", "origin", "percentiles_us", "sample_count"}, path
+ )
+ availability = component["availability"]
+ if availability == "unavailable":
+ if component != {
+ "availability": "unavailable", "origin": None,
+ "percentiles_us": None, "sample_count": 0,
+ }:
+ raise ContractError(f"{path} has invalid unavailable representation")
+ if sample_component and sample_component["availability"] != "unavailable":
+ raise ContractError(f"{path} disagrees with samples")
+ return
+ expected_availability = "derived" if derived else "measured"
+ expected_origin = "derived-percentile-sum" if derived else "measured"
+ if availability != expected_availability or component["origin"] != expected_origin:
+ raise ContractError(f"{path} has invalid availability/origin")
+ percentiles = _keys(component["percentiles_us"], set(PERCENTILES), f"{path}.percentiles_us")
+ if derived:
+ if component["sample_count"] != 0:
+ raise ContractError(f"{path}.sample_count must be zero for a derived value")
+ return
+ if sample_component is None or sample_component["availability"] != "measured":
+ raise ContractError(f"{path} lacks measured sample evidence")
+ flattened = [sample for trial in sample_component["trials"] for sample in trial]
+ if component["sample_count"] != len(flattened):
+ raise ContractError(f"{path}.sample_count differs from exact samples")
+ for name, percentile in zip(PERCENTILES, (50, 90, 95, 99), strict=True):
+ _close(percentiles[name], _nearest_rank(flattened, percentile), f"{path}.{name}")
+
+
+def _validate_oracle(
+ value: Any, path: str, profile: dict[str, Any] | None = None
+) -> dict[str, Any]:
+ profile = profile or identity.V1_NORMAL_CASE_PROFILE
+ oracle = _keys(
+ value,
+ {"atol", "checks", "combine_weight_semantics", "contract", "dispatch_sha256",
+ "max_absolute_error", "max_elementwise_relative_error", "max_relative_error",
+ "max_weight_error", "order_sha256", "ordering_contract", "passed", "receive_count",
+ "rtol"},
+ path,
+ )
+ if oracle["contract"] != profile["oracle_contract"]:
+ raise ContractError(f"{path}.contract differs")
+ checks = _keys(
+ oracle["checks"],
+ {"combine_values", "counts", "metadata", "multiplicity", "payload", "source_set",
+ "weights"},
+ f"{path}.checks",
+ )
+ if any(type(value) is not bool for value in checks.values()):
+ raise ContractError(f"{path}.checks must be boolean")
+ if type(oracle["passed"]) is not bool:
+ raise ContractError(f"{path}.passed must be boolean")
+ _integer(oracle["receive_count"], f"{path}.receive_count")
+ _text(oracle["ordering_contract"], f"{path}.ordering_contract")
+ expected_weight_semantics = (
+ "gate-weighted-sum"
+ if profile["combine_semantics"] == "gate-weighted"
+ else "unweighted-rank-sum"
+ )
+ if oracle["combine_weight_semantics"] != expected_weight_semantics:
+ raise ContractError(f"{path}.combine_weight_semantics differs from v1")
+ _close(oracle["rtol"], 5e-2, f"{path}.rtol")
+ _close(oracle["atol"], 2e-2, f"{path}.atol")
+ for field in ("dispatch_sha256", "order_sha256"):
+ digest = oracle[field]
+ if digest is not None and (
+ not isinstance(digest, str) or len(digest) != 64
+ or any(character not in "0123456789abcdef" for character in digest)
+ ):
+ raise ContractError(f"{path}.{field} is not a SHA-256 digest")
+ for field in (
+ "max_absolute_error", "max_elementwise_relative_error", "max_relative_error",
+ "max_weight_error",
+ ):
+ if oracle[field] is not None:
+ _number(oracle[field], f"{path}.{field}", minimum=0.0)
+ expected_pass = (
+ all(checks.values())
+ and oracle["max_relative_error"] is not None
+ and oracle["max_relative_error"] < 5e-2
+ )
+ if oracle["passed"] != expected_pass:
+ raise ContractError(f"{path}.passed differs from its evidence")
+ return oracle
+
+
+def validate_raw_document(document: Any, samples_document: Any) -> dict[str, Any]:
+ """Validate identities, exact samples, formulas, privacy, and the native raw shape."""
+ _validate_native_schema("raw-case-v1.schema.json", document)
+ doc = _keys(
+ document,
+ {"case", "format", "generated_at", "identity", "implementation", "measurement",
+ "outcome", "provenance", "record_type", "runtime_fingerprint", "sample_artifact",
+ "schema_version", "topology", "workload"},
+ "raw",
+ )
+ _finite_tree(doc)
+ if doc["format"] != RAW_FORMAT or doc["schema_version"] != 1 or doc["record_type"] != "case-attempt":
+ raise ContractError("raw format/schema/record type differs from v1")
+ _text(doc["generated_at"], "raw.generated_at")
+ identifiers = _keys(
+ doc["identity"],
+ {"allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal", "case_factors",
+ "case_id", "series_factors", "series_id"},
+ "raw.identity",
+ )
+ for field, kind in (
+ ("allocation_id", "allocation"), ("attempt_id", "attempt"),
+ ("case_id", "case"), ("series_id", "series"),
+ ):
+ _typed(identifiers[field], kind, f"raw.identity.{field}")
+ ordinal = _integer(identifiers["attempt_ordinal"], "raw.identity.attempt_ordinal", minimum=1)
+ allocation_factors = _keys(
+ identifiers["allocation_factors"], ALLOCATION_FACTOR_FIELDS,
+ "raw.identity.allocation_factors",
+ )
+ case_factors = _keys(
+ identifiers["case_factors"], {"case", "profile", "sku"},
+ "raw.identity.case_factors",
+ )
+ scheduled_case = _keys(
+ case_factors["case"], TERMINAL_CASE_FIELDS, "raw.identity.case_factors.case"
+ )
+ profile = scheduled_case_profile(scheduled_case, "raw.identity.case_factors.case")
+ if case_factors["profile"] != profile:
+ raise ContractError("raw case profile differs from CollectiveX v1")
+ _text(case_factors["sku"], "raw.identity.case_factors.sku")
+ series_factors = _keys(
+ identifiers["series_factors"],
+ {"backend", "case_id", "image_digest", "implementation_contract_sha256",
+ "public_config_sha256", "routing_control_sha256",
+ "runtime_fingerprint_sha256", "source_sha", "squash_sha256", "workload_id"},
+ "raw.identity.series_factors",
+ )
+ if identity.allocation_id(identifiers["allocation_factors"]) != identifiers["allocation_id"]:
+ raise ContractError("allocation identity differs")
+ if identity.digest("case", identifiers["case_factors"]) != identifiers["case_id"]:
+ raise ContractError("case identity differs")
+ if identity.series_id(identifiers["series_factors"]) != identifiers["series_id"]:
+ raise ContractError("series identity differs")
+ if identity.attempt_id(
+ allocation=identifiers["allocation_id"], case=identifiers["case_id"], ordinal=ordinal
+ ) != identifiers["attempt_id"]:
+ raise ContractError("attempt identity differs")
+
+ samples = validate_samples_document(samples_document)
+ for field in ("allocation_id", "attempt_id", "case_id", "series_id"):
+ if samples[field] != identifiers[field]:
+ raise ContractError(f"samples.{field} differs from raw identity")
+ sample_by_token = {point["tokens_per_rank"]: point for point in samples["points"]}
+
+ case = _keys(
+ doc["case"],
+ {"attempt_ordinal", "backend", "eplb", "ep_size", "mode", "phase",
+ "required_publication", "resource_mode", "runner", "shape", "suite", "workload_name"},
+ "raw.case",
+ )
+ ep_size = _integer(case["ep_size"], "raw.case.ep_size", minimum=1)
+ if case["attempt_ordinal"] != ordinal:
+ raise ContractError("case attempt ordinal differs")
+ for field in ("backend", "mode", "phase", "required_publication", "resource_mode", "runner",
+ "suite", "workload_name"):
+ _text(case[field], f"raw.case.{field}")
+ shape = _keys(
+ case["shape"],
+ {"activation_profile", "dispatch_dtype", "eplb", "experts", "experts_per_rank",
+ "hidden", "kernel_gen", "num_logical_experts", "quant", "routing", "topk"},
+ "raw.case.shape",
+ )
+ hidden = _integer(shape["hidden"], "raw.case.shape.hidden", minimum=1)
+ topk = _integer(shape["topk"], "raw.case.shape.topk", minimum=1)
+ physical_experts = _integer(
+ shape["experts"], "raw.case.shape.experts", minimum=1
+ )
+ logical_experts = _integer(
+ shape["num_logical_experts"],
+ "raw.case.shape.num_logical_experts",
+ minimum=1,
+ )
+ experts_per_rank = _integer(
+ shape["experts_per_rank"], "raw.case.shape.experts_per_rank", minimum=1
+ )
+ quant = _keys(
+ shape["quant"],
+ {"combine_accum_dtype", "combine_input_dtype", "combine_output_dtype",
+ "combine_quant_mode", "scale_layout"},
+ "raw.case.shape.quant",
+ )
+ eplb = _keys(
+ case["eplb"],
+ {"enabled", "imbalance_after", "imbalance_before", "mapping_hash", "max_replicas",
+ "num_logical_experts", "num_physical_experts", "num_redundant", "planner",
+ "reference_tokens_per_rank", "replicated_experts"},
+ "raw.case.eplb",
+ )
+ if not isinstance(eplb["enabled"], bool):
+ raise ContractError("raw.case.eplb.enabled must be boolean")
+ expected_redundant = (
+ profile["eplb_redundant_experts"] if eplb["enabled"] else 0
+ )
+ expected_physical = eplb_contract.physical_count(
+ scheduled_case["experts"], expected_redundant, ep_size
+ )
+ if (
+ shape["eplb"] != eplb["enabled"]
+ or logical_experts != scheduled_case["experts"]
+ or physical_experts != expected_physical
+ or experts_per_rank * ep_size != physical_experts
+ or eplb["num_logical_experts"] != logical_experts
+ or eplb["num_physical_experts"] != physical_experts
+ or eplb["num_redundant"] != expected_redundant
+ ):
+ raise ContractError("raw EPLB/shape dimensions differ from the frozen profile")
+ if eplb["enabled"]:
+ expected_plan = _expected_eplb_plan(
+ scheduled_case["routing"],
+ topk,
+ logical_experts,
+ physical_experts,
+ ep_size,
+ profile["seed"],
+ profile["eplb_reference_tokens_per_rank"],
+ )
+ expected_eplb = {
+ "enabled": True,
+ "imbalance_after": expected_plan["imbalance_after"],
+ "imbalance_before": expected_plan["imbalance_before"],
+ "mapping_hash": eplb_contract.mapping_hash(expected_plan),
+ "max_replicas": expected_plan["max_replicas"],
+ "num_logical_experts": logical_experts,
+ "num_physical_experts": physical_experts,
+ "num_redundant": expected_redundant,
+ "planner": profile["eplb_planner"],
+ "reference_tokens_per_rank": profile[
+ "eplb_reference_tokens_per_rank"
+ ],
+ "replicated_experts": expected_plan["replicated_experts"],
+ }
+ else:
+ expected_eplb = {
+ "enabled": False,
+ "imbalance_after": None,
+ "imbalance_before": None,
+ "mapping_hash": None,
+ "max_replicas": None,
+ "num_logical_experts": logical_experts,
+ "num_physical_experts": physical_experts,
+ "num_redundant": 0,
+ "planner": None,
+ "reference_tokens_per_rank": None,
+ "replicated_experts": 0,
+ }
+ _equivalent(eplb, expected_eplb, "raw.case.eplb", tolerance=1e-9)
+ if case_factors["sku"] != case["runner"]:
+ raise ContractError("raw case runner differs from case identity")
+
+ workload = _keys(
+ doc["workload"],
+ {"activation_generator", "activation_identity", "activation_profile",
+ "cross_rank_consistent", "manifest_checksums", "members", "routing_generator", "source",
+ "trace_hashes", "trace_signature", "workload_id"},
+ "raw.workload",
+ )
+ if workload["source"] not in {"canonical-serialized", "seeded-runtime"}:
+ raise ContractError("raw workload source is invalid")
+ if workload["source"] == "canonical-serialized":
+ _typed(workload["workload_id"], "workload", "raw.workload.workload_id")
+ members = workload["members"]
+ checksums = workload["manifest_checksums"]
+ if (
+ not isinstance(members, list)
+ or not members
+ or members != sorted(set(members))
+ or not all(identity.is_typed_id(member, "workload") for member in members)
+ or not isinstance(checksums, dict)
+ or set(checksums) != set(members)
+ ):
+ raise ContractError("raw canonical workload members/checksums are invalid")
+ for member, values in checksums.items():
+ if (
+ not isinstance(values, dict)
+ or set(values) != {"topk_idx", "topk_weights", "trace"}
+ or any(not re.fullmatch(r"[0-9a-f]{64}", str(value)) for value in values.values())
+ ):
+ raise ContractError(f"raw canonical workload checksums differ for {member}")
+ expected_workload_id = identity.workload_id({
+ "members": [
+ {"checksums": checksums[member], "workload_id": member}
+ for member in members
+ ]
+ })
+ if workload["workload_id"] != expected_workload_id:
+ raise ContractError("raw composite workload identity differs from its members")
+ elif any(workload[field] is not None for field in ("members", "manifest_checksums", "workload_id")):
+ raise ContractError("raw seeded workload cannot claim serialized members")
+ if workload["cross_rank_consistent"] is not True:
+ raise ContractError("raw workload is not consistent across ranks")
+
+ measurement = _keys(
+ doc["measurement"],
+ {"component_order_contract", "conditioning", "contract", "rows",
+ "sampling", "source_allocation"},
+ "raw.measurement",
+ )
+ validate_conditioning_contract(measurement["conditioning"], case["phase"])
+ sampling = _keys(
+ measurement["sampling"],
+ {"contract", "iterations_per_trial", "percentile_method", "reduction",
+ "samples_per_component", "trials", "warmup_iterations", "warmup_semantics"},
+ "raw.measurement.sampling",
+ )
+ expected_sampling = {
+ "contract": profile["sampling_contract"], "iterations_per_trial": 8,
+ "percentile_method": profile["percentile_method"],
+ "reduction": profile["rank_reduction"],
+ "samples_per_component": 512, "trials": 64, "warmup_iterations": 32,
+ "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1",
+ }
+ if sampling != expected_sampling:
+ raise ContractError("raw sampling contract differs from fixed-512-v1")
+ if (
+ case["mode"] != profile["mode"]
+ or case["resource_mode"] != profile["resource_mode"]
+ or measurement["contract"] != profile["contract"]
+ or measurement["component_order_contract"] != profile["component_order_contract"]
+ or measurement["source_allocation"] != "even"
+ or shape["activation_profile"] != profile["activation_profile"]
+ or shape["dispatch_dtype"] != profile["dtype"]
+ or quant["combine_input_dtype"] != profile["combine_dtype"]
+ or quant["combine_output_dtype"] != profile["combine_dtype"]
+ or quant["combine_quant_mode"] != profile["combine_quant_mode"]
+ or quant["scale_layout"] is not None
+ or workload["activation_generator"] != profile["activation_generator"]
+ or workload["activation_profile"] != profile["activation_profile"]
+ or workload["routing_generator"] != profile["routing_generator"]
+ ):
+ raise ContractError("raw case differs from the frozen v1 profile")
+ expected_activation = hashlib.sha256(
+ (
+ f"counter|seed={profile['seed']}|hidden={hidden}|"
+ f"gen={profile['activation_generator']}"
+ ).encode()
+ ).hexdigest()
+ if workload["activation_identity"] != expected_activation:
+ raise ContractError("raw activation identity differs from the frozen seed/profile")
+ rows = measurement["rows"]
+ if not isinstance(rows, list) or not rows:
+ raise ContractError("raw.measurement.rows must be non-empty")
+ seen_points = set()
+ row_tokens = []
+ recomputed_anomalies = 0
+ for index, row_value in enumerate(rows):
+ path = f"raw.measurement.rows[{index}]"
+ row = _keys(
+ row_value,
+ {"anomalies", "components", "correctness", "evidence_id", "global_tokens",
+ "logical_bytes", "point_id", "receive", "routing",
+ "sample_histograms", "sample_sha256", "token_rate_at_latency_percentile",
+ "tokens_per_rank"},
+ path,
+ )
+ tokens = _integer(row["tokens_per_rank"], f"{path}.tokens_per_rank", minimum=1)
+ row_tokens.append(tokens)
+ if tokens in seen_points or tokens not in sample_by_token:
+ raise ContractError(f"{path} token point is duplicate or missing samples")
+ seen_points.add(tokens)
+ if row["global_tokens"] != tokens * ep_size:
+ raise ContractError(f"{path}.global_tokens formula differs")
+ sample_point = sample_by_token[tokens]
+ expected_point = identity.point_id(series=identifiers["series_id"], tokens_per_rank=tokens)
+ if row["point_id"] != expected_point or sample_point["point_id"] != expected_point:
+ raise ContractError(f"{path}.point_id differs")
+ expected_evidence = identity.evidence_id(
+ point=expected_point, allocation=identifiers["allocation_id"],
+ attempt=identifiers["attempt_id"], sample_sha256=sample_point["sample_sha256"],
+ )
+ if row["evidence_id"] != expected_evidence or sample_point["evidence_id"] != expected_evidence:
+ raise ContractError(f"{path}.evidence_id differs")
+ if row["sample_sha256"] != sample_point["sample_sha256"]:
+ raise ContractError(f"{path}.sample_sha256 differs")
+ components = _keys(
+ row["components"], {"combine", "dispatch", "isolated_sum", "roundtrip"},
+ f"{path}.components",
+ )
+ for name in ("combine", "dispatch", "roundtrip"):
+ _validate_component(
+ components[name], sample_point["components"][name], f"{path}.components.{name}"
+ )
+ _validate_component(
+ components["isolated_sum"], None, f"{path}.components.isolated_sum", derived=True
+ )
+ _, _, _, expected_indices, expected_weights = _expected_canonical_trace(
+ scheduled_case["routing"],
+ hidden,
+ topk,
+ logical_experts,
+ physical_experts,
+ ep_size,
+ tokens,
+ profile["seed"],
+ eplb["enabled"],
+ profile["eplb_reference_tokens_per_rank"],
+ )
+ expected_routing = _expected_routing_summary(
+ expected_indices,
+ expected_weights,
+ physical_experts=physical_experts,
+ ep_size=ep_size,
+ tokens_per_rank=tokens,
+ gpus_per_node=scheduled_case["gpus_per_node"],
+ scale_up_domain=scheduled_case["scale_up_domain"],
+ )
+ _equivalent(
+ row["routing"], expected_routing, f"{path}.routing", tolerance=1e-5
+ )
+ expected_payload_counts = (
+ expected_routing["expert_assignments_per_rank"]
+ if profile["payload_unit"] == "token-expert"
+ else expected_routing["payload_copies_per_rank"]
+ )
+ throughput = _keys(
+ row["token_rate_at_latency_percentile"], set(PERCENTILES),
+ f"{path}.token_rate_at_latency_percentile",
+ )
+ for percentile in PERCENTILES:
+ latency = components["roundtrip"]["percentiles_us"][percentile]
+ if latency <= 0:
+ raise ContractError(f"{path} roundtrip latency must be positive")
+ _close(
+ throughput[percentile], row["global_tokens"] / (latency * 1e-6),
+ f"{path}.token_rate_at_latency_percentile.{percentile}", 1e-9,
+ )
+ correctness = _keys(
+ row["correctness"],
+ {"contract", "max_relative_error", "passed", "rank_evidence", "scope"},
+ f"{path}.correctness",
+ )
+ if (
+ correctness["contract"] != profile["oracle_contract"]
+ or correctness["scope"] != profile["correctness_scope"]
+ or type(correctness["passed"]) is not bool
+ ):
+ raise ContractError(f"{path}.correctness contract differs")
+ _number(
+ correctness["max_relative_error"],
+ f"{path}.correctness.max_relative_error",
+ minimum=0.0,
+ )
+ rank_evidence = correctness["rank_evidence"]
+ if not isinstance(rank_evidence, list) or len(rank_evidence) != ep_size:
+ raise ContractError(f"{path}.correctness.rank_evidence must cover every rank")
+ ranks = set()
+ observed_max_error = 0.0
+ evidence_passed = True
+ for evidence_index, evidence_value in enumerate(rank_evidence):
+ evidence_path = f"{path}.correctness.rank_evidence[{evidence_index}]"
+ evidence = _keys(
+ evidence_value,
+ {"input_unchanged", "order_stable", "post_timing", "pre_timing", "rank"},
+ evidence_path,
+ )
+ evidence_rank = _integer(evidence["rank"], f"{evidence_path}.rank")
+ if evidence_rank >= ep_size:
+ raise ContractError(f"{evidence_path}.rank is outside the EP group")
+ ranks.add(evidence_rank)
+ if type(evidence["input_unchanged"]) is not bool or type(evidence["order_stable"]) is not bool:
+ raise ContractError(f"{evidence_path} stability fields must be boolean")
+ pre = _validate_oracle(
+ evidence["pre_timing"], f"{evidence_path}.pre_timing", profile
+ )
+ post = _validate_oracle(
+ evidence["post_timing"], f"{evidence_path}.post_timing", profile
+ )
+ if (
+ pre["receive_count"] != expected_payload_counts[evidence_rank]
+ or post["receive_count"] != expected_payload_counts[evidence_rank]
+ ):
+ raise ContractError(
+ f"{evidence_path}.receive_count differs from canonical routing"
+ )
+ expected_stability = all(
+ pre[field] == post[field]
+ for field in ("ordering_contract", "order_sha256", "dispatch_sha256")
+ )
+ if evidence["order_stable"] != expected_stability:
+ raise ContractError(f"{evidence_path}.order_stable differs from the evidence")
+ errors = [
+ oracle["max_relative_error"]
+ for oracle in (pre, post)
+ if oracle["max_relative_error"] is not None
+ ]
+ observed_max_error = max([observed_max_error, *errors])
+ evidence_passed = evidence_passed and all(
+ (evidence["input_unchanged"], evidence["order_stable"], pre["passed"], post["passed"])
+ )
+ if ranks != set(range(ep_size)) or correctness["passed"] != evidence_passed:
+ raise ContractError(f"{path}.correctness rank coverage or outcome differs")
+ _close(
+ correctness["max_relative_error"], observed_max_error,
+ f"{path}.correctness.max_relative_error",
+ )
+ if components["dispatch"]["availability"] == "measured":
+ for percentile in PERCENTILES:
+ expected = (
+ components["dispatch"]["percentiles_us"][percentile]
+ + components["combine"]["percentiles_us"][percentile]
+ )
+ _close(
+ components["isolated_sum"]["percentiles_us"][percentile], expected,
+ f"{path}.components.isolated_sum.{percentile}",
+ )
+ logical_copies = (
+ sum(expected_routing["expert_assignments_per_rank"])
+ if profile["payload_unit"] == "token-expert"
+ else expected_routing["routed_copies"]
+ )
+ expected_bytes = logical_copies * hidden * 2
+ expected_logical = {
+ "combine": expected_bytes,
+ "dispatch": expected_bytes,
+ "roundtrip": expected_bytes * 2,
+ }
+ _equivalent(row["logical_bytes"], expected_logical, f"{path}.logical_bytes")
+
+ max_receive = max(expected_payload_counts)
+ expected_receive = {
+ "max": max_receive,
+ "mean": sum(expected_payload_counts) / ep_size,
+ "min": min(expected_payload_counts),
+ "total": sum(expected_payload_counts),
+ }
+ _equivalent(row["receive"], expected_receive, f"{path}.receive")
+ expected_histograms = {
+ name: (
+ _expected_histogram([
+ sample
+ for trial in sample_point["components"][name]["trials"]
+ for sample in trial
+ ])
+ if sample_point["components"][name]["availability"] == "measured"
+ else None
+ )
+ for name in ("dispatch", "combine", "roundtrip")
+ }
+ _equivalent(
+ row["sample_histograms"], expected_histograms, f"{path}.sample_histograms"
+ )
+ expected_anomalies = _expected_anomalies(tokens, components)
+ _equivalent(row["anomalies"], expected_anomalies, f"{path}.anomalies")
+ recomputed_anomalies += len(expected_anomalies)
+ if seen_points != set(sample_by_token):
+ raise ContractError("raw rows and sample points differ")
+ if row_tokens != sorted(row_tokens):
+ raise ContractError("raw rows must follow the scheduled token ladder")
+ expected_trace_hashes = sorted(row["routing"]["hash"] for row in rows)
+ if workload["trace_hashes"] != expected_trace_hashes:
+ raise ContractError("raw workload trace hashes differ from measured rows")
+ expected_trace_signature = hashlib.sha256(
+ "|".join(expected_trace_hashes).encode()
+ ).hexdigest()
+ if workload["trace_signature"] != expected_trace_signature:
+ raise ContractError("raw workload trace signature differs from measured rows")
+
+ implementation = _keys(
+ doc["implementation"], {"kernel_generation", "name", "provenance", "resource_profile"},
+ "raw.implementation",
+ )
+ if (
+ implementation["name"] != case["backend"]
+ or implementation["kernel_generation"] != shape["kernel_gen"]
+ ):
+ raise ContractError("raw implementation identity differs from the case")
+ provenance_fields = _obj(implementation["provenance"], "raw.implementation.provenance")
+ unknown = set(provenance_fields) - PROVENANCE_KEYS
+ if unknown:
+ raise ContractError(f"raw implementation provenance has unknown fields {sorted(unknown)}")
+ if (
+ implementation["name"] == "deepep-v2"
+ and provenance_fields.get("deterministic") is not False
+ ):
+ raise ContractError("DeepEP V2 deterministic mode differs from the v1 kernel contract")
+ if implementation["name"] == "deepep-v2" and (
+ _integer(
+ provenance_fields.get("tuning_num_experts"),
+ "raw.implementation.provenance.tuning_num_experts",
+ minimum=1,
+ ) != logical_experts
+ or _integer(
+ provenance_fields.get("num_experts"),
+ "raw.implementation.provenance.num_experts",
+ minimum=1,
+ ) != physical_experts
+ ):
+ raise ContractError("DeepEP V2 expert-count provenance differs from the case")
+ if implementation["name"] == "deepep-hybrid":
+ realized_config = provenance_fields.get("realized_config")
+ jit_kernel_keys = provenance_fields.get("jit_kernel_keys")
+ jit_shared_objects = provenance_fields.get("jit_shared_objects")
+ domain_ranks, communication_domains = hybrid_communication_domains(
+ ep_size, scheduled_case["scale_up_domain"]
+ )
+ if (
+ not _hybrid_realized_config_is_valid(realized_config)
+ or not _hybrid_jit_evidence_is_valid(jit_shared_objects, jit_kernel_keys)
+ or realized_config["hidden_dim"] != shape["hidden"]
+ or realized_config["num_of_experts_per_rank"] * ep_size != physical_experts
+ or realized_config["num_of_ranks_per_node"] != domain_ranks
+ or realized_config["num_of_nodes"] != communication_domains
+ or realized_config["token_data_type"] != "UINT16"
+ or any(
+ len(artifact["rank_artifacts"]) != ep_size
+ for artifact in jit_shared_objects
+ )
+ ):
+ raise ContractError("DeepEP Hybrid realized config/JIT evidence differs from the case")
+ if implementation["name"] == "nccl-ep" and implementation["kernel_generation"] != (
+ collective_kernel_generation(provenance_fields.get("collective_library"))
+ ):
+ raise ContractError("NCCL/RCCL kernel generation differs from collective lineage")
+ resource_profile = _obj(
+ implementation["resource_profile"], "raw.implementation.resource_profile"
+ )
+ expected_resource_profile = project_resource_profile(provenance_fields)
+ if resource_profile != expected_resource_profile:
+ raise ContractError("raw resource profile differs from implementation provenance")
+ topology = _keys(
+ doc["topology"],
+ {"device_count", "device_product", "gpus_per_node", "nodes", "placement",
+ "realized_placement", "scale_out_transport", "scale_up_domain",
+ "scale_up_transport", "scope", "topology_class", "transport", "world_size"},
+ "raw.topology",
+ )
+ for field in ("device_count", "gpus_per_node", "nodes", "scale_up_domain", "world_size"):
+ _integer(topology[field], f"raw.topology.{field}", minimum=1)
+ for field in ("scale_up_transport", "scope", "topology_class", "transport"):
+ _text(topology[field], f"raw.topology.{field}")
+ if topology["scale_out_transport"] is not None:
+ _text(topology["scale_out_transport"], "raw.topology.scale_out_transport")
+ realized = _keys(
+ topology["realized_placement"],
+ {"gpus_per_node", "nodes", "ranks_per_node", "unique_local_ranks", "valid"},
+ "raw.topology.realized_placement",
+ )
+ if realized != {
+ "gpus_per_node": topology["gpus_per_node"],
+ "nodes": topology["nodes"],
+ "ranks_per_node": topology["gpus_per_node"],
+ "unique_local_ranks": True,
+ "valid": True,
+ }:
+ raise ContractError("raw realized placement differs from requested topology")
+ if (
+ topology["world_size"] != ep_size
+ or topology["nodes"] * topology["gpus_per_node"] != ep_size
+ or topology["device_count"] != topology["gpus_per_node"]
+ or topology["placement"] != profile["placement"]
+ or (
+ topology["scope"] == "scale-up"
+ and (
+ ep_size > topology["scale_up_domain"]
+ or topology["scale_out_transport"] is not None
+ )
+ )
+ or (
+ topology["scope"] == "scale-out"
+ and (
+ ep_size <= topology["scale_up_domain"]
+ or ep_size % topology["scale_up_domain"] != 0
+ or topology["scale_out_transport"] is None
+ )
+ )
+ or topology["scope"] not in {"scale-up", "scale-out"}
+ ):
+ raise ContractError("raw topology dimensions differ from the case")
+ if implementation["name"] == "deepep-v2":
+ scale_out = scheduled_case["scope"] == "scale-out"
+ expected_policy = (
+ (True, True, "nccl-gin")
+ if scale_out
+ else (False, False, "nccl-device-lsa")
+ )
+ if (
+ provenance_fields.get("allow_hybrid_mode"),
+ provenance_fields.get("gin_enabled"),
+ provenance_fields.get("communication_backend"),
+ ) != expected_policy:
+ raise ContractError("DeepEP V2 communication policy differs from the v1 contract")
+ lsa_topology = tuple(
+ _integer(
+ provenance_fields.get(field),
+ f"raw.implementation.provenance.{field}",
+ minimum=1,
+ )
+ for field in (
+ "physical_rdma_ranks", "physical_nvlink_ranks",
+ "logical_scaleout_ranks", "logical_scaleup_ranks",
+ )
+ )
+ domains = ep_size // scheduled_case["scale_up_domain"]
+ expected_v2_topology = (
+ (
+ domains,
+ scheduled_case["scale_up_domain"],
+ domains,
+ scheduled_case["scale_up_domain"],
+ )
+ if scale_out
+ else (1, ep_size, 1, ep_size)
+ )
+ if lsa_topology != expected_v2_topology:
+ raise ContractError("DeepEP V2 realized communication domains differ from topology")
+ runtime = _keys(
+ doc["runtime_fingerprint"],
+ {"accelerator_runtime", "collective_library", "device", "driver_version", "framework",
+ "machine", "python_version", "vendor"},
+ "raw.runtime_fingerprint",
+ )
+ for field in ("machine", "python_version", "vendor"):
+ _text(runtime[field], f"raw.runtime_fingerprint.{field}")
+ runtime_device = _keys(
+ runtime["device"], {"arch", "compute_units", "memory_bytes", "product", "warp_size"},
+ "raw.runtime_fingerprint.device",
+ )
+ if topology["device_product"] != runtime_device["product"]:
+ raise ContractError("raw topology and runtime device products differ")
+ platform = capability.PLATFORMS.get(case["runner"])
+ if platform is not None:
+ identity_issues = capability.runtime_identity_issues(
+ case["runner"], vendor=runtime["vendor"], arch=runtime_device["arch"],
+ machine=runtime["machine"], device_name=runtime_device["product"],
+ device_count=topology["device_count"], world_size=topology["world_size"],
+ )
+ registered_topology = capability.topology_for(case["runner"], ep_size)
+ if identity_issues or (
+ registered_topology is None
+ or topology["gpus_per_node"] != platform["gpus_per_node"]
+ or topology["scale_up_domain"] != platform["scale_up_domain"]
+ or any(
+ topology[field] != registered_topology[field]
+ for field in (
+ "nodes", "scope", "scale_up_transport", "scale_out_transport",
+ "topology_class", "transport",
+ )
+ )
+ ):
+ raise ContractError(
+ "raw runtime/topology differs from the scheduled SKU: "
+ + "; ".join(identity_issues)
+ )
+ raw_provenance = _keys(
+ doc["provenance"], {"command", "distributed_launcher", "git_run", "image", "redaction"},
+ "raw.provenance",
+ )
+ image = _keys(
+ raw_provenance["image"],
+ {"arch", "digest", "digest_verified", "reference", "squash_sha256"},
+ "raw.provenance.image",
+ )
+ if (
+ image["digest_verified"] is not True
+ or not isinstance(image["digest"], str)
+ or not re.fullmatch(r"sha256:[0-9a-f]{64}", image["digest"])
+ ):
+ raise ContractError("raw image digest was not registry-verified")
+ if raw_provenance["redaction"] != "sanitized-v1":
+ raise ContractError("raw provenance redaction contract differs")
+ git_run = raw_provenance["git_run"]
+ if git_run is not None:
+ git_run = _keys(git_run, GIT_RUN_FIELDS, "raw.provenance.git_run")
+ expected_provenance_complete = provenance_complete(
+ provenance_fields,
+ case["backend"],
+ git_run,
+ image_digest=image["digest"],
+ image_verified=image["digest_verified"],
+ squash_sha256=image["squash_sha256"],
+ )
+
+ actual_scheduled_case = {
+ "backend": case["backend"],
+ "canonical": workload["source"] == "canonical-serialized",
+ "eplb": eplb["enabled"],
+ "ep": ep_size,
+ "experts": shape["num_logical_experts"],
+ "gpus_per_node": topology["gpus_per_node"],
+ "hidden": hidden,
+ "ladder": " ".join(map(str, row_tokens)),
+ "mode": case["mode"],
+ "nodes": topology["nodes"],
+ "phase": case["phase"],
+ "required_publication": case["required_publication"],
+ "routing": shape["routing"],
+ "samples_per_point": sampling["samples_per_component"],
+ "scale_out_transport": topology["scale_out_transport"],
+ "scale_up_domain": topology["scale_up_domain"],
+ "scale_up_transport": topology["scale_up_transport"],
+ "scope": topology["scope"],
+ "suite": case["suite"],
+ "timing": (
+ f"{sampling['iterations_per_trial']}:{sampling['trials']}:"
+ f"{sampling['warmup_iterations']}"
+ ),
+ "topk": shape["topk"],
+ "topology_class": topology["topology_class"],
+ "transport": topology["transport"],
+ "warmup_semantics": sampling["warmup_semantics"],
+ "workload": case["workload_name"],
+ }
+ if scheduled_case != actual_scheduled_case:
+ mismatches = sorted(
+ field for field in scheduled_case
+ if scheduled_case[field] != actual_scheduled_case[field]
+ )
+ raise ContractError(f"raw data differs from scheduled case fields {mismatches}")
+
+ if workload["source"] == "canonical-serialized":
+ _validate_canonical_workload(workload, scheduled_case, rows, eplb)
+
+ expected_series = {
+ "backend": case["backend"],
+ "case_id": identifiers["case_id"],
+ "image_digest": image["digest"],
+ "implementation_contract_sha256": _sha256_json({
+ "kernel_generation": implementation["kernel_generation"],
+ "name": implementation["name"],
+ "provenance": series_provenance(provenance_fields),
+ "resource_profile": resource_profile,
+ }),
+ "public_config_sha256": public_series_config_sha256(public_series_config(
+ kernel_generation=implementation["kernel_generation"],
+ provenance=provenance_fields,
+ resource_profile=resource_profile,
+ resource_mode=case["resource_mode"],
+ device_product=topology["device_product"],
+ )),
+ "routing_control_sha256": routing_implementation_control_sha256(implementation),
+ "runtime_fingerprint_sha256": _sha256_json(runtime),
+ "source_sha": git_run["source_sha"] if git_run is not None else None,
+ "squash_sha256": image["squash_sha256"],
+ "workload_id": workload["workload_id"] or workload["trace_signature"],
+ }
+ if series_factors != expected_series:
+ raise ContractError("raw series factors differ from measured implementation/runtime")
+ expected_allocation = {
+ "artifact": git_run["artifact"] if git_run is not None else None,
+ "execution_id": allocation_factors["execution_id"],
+ "job": git_run["job"] if git_run is not None else None,
+ "repo": git_run["repo"] if git_run is not None else None,
+ "run_attempt": git_run["run_attempt"] if git_run is not None else None,
+ "run_id": git_run["run_id"] if git_run is not None else None,
+ "runner": case["runner"],
+ "source_sha": git_run["source_sha"] if git_run is not None else None,
+ }
+ if allocation_factors != expected_allocation:
+ raise ContractError("raw allocation factors differ from provenance")
+ artifact = _keys(doc["sample_artifact"], {"bytes", "format", "path", "sha256"}, "raw.sample_artifact")
+ if artifact["format"] != SAMPLES_FORMAT or Path(artifact["path"]).name != artifact["path"]:
+ raise ContractError("raw.sample_artifact format/path is invalid")
+ if not isinstance(artifact["sha256"], str) or len(artifact["sha256"]) != 64:
+ raise ContractError("raw.sample_artifact.sha256 is invalid")
+ _integer(artifact["bytes"], "raw.sample_artifact.bytes", minimum=1)
+ outcome = _keys(doc["outcome"], {"publication_status", "reasons", "status", "validity"}, "raw.outcome")
+ if outcome["status"] not in {"success", "invalid"} or outcome["publication_status"] not in {"diagnostic", "invalid"}:
+ raise ContractError("raw outcome status is invalid")
+ if not isinstance(outcome["reasons"], list) or not all(isinstance(x, str) for x in outcome["reasons"]):
+ raise ContractError("raw outcome reasons must be strings")
+ validity = _keys(
+ outcome["validity"],
+ {"anomaly_free", "execution_status", "measurement_conformance", "provenance_complete",
+ "resource_conformance", "sampling_conformance", "semantic_correctness",
+ "workload_identity", "workload_source"},
+ "raw.outcome.validity",
+ )
+ correctness_passed = all(row["correctness"]["passed"] for row in rows)
+ workload_consistent = workload["cross_rank_consistent"] is True
+ expected_status = "success" if correctness_passed and workload_consistent else "invalid"
+ expected_publication = "diagnostic" if expected_status == "success" else "invalid"
+ if (
+ outcome["status"] != expected_status
+ or outcome["publication_status"] != expected_publication
+ or bool(outcome["reasons"]) == (expected_status == "success")
+ or validity["execution_status"] != "complete"
+ or validity["semantic_correctness"] != ("pass" if correctness_passed else "fail")
+ or validity["workload_identity"] != (
+ "consistent-across-ranks" if workload_consistent else "inconsistent"
+ )
+ or validity["workload_source"] != workload["source"]
+ or validity["measurement_conformance"] != "conformant"
+ or validity["sampling_conformance"] != "conformant"
+ or validity["resource_conformance"] != resource_profile["conformance_class"]
+ or validity["anomaly_free"] != (recomputed_anomalies == 0)
+ or validity["provenance_complete"] is not expected_provenance_complete
+ ):
+ raise ContractError("raw outcome differs from its measurement evidence")
+ artifact_safety.assert_publication_safe([doc])
+ return doc
+
+
+def make_terminal_document(
+ *,
+ allocation_factors: dict[str, Any],
+ attempt_ordinal: int,
+ case: dict[str, Any],
+ case_factors: dict[str, Any],
+ control_sha256: str | None,
+ failure_mode: str,
+ generated_at: str,
+ git_run: dict[str, Any] | None,
+ reason: str,
+ return_code: int,
+ source: str,
+ status: str,
+ expected_case_id: str | None = None,
+) -> dict[str, Any]:
+ """Build and self-validate one attributable non-success attempt."""
+ case_id = identity.digest("case", case_factors)
+ if expected_case_id is not None and expected_case_id != case_id:
+ raise ContractError(
+ f"scheduled case ID differs from terminal factors: {expected_case_id} != {case_id}"
+ )
+ allocation_id = identity.allocation_id(allocation_factors)
+ attempt_id = identity.attempt_id(
+ allocation=allocation_id, case=case_id, ordinal=attempt_ordinal
+ )
+ document = {
+ "format": TERMINAL_FORMAT,
+ "schema_version": 1,
+ "record_type": "terminal-outcome",
+ "generated_at": generated_at,
+ "identity": {
+ "allocation_factors": allocation_factors,
+ "allocation_id": allocation_id,
+ "attempt_id": attempt_id,
+ "attempt_ordinal": attempt_ordinal,
+ "case_factors": case_factors,
+ "case_id": case_id,
+ },
+ "case": case,
+ "provenance": {
+ "git_run": git_run,
+ "control_sha256": control_sha256,
+ "redaction": "sanitized-v1",
+ "source": source,
+ },
+ "outcome": {
+ "status": status,
+ "failure_mode": failure_mode,
+ "reason": reason,
+ "return_code": return_code,
+ },
+ }
+ return validate_terminal_document(document)
+
+
+def validate_terminal_document(document: Any) -> dict[str, Any]:
+ _validate_native_schema("terminal-outcome-v1.schema.json", document)
+ doc = _keys(
+ document,
+ {"case", "format", "generated_at", "identity", "outcome", "provenance", "record_type",
+ "schema_version"},
+ "terminal",
+ )
+ if doc["format"] != TERMINAL_FORMAT or doc["schema_version"] != 1 or doc["record_type"] != "terminal-outcome":
+ raise ContractError("terminal format/schema/record type differs from v1")
+ ids = _keys(doc["identity"], {
+ "allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal",
+ "case_factors", "case_id",
+ }, "terminal.identity")
+ for field, kind in (("allocation_id", "allocation"), ("attempt_id", "attempt"), ("case_id", "case")):
+ _typed(ids[field], kind, f"terminal.identity.{field}")
+ ordinal = _integer(ids["attempt_ordinal"], "terminal.identity.attempt_ordinal", minimum=1)
+ case = _keys(doc["case"], TERMINAL_CASE_FIELDS, "terminal.case")
+ factors = _keys(ids["case_factors"], {"case", "profile", "sku"}, "terminal.identity.case_factors")
+ if factors["case"] != case or factors["profile"] != scheduled_case_profile(
+ case, "terminal.case"
+ ):
+ raise ContractError("terminal case factors differ from the scheduled case/profile")
+ _text(factors["sku"], "terminal.identity.case_factors.sku")
+ allocation = _keys(
+ ids["allocation_factors"], ALLOCATION_FACTOR_FIELDS,
+ "terminal.identity.allocation_factors",
+ )
+ expected_case = identity.digest("case", factors)
+ expected_allocation = identity.allocation_id(allocation)
+ expected_attempt = identity.attempt_id(
+ allocation=expected_allocation, case=expected_case, ordinal=ordinal
+ )
+ if (ids["case_id"], ids["allocation_id"], ids["attempt_id"]) != (
+ expected_case, expected_allocation, expected_attempt
+ ):
+ raise ContractError("terminal typed identities do not match their factors")
+ provenance = _keys(
+ doc["provenance"], {"git_run", "control_sha256", "redaction", "source"},
+ "terminal.provenance",
+ )
+ git_run = provenance["git_run"]
+ if git_run is not None:
+ git_run = _keys(git_run, GIT_RUN_FIELDS, "terminal.provenance.git_run")
+ control = provenance["control_sha256"]
+ if control is not None and (
+ not isinstance(control, str) or len(control) != 64
+ or any(char not in "0123456789abcdef" for char in control)
+ ):
+ raise ContractError("terminal control_sha256 is invalid")
+ if provenance["redaction"] != "sanitized-v1":
+ raise ContractError("terminal redaction contract differs")
+ source = _text(provenance["source"], "terminal.provenance.source")
+ outcome = _keys(
+ doc["outcome"], {"failure_mode", "reason", "return_code", "status"}, "terminal.outcome"
+ )
+ if outcome["status"] not in {"failed", "invalid", "unsupported"}:
+ raise ContractError("terminal outcome status is invalid")
+ failure_mode = _text(outcome["failure_mode"], "terminal.outcome.failure_mode")
+ reason = _text(outcome["reason"], "terminal.outcome.reason")
+ _integer(outcome["return_code"], "terminal.outcome.return_code")
+ if source == "runtime-emitter":
+ expected_runner = factors["sku"]
+ expected_reason = RUNTIME_FAILURE_REASONS.get(failure_mode)
+ valid_outcome = outcome["status"] == "failed" and reason == expected_reason
+ elif source == "post-emit-command":
+ expected_runner = factors["sku"]
+ expected_reason = POST_EMIT_FAILURE_REASONS.get(failure_mode)
+ valid_outcome = outcome["status"] == "failed" and reason == expected_reason
+ elif source == "matrix-capability-resolver":
+ expected_runner = "capability-resolver"
+ valid_outcome = (
+ outcome["status"] == "unsupported"
+ and failure_mode == "capability"
+ and reason in CAPABILITY_FAILURE_REASONS
+ )
+ else:
+ raise ContractError("terminal provenance source is not registered")
+ if not valid_outcome:
+ raise ContractError("terminal source and outcome are not registered")
+ expected_allocation = {
+ "artifact": git_run["artifact"] if git_run is not None else None,
+ "execution_id": allocation["execution_id"],
+ "job": git_run["job"] if git_run is not None else None,
+ "repo": git_run["repo"] if git_run is not None else None,
+ "run_attempt": git_run["run_attempt"] if git_run is not None else None,
+ "run_id": git_run["run_id"] if git_run is not None else None,
+ "runner": expected_runner,
+ "source_sha": git_run["source_sha"] if git_run is not None else None,
+ }
+ if allocation != expected_allocation:
+ raise ContractError("terminal allocation factors differ from provenance or source")
+ artifact_safety.assert_publication_safe([doc])
+ return doc
+
+
+def load_raw_attempt(path: str | os.PathLike[str]) -> dict[str, Any]:
+ document = strict_load(path)
+ artifact = _obj(document, "raw").get("sample_artifact")
+ artifact = _obj(artifact, "raw.sample_artifact")
+ sample_path = Path(path).with_name(_text(artifact.get("path"), "raw.sample_artifact.path"))
+ payload = sample_path.read_bytes()
+ if len(payload) != artifact.get("bytes") or hashlib.sha256(payload).hexdigest() != artifact.get("sha256"):
+ raise ContractError("sample artifact bytes or digest differ")
+ samples = strict_load(sample_path)
+ return validate_raw_document(document, samples)
+
+
+def load_attempt(path: str | os.PathLike[str]) -> dict[str, Any]:
+ """Fully validate and return one native raw or terminal attempt."""
+ document = strict_load(path)
+ if isinstance(document, dict) and document.get("format") == RAW_FORMAT:
+ return load_raw_attempt(path)
+ if isinstance(document, dict) and document.get("format") == TERMINAL_FORMAT:
+ return validate_terminal_document(document)
+ raise ContractError("unknown native attempt format")
+
+
+def quarantine_invalid_attempt(path: str | os.PathLike[str]) -> bool:
+ """Move an invalid attempt and its basename-safe sample outside JSON upload globs."""
+ destination = Path(path)
+ if not destination.is_file():
+ return False
+ try:
+ load_attempt(destination)
+ return False
+ except (ContractError, OSError, ValueError):
+ try:
+ document = json.loads(destination.read_bytes())
+ except (OSError, json.JSONDecodeError):
+ document = {}
+ artifact = document.get("sample_artifact") if isinstance(document, dict) else None
+ sample_name = artifact.get("path") if isinstance(artifact, dict) else None
+ if isinstance(sample_name, str) and Path(sample_name).name == sample_name:
+ sample_path = destination.with_name(sample_name)
+ if sample_path.is_file():
+ os.replace(sample_path, sample_path.with_name(sample_path.name + ".quarantine"))
+ os.replace(destination, destination.with_name(destination.name + ".quarantine"))
+ return True
+
+
+def normalize_attempt(document: dict[str, Any]) -> dict[str, Any]:
+ """Return the publisher-facing projection after native validation."""
+ if document.get("format") == RAW_FORMAT:
+ ids = document["identity"]
+ return {
+ "allocation_id": ids["allocation_id"],
+ "attempt_id": ids["attempt_id"],
+ "case": document["case"],
+ "case_id": ids["case_id"],
+ "generated_at": document["generated_at"],
+ "outcome": document["outcome"],
+ "points": document["measurement"]["rows"],
+ "runtime_fingerprint": document["runtime_fingerprint"],
+ "series_id": ids["series_id"],
+ }
+ if document.get("format") == TERMINAL_FORMAT:
+ ids = document["identity"]
+ return {
+ "allocation_id": ids["allocation_id"],
+ "attempt_id": ids["attempt_id"],
+ "case": document["case"],
+ "case_id": ids["case_id"],
+ "generated_at": document["generated_at"],
+ "outcome": document["outcome"],
+ "points": [],
+ "runtime_fingerprint": None,
+ "series_id": None,
+ }
+ raise ContractError("unknown attempt format")
+
+
+def _env_integer(name: str, default: int) -> int:
+ try:
+ return int(os.environ.get(name, str(default)))
+ except ValueError:
+ return default
+
+
+def _env_enabled(name: str) -> bool:
+ return os.environ.get(name, "").lower() in {"1", "true", "yes"}
+
+
+def _terminal_case_from_environment(backend: str, phase: str) -> dict[str, Any]:
+ ep = _env_integer("CX_EP", _env_integer("CX_NGPUS", 1))
+ gpus_per_node = _env_integer("CX_GPUS_PER_NODE", ep)
+ ladder = os.environ.get("CX_TOKENS_LADDER", "") or (
+ "1 2 4 8 16 32 64 128"
+ if phase == "decode"
+ else "128 256 512 1024 2048 4096"
+ )
+ return {
+ "suite": os.environ.get("CX_SUITE") or "manual",
+ "workload": os.environ.get("CX_WORKLOAD_NAME") or "manual",
+ "required_publication": os.environ.get("CX_REQUIRED_PUBLICATION") or "diagnostic",
+ "backend": backend,
+ "mode": os.environ.get("CX_MODE", "normal"),
+ "routing": os.environ.get("CX_ROUTING", "uniform"),
+ "phase": phase,
+ "ep": ep,
+ "eplb": _env_enabled("CX_EPLB"),
+ "hidden": _env_integer("CX_HIDDEN", 7168),
+ "topk": _env_integer("CX_TOPK", 8),
+ "experts": _env_integer("CX_EXPERTS", 256),
+ "samples_per_point": _env_integer("CX_SAMPLES_PER_POINT", 512),
+ "warmup_semantics": os.environ.get(
+ "CX_WARMUP_SEMANTICS",
+ "full-roundtrip-before-each-component-trial-point-v1",
+ ),
+ "ladder": ladder,
+ "timing": (
+ f'{_env_integer("CX_ITERS", 8)}:{_env_integer("CX_TRIALS", 64)}:'
+ f'{_env_integer("CX_WARMUP", 32)}'
+ ),
+ "canonical": _env_enabled("CX_CANONICAL"),
+ "nodes": _env_integer("CX_NODES", _env_integer("SLURM_NNODES", 1)),
+ "gpus_per_node": gpus_per_node,
+ "scale_up_domain": _env_integer("CX_SCALE_UP_DOMAIN", gpus_per_node),
+ "scope": os.environ.get("CX_SCOPE", "scale-up"),
+ "topology_class": os.environ.get("CX_TOPO", "manual"),
+ "transport": os.environ.get("CX_TRANSPORT", "unknown"),
+ "scale_up_transport": os.environ.get("CX_SCALE_UP_TRANSPORT", "unknown"),
+ "scale_out_transport": os.environ.get("CX_SCALE_OUT_TRANSPORT") or None,
+ }
+
+
+def _git_run_from_environment() -> dict[str, Any] | None:
+ def value(name: str) -> str | None:
+ return os.environ.get(name) or None
+
+ git_run = {
+ "run_id": value("GITHUB_RUN_ID"),
+ "run_attempt": value("GITHUB_RUN_ATTEMPT"),
+ "ref": value("GITHUB_REF_NAME") or value("GITHUB_REF"),
+ "source_sha": value("COLLECTIVEX_SOURCE_SHA") or value("GITHUB_SHA"),
+ "repo": value("GITHUB_REPOSITORY"),
+ "job": value("GITHUB_JOB"),
+ "artifact": value("COLLECTIVEX_ARTIFACT_NAME"),
+ }
+ return git_run if any(value is not None for value in git_run.values()) else None
+
+
+def _allocation_factors_from_environment(
+ runner: str, git_run: dict[str, Any] | None
+) -> dict[str, Any]:
+ return {
+ "artifact": git_run["artifact"] if git_run is not None else None,
+ "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID") or None,
+ "job": git_run["job"] if git_run is not None else None,
+ "repo": git_run["repo"] if git_run is not None else None,
+ "run_attempt": git_run["run_attempt"] if git_run is not None else None,
+ "run_id": git_run["run_id"] if git_run is not None else None,
+ "runner": runner,
+ "source_sha": git_run["source_sha"] if git_run is not None else None,
+ }
+
+
+def make_terminal_from_environment(
+ *, backend: str, phase: str, return_code: int, failure_mode: str | None = None
+) -> dict[str, Any]:
+ """Build a terminal document from the same exported case coordinates as run_ep."""
+ mode = failure_mode or RETURN_CODE_FAILURE_MODES.get(return_code, "execution")
+ reason = RUNTIME_FAILURE_REASONS.get(mode)
+ if reason is None:
+ raise ContractError("runtime failure mode is not registered")
+ runner = os.environ.get("CX_RUNNER", "")
+ case = _terminal_case_from_environment(backend, phase)
+ case_factors = {
+ "case": case,
+ "profile": scheduled_case_profile(case, "runtime case"),
+ "sku": runner,
+ }
+ git_run = _git_run_from_environment()
+ control = os.environ.get("COLLECTIVEX_CONTROL_SHA256") or None
+ return make_terminal_document(
+ allocation_factors=_allocation_factors_from_environment(runner, git_run),
+ attempt_ordinal=_env_integer("CX_ATTEMPT_ID", 1),
+ case=case,
+ case_factors=case_factors,
+ control_sha256=control,
+ failure_mode=mode,
+ generated_at=dt.datetime.now(dt.timezone.utc).isoformat(),
+ git_run=git_run,
+ reason=reason,
+ return_code=return_code,
+ source="runtime-emitter",
+ status="failed",
+ expected_case_id=os.environ.get("CX_CASE_ID") or None,
+ )
+
+
+def _write_document(path: str | os.PathLike[str], document: dict[str, Any]) -> None:
+ destination = Path(path)
+ destination.parent.mkdir(parents=True, exist_ok=True)
+ temporary = destination.with_name(destination.name + ".tmp")
+ temporary.write_text(json.dumps(document, indent=2, sort_keys=True) + "\n")
+ os.replace(temporary, destination)
+
+
+def demote_raw_attempt(path: str | os.PathLike[str], return_code: int) -> dict[str, Any]:
+ """Replace a rank-zero raw result when the distributed command later fails."""
+ destination = Path(path)
+ raw = strict_load(destination)
+ if not isinstance(raw, dict) or raw.get("format") != RAW_FORMAT:
+ raise ContractError("only a native raw attempt can be demoted")
+ ids = _obj(raw.get("identity"), "raw.identity")
+ required = {
+ "allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal",
+ "case_factors", "case_id",
+ }
+ if not required.issubset(ids):
+ raise ContractError("raw identity lacks terminal factors")
+ mode = RETURN_CODE_FAILURE_MODES.get(return_code, "execution")
+ git_run = _obj(raw.get("provenance"), "raw.provenance").get("git_run")
+ if git_run is not None:
+ git_run = _keys(git_run, GIT_RUN_FIELDS, "raw.provenance.git_run")
+ terminal = make_terminal_document(
+ allocation_factors=ids["allocation_factors"],
+ attempt_ordinal=ids["attempt_ordinal"],
+ case=ids["case_factors"]["case"],
+ case_factors=ids["case_factors"],
+ control_sha256=os.environ.get("COLLECTIVEX_CONTROL_SHA256") or None,
+ failure_mode=mode,
+ generated_at=dt.datetime.now(dt.timezone.utc).isoformat(),
+ git_run=git_run,
+ reason=POST_EMIT_FAILURE_REASONS[mode],
+ return_code=return_code,
+ source="post-emit-command",
+ status="failed",
+ expected_case_id=ids["case_id"],
+ )
+ artifact = raw.get("sample_artifact") or {}
+ sample_name = artifact.get("path")
+ if isinstance(sample_name, str) and Path(sample_name).name == sample_name:
+ destination.with_name(sample_name).unlink(missing_ok=True)
+ _write_document(destination, terminal)
+ return terminal
+
+
+def validate_attempt_paths(paths: list[str]) -> int:
+ """Fully validate a result directory's attempts and paired sample artifacts."""
+ if not paths or len(paths) != len(set(paths)):
+ raise ContractError("validate-many requires unique result paths")
+ sample_paths: set[Path] = set()
+ referenced_samples: set[Path] = set()
+ attempt_count = 0
+ for raw_path in paths:
+ path = Path(raw_path).resolve()
+ document = strict_load(path)
+ if isinstance(document, dict) and document.get("format") == RAW_FORMAT:
+ document = load_raw_attempt(path)
+ referenced_samples.add(path.with_name(document["sample_artifact"]["path"]))
+ attempt_count += 1
+ elif isinstance(document, dict) and document.get("format") == TERMINAL_FORMAT:
+ validate_terminal_document(document)
+ attempt_count += 1
+ elif isinstance(document, dict) and document.get("format") == SAMPLES_FORMAT:
+ validate_samples_document(document)
+ sample_paths.add(path)
+ else:
+ raise ContractError(f"unknown result artifact {path.name}")
+ if sample_paths != referenced_samples:
+ raise ContractError("sample artifacts are missing, orphaned, or outside the validated set")
+ if attempt_count == 0:
+ raise ContractError("result set contains no native attempts")
+ return attempt_count
+
+
+def validate_delivery(
+ paths: list[str], source_path: str, *, disposition: str | None = None
+) -> int:
+ """Reconcile a shard or matrix disposition with its complete native attempt set."""
+ source_file = Path(source_path).resolve()
+ source = strict_load(source_file)
+ if isinstance(source, dict) and source.get("format") == "collectivex.matrix.v1":
+ if disposition is None:
+ raise ContractError("matrix delivery validation requires a disposition")
+ wrappers = [
+ item for item in source.get("requested_cases", [])
+ if isinstance(item, dict) and item.get("disposition") == disposition
+ ]
+ expected = {
+ item["case"]["case_id"]: (item["sku"], item["case"])
+ for item in wrappers
+ }
+ expected_count = len(wrappers)
+ require_one_allocation = disposition == "unsupported"
+ elif isinstance(source, dict) and isinstance(source.get("cases"), list):
+ expected = {
+ case["case_id"]: (source.get("sku"), case)
+ for case in source["cases"]
+ }
+ expected_count = len(source["cases"])
+ require_one_allocation = True
+ else:
+ raise ContractError("delivery source is not a matrix or shard control")
+ if not expected or len(expected) != expected_count:
+ raise ContractError("delivery source has empty or duplicate case coverage")
+
+ validate_attempt_paths(paths)
+ attempts = []
+ for raw_path in paths:
+ document = strict_load(raw_path)
+ if isinstance(document, dict) and document.get("format") in {RAW_FORMAT, TERMINAL_FORMAT}:
+ attempts.append(load_attempt(raw_path))
+ by_case: dict[str, list[dict[str, Any]]] = {}
+ attempt_ids = set()
+ allocation_ids = set()
+ source_sha256 = hashlib.sha256(source_file.read_bytes()).hexdigest()
+ for document in attempts:
+ ids = document["identity"]
+ case_id = ids["case_id"]
+ if case_id not in expected or ids["attempt_id"] in attempt_ids:
+ raise ContractError("delivery contains an extra case or duplicate attempt")
+ attempt_ids.add(ids["attempt_id"])
+ allocation_ids.add(ids["allocation_id"])
+ sku, scheduled = expected[case_id]
+ scheduled_case = {key: value for key, value in scheduled.items() if key != "case_id"}
+ if ids["case_factors"] != {
+ "case": scheduled_case,
+ "profile": scheduled_case_profile(scheduled_case, "delivery case"),
+ "sku": sku,
+ }:
+ raise ContractError("delivery attempt differs from its scheduled case")
+ factors = ids["allocation_factors"]
+ expected_environment = {
+ "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"),
+ "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID"),
+ "job": os.environ.get("GITHUB_JOB"),
+ "repo": os.environ.get("GITHUB_REPOSITORY"),
+ "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+ "run_id": os.environ.get("GITHUB_RUN_ID"),
+ "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"),
+ }
+ expected_runner = (
+ "capability-resolver"
+ if document["format"] == TERMINAL_FORMAT
+ and document["provenance"]["source"] == "matrix-capability-resolver"
+ else sku
+ )
+ if any(
+ value is not None and factors[field] != value
+ for field, value in expected_environment.items()
+ ) or factors["runner"] != expected_runner:
+ raise ContractError("delivery allocation factors differ from the workflow")
+ if document["format"] == TERMINAL_FORMAT:
+ control = document["provenance"]["control_sha256"]
+ if control != source_sha256:
+ raise ContractError("terminal outcome does not reference its exact control document")
+ by_case.setdefault(case_id, []).append(document)
+ if set(by_case) != set(expected):
+ raise ContractError("delivery case coverage is incomplete")
+ for case_id, documents in by_case.items():
+ ordinals = sorted(document["identity"]["attempt_ordinal"] for document in documents)
+ if ordinals != list(range(1, len(ordinals) + 1)):
+ raise ContractError(f"delivery attempt ordinals are not contiguous for {case_id}")
+ if require_one_allocation and len(allocation_ids) != 1:
+ raise ContractError("one shard must use exactly one allocation identity")
+ return len(attempts)
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="CollectiveX native attempt contracts")
+ subparsers = parser.add_subparsers(dest="command", required=True)
+ probe = subparsers.add_parser("probe")
+ probe.add_argument("path")
+ probe.add_argument("--status", choices=("success", "invalid"))
+ emit = subparsers.add_parser("emit-terminal")
+ emit.add_argument("--out", required=True)
+ emit.add_argument("--backend", required=True)
+ emit.add_argument("--phase", required=True, choices=("decode", "prefill"))
+ emit.add_argument("--return-code", required=True, type=int)
+ emit.add_argument("--failure-mode")
+ demote = subparsers.add_parser("demote")
+ demote.add_argument("path")
+ demote.add_argument("--return-code", required=True, type=int)
+ validate_many = subparsers.add_parser("validate-many")
+ validate_many.add_argument("paths", nargs="+")
+ quarantine = subparsers.add_parser("quarantine-invalid")
+ quarantine.add_argument("path")
+ delivery = subparsers.add_parser("validate-delivery")
+ delivery.add_argument("--source", required=True)
+ delivery.add_argument("--disposition")
+ delivery.add_argument("paths", nargs="+")
+ args = parser.parse_args()
+ try:
+ if args.command == "probe":
+ document = load_attempt(args.path)
+ if args.status is None:
+ return 0
+ if document.get("format") != RAW_FORMAT:
+ return 1
+ outcome = document["outcome"]
+ validity = outcome.get("validity")
+ return int(
+ not (
+ isinstance(validity, dict)
+ and validity.get("execution_status") == "complete"
+ and outcome.get("status") == args.status
+ )
+ )
+ if args.command == "emit-terminal":
+ document = make_terminal_from_environment(
+ backend=args.backend,
+ phase=args.phase,
+ return_code=args.return_code,
+ failure_mode=args.failure_mode,
+ )
+ _write_document(args.out, document)
+ print(f"preserved terminal outcome ({document['outcome']['failure_mode']})")
+ return 0
+ if args.command == "validate-many":
+ print(f"validated {validate_attempt_paths(args.paths)} native attempts")
+ return 0
+ if args.command == "quarantine-invalid":
+ quarantine_invalid_attempt(args.path)
+ return 0
+ if args.command == "validate-delivery":
+ print(
+ f"validated {validate_delivery(args.paths, args.source, disposition=args.disposition)} "
+ "delivery attempts"
+ )
+ return 0
+ demote_raw_attempt(args.path, args.return_code)
+ return 0
+ except (ContractError, identity.IdentityError, OSError, ValueError) as exc:
+ print(f"terminal contract error: {exc}", file=sys.stderr)
+ return 1
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/docs/methodology.md b/experimental/CollectiveX/docs/methodology.md
new file mode 100644
index 0000000000..606e7c8995
--- /dev/null
+++ b/experimental/CollectiveX/docs/methodology.md
@@ -0,0 +1,305 @@
+# CollectiveX EP v1 Contract
+
+
+
+**English** | [中文](./methodology_zh.md)
+
+
+
+This document defines new CollectiveX results. Historical run notes are evidence, not contract.
+
+## Product Boundary
+
+CollectiveX is a communication microbenchmark for:
+
+- comparing EP libraries on one chip/topology;
+- comparing EP latency and logical payload bandwidth across systems under the same workload; and
+- exposing unsupported, failed, invalid, and unstable evidence without contaminating decisions.
+
+It does not predict serving throughput without a separate correlation study.
+
+## Matrix
+
+The promoted workload is `deepseek-v3-v1`: hidden 7168, top-k 8, 256 routed experts, BF16 dispatch
+and combine, packed placement, and backend-tuned resources. Each case explicitly selects normal
+`layout-and-dispatch-v1` or low-latency `expert-packed-weighted-combine-v1` semantics.
+
+- `ep-core-v1`: uniform routing; decode T=1..128 powers of two; prefill T=256/512.
+- `ep-routing-v1`: Zipf with EPLB off/on; decode T=128; prefill T=512.
+- `ep-low-latency-v1`: DeepEP V1/UCCL native low-latency APIs; uniform decode T=1..128 powers of
+ two; the capability contract rejects every other backend instead of fabricating a low-latency path.
+- Canonical surface: 608 requested cases / 1,600 token points; 364 runnable cases / 940 points in
+ 58 executable workflow shards/allocation cells; 244 unsupported cases / 660 points.
+
+| Systems | EP8 | EP16 |
+|---|---|---|
+| H100/H200/B200/B300 | 1x8 NVLink, scale-up | 2x8 NVLink + RDMA, scale-out |
+| MI325X/MI355X | 1x8 XGMI, scale-up | 2x8 XGMI + RDMA, scale-out |
+| GB200/GB300 | 2x4 MNNVL, scale-up | 4x4 MNNVL, scale-up |
+
+Physical host count does not define scope. Both GB cells remain inside one 72-GPU MNNVL scale-up
+domain.
+
+Unsupported combinations are terminal outcomes, not silently skipped coverage. DeepEP V2 is the
+`ElasticBuffer` introduced by PR #605, pinned with upstream PR #630's minimal pure-scale-up fix.
+Scale-up cases request NCCL Device API LSA and fail closed unless the realized LSA team covers the
+full EP world. x86 EP16 scale-out uses the hybrid path with GIN and requires two logical scale-out
+domains represented by two physical RDMA ranks, with eight scale-up ranks per domain. GB EP16
+remains MNNVL scale-up and uses LSA. NVIDIA capabilities declared in source remain unvalidated until
+GPU outcomes pass the native oracle and publisher gates. H100 V2 on the current runner pool is a
+declared unsupported combination in v1 because NCCL 2.30.4 reports no Device API symmetric-memory
+support for its EP8 communicator; that pool can return only after all-rank CUDA P2P/LSA support is
+restored. Removed axes include `[cl]`, `[rv]`, quantization, alternate activation/routing profiles,
+uneven allocation, placement permutations, model envelopes, and scaling.
+FlashInfer is excluded from v1 after repeatable intermittent execution failures; those failures are
+not converted into planned-unsupported coverage.
+MoRI EP8 uses MI325X AsyncLL or MI355X IntraNode in normal mode. EP16 uses pinned InterNodeV1 over
+2x8 XGMI + RDMA with 96 blocks, 64 RDMA blocks, 8 warps, one QP per PE, and external input. MoRI's
+AsyncLL transport is not the genuine low-latency suite contract and is never labeled as such.
+
+## Workload Identity
+
+One canonical workload is generated over the global token batch and sliced by source rank. Expert
+indices and gate weights are serialized. Activations use a versioned integer counter formula whose
+BF16 values are exact across runtimes; its full identity is bound into the manifest. The manifest
+also binds shape/EP coordinates and oracle version. SHA-256 covers canonical bytes and parameters;
+library RNG regeneration is not proof of identity.
+
+Routing traffic distinguishes:
+
+- token-expert assignments, which determine expert compute load; and
+- rank-deduplicated token payload copies, which determine EP activation traffic.
+
+Adapters may not generate routing or reinterpret one quantity as the other.
+
+## Measurement
+
+Normal mode uses `layout-and-dispatch-v1`: dispatch timing includes layout plus communication, and
+combine returns activation payload through an unweighted rank-sum path. Low-latency mode uses
+`expert-packed-weighted-combine-v1`: native DeepEP V1/UCCL APIs dispatch token-expert assignments and
+perform gate-weighted combine. Expert-output staging is outside isolated combine timing and inside
+measured paired roundtrip. Each component declares availability, origin, start/end states, stage
+scope, and sample count. A paired-only API reports null isolated components. `isolated_sum` is
+derived and never used for throughput or recommendations. Mode is series identity, and normal and
+low-latency evidence cannot share a ranking cohort.
+
+Every measured component uses `fixed-512-v1`:
+
+- 64 trials x 8 timed iterations = 512 observations;
+- 32 synchronized full dispatch-stage-combine warmups before each available measured component at
+ every trial/point;
+- roundtrip first, then isolated dispatch and combine, with a fixed per-phase conditioning ladder; and
+- per-iteration maximum latency across ranks before nearest-rank p50/p90/p95/p99.
+
+Measured roundtrip p99 is the headline latency. Retries remain separate attempts; a later success
+does not erase earlier failures. Decode and prefill identify the serving regime represented by one
+MoE-layer collective; they do not change the timed primitive at an otherwise identical shape.
+
+The NCCL/RCCL reference is an end-to-end Python adapter, not a bare fabric primitive. Its dispatch
+boundary includes layout, count exchange, a device-to-host split synchronization, fresh receive
+allocation, and four payload/metadata all-to-all calls; activation-only combine adds one all-to-all plus
+scatter/reduction. Its p99 therefore measures the complete reference-adapter boundary and can be
+host/scheduler-sensitive. It is useful for portable system controls but must not be labeled fabric,
+link, bus, or single-collective latency.
+
+The versioned conditioning and EPLB planner contracts (reference trace, redundant count, and
+placement/remap version) are part of scheduled and evidence identity.
+
+Logical payload bandwidth is:
+
+`logical_payload_bytes / measured_latency_seconds`
+
+Normal-mode payload bytes use rank-deduplicated token-rank activations; low-latency bytes use
+token-expert assignments. Both add required scale bytes at the named boundary and exclude expert
+metadata, padding, and backend buffer capacity. Algorithm bandwidth, bus bandwidth, wire utilization,
+and physical-link utilization are not published without a defined primitive model or transport
+counters. Logical bandwidth must never be labeled physical bandwidth. Published payload and token
+rates are named `rate_at_latency_percentile`: bytes or tokens divided by the matching latency
+percentile. They are lower-tail service rates at p99 latency, not p99 percentiles of an inverted rate
+distribution.
+
+## Correctness
+
+An implementation-independent oracle uses an expert-specific deterministic transform so wrong
+expert routing cannot pass an identity roundtrip. For every rank and point it verifies:
+
+1. destination rank/expert, source token, multiplicity, gate weight, and receive counts;
+2. dispatched payload and metadata before timing;
+3. combined output before timing;
+4. unchanged semantic inputs through all timed samples; and
+5. dispatched payload/metadata and combined output again after timing.
+
+Normal-mode adapters use activation-only, unweighted rank-sum combine. The oracle builds each rank's
+gate-weighted expert aggregate before combine, independently derives `sum(gate * expert(token))`,
+and checks the dispatch metadata and transformed output. Low-latency adapters separately verify the
+expert-packed source/expert assignment, native gate weights, and gate-weighted combined output. Both
+contracts check every element with recorded `rtol=0.05` and `atol=0.02`. Any failed rank or point
+makes the case ineligible.
+Pre/post dispatch evidence is hashed in canonical source-token order. Native receive slots may be
+assigned nondeterministically, so physical receive order is not treated as a correctness property.
+
+## Native Result
+
+One raw case document uses `format: "collectivex.ep.v1"`, rejects unknown fields, and contains:
+
+- `case`: stable case ID, suite, required tier, and coordinate;
+- `workload`: canonical identity and logical MoE shape;
+- `measurement`: sampling, component states, timing, and byte accounting;
+- `implementation`: instantiated class/API, pinned source, loaded libraries, and resources;
+- `topology`: requested and realized SKU, devices, placement, scale-up domain, and transport;
+- `provenance`: source SHA, image/squash hashes, allocation, run, and attempt;
+- `rows`: point latency, byte accounting, token rate, correctness, load, fanout, and anomaly evidence; and
+- `outcome`: `success`, `failed`, `invalid`, `diagnostic`, or `unsupported`, with reasons.
+
+Raw result documents and exact samples pass through transient GitHub delivery artifacts before the
+publisher archives them in the private bundle; they never enter the public tree. Private environment
+details remain in local mode-0600 logs and ignored operator notes; they are never archived or
+published. Every expected case has one terminal selected outcome while every attempt remains retained.
+
+## Identity And Comparisons
+
+Canonical JSON produces three full SHA-256 IDs:
+
+- `series_id`: all locked factors except token coordinate and repeat allocation;
+- `point_id`: `series_id` plus token coordinate; and
+- `evidence_id`: `point_id` plus allocation/run/attempt/sample checksum.
+
+Locked factors include workload bytes, measurement and sampling contract, resources, realized
+topology, implementation/build, loaded libraries, image/squash, runtime, and source SHA.
+Deferred code generation is captured before measurement and recaptured afterward. DeepEP V2 uses a
+fixed NVCC random seed and binds final cache keys plus generated-source and executable-SASS hashes;
+raw CUBIN bytes remain private diagnostics. Hybrid binds its realized auto-tuned config and complete
+kernel-key set while retaining rank-local shared-object hashes as private diagnostics. Locally built
+extension hashes are diagnostic; their pinned source trees, build recipe, runtime, and dependencies
+remain series-bound.
+The series identity includes the case ID, which binds the complete scheduled token ladder and the
+frozen percentile, rank-reduction, conditioning, warmup, and correctness semantics.
+
+A controlled comparison declares one contrast:
+
+- `library`: backend implementation and its tuned resource profile may differ; the realized system,
+ workload, EP, resource policy, source, and measurement remain matched;
+- `chip`: a controlled platform contrast. The full realized system/topology and tuned resource
+ profile may differ while workload, EP, placement class, resource policy, backend lineage, source,
+ and measurement remain matched. It is not a silicon-only comparison;
+- `system`: all hardware/backend differences stay visible while workload, EP, and measurement match;
+- `routing`: routing distribution/EPLB differs while the static implementation build/generator,
+ system, model shape, resource profile, and measurement remain matched. Uniform and Zipf without
+ EPLB reuse the same generated implementation; EPLB's physical-expert/JIT configuration remains an
+ explicit treatment difference.
+
+Any undeclared mismatch rejects the overlay. Chip/system results describe measured systems, not
+silicon alone.
+
+## Evidence Policy
+
+Capability declarations say what may be attempted; artifacts determine evidence status. Promotion
+requires exact expected coverage with no missing, extra, duplicate, malformed, or heterogeneous
+case. Public coverage preserves each matrix disposition; promotion requires every runnable case to
+succeed and every planned-unsupported case to remain unsupported in every selected run. Only the
+pinned canonical full-v1 matrix, with a decision-grade library, chip, system, and routing cohort,
+may advance `dev-latest`; partial matrices remain diagnostic. The full-matrix digest intentionally
+pins the exact workflow shard grouping as well as the requested cases, so changing `--max-cases`
+or the SKU round-robin scheduling order produces diagnostic-only runs even when case coverage is
+unchanged. Superseded retries,
+planned-unsupported outcomes, and unstable comparison cohorts may render diagnostically but cannot
+rank or recommend; every successful required series in a promoted dataset remains decision-grade.
+Any failed, invalid, or diagnostic retry of a runnable case blocks promotion even if a later retry
+succeeds. Routing cohorts are comparable-experimental sensitivities and never produce configuration
+recommendations; official library/platform/system cohorts own actionable recommendations.
+
+A point becomes decision-grade only after three independent workflow runs and allocation IDs pass
+correctness, identity, provenance, tail gates, p50/p99 repeat-stability thresholds, and stable ordering. The
+publisher, not the frontend, computes eligibility, controlled cohorts, sensitivity pairs, and
+recommendations.
+
+## Execution Isolation
+
+Every non-MNNVL scale-out case uses operator-pinned socket and RDMA selectors. The launcher rejects
+missing or partial profiles, then probes every allocated node for the configured interface, active
+HCA port, and configured GID before backend initialization. It never substitutes a default route,
+inherited runner environment, or transport fallback. Scale-up and MNNVL cases clear the profile;
+scale-out NCCL/RCCL forces `NCCL_NET=IB` and exact HCA matching. Selector values remain in encrypted
+config and mode-0600 private logs.
+
+Repository staging uses a pre-existing, runner-owned, group/world non-writable shared base outside
+the checkout and workflow workspace. The parent process resolves the exact execution child before
+copying, claims it with a runner-owned marker, and verifies that all allocated nodes can read and
+write the same bytes. Cleanup waits for confirmed allocation teardown and removes only that child,
+including a safely identified partial claim. The same-run V2/Hybrid source archive is fully validated
+under fixed member and expanded-size bounds, and only the selected pinned root is extracted; a
+symlink is accepted only when it is a relative leaf pointing to a regular member inside the same
+backend root, followed by exact Git tree/submodule validation.
+
+## Artifact Validation And JIT Delivery
+
+There is no self-hosted service, Vercel storage, GCP, Neon, managed database, or managed object
+store. The publication workflow uses runner-local temporary storage only as a disposable validation
+and promotion workspace:
+
+```text
+$COLLECTIVEX_STORE_ROOT/
+ private/incoming/ # write-once downloaded GHA attempts
+ private/bundles// # immutable source archives, native results/samples, matrix, checksums
+ private/quarantine/ # rejected attempts plus machine-readable reasons
+ public/datasets// # immutable sanitized frontend datasets
+ public/channels/ # small atomic pointers: latest-attempt, dev-latest
+ locks/
+```
+
+Private and public trees use separate permissions. JSON manifests and checksums are authoritative;
+a rebuildable catalog is only an index. Raw sweep artifacts are transient publisher input; only the
+sanitized promoted NDJSON is retained as a frontend publication artifact.
+
+Container tags are checked against pinned registry digests. Enroot imports use a fixed
+`SOURCE_DATE_EPOCH` and versioned cache generation; every mounted squash is freshly hashed into
+series identity. Image-provided DeepEP is also checked against exact per-architecture wheel and
+installed-file fingerprints, so a stale cache cannot inherit the pinned source identity.
+Source-built DeepEP V2 uses a separate mode-0700 cluster-local cache mounted only as `/cx-cache`.
+Its content key binds a versioned build recipe, verified image digest, CPU/GPU architecture,
+upstream source trees, and pinned build dependencies. The cache is never an artifact or publisher
+input; per-execution source/results stages remain isolated and disposable, and marker plus runtime
+probes fail closed before reuse. The runner UID is inside the trusted cluster boundary: this cache
+guards against stale or accidental mutation, not hostile same-UID jobs. Only an unpublished partial
+build may be reset automatically; a published cache that fails integrity or runtime checks is left
+intact and rejected so a concurrent allocation cannot lose files it is using.
+
+Publication is fail-closed:
+
+1. acquire an exclusive filesystem lock and stage on the destination filesystem;
+2. archive source bytes before parsing;
+3. require the exact matrix-declared artifact set and reject every unconsumed archive member;
+4. validate strict schemas, privacy, checksums, identities, timing, and exact matrix outcomes;
+5. write checksums and `COMPLETE`, fsync, then atomically rename the private bundle;
+6. build and validate the sanitized content-addressed dataset, fsync, then atomically rename it;
+7. atomically replace `dev-latest.json` only when every promotion gate passes.
+
+Rejected attempts may update workspace `latest-attempt` but never `dev-latest`. The workspace is
+destroyed with the publication runner and is never attached to the frontend. No artifact is emitted
+unless all three selected bundles advance `dev-latest`.
+
+`publisher.py ingest` accepts the exact matrix plus one `--artifact` directory or ZIP per GitHub
+artifact. `promote` accepts explicit immutable bundle IDs. Default `verify` requires
+`latest-attempt`; it also verifies `dev-latest` when present, while an explicit
+`--channel dev-latest` requires it. The workflow copies only the verified sanitized dataset to a
+one-record `collectivex_public_v1_.ndjson` artifact. Raw artifacts and private workspace
+content are never bundled into the application.
+
+Sweeps default to `release_tag=unversioned`. Selecting `v1` requires the locked full-matrix digest
+and emits a marker bound to the run ID, attempt, source SHA, and matrix SHA-256. The manual
+publication workflow accepts exactly three unique successful `CollectiveX Sweep` run IDs from one
+source SHA, revalidates their metadata and exact markers, downloads their immutable artifacts, and
+passes the same provenance assertions to `publisher.py ingest`. Partial, filtered, untagged,
+cross-source, failed, or expired inputs fail closed.
+
+Using a server-side GitHub read token, the frontend discovers the latest successful version-scoped
+publication run and downloads the publication artifact just in time. It requires exactly one root
+NDJSON entry, validates UTF-8, schema, promotion status, and filename/body SHA-256, then exposes a
+short-lived versioned channel pointer and immutable versioned dataset URL. The benchmark-version
+selector currently exposes V1; later versions require separate release and publication identities.
+The frontend never invents missing values, selects retries, or recomputes decision eligibility.
+
+## Legacy Data
+
+Numeric schemas 3-5 are outside the v1 publisher and frontend reader. They remain historical
+diagnostic evidence and cannot seed `dev-latest` or drive v1 decisions.
diff --git a/experimental/CollectiveX/docs/methodology_zh.md b/experimental/CollectiveX/docs/methodology_zh.md
new file mode 100644
index 0000000000..7f6dcb67af
--- /dev/null
+++ b/experimental/CollectiveX/docs/methodology_zh.md
@@ -0,0 +1,297 @@
+# CollectiveX EP v1 契约
+
+
+
+[English](./methodology.md) | **中文**
+
+
+
+本文档定义新的 CollectiveX 结果。历史运行笔记是 evidence,不是 contract。
+
+## 产品边界
+
+CollectiveX 是通信 microbenchmark,用于:
+
+- 在同一 chip/topology 上比较 EP libraries;
+- 在相同 workload 下比较不同系统的 EP latency 和 logical payload bandwidth;
+- 展示 unsupported、failed、invalid 和 unstable evidence,同时避免污染决策。
+
+若没有单独的 correlation study,它不能预测 serving throughput。
+
+## 矩阵
+
+提升后的 workload 为 `deepseek-v3-v1`:hidden 7168、top-k 8、256 routed experts、BF16
+dispatch 和 combine、packed placement,以及 backend-tuned resources。每个 case 都显式选择
+normal `layout-and-dispatch-v1` 或 low-latency `expert-packed-weighted-combine-v1` 语义。
+
+- `ep-core-v1`:uniform routing;decode T=1..128 的 2 次幂;prefill T=256/512。
+- `ep-routing-v1`:Zipf,EPLB off/on;decode T=128;prefill T=512。
+- `ep-low-latency-v1`:使用 DeepEP V1/UCCL 原生 low-latency API;uniform decode T=1..128 的
+ 2 次幂;capability contract 会拒绝其他后端,不会伪造 low-latency path。
+- 规范矩阵范围:请求 608 个 cases / 1,600 个 token points;364 个可运行 cases / 940 个
+ points,分布在 58 个可执行 workflow shards/allocation cells;244 个 unsupported cases / 660 个
+ points。
+
+| 系统 | EP8 | EP16 |
+|---|---|---|
+| H100/H200/B200/B300 | 1x8 NVLink,scale-up | 2x8 NVLink + RDMA,scale-out |
+| MI325X/MI355X | 1x8 XGMI,scale-up | 2x8 XGMI + RDMA,scale-out |
+| GB200/GB300 | 2x4 MNNVL,scale-up | 4x4 MNNVL,scale-up |
+
+物理主机数量不能定义通信范围。两个 GB 配置都位于同一个 72-GPU MNNVL scale-up domain 内。
+
+Unsupported combinations 是 terminal outcomes,不会被静默跳过。DeepEP V2 指 PR #605
+引入的 `ElasticBuffer`,并固定使用 upstream PR #630 的最小纯 scale-up 修复。V2 的 scale-up
+cases 请求 NCCL Device API LSA;若实际建立的 LSA team 未覆盖整个 EP world,则直接失败。x86
+EP16 scale-out 使用启用 GIN 的 hybrid path,并要求两个逻辑 scale-out domains(由两个物理 RDMA
+ranks 表示)、每个 domain 八个 scale-up ranks。GB EP16 仍是 MNNVL scale-up,因此使用 LSA。
+Source 中声明的 NVIDIA capabilities 在 GPU outcomes 通过 native oracle 和 publisher gates 前仍为
+unvalidated。当前 runner pool 上的 H100 V2 在 v1 中被声明为 unsupported,因为 NCCL 2.30.4
+报告其 EP8 communicator 不具备 Device API symmetric-memory 支持;只有该 pool 恢复全 rank
+CUDA P2P/LSA 支持后才能重新加入。已移除的轴包括 `[cl]`、`[rv]`、quantization、alternate
+activation/routing profiles、uneven allocation、placement
+permutations、model envelopes 和 scaling。
+FlashInfer 因可重复出现的间歇性执行失败而排除在 v1 外;这些失败不会转为 planned-unsupported
+coverage。
+MoRI EP8 在 normal mode 下使用 MI325X AsyncLL 或 MI355X IntraNode。EP16 固定使用 2x8 XGMI +
+RDMA 上的 InterNodeV1,配置为 96 blocks、64 RDMA blocks、8 warps、每个 PE 一个 QP,以及
+external input。MoRI 的 AsyncLL transport 不属于真正的 low-latency suite contract,也绝不会
+以该模式标注。
+
+## Workload 身份
+
+一个 canonical workload 在 global token batch 上生成,再按 source rank 切分。Expert indices
+和 gate weights 会序列化。Activations 使用带版本的整数计数器公式,其 BF16 值在不同 runtime
+中精确一致;完整身份绑定到 manifest。Manifest 还绑定 shape/EP coordinates 和 oracle version。
+SHA-256 覆盖 canonical bytes 和 parameters;重新生成 library RNG 不能证明身份一致。
+
+Routing traffic 区分:
+
+- token-expert assignments,决定 expert compute load;
+- rank-deduplicated token payload copies,决定 EP activation traffic。
+
+Adapters 不得生成 routing,也不得将两种量相互解释。
+
+## 测量
+
+Normal mode 使用 `layout-and-dispatch-v1`:dispatch timing 包括 layout 和 communication,combine
+通过 unweighted rank-sum path 返回 activation payload。Low-latency mode 使用
+`expert-packed-weighted-combine-v1`:DeepEP V1/UCCL 原生 API dispatch token-expert assignments,
+并执行 gate-weighted combine。Expert-output staging 不计入 isolated combine timing,但计入被测
+paired roundtrip。每个 component 声明 availability、origin、start/end states、stage scope 和 sample
+count。仅有 paired API 时,isolated components 报 null。`isolated_sum` 为派生值,不用于
+throughput 或 recommendations。Mode 属于 series identity;normal 和 low-latency evidence 不能
+共用排名 cohort。
+
+每个被测 component 均使用 `fixed-512-v1`:
+
+- 64 trials x 8 timed iterations = 512 observations;
+- 每个 trial/point 的每个可用被测 component 前,执行 32 次同步完整
+ dispatch-stage-combine warmups;
+- 先测 roundtrip,再测 isolated dispatch 和 combine,并使用固定的 per-phase conditioning ladder;
+- 每次 iteration 先取跨 rank 最大 latency,再以 nearest-rank 计算 p50/p90/p95/p99。
+
+被测 roundtrip p99 是 headline latency。Retries 保持为独立 attempts;后续成功不会抹除早期失败。
+Decode 和 prefill 表示一个 MoE-layer collective 所代表的 serving regime;在其他 shape 相同时,
+它们不会改变 timed primitive。
+
+NCCL/RCCL reference 是 end-to-end Python adapter,而不是 bare fabric primitive。其 dispatch
+boundary 包含 layout、count exchange、device-to-host split synchronization、fresh receive
+allocation,以及四次 payload/metadata all-to-all;activation-only combine 还包含一次 all-to-all 和
+scatter/reduction。因此其 p99 测量完整 reference-adapter boundary,可能对 host/scheduler 敏感。
+它可作为 portable system control,但不得标记为 fabric、link、bus 或 single-collective latency。
+
+带版本的 conditioning 和 EPLB planner contracts(reference trace、redundant count 和
+placement/remap version)属于 scheduled 和 evidence identity。
+
+Logical payload bandwidth 为:
+
+`logical_payload_bytes / measured_latency_seconds`
+
+Normal-mode payload bytes 使用按 rank 去重的 token-rank activations;low-latency bytes 使用
+token-expert assignments。两种模式都在命名边界上加入必需 scale bytes,并排除 expert metadata、
+padding 和 backend buffer capacity。若没有定义 primitive model 或 transport counters,不发布
+algorithm bandwidth、bus bandwidth、wire utilization 或 physical-link utilization。Logical
+bandwidth 绝不能标为 physical bandwidth。已发布 payload 和 token rates 命名为
+`rate_at_latency_percentile`:bytes 或 tokens 除以对应 latency percentile。它们是 p99 latency
+下的 lower-tail service rates,不是 inverted rate distribution 的 p99 percentiles。
+
+## 正确性
+
+与实现无关的 oracle 使用 expert-specific deterministic transform,使错误 expert routing 无法
+通过 identity roundtrip。它对每个 rank 和 point 验证:
+
+1. destination rank/expert、source token、multiplicity、gate weight 和 receive counts;
+2. timing 前的 dispatched payload 和 metadata;
+3. timing 前的 combined output;
+4. 所有 timed samples 期间 semantic inputs 不变;
+5. timing 后再次验证 dispatched payload/metadata 和 combined output。
+
+Normal-mode adapters 使用 activation-only、unweighted rank-sum combine。Oracle 在 combine 前
+构造每个 rank 的 gate-weighted expert aggregate,独立计算 `sum(gate * expert(token))`,并检查
+dispatch metadata 和 transformed output。Low-latency adapters 单独验证 expert-packed
+source/expert assignment、原生 gate weights 和 gate-weighted combined output。两个契约都使用
+已记录的 `rtol=0.05` 和 `atol=0.02` 检查每个 element。任一 rank 或 point 失败都会使 case
+不合格。Pre/post dispatch evidence 按
+canonical source-token order 计算 hash。Native receive slots 可能非确定性分配,因此 physical
+receive order 不作为 correctness property。
+
+## Native 结果
+
+单个 raw case document 使用 `format: "collectivex.ep.v1"`,拒绝未知 fields,并包含:
+
+- `case`:稳定 case ID、suite、required tier 和 coordinate;
+- `workload`:canonical identity 和 logical MoE shape;
+- `measurement`:sampling、component states、timing 和 byte accounting;
+- `implementation`:实例化 class/API、固定 source、loaded libraries 和 resources;
+- `topology`:requested 和 realized SKU、devices、placement、scale-up domain 和 transport;
+- `provenance`:source SHA、image/squash hashes、allocation、run 和 attempt;
+- `rows`:point latency、byte accounting、token rate、correctness、load、fanout 和 anomaly evidence;
+- `outcome`:`success`、`failed`、`invalid`、`diagnostic` 或 `unsupported`,以及 reasons。
+
+Raw result documents 和 exact samples 会先经过临时 GitHub delivery artifacts,再由 publisher
+归档到 private bundle;它们不会进入 public tree。Private environment details 只保留在本地
+mode-0600 logs 和忽略的 operator notes 中;不会归档或发布。每个 expected case 有一个 terminal
+selected outcome,同时保留每次 attempt。
+
+## 身份与比较
+
+Canonical JSON 生成三个完整 SHA-256 IDs:
+
+- `series_id`:除 token coordinate 和 repeat allocation 外的所有 locked factors;
+- `point_id`:`series_id` 加 token coordinate;
+- `evidence_id`:`point_id` 加 allocation/run/attempt/sample checksum。
+
+Locked factors 包括 workload bytes、measurement 和 sampling contract、resources、realized
+topology、implementation/build、loaded libraries、image/squash、runtime 和 source SHA。
+Deferred code generation 会在 measurement 前捕获,并在之后再次捕获。DeepEP V2 使用固定的
+NVCC random seed,并绑定最终 cache keys、generated-source hashes 与 executable-SASS hashes;
+raw CUBIN bytes 仅保留为 private diagnostics。Hybrid 绑定实际自动调优配置与完整 kernel-key
+set,同时将各 rank 的 shared-object hashes 仅保留为 private diagnostics。本地构建的 extension
+hashes 属于 diagnostic;其固定 source trees、build recipe、runtime 与 dependencies 仍绑定到
+series。
+Series identity 包含 case ID;case ID 绑定完整 scheduled token ladder,以及固定的 percentile、
+rank-reduction、conditioning、warmup 和 correctness semantics。
+
+Controlled comparison 只声明一个 contrast:
+
+- `library`:backend implementation 及其 tuned resource profile 可以不同;realized system、
+ workload、EP、resource policy、source 和 measurement 必须匹配;
+- `chip`:受控 platform contrast。完整 realized system/topology 和 tuned resource profile 可以不同,
+ 但 workload、EP、placement class、resource policy、backend lineage、source 和 measurement 必须
+ 匹配。它不是 silicon-only comparison;
+- `system`:保留所有 hardware/backend 差异,同时匹配 workload、EP 和 measurement;
+- `routing`:routing distribution/EPLB 可以不同,但 static implementation build/generator、system、
+ model shape、resource profile 和 measurement 必须匹配。未启用 EPLB 的 Uniform 和 Zipf 复用
+ 同一 generated implementation;EPLB 的 physical-expert/JIT configuration 是显式 treatment
+ difference。
+
+任何未声明的 mismatch 都会拒绝 overlay。Chip/system results 描述 measured systems,而非仅描述
+silicon。
+
+## Evidence 策略
+
+Capability declarations 说明可以尝试什么;artifacts 决定 evidence status。Promotion 要求完整的
+expected coverage,不能有 missing、extra、duplicate、malformed 或 heterogeneous case。Public
+coverage 保留每个 matrix disposition;promotion 要求每个 runnable case 在所有 selected runs 中
+成功,且每个 planned-unsupported case 始终为 unsupported。只有固定 canonical full-v1 matrix,
+且具有 decision-grade library、chip、system 和 routing cohort,才能推进 `dev-latest`;partial
+matrices 仍为 diagnostic。Full-matrix digest 有意绑定精确 workflow shard grouping 和 requested
+cases,因此即使 case coverage 不变,修改 `--max-cases` 或 SKU round-robin scheduling order 也只
+会产生 diagnostic-only runs。Superseded retries、planned-unsupported outcomes 和 unstable
+comparison cohorts 可以用于诊断展示,但不能排名或推荐;promoted dataset 中每个成功的 required
+series 都必须保持 decision-grade。Runnable case 的任何 failed、invalid 或 diagnostic retry 都会
+阻止 promotion,即使后续 retry 成功。Routing cohorts 是 comparable-experimental sensitivities,
+不会产生 configuration recommendations;official library/platform/system cohorts 才能产生可执行
+recommendations。
+
+一个 point 只有在三个独立 workflow runs 和 allocation IDs 均通过 correctness、identity、
+provenance、tail gates、p50/p99 repeat-stability thresholds 和 stable ordering 后才成为
+decision-grade。Eligibility、controlled cohorts、sensitivity pairs 和 recommendations 由
+publisher 而非 frontend 计算。
+
+## 执行隔离
+
+每个非 MNNVL scale-out case 都使用 operator 固定的 socket 与 RDMA selectors。Launcher 会拒绝
+缺失或不完整的 profile,并在 backend 初始化前逐个 allocation 节点检查已配置 interface、active
+HCA port 与指定 GID。它不会改用 default route、继承的 runner environment 或 transport
+fallback。Scale-up 和 MNNVL case 会清除该 profile;scale-out NCCL/RCCL 强制设置
+`NCCL_NET=IB` 并精确匹配 HCA。Selector values 只保留在加密配置和 mode-0600 private logs 中。
+
+Repository staging 使用 checkout 与 workflow workspace 外预创建的 shared base;该 base 由
+runner owner 持有,group/world 均不可写。父进程在复制前解析精确 execution child,以
+runner-owned marker 声明所有权,并验证所有 allocation 节点读写的是同一份 bytes。Cleanup 会
+等待 allocation teardown 得到确认,并只删除该 child,包括可安全识别的未完成 claim。同一 run
+的 V2/Hybrid source archive 会在固定 member 数和解压大小上限内完整验证,并且只提取所选 fixed
+root;仅当相对 leaf symlink 指向同一 backend root 内的 regular member 时才允许创建,之后还要
+通过精确 Git tree/submodule 校验。
+
+## 产物验证与即时交付
+
+不使用 self-hosted service、Vercel storage、GCP、Neon、managed database 或 managed object
+store。Publication workflow 仅将 runner 本地临时存储用作可丢弃的 validation 与 promotion
+工作区:
+
+```text
+$COLLECTIVEX_STORE_ROOT/
+ private/incoming/ # write-once downloaded GHA attempts
+ private/bundles// # immutable source archives, native results/samples, matrix, checksums
+ private/quarantine/ # rejected attempts plus machine-readable reasons
+ public/datasets// # immutable sanitized frontend datasets
+ public/channels/ # small atomic pointers: latest-attempt, dev-latest
+ locks/
+```
+
+Private 和 public trees 使用不同 permissions。JSON manifests 和 checksums 是权威记录;可重建
+catalog 仅为 index。Raw sweep artifacts 只是 publisher 的临时输入;只有清理并完成 promotion
+的 NDJSON 会保留为前端 publication artifact。
+
+Container tags 会与固定 registry digests 核对。Enroot imports 使用固定
+`SOURCE_DATE_EPOCH` 和 versioned cache generation;每个 mounted squash 都重新计算 hash 并纳入
+series identity。Image-provided DeepEP 也按精确 per-architecture wheel 和 installed-file
+fingerprints 检查,因此 stale cache 不能继承固定 source identity。
+Source-built DeepEP V2 使用独立的 mode-0700 cluster-local cache,并且只以 `/cx-cache` 挂载。
+其 content key 绑定版本化 build recipe、verified image digest、CPU/GPU architecture、
+upstream source trees 和固定 build dependencies。该 cache 既不是 artifact,也不是 publisher
+input;每次执行的 source/results stage 仍然隔离且可丢弃,并在复用前以 marker 和 runtime probe
+fail closed。Runner UID 属于受信任的 cluster boundary:该 cache 用于防止 stale 或意外修改,
+不防御恶意的同 UID job。只有从未发布的 partial build 才能自动重置;已发布 cache 一旦未通过
+integrity 或 runtime 检查,将保持原样并被拒绝,避免并发 allocation 正在使用的文件被删除。
+
+Publication 采用 fail-closed:
+
+1. 获取 exclusive filesystem lock,并在 destination filesystem 上 stage;
+2. 解析前归档 source bytes;
+3. 要求精确 matrix-declared artifact set,并拒绝每个未消费 archive member;
+4. 验证 strict schemas、privacy、checksums、identities、timing 和精确 matrix outcomes;
+5. 写入 checksums 和 `COMPLETE`,fsync,然后原子 rename private bundle;
+6. 构建并验证 sanitized content-addressed dataset,fsync,然后原子 rename;
+7. 仅在全部 promotion gates 通过后原子替换 `dev-latest.json`。
+
+Rejected attempts 可以更新工作区中的 `latest-attempt`,但不能更新 `dev-latest`。工作区会随
+publication runner 销毁,且绝不连接到前端。只有三个选定 bundles 全部推进 `dev-latest` 后才会
+生成 artifact。
+
+`publisher.py ingest` 接受精确 matrix,并为每个 GitHub artifact 接受一个 `--artifact` directory
+或 ZIP。`promote` 接受显式 immutable bundle IDs。默认 `verify` 要求 `latest-attempt`;若存在
+`dev-latest` 也会验证,而显式 `--channel dev-latest` 则要求其存在。Workflow 只会将通过验证并
+清理后的 dataset 复制到单记录 `collectivex_public_v1_.ndjson` artifact。Raw artifacts 和
+private workspace 内容绝不打包进应用。
+
+Sweeps 默认使用 `release_tag=unversioned`。选择 `v1` 时必须匹配固定的完整 matrix digest,并
+生成绑定 run ID、attempt、source SHA 与 matrix SHA-256 的 marker。手动 publication workflow
+只接受三个唯一、成功、来自同一 source SHA 的 `CollectiveX Sweep` run IDs;它会重新校验
+metadata 与精确 markers,下载 immutable artifacts,并将相同 provenance assertions 传给
+`publisher.py ingest`。Partial、filtered、untagged、跨 source、失败或过期的输入都会 fail closed。
+
+前端使用 server-side GitHub read token,即时发现最新成功且按版本隔离的 publication run,并
+下载 publication artifact。它要求 ZIP 根目录只有一个 NDJSON entry,校验 UTF-8、schema、
+promotion 状态及 filename/body SHA-256,随后提供短期缓存的带版本 channel pointer 和 immutable
+带版本 dataset URL。Benchmark-version selector 当前只显示 V1;后续版本必须使用独立的 release
+与 publication identity。前端不会虚构 missing values、选择 retries,或重新计算 decision
+eligibility。
+
+## 历史数据
+
+Numeric schemas 3-5 不在 v1 publisher 和 frontend reader 范围内。它们仍是 historical
+diagnostic evidence,不能作为 `dev-latest` 初始数据或驱动 v1 decisions。
diff --git a/experimental/CollectiveX/identity.py b/experimental/CollectiveX/identity.py
new file mode 100644
index 0000000000..ff5b116996
--- /dev/null
+++ b/experimental/CollectiveX/identity.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""Canonical, cross-runtime identities for CollectiveX v1."""
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+from typing import Any
+
+IDENTITY_VERSION = 1
+MAX_SAFE_INTEGER = (1 << 53) - 1
+PREFIXES = {
+ "case": "cxcase-v1-",
+ "workload": "cxwork-v1-",
+ "series": "cxseries-v1-",
+ "point": "cxpoint-v1-",
+ "evidence": "cxevidence-v1-",
+ "allocation": "cxallocation-v1-",
+ "attempt": "cxattempt-v1-",
+}
+V1_NORMAL_CASE_PROFILE = {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_profile": "canonical-counter-source-v3",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "combine_semantics": "activation-only",
+ "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2",
+ "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "contract": "layout-and-dispatch-v1",
+ "correctness_scope": "dispatch-metadata-and-transformed-combine",
+ "dtype": "bf16",
+ "eplb_planner": "greedy-rank-major-v1",
+ "eplb_redundant_experts": 32,
+ "eplb_reference_tokens_per_rank": 2048,
+ "mode": "normal",
+ "oracle_contract": "expert-specific-transform-v1",
+ "oracle_tolerances": "rtol=0.05,atol=0.02",
+ "payload_unit": "token-rank",
+ "placement": "packed",
+ "percentile_method": "nearest-rank",
+ "rank_reduction": "cross-rank-max-per-iteration",
+ "resource_mode": "tuned",
+ "routing_generator": "collectivex-routing-counter-v3",
+ "sampling_contract": "fixed-512-v1",
+ "seed": 67,
+}
+
+V1_LOW_LATENCY_CASE_PROFILE = {
+ **V1_NORMAL_CASE_PROFILE,
+ "component_order_contract": "roundtrip-dispatch-gate-weighted-combine-v1",
+ "combine_semantics": "gate-weighted",
+ "contract": "expert-packed-weighted-combine-v1",
+ "correctness_scope": "expert-assignment-and-weighted-combine",
+ "mode": "low-latency",
+ "oracle_contract": "expert-assignment-transform-v1",
+ "payload_unit": "token-expert",
+}
+
+# Compatibility alias for normal-mode callers. New scheduling and validation
+# must select a profile from the explicit case mode.
+V1_CASE_PROFILE = V1_NORMAL_CASE_PROFILE
+V1_CASE_PROFILES = {
+ "normal": V1_NORMAL_CASE_PROFILE,
+ "low-latency": V1_LOW_LATENCY_CASE_PROFILE,
+}
+
+
+def case_profile(mode: str) -> dict[str, Any]:
+ """Return the immutable measurement profile for one scheduled mode."""
+ try:
+ return V1_CASE_PROFILES[mode]
+ except KeyError as exc:
+ raise IdentityError(f"unknown CollectiveX case mode {mode!r}") from exc
+
+
+def profile_for_case(case: dict[str, Any]) -> dict[str, Any]:
+ """Resolve a scheduled case's explicit mode to its identity profile."""
+ mode = case.get("mode")
+ if not isinstance(mode, str):
+ raise IdentityError("scheduled case mode is missing")
+ return case_profile(mode)
+
+
+class IdentityError(ValueError):
+ """An identity payload cannot be represented consistently across runtimes."""
+
+
+def _validate(value: Any, path: str = "$") -> None:
+ if value is None or isinstance(value, bool):
+ return
+ if isinstance(value, str):
+ if any(ord(character) < 0x20 or ord(character) > 0x7E for character in value):
+ raise IdentityError(f"{path}: string must contain printable ASCII only")
+ return
+ if type(value) is int:
+ if abs(value) > MAX_SAFE_INTEGER:
+ raise IdentityError(f"{path}: integer exceeds the cross-runtime safe range")
+ return
+ if isinstance(value, list):
+ for index, item in enumerate(value):
+ _validate(item, f"{path}[{index}]")
+ return
+ if isinstance(value, dict):
+ for key, item in value.items():
+ if not isinstance(key, str):
+ raise IdentityError(f"{path}: object key is not a string")
+ if any(ord(character) < 0x20 or ord(character) > 0x7E for character in key):
+ raise IdentityError(f"{path}: object key must contain printable ASCII only")
+ _validate(item, f"{path}.{key}")
+ return
+ raise IdentityError(f"{path}: unsupported identity value {type(value).__name__}")
+
+
+def canonical_bytes(value: Any) -> bytes:
+ """Return compact UTF-8 JSON after enforcing the portable value subset."""
+ _validate(value)
+ return json.dumps(
+ value,
+ ensure_ascii=False,
+ allow_nan=False,
+ sort_keys=True,
+ separators=(",", ":"),
+ ).encode("utf-8")
+
+
+def digest(kind: str, value: Any) -> str:
+ """Hash a typed v1 identity payload and return its typed identifier."""
+ try:
+ prefix = PREFIXES[kind]
+ except KeyError as exc:
+ raise IdentityError(f"unknown identity kind {kind!r}") from exc
+ body = {"kind": kind, "value": value, "version": IDENTITY_VERSION}
+ return prefix + hashlib.sha256(canonical_bytes(body)).hexdigest()
+
+
+def is_typed_id(value: Any, kind: str) -> bool:
+ prefix = PREFIXES.get(kind)
+ return bool(
+ isinstance(value, str)
+ and prefix
+ and re.fullmatch(re.escape(prefix) + r"[0-9a-f]{64}", value)
+ )
+
+
+def case_id(*, sku: str, profile: dict[str, Any], case: dict[str, Any]) -> str:
+ return digest("case", {"case": case, "profile": profile, "sku": sku})
+
+
+def workload_id(value: dict[str, Any]) -> str:
+ return digest("workload", value)
+
+
+def series_id(value: dict[str, Any]) -> str:
+ return digest("series", value)
+
+
+def point_id(*, series: str, tokens_per_rank: int) -> str:
+ return digest("point", {"series_id": series, "tokens_per_rank": tokens_per_rank})
+
+
+def allocation_id(value: dict[str, Any]) -> str:
+ return digest("allocation", value)
+
+
+def attempt_id(*, allocation: str, case: str, ordinal: int) -> str:
+ return digest(
+ "attempt", {"allocation_id": allocation, "case_id": case, "ordinal": ordinal}
+ )
+
+
+def evidence_id(
+ *, point: str, allocation: str, attempt: str, sample_sha256: str
+) -> str:
+ return digest(
+ "evidence",
+ {
+ "allocation_id": allocation,
+ "attempt_id": attempt,
+ "point_id": point,
+ "sample_sha256": sample_sha256,
+ },
+ )
+
+
+IDENTITY_TEST_VECTOR = {
+ "payload": {"backend": "deepep", "ep": 8, "shape": [7168, 8, 256]},
+ "series_id": "cxseries-v1-a79bf758488e3edd50f5531f3af825f371bf42aae7c4097e461fd2a32615af81",
+}
+
+
+def verify_test_vector() -> None:
+ observed = series_id(IDENTITY_TEST_VECTOR["payload"])
+ if observed != IDENTITY_TEST_VECTOR["series_id"]:
+ raise IdentityError(
+ f"identity implementation differs: {observed} != {IDENTITY_TEST_VECTOR['series_id']}"
+ )
+
+
+if __name__ == "__main__":
+ verify_test_vector()
+ print(IDENTITY_TEST_VECTOR["series_id"])
diff --git a/experimental/CollectiveX/launchers/launch_gb-nv.sh b/experimental/CollectiveX/launchers/launch_gb-nv.sh
new file mode 100644
index 0000000000..21aae4c13c
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_gb-nv.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+# CollectiveX shared GB200/GB300 NVL72 (aarch64) launcher.
+# shellcheck disable=SC2016,SC2034
+#
+# EP8/EP16 use one Slurm task per GPU across two or four trays in the same
+# MNNVL scale-up domain.
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"; REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+PRODUCT="${CX_SHARD_SKU:-${CX_GB_PRODUCT:-${CX_PUBLIC_RUNNER:-}}}"
+case "$PRODUCT" in
+ gb200|gb300) ;;
+ *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to gb200 or gb300" ;;
+esac
+RUNNER="$PRODUCT"
+export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-deepep}"
+export CX_IMAGE_PLATFORM=linux/arm64
+JOB_ID=""
+cx_install_launcher_fail_safe
+cx_set_failure_stage setup
+cx_load_operator_config
+cx_lock_canonical_gha_env "$RUNNER"
+NODES="${CX_NODES:-2}"; GPN="${CX_GPUS_PER_NODE:-4}"
+SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-72}"
+EXPECTED_WORLD=$((NODES * GPN))
+NGPUS="${CX_NGPUS:-$EXPECTED_WORLD}"
+if [ "$PRODUCT" = gb200 ]; then default_time=30; else default_time=90; fi
+TIME_MIN="${CX_TIME:-$default_time}"
+[ "$NODES" = 2 ] || [ "$NODES" = 4 ] \
+ || cx_die "$PRODUCT v1 supports two or four four-GPU trays"
+[ "$GPN" = 4 ] || cx_die "$PRODUCT requires four GPUs per tray"
+[ "$SCALE_UP_DOMAIN" = 72 ] || cx_die "$PRODUCT requires the NVL72 scale-up domain"
+[ "$NGPUS" = "$EXPECTED_WORLD" ] \
+ || cx_die "$PRODUCT world size must equal nodes x GPUs per tray"
+cx_apply_timing_profile
+IMAGE="${CX_IMAGE:-$(cx_default_image "$PRODUCT")}"
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+export CX_RUNNER="$RUNNER" CX_TS="$TS" CX_TOPO="${PRODUCT}-nvl72-mnnvl"
+export CX_SCOPE=scale-up CX_TRANSPORT=mnnvl CX_SCALE_UP_TRANSPORT=mnnvl
+export CX_NODES="$NODES" CX_GPUS_PER_NODE="$GPN" CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN"
+export CX_NGPUS="$NGPUS"
+unset CX_SCALE_OUT_TRANSPORT
+case "$CX_BENCH" in
+ deepep|deepep-v2|deepep-hybrid|nccl-ep) ;;
+ *) cx_die "unsupported $PRODUCT EP backend: $CX_BENCH" ;;
+esac
+cx_validate_shard_control "$CX_DIR"
+cx_require_vars CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR
+[ "$PRODUCT" != gb300 ] || cx_require_vars CX_ENROOT_CACHE_PATH
+PARTITION="$CX_PARTITION"; ACCOUNT="$CX_ACCOUNT"; SQUASH_DIR="$CX_SQUASH_DIR"
+[ -z "${CX_ENROOT_CACHE_PATH:-}" ] || export ENROOT_CACHE_PATH="$CX_ENROOT_CACHE_PATH"
+export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1
+cx_apply_network_profile "$NODES" "$CX_TRANSPORT"
+
+cx_log "$PRODUCT nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH"
+[ "${CX_DRYRUN:-0}" = 1 ] && { cx_log "DRYRUN"; exit 0; }
+cx_set_failure_stage registry-verification
+cx_verify_registry_image "$IMAGE"
+cx_set_failure_stage repository-stage
+MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "$CX_STAGE_DIR")"
+cx_stage_repo "$REPO_ROOT" "$MOUNT_SRC"
+cx_prepare_runtime_marker "$MOUNT_SRC"
+CONTAINER_MOUNTS="$MOUNT_SRC:/ix"
+if [ "$CX_BENCH" = deepep-v2 ] || [ "$CX_BENCH" = deepep-hybrid ]; then
+ cx_set_failure_stage backend-setup
+ cx_prepare_backend_source "$MOUNT_SRC" "$CX_BENCH" \
+ || cx_die "cannot stage the pinned backend source"
+ export CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources
+fi
+if [ "$CX_BENCH" = deepep-v2 ]; then
+ cx_prepare_backend_cache "$CX_SQUASH_DIR" \
+ || cx_die "cannot prepare the isolated backend cache"
+ CONTAINER_MOUNTS="$CONTAINER_MOUNTS,$CX_PREPARED_BACKEND_CACHE:/cx-cache"
+ export CX_BACKEND_CACHE_ROOT=/cx-cache
+fi
+
+cx_set_failure_stage scheduler-allocation
+command -v salloc >/dev/null || cx_die "salloc not found"
+cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \
+ --gres=gpu:"$GPN" --ntasks-per-node="$GPN" --exclusive --mem=0 --cpus-per-task=35 \
+ --time="$TIME_MIN"
+[ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc"
+cx_set_failure_stage container-import
+SQUASH_FILE="$(cx_ensure_squash_on_job "$JOB_ID" "$SQUASH_DIR" "$IMAGE")"
+cx_set_failure_stage container-hash
+cx_export_squash_identity "$SQUASH_FILE"
+cx_preflight_allocation "$JOB_ID" "$NODES" "$MOUNT_SRC" "$SQUASH_FILE" \
+ "${CX_SHARD_FILE:-}"
+
+# Keep the loader policy here because it is platform/container specific and
+# security tests evaluate this literal independently.
+SOURCE_BACKEND_ENV='case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 66;; esac; env_file="/ix/experimental/CollectiveX/.cx_backend/env/node-${SLURM_NODEID}.sh"; env_root="${env_file%/*}"; [ -d "$env_root" ] && [ ! -L "$env_root" ] || exit 66; case "$(stat -c "%a" "$env_root")" in 700|[1-7]700) ;; *) exit 66;; esac; [ -f "$env_file" ] && [ -r "$env_file" ] && [ ! -L "$env_file" ] && [ "$(stat -c "%u:%a" "$env_file")" = "$(stat -c "%u" "$env_root"):600" ] || exit 66; . "$env_file" || exit 66'
+BACKEND_PROBE="$SOURCE_BACKEND_ENV"'; case "$CX_BENCH" in deepep) python3 -c "from deep_ep import Buffer";; deepep-v2) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''ElasticBuffer'\'')";; deepep-hybrid) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''HybridEPBuffer'\'')";; nccl-ep) python3 -c "import torch";; esac'
+WRAP="${SOURCE_BACKEND_ENV}"$'\n'"$(cx_slurm_rank_wrapper)"
+CX_DISTRIBUTED_CONTAINER_ARGS=(--container-writable --container-remap-root)
+[ "$CX_BENCH" != deepep ] || export CX_ALLOW_MNNVL=1
+run_rc=0
+cx_set_failure_stage container-launch
+cx_run_distributed_shard || run_rc=$?
+
+cx_adopt_runtime_stage "$MOUNT_SRC"
+collect_rc=0
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$?
+[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection
+final_rc="$run_rc"
+[ "$final_rc" != 0 ] || final_rc="$collect_rc"
+exit "$final_rc"
diff --git a/experimental/CollectiveX/launchers/launch_mi-amds.sh b/experimental/CollectiveX/launchers/launch_mi-amds.sh
new file mode 100644
index 0000000000..f66f820f54
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_mi-amds.sh
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+# CollectiveX shared MI325X/MI355X AMD Slurm launcher (one or two nodes).
+# shellcheck disable=SC2016,SC2034
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER="${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}"
+case "$RUNNER" in
+ mi325x) CPUS_PER_TASK=256; DEVICE_MOUNTS=",/dev/kfd:/dev/kfd,/dev/dri:/dev/dri" ;;
+ mi355x) CPUS_PER_TASK=128; DEVICE_MOUNTS="" ;;
+ *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to mi325x or mi355x" ;;
+esac
+export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-mori}"
+export CX_IMAGE_PLATFORM=linux/amd64
+JOB_ID=""
+cx_install_launcher_fail_safe
+cx_set_failure_stage setup
+cx_load_operator_config
+cx_lock_canonical_gha_env "$RUNNER"
+
+NODES="${CX_NODES:-1}"; GPN="${CX_GPUS_PER_NODE:-8}"
+SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-8}"
+EXPECTED_WORLD=$((NODES * GPN))
+NGPUS="${CX_NGPUS:-$EXPECTED_WORLD}"
+TIME_MIN="${CX_TIME:-60}"
+EXCLUDE_NODES="${CX_EXCLUDE_NODES:-}"
+NODELIST="${CX_NODELIST:-}"
+MOUNT_DIR=/ix
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+[ "$NODES" = 1 ] || [ "$NODES" = 2 ] \
+ || cx_die "$RUNNER supports one or two nodes"
+[ "$GPN" = 8 ] || cx_die "$RUNNER requires eight GPUs per node"
+[ "$SCALE_UP_DOMAIN" = 8 ] || cx_die "$RUNNER requires an eight-GPU scale-up domain"
+[ "$NGPUS" = "$EXPECTED_WORLD" ] \
+ || cx_die "$RUNNER world size must equal nodes x GPUs per node"
+case "$CX_BENCH" in
+ mori|nccl-ep) ;;
+ *) cx_die "unsupported AMD EP backend: $CX_BENCH" ;;
+esac
+
+if [ "$RUNNER" = mi325x ]; then
+ export MORI_DISABLE_AUTO_XGMI="${MORI_DISABLE_AUTO_XGMI:-0}"
+ export MORI_ENABLE_SDMA="${MORI_ENABLE_SDMA:-1}"
+ export MORI_APP_LOG_LEVEL="${MORI_APP_LOG_LEVEL:-info}"
+ export MORI_SHMEM_LOG_LEVEL="${MORI_SHMEM_LOG_LEVEL:-info}"
+ export MORI_IO_LOG_LEVEL="${MORI_IO_LOG_LEVEL:-info}"
+ [ "$CX_BENCH" != mori ] \
+ || export CX_IMAGE="${CX_IMAGE:-$CX_IMAGE_AMD_MORI_MI325}"
+fi
+if [ "$CX_BENCH" = mori ]; then
+ if [ "$NODES" -gt 1 ]; then
+ export CX_MORI_KERNEL_TYPE=internode-v1
+ elif [ "$RUNNER" = mi325x ]; then
+ export CX_MORI_KERNEL_TYPE="${CX_MORI_KERNEL_TYPE:-asyncll}"
+ else
+ export CX_MORI_KERNEL_TYPE="${CX_MORI_KERNEL_TYPE:-intranode}"
+ fi
+fi
+IMAGE="${CX_IMAGE:-$(cx_default_image "$RUNNER")}"
+export CX_RUNNER="$RUNNER" CX_NGPUS="$NGPUS" CX_NODES="$NODES"
+export CX_GPUS_PER_NODE="$GPN" CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN" CX_TS="$TS"
+export CX_SCALE_UP_TRANSPORT=xgmi
+if [ "$NODES" -gt 1 ]; then
+ export CX_SCOPE=scale-out CX_SCALE_OUT_TRANSPORT=rdma
+ export CX_TRANSPORT=xgmi-rdma CX_TOPO="${RUNNER}-xgmi-rdma"
+else
+ export CX_SCOPE=scale-up CX_TRANSPORT=xgmi CX_TOPO="${RUNNER}-xgmi"
+ unset CX_SCALE_OUT_TRANSPORT
+fi
+export CX_RUN_TIMEOUT="${CX_RUN_TIMEOUT:-1800}"
+cx_apply_network_profile "$NODES" "$CX_TRANSPORT"
+cx_validate_shard_control "$CX_DIR"
+cx_require_vars CX_PARTITION CX_SQUASH_DIR CX_STAGE_DIR
+PARTITION="$CX_PARTITION"; SQUASH_DIR="$CX_SQUASH_DIR"
+
+cx_log "runner=$RUNNER nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH"
+cx_set_failure_stage repository-stage
+MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "$CX_STAGE_DIR")"
+cx_stage_repo "$REPO_ROOT" "$MOUNT_SRC"
+cx_prepare_runtime_marker "$MOUNT_SRC"
+[ "${CX_DRYRUN:-0}" != 1 ] || { cx_log "CX_DRYRUN=1 - not allocating"; exit 0; }
+cx_set_failure_stage registry-verification
+cx_verify_registry_image "$IMAGE"
+cx_set_failure_stage scheduler-allocation
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+
+allocation=(--partition="$PARTITION" --nodes="$NODES" --gres=gpu:"$GPN" --exclusive
+ --time="$TIME_MIN")
+if [ "$NODES" = 1 ]; then
+ allocation+=(--cpus-per-task="$CPUS_PER_TASK")
+else
+ allocation+=(--ntasks-per-node="$GPN" --cpus-per-task="$((CPUS_PER_TASK / GPN))")
+fi
+if [ -n "$NODELIST" ]; then
+ cx_log "using configured node pin"
+ allocation+=(--nodelist="$NODELIST")
+elif [ -n "$EXCLUDE_NODES" ]; then
+ allocation+=(--exclude="$EXCLUDE_NODES")
+fi
+cx_salloc_jobid "${allocation[@]}"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_set_failure_stage setup
+cx_validate_network_profile_on_job "$JOB_ID" "$NODES" "$CX_TRANSPORT"
+
+cx_set_failure_stage container-import
+SQUASH_FILE="$(cx_ensure_squash_on_job \
+ "$JOB_ID" "$SQUASH_DIR" "$IMAGE" "${CX_LOCK_DIR:-}")"
+cx_set_failure_stage container-hash
+import_log="$(cx_private_log_path image-hash)"
+if ! COLLECTIVEX_SQUASH_SHA256="$(
+ srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --chdir=/tmp \
+ --export="$(cx_host_exports)" sha256sum "$SQUASH_FILE" \
+ 2>>"$import_log" | awk 'NR==1 {print $1}'
+)"; then
+ cx_fail_stage container-hash "$import_log"
+fi
+[[ "$COLLECTIVEX_SQUASH_SHA256" =~ ^[0-9a-f]{64}$ ]] \
+ || cx_fail_stage container-hash "$import_log"
+export COLLECTIVEX_SQUASH_SHA256
+cx_preflight_allocation "$JOB_ID" "$NODES" "$MOUNT_SRC" "$SQUASH_FILE" \
+ "${CX_SHARD_FILE:-}"
+CONTAINER_MOUNTS="$MOUNT_SRC:$MOUNT_DIR$DEVICE_MOUNTS"
+
+if [ "$NODES" = 1 ]; then
+ run_rc=0
+ cx_set_failure_stage container-launch
+ runtime_log="$(cx_private_log_path runtime)"
+ srun --jobid="$JOB_ID" --chdir=/tmp --container-image="$SQUASH_FILE" \
+ --container-mounts="$CONTAINER_MOUNTS" --container-writable --container-remap-root \
+ --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \
+ --no-container-entrypoint --export="$(cx_container_exports)" \
+ bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" \
+ >"$runtime_log" 2>&1 || run_rc=$?
+else
+ SOURCE_BACKEND_ENV='case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 66;; esac; env_file="/ix/experimental/CollectiveX/.cx_backend/env/node-${SLURM_NODEID}.sh"; env_root="${env_file%/*}"; [ -d "$env_root" ] && [ ! -L "$env_root" ] || exit 66; case "$(stat -c "%a" "$env_root")" in 700|[1-7]700) ;; *) exit 66;; esac; [ -f "$env_file" ] && [ -r "$env_file" ] && [ ! -L "$env_file" ] && [ "$(stat -c "%u:%a" "$env_file")" = "$(stat -c "%u" "$env_root"):600" ] || exit 66; . "$env_file" || exit 66'
+ BACKEND_PROBE="$SOURCE_BACKEND_ENV"'; case "$CX_BENCH" in mori) python3 -c "import mori";; nccl-ep) python3 -c "import torch";; esac'
+ WRAP="${SOURCE_BACKEND_ENV}"$'\n'"$(cx_slurm_rank_wrapper)"
+ CX_DISTRIBUTED_CONTAINER_ARGS=(--container-writable --container-remap-root)
+ run_rc=0
+ cx_set_failure_stage container-launch
+ cx_run_distributed_shard || run_rc=$?
+fi
+
+cx_adopt_runtime_stage "$MOUNT_SRC"
+if [ "$NODES" = 1 ] && [ "$run_rc" != 0 ]; then
+ cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true
+fi
+collect_rc=0
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$?
+[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection
+final_rc="$run_rc"
+[ "$final_rc" != 0 ] || final_rc="$collect_rc"
+rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true
+cx_log "done - result artifacts collected"
+exit "$final_rc"
diff --git a/experimental/CollectiveX/launchers/launch_single-slurm.sh b/experimental/CollectiveX/launchers/launch_single-slurm.sh
new file mode 100644
index 0000000000..eade8fb751
--- /dev/null
+++ b/experimental/CollectiveX/launchers/launch_single-slurm.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+# CollectiveX shared standard NVIDIA Slurm launcher (one or two nodes).
+# shellcheck disable=SC2016,SC2034
+set -euo pipefail
+
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CX_DIR="$(cd "$HERE/.." && pwd)"
+REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)"
+# shellcheck source=../runtime/common.sh
+source "$HERE/../runtime/common.sh"
+
+RUNNER="${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}"
+ALLOC_EXTRA=(); SRUN_EXTRA=(); LOCAL_IMPORT=0
+case "$RUNNER" in
+ h100-dgxc) PRODUCT=h100; TOPO=h100-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=1 ;;
+ h200-dgxc)
+ PRODUCT=h200; TOPO=h200-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=0
+ SRUN_EXTRA=(--container-remap-root)
+ ;;
+ b200-dgxc)
+ PRODUCT=b200; TOPO=b200-nvlink-island; DEFAULT_TIME=30; REQUIRE_ACCOUNT=1
+ ALLOC_EXTRA=(--mem=0)
+ ;;
+ b300)
+ PRODUCT=b300; TOPO=b300-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=1
+ # Do not restore ALLOC_EXTRA=(-N 1 --mem=0); it blocks two-node B300 jobs.
+ ALLOC_EXTRA=(--mem=0)
+ SRUN_EXTRA=(--mpi=none --container-remap-root)
+ LOCAL_IMPORT=1
+ ;;
+ *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to a registered NVIDIA SKU" ;;
+esac
+export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-deepep}"
+export CX_IMAGE_PLATFORM=linux/amd64
+JOB_ID=""
+cx_install_launcher_fail_safe
+cx_set_failure_stage setup
+cx_load_operator_config
+cx_lock_canonical_gha_env "$RUNNER"
+
+NODES="${CX_NODES:-1}"; GPN="${CX_GPUS_PER_NODE:-8}"
+SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-8}"
+EXPECTED_WORLD=$((NODES * GPN))
+NGPUS="${CX_NGPUS:-$EXPECTED_WORLD}"
+TIME_MIN="${CX_TIME:-$DEFAULT_TIME}"
+IMAGE="${CX_IMAGE:-$(cx_default_image "$PRODUCT")}"
+TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
+[ "$NODES" = 1 ] || [ "$NODES" = 2 ] \
+ || cx_die "$RUNNER supports one or two nodes"
+[ "$GPN" = 8 ] || cx_die "$RUNNER requires eight GPUs per node"
+[ "$SCALE_UP_DOMAIN" = 8 ] || cx_die "$RUNNER requires an eight-GPU scale-up domain"
+[ "$NGPUS" = "$EXPECTED_WORLD" ] \
+ || cx_die "$RUNNER world size must equal nodes x GPUs per node"
+case "$CX_BENCH" in
+ deepep|deepep-v2|deepep-hybrid|uccl|nccl-ep) ;;
+ *) cx_die "unsupported $RUNNER EP backend: $CX_BENCH" ;;
+esac
+
+export CX_RUNNER="$RUNNER" CX_NGPUS="$NGPUS" CX_NODES="$NODES"
+export CX_GPUS_PER_NODE="$GPN" CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN"
+export CX_TS="$TS" CX_SCALE_UP_TRANSPORT=nvlink
+if [ "$NODES" -gt 1 ]; then
+ export CX_SCOPE=scale-out CX_SCALE_OUT_TRANSPORT=rdma
+ export CX_TRANSPORT=nvlink-rdma CX_TOPO="${PRODUCT}-nvlink-rdma"
+else
+ export CX_SCOPE=scale-up CX_TRANSPORT=nvlink CX_TOPO="$TOPO"
+ unset CX_SCALE_OUT_TRANSPORT
+fi
+export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" NCCL_CUMEM_ENABLE=1
+cx_apply_network_profile "$NODES" "$CX_TRANSPORT"
+cx_validate_shard_control "$CX_DIR"
+cx_require_vars CX_PARTITION CX_SQUASH_DIR
+[ "$REQUIRE_ACCOUNT" = 0 ] || cx_require_vars CX_ACCOUNT
+[ "$RUNNER" != b300 ] || cx_require_vars CX_STAGE_DIR
+
+cx_log "runner=$RUNNER nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH"
+[ "${CX_DRYRUN:-0}" != 1 ] || { cx_log "CX_DRYRUN=1 - not allocating"; exit 0; }
+cx_set_failure_stage registry-verification
+cx_verify_registry_image "$IMAGE"
+SQUASH_FILE=""
+cx_set_failure_stage repository-stage
+MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "${CX_STAGE_DIR:-}")"
+cx_stage_repo "$REPO_ROOT" "$MOUNT_SRC"
+cx_prepare_runtime_marker "$MOUNT_SRC"
+CONTAINER_MOUNTS="$MOUNT_SRC:/ix"
+if [ "$CX_BENCH" = deepep-v2 ] || [ "$CX_BENCH" = deepep-hybrid ]; then
+ cx_set_failure_stage backend-setup
+ cx_prepare_backend_source "$MOUNT_SRC" "$CX_BENCH" \
+ || cx_die "cannot stage the pinned backend source"
+ export CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources
+fi
+if [ "$CX_BENCH" = deepep-v2 ]; then
+ cx_prepare_backend_cache "$CX_SQUASH_DIR" \
+ || cx_die "cannot prepare the isolated backend cache"
+ CONTAINER_MOUNTS="$CONTAINER_MOUNTS,$CX_PREPARED_BACKEND_CACHE:/cx-cache"
+ export CX_BACKEND_CACHE_ROOT=/cx-cache
+fi
+
+cx_set_failure_stage scheduler-allocation
+command -v salloc >/dev/null || cx_die "salloc not found on this runner"
+allocation=(--partition="$CX_PARTITION" --nodes="$NODES" --gres=gpu:"$GPN" --exclusive
+ --time="$TIME_MIN" "${ALLOC_EXTRA[@]}")
+[ "$NODES" = 1 ] || allocation+=(--ntasks-per-node="$GPN")
+[ -z "${CX_ACCOUNT:-}" ] || allocation+=(--account="$CX_ACCOUNT")
+[ -z "${CX_EXCLUDE_NODES:-}" ] || allocation+=(--exclude="$CX_EXCLUDE_NODES")
+cx_salloc_jobid "${allocation[@]}"
+[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc"
+cx_set_failure_stage setup
+cx_validate_network_profile_on_job "$JOB_ID" "$NODES" "$CX_TRANSPORT"
+if [ "$LOCAL_IMPORT" = 1 ]; then
+ cx_set_failure_stage container-import
+ SQUASH_FILE="$(CX_ENROOT_LOCAL_IMPORT=1 cx_ensure_squash "$CX_SQUASH_DIR" "$IMAGE")"
+ cx_set_failure_stage container-hash
+ cx_export_squash_identity "$SQUASH_FILE"
+else
+ cx_set_failure_stage container-import
+ SQUASH_FILE="$(cx_ensure_squash_on_job "$JOB_ID" "$CX_SQUASH_DIR" "$IMAGE")"
+ cx_set_failure_stage container-hash
+ cx_export_squash_identity "$SQUASH_FILE"
+fi
+cx_preflight_allocation "$JOB_ID" "$NODES" "$MOUNT_SRC" "$SQUASH_FILE" \
+ "${CX_SHARD_FILE:-}"
+
+if [ "$NODES" = 1 ]; then
+ run_rc=0
+ cx_set_failure_stage container-launch
+ runtime_log="$(cx_private_log_path runtime)"
+ srun --jobid="$JOB_ID" --container-image="$SQUASH_FILE" \
+ --container-mounts="$CONTAINER_MOUNTS" --no-container-mount-home \
+ --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint \
+ "${SRUN_EXTRA[@]}" --export="$(cx_container_exports)" \
+ bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \
+ >"$runtime_log" 2>&1 || run_rc=$?
+else
+ SOURCE_BACKEND_ENV='case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 66;; esac; env_file="/ix/experimental/CollectiveX/.cx_backend/env/node-${SLURM_NODEID}.sh"; env_root="${env_file%/*}"; [ -d "$env_root" ] && [ ! -L "$env_root" ] || exit 66; case "$(stat -c "%a" "$env_root")" in 700|[1-7]700) ;; *) exit 66;; esac; [ -f "$env_file" ] && [ -r "$env_file" ] && [ ! -L "$env_file" ] && [ "$(stat -c "%u:%a" "$env_file")" = "$(stat -c "%u" "$env_root"):600" ] || exit 66; . "$env_file" || exit 66'
+ BACKEND_PROBE="$SOURCE_BACKEND_ENV"'; case "$CX_BENCH" in deepep) python3 -c "from deep_ep import Buffer";; deepep-v2) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''ElasticBuffer'\'')";; deepep-hybrid) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''HybridEPBuffer'\'')";; uccl) python3 -c "import torch; from uccl_deepep import Buffer";; nccl-ep) python3 -c "import torch";; esac'
+ WRAP="${SOURCE_BACKEND_ENV}"$'\n'"$(cx_slurm_rank_wrapper)"
+ CX_DISTRIBUTED_CONTAINER_ARGS=(--container-writable "${SRUN_EXTRA[@]}")
+ run_rc=0
+ cx_set_failure_stage container-launch
+ cx_run_distributed_shard || run_rc=$?
+fi
+
+cx_adopt_runtime_stage "$MOUNT_SRC"
+if [ "$NODES" = 1 ] && [ "$run_rc" != 0 ]; then
+ cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true
+fi
+collect_rc=0
+cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$?
+[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection
+final_rc="$run_rc"
+[ "$final_rc" != 0 ] || final_rc="$collect_rc"
+cx_log "done - result artifacts collected"
+exit "$final_rc"
diff --git a/experimental/CollectiveX/publisher.py b/experimental/CollectiveX/publisher.py
new file mode 100644
index 0000000000..e04ba1f514
--- /dev/null
+++ b/experimental/CollectiveX/publisher.py
@@ -0,0 +1,3284 @@
+#!/usr/bin/env python3
+"""Fail-closed filesystem publisher for CollectiveX EP v1 artifacts."""
+from __future__ import annotations
+
+import argparse
+import contextlib
+import datetime as dt
+import fcntl
+from functools import lru_cache
+import hashlib
+import json
+import math
+import os
+from pathlib import Path, PurePosixPath
+import re
+import shutil
+import stat
+import statistics
+import sys
+import tempfile
+from typing import Any, Iterator, Sequence
+import zipfile
+
+import jsonschema
+
+HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(HERE))
+
+import artifact_safety # noqa: E402
+import capability # noqa: E402
+import contracts # noqa: E402
+import identity # noqa: E402
+import sweep_matrix # noqa: E402
+
+FORMAT_BUNDLE = "collectivex.private.bundle.v1"
+FORMAT_PUBLIC = "collectivex.public.v1"
+FORMAT_CHANNEL = "collectivex.channel.v1"
+POLICY = "collectivex-decision-grade-v1"
+PUBLISHER_POLICY = "collectivex-publisher-v1"
+OUTCOMES = ("success", "unsupported", "failed", "invalid", "diagnostic")
+REQUIRED_ALLOCATIONS = 3
+REQUIRED_COHORT_KINDS = ("library", "chip", "system", "routing")
+REQUIRED_PROMOTION_COHORT_COUNTS = {"library": 76, "system": 12, "routing": 116}
+CANONICAL_FULL_V1_MATRIX_SHA256 = (
+ "f1ca85f9689922b90edd5767b9ff2a902f6b896f32f68b2ca086dde3fd2157d0"
+)
+CANONICAL_FULL_V1_CASE_CATALOG_SHA256 = (
+ "8e262178f770b0cdde12b7ec71604afd87251fa55685d4594f29717153ad6bbd"
+)
+P50_STABILITY_LIMIT = 1.10
+P99_STABILITY_LIMIT = 1.25
+MAX_ARCHIVE_MEMBERS = 20_000
+MAX_ARCHIVE_MEMBER_BYTES = 2 * 1024**3
+MAX_ARCHIVE_TOTAL_BYTES = 16 * 1024**3
+MAX_PUBLIC_DATASET_BYTES = 32 * 1024**2
+HEX64 = re.compile(r"[0-9a-f]{64}")
+SAFE_ID = re.compile(r"[a-z0-9][a-z0-9_.-]{0,127}")
+REASON = re.compile(r"[a-z0-9][a-z0-9.-]{0,95}")
+ARTIFACT_NAME = re.compile(
+ r"cx(?:unsupported|shard-[a-z0-9][a-z0-9_.-]{0,127})-[1-9][0-9]*-[1-9][0-9]*"
+)
+COVERAGE_TOPOLOGY_FIELDS = (
+ "ep_size", "nodes", "gpus_per_node", "scale_up_domain", "scope",
+ "scale_up_transport", "scale_out_transport", "transport", "topology_class",
+)
+CHANNEL_PATH = re.compile(r"datasets/([0-9a-f]{64})/dataset\.json")
+SCHEMA_DIR = HERE / "schemas"
+_SCHEMAS: dict[str, jsonschema.protocols.Validator] = {}
+
+
+class PublisherError(ValueError):
+ """Input or stored state violates the publication contract."""
+
+
+strict_load = contracts.strict_load
+_canonical = contracts.canonical_json_bytes
+
+
+def _sha_bytes(data: bytes) -> str:
+ return hashlib.sha256(data).hexdigest()
+
+
+def _sha_file(path: Path) -> str:
+ digest = hashlib.sha256()
+ with path.open("rb") as handle:
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ digest.update(chunk)
+ return digest.hexdigest()
+
+
+def _latest_timestamp(values: Sequence[str]) -> str:
+ """Return the latest evidence timestamp without introducing publisher wall time."""
+ if not values:
+ raise PublisherError("cannot derive a timestamp without evidence")
+
+ def parsed(value: str) -> dt.datetime:
+ try:
+ timestamp = dt.datetime.fromisoformat(value.replace("Z", "+00:00"))
+ except ValueError as exc:
+ raise PublisherError("evidence timestamp is not ISO-8601") from exc
+ if timestamp.tzinfo is None:
+ raise PublisherError("evidence timestamp must include a timezone")
+ return timestamp.astimezone(dt.timezone.utc)
+
+ return max(values, key=lambda value: (parsed(value), value))
+
+
+def _schema(name: str, value: Any) -> None:
+ validator = _SCHEMAS.get(name)
+ if validator is None:
+ schema = strict_load(SCHEMA_DIR / name)
+ jsonschema.Draft202012Validator.check_schema(schema)
+ validator = jsonschema.Draft202012Validator(
+ schema, format_checker=jsonschema.FormatChecker()
+ )
+ _SCHEMAS[name] = validator
+ errors = sorted(validator.iter_errors(value), key=lambda error: list(error.absolute_path))
+ if errors:
+ error = errors[0]
+ location = ".".join(map(str, error.absolute_path)) or "$"
+ raise PublisherError(f"{name}:{location}: {error.message}")
+def _exact(obj: Any, fields: set[str], path: str) -> dict[str, Any]:
+ if not isinstance(obj, dict):
+ raise PublisherError(f"{path} must be an object")
+ actual = set(obj)
+ if actual != fields:
+ raise PublisherError(
+ f"{path} fields differ: missing={sorted(fields - actual)}, "
+ f"extra={sorted(actual - fields)}"
+ )
+ return obj
+def _array(value: Any, path: str, *, nonempty: bool = False) -> list[Any]:
+ if not isinstance(value, list) or (nonempty and not value):
+ qualifier = "a nonempty" if nonempty else "an"
+ raise PublisherError(f"{path} must be {qualifier} array")
+ return value
+
+
+def _integer(value: Any, path: str, *, minimum: int = 0) -> int:
+ if type(value) is not int or value < minimum:
+ raise PublisherError(f"{path} must be an integer >= {minimum}")
+ return value
+
+
+def _unique(values: Sequence[Any], path: str) -> None:
+ serialized = [_canonical(value) for value in values]
+ if len(serialized) != len(set(serialized)):
+ raise PublisherError(f"{path} contains duplicates")
+
+def _eligibility(value: dict[str, Any], path: str) -> dict[str, Any]:
+ allocations = value["allocation_ids"]
+ p50 = value["p50_max_min_ratio"]
+ p99 = value["p99_max_min_ratio"]
+ gates = (
+ len(allocations) >= REQUIRED_ALLOCATIONS,
+ value["complete"], value["correct"], value["measured_roundtrip_p99"],
+ value["stable_p50"], value["stable_p99"], value["stable_ordering"],
+ p50 is not None and p50 <= P50_STABILITY_LIMIT,
+ p99 is not None and p99 <= P99_STABILITY_LIMIT,
+ )
+ if value["decision_grade"] != (all(gates) and not value["reasons"]):
+ raise PublisherError(f"{path}.decision_grade does not match promotion gates")
+ if value["decision_grade"] == bool(value["reasons"]):
+ raise PublisherError(f"{path}.reasons does not match decision status")
+ return value
+
+
+def validate_channel(doc: Any, *, expected_channel: str | None = None) -> dict[str, Any]:
+ _schema("channel-v1.schema.json", doc)
+ if expected_channel and doc["channel"] != expected_channel:
+ raise PublisherError("channel name does not match its file")
+ target = doc["dataset"]
+ match = CHANNEL_PATH.fullmatch(target["path"]) if isinstance(target["path"], str) else None
+ if not match or match.group(1) != target["sha256"]:
+ raise PublisherError("channel dataset path and sha256 do not agree")
+ return doc
+
+
+def _metric_value(series: dict[str, Any], metric: dict[str, Any]) -> tuple[str, float, str]:
+ point = next(
+ (point for point in series["points"] if point["tokens_per_rank"] == metric["tokens_per_rank"]),
+ None,
+ )
+ if point is None or series["phase"] != metric["phase"]:
+ raise PublisherError("decision metric references an unavailable point")
+ component = point["components"]["roundtrip"]
+ if metric["measure"] == "latency_us":
+ value = component["latency_us"][metric["statistic"]]
+ unit = "us"
+ else:
+ rates = component["logical_payload_rate_gbps_at_latency_percentile"]
+ if rates is None:
+ raise PublisherError("logical bandwidth decision has no logical byte contract")
+ value = rates[metric["statistic"]]
+ unit = "GB/s"
+ return point["point_id"], value, unit
+
+
+def _validate_metric(metric: dict[str, Any]) -> None:
+ expected = "min" if metric["measure"] == "latency_us" else "max"
+ if metric["objective"] != expected:
+ raise PublisherError(f"{metric['measure']} objective must be {expected}")
+
+
+def _metric_label(measure: str, statistic: str) -> str:
+ return (
+ f"{statistic} latency"
+ if measure == "latency_us"
+ else f"payload rate at {statistic} latency"
+ )
+
+
+def _routing_build_control(build: dict[str, Any]) -> dict[str, Any]:
+ return {
+ key: build[key]
+ for key in (
+ "routing_control_sha256", "image_digest", "source_sha", "squash_sha256",
+ )
+ }
+
+
+def _routing_implementation_mismatch(members: Sequence[dict[str, Any]]) -> bool:
+ off_eplb_hashes = {
+ member["build"]["implementation_contract_sha256"]
+ for member in members if not member["workload"]["eplb"]
+ }
+ return len(off_eplb_hashes) > 1
+
+
+def _public_case_factors(series: dict[str, Any]) -> dict[str, Any]:
+ workload = series["workload"]
+ system = series["system"]
+ measurement = series["measurement"]
+ ep_size = system["ep_size"]
+ case = {
+ "backend": series["backend"]["id"],
+ "canonical": True,
+ "eplb": workload["eplb"],
+ "ep": ep_size,
+ "experts": workload["experts"],
+ "gpus_per_node": system["gpus_per_node"],
+ "hidden": workload["hidden"],
+ "ladder": " ".join(str(point["tokens_per_rank"]) for point in series["points"]),
+ "mode": series["mode"],
+ "nodes": system["nodes"],
+ "phase": series["phase"],
+ "required_publication": series["publication_tier"],
+ "routing": workload["routing"],
+ "samples_per_point": measurement["samples_per_component"],
+ "scale_out_transport": system["scale_out_transport"],
+ "scale_up_domain": system["scale_up_domain"],
+ "scale_up_transport": system["scale_up_transport"],
+ "scope": system["scope"],
+ "suite": series["suite"],
+ "timing": (
+ f"{measurement['iters']}:{measurement['trials']}:"
+ f"{measurement['warmups']}"
+ ),
+ "topk": workload["top_k"],
+ "topology_class": system["topology_class"],
+ "transport": system["transport"],
+ "warmup_semantics": sweep_matrix.ep_harness.WARMUP_SEMANTICS,
+ "workload": series["model"],
+ }
+ return {
+ "case": case,
+ "profile": identity.profile_for_case(case),
+ "sku": system["sku"],
+ }
+
+
+def _coverage_topology(case: dict[str, Any]) -> dict[str, Any]:
+ """Project exact fabric placement without exposing private runner details."""
+ return {
+ "ep_size": case.get("ep_size", case.get("ep")),
+ **{field: case[field] for field in COVERAGE_TOPOLOGY_FIELDS if field != "ep_size"},
+ }
+
+
+def _coverage_coordinates(case: dict[str, Any]) -> dict[str, Any]:
+ return {
+ "sku": case["sku"], "backend": case["backend"],
+ "mode": case["mode"], "phase": case["phase"],
+ "topology": _coverage_topology(case),
+ }
+
+
+@lru_cache(maxsize=1)
+def _canonical_coverage_cases() -> dict[str, dict[str, Any]]:
+ matrix = sweep_matrix.resolve_matrix(suites="all", max_cases=128, backends="all")
+ return {
+ item["case"]["case_id"]: _coverage_coordinates({
+ "sku": item["sku"], **item["case"],
+ })
+ for item in matrix["requested_cases"]
+ }
+
+
+def _public_series_config(series: dict[str, Any]) -> dict[str, Any]:
+ return {
+ "backend": {
+ "generation": series["backend"]["generation"],
+ "version": series["backend"]["version"],
+ },
+ "resource": series["resource"],
+ "system": {"label": series["system"]["label"]},
+ }
+
+
+def _public_cohort_factors(kind: str, item: dict[str, Any]) -> tuple[Any, Any]:
+ workload = item["workload"]
+ build = item["build"]
+ shape = {
+ key: workload[key]
+ for key in (
+ "hidden", "top_k", "experts", "dispatch_dtype", "combine_dtype",
+ "activation_profile",
+ )
+ }
+ common = {
+ "model": item["model"], "mode": item["mode"], "phase": item["phase"],
+ "shape": shape, "measurement": item["measurement"],
+ "ep_size": item["system"]["ep_size"],
+ }
+ if kind == "library":
+ return (
+ {**common, "system": item["system"], "workload": workload,
+ "resource_mode": item["resource"]["mode"], "source": build["source_sha"]},
+ item["backend"]["id"],
+ )
+ if kind == "chip":
+ return (
+ {**common, "backend": item["backend"], "workload": workload,
+ "resource_mode": item["resource"]["mode"], "source": build["source_sha"]},
+ item["system"],
+ )
+ if kind == "system":
+ return {**common, "workload": workload, "source": build["source_sha"]}, [
+ item["system"]["sku"], item["backend"]["id"], item["resource"]["profile"]
+ ]
+ if kind == "routing":
+ return (
+ {**common, "backend": item["backend"], "system": item["system"],
+ "resource": item["resource"], "build": _routing_build_control(build)},
+ [workload["routing"], workload["eplb"],
+ build["implementation_contract_sha256"]],
+ )
+ raise PublisherError(f"unknown cohort kind {kind}")
+
+
+def _case_disposition_catalog_sha256(coverage: Sequence[dict[str, Any]]) -> str:
+ catalog = [
+ {"case_id": item["case_id"], "disposition": item["disposition"]}
+ for item in sorted(coverage, key=lambda item: item["case_id"])
+ ]
+ return _sha_bytes(_canonical(catalog))
+
+
+def validate_public_dataset(doc: Any) -> dict[str, Any]:
+ _schema("public-dataset-v1.schema.json", doc)
+ if len(_canonical(doc)) + 1 > MAX_PUBLIC_DATASET_BYTES:
+ raise PublisherError("public dataset exceeds the serving size limit")
+ try:
+ artifact_safety.assert_publication_safe([doc])
+ except artifact_safety.ArtifactSafetyError as exc:
+ raise PublisherError(str(exc)) from exc
+ if doc["source_bundle_ids"] != sorted(doc["source_bundle_ids"]):
+ raise PublisherError("source bundle IDs are not canonical")
+ for field, key in (
+ ("coverage", "case_id"), ("attempts", "attempt_id"),
+ ("series", "series_id"), ("cohorts", "cohort_id"),
+ ("rankings", "ranking_id"), ("recommendations", "recommendation_id"),
+ ("sensitivities", "sensitivity_id"),
+ ):
+ if doc[field] != sorted(doc[field], key=lambda item: item[key]):
+ raise PublisherError(f"{field} are not in canonical identity order")
+ promotion = doc["promotion"]
+ quarantined = promotion["status"] == "quarantined"
+ if quarantined != (promotion["reason"] is not None) or quarantined != (
+ promotion["matrix_id"] is None
+ ):
+ raise PublisherError("promotion reason/matrix identity differs from status")
+ attempts = {item["attempt_id"]: item for item in doc["attempts"]}
+ if len(attempts) != len(doc["attempts"]):
+ raise PublisherError("dataset has duplicate attempt IDs")
+ evidence = [
+ value["evidence_id"] for item in doc["attempts"] for value in item["evidence"]
+ ]
+ _unique(evidence, "dataset attempt evidence")
+ series = {item["series_id"]: item for item in doc["series"]}
+ if len(series) != len(doc["series"]):
+ raise PublisherError("dataset has duplicate series IDs")
+ allocation_ids = set(promotion["allocation_ids"])
+ case_ids = {item["case_id"] for item in doc["coverage"]}
+ if len(case_ids) != len(doc["coverage"]):
+ raise PublisherError("dataset has duplicate case coverage")
+ coverage_by_case = {item["case_id"]: item for item in doc["coverage"]}
+ series_case_ids = {
+ case_id for item in doc["series"] for case_id in item["case_ids"]
+ }
+ canonical_cases = _canonical_coverage_cases()
+ for item in doc["coverage"]:
+ topology = item["topology"]
+ registered = capability.topology_for(item["sku"], topology["ep_size"])
+ if (
+ item["sku"] not in capability.PLATFORMS
+ or item["backend"] not in capability.BACKENDS
+ or registered is None
+ or any(
+ topology[field] != registered[field]
+ for field in COVERAGE_TOPOLOGY_FIELDS if field != "ep_size"
+ )
+ ):
+ raise PublisherError("coverage topology differs from the capability registry")
+ coordinates = {
+ "sku": item["sku"], "backend": item["backend"],
+ "mode": item["mode"], "phase": item["phase"], "topology": topology,
+ }
+ canonical = canonical_cases.get(item["case_id"])
+ if canonical is not None and coordinates != canonical:
+ raise PublisherError("coverage coordinates differ from its case identity")
+ if canonical is None and item["case_id"] not in series_case_ids:
+ raise PublisherError("coverage case identity is outside the v1 catalog")
+ for item in doc["attempts"]:
+ if item["case_id"] not in case_ids or item["allocation_id"] not in allocation_ids:
+ raise PublisherError("attempt references undeclared coverage or allocation")
+ if item["series_id"] is not None and item["series_id"] not in series:
+ raise PublisherError("attempt references unknown series")
+ if (item["outcome"] == "success") != (item["reason"] is None):
+ raise PublisherError("attempt reason must be null exactly for success")
+ if item["outcome"] == "success" and item["failure_mode"] is not None:
+ raise PublisherError("successful attempt cannot have a failure mode")
+ if (item["outcome"] == "success" and item["selected"]) != (
+ item["series_id"] is not None
+ ):
+ raise PublisherError("attempt series must be present exactly for selected success")
+ if {item["allocation_id"] for item in doc["attempts"]} != allocation_ids:
+ raise PublisherError("promotion allocation catalog differs from attempts")
+ attempt_groups: dict[tuple[str, str], list[dict[str, Any]]] = {}
+ for item in doc["attempts"]:
+ attempt_groups.setdefault((item["case_id"], item["allocation_id"]), []).append(item)
+ for (case_id, allocation_id), group in attempt_groups.items():
+ ordinals = sorted(item["attempt_index"] for item in group)
+ if ordinals != list(range(1, len(group) + 1)):
+ raise PublisherError("public retries must retain contiguous attempt indexes")
+ if any(
+ item["attempt_id"] != identity.attempt_id(
+ allocation=allocation_id, case=case_id, ordinal=item["attempt_index"]
+ )
+ for item in group
+ ):
+ raise PublisherError("public retry identity differs from its case/allocation/index")
+ selected = [item for item in group if item["selected"]]
+ if len(selected) != 1 or selected[0]["attempt_index"] != ordinals[-1]:
+ raise PublisherError("publisher must select the latest retry per case/allocation")
+ selected_by_series: dict[str, list[dict[str, Any]]] = {}
+ for item in doc["attempts"]:
+ if item["selected"] and item["outcome"] == "success":
+ selected_by_series.setdefault(item["series_id"], []).append(item)
+ terminal = 0
+ for item in doc["coverage"]:
+ listed = set(item["attempt_ids"])
+ selected = item["selected_attempt_id"]
+ expected_attempts = {
+ attempt_id for attempt_id, attempt in attempts.items()
+ if attempt["case_id"] == item["case_id"]
+ }
+ if listed != expected_attempts:
+ raise PublisherError("coverage references attempts from another case")
+ if selected is not None:
+ terminal += 1
+ if (selected not in listed or not attempts[selected]["selected"]
+ or any(attempts[selected][field] != item[field]
+ for field in ("outcome", "failure_mode", "reason"))):
+ raise PublisherError("coverage selected outcome differs")
+ selected_candidates = [attempts[value] for value in listed if attempts[value]["selected"]]
+ latest = max(
+ selected_candidates,
+ key=lambda attempt: (
+ int(attempt["run_id"]), attempt["run_attempt"],
+ attempt["attempt_index"], attempt["attempt_id"]
+ ),
+ )
+ if selected != latest["attempt_id"]:
+ raise PublisherError("coverage does not select the latest canonical allocation")
+ if promotion["requested_cases"] != len(doc["coverage"]) or promotion["terminal_cases"] != terminal:
+ raise PublisherError("promotion coverage counts differ")
+ selected_evidence: dict[tuple[str, str], set[str]] = {}
+ for attempt in doc["attempts"]:
+ if attempt["selected"] and attempt["series_id"] is not None:
+ for value in attempt["evidence"]:
+ selected_evidence.setdefault(
+ (attempt["series_id"], value["point_id"]), set()
+ ).add(value["evidence_id"])
+ for item in doc["series"]:
+ eligibility = _eligibility(item["eligibility"], f"series {item['series_id']}")
+ workload = item["workload"]
+ model, hidden, top_k, experts = sweep_matrix.V1_WORKLOAD
+ suite_contract = sweep_matrix.V1_SUITE_CONTRACTS.get(item["suite"])
+ coordinate = (
+ item["mode"], item["phase"], workload["routing"], workload["eplb"]
+ )
+ profile = identity.case_profile(item["mode"])
+ if (
+ item["model"] != model
+ or (workload["hidden"], workload["top_k"], workload["experts"])
+ != (hidden, top_k, experts)
+ or suite_contract is None
+ or coordinate not in suite_contract["coordinates"]
+ or (
+ suite_contract.get("backends") is not None
+ and item["backend"]["id"] not in suite_contract["backends"]
+ )
+ or item["publication_tier"] != suite_contract["publication"]
+ or item["measurement"]["contract"] != profile["contract"]
+ or item["measurement"]["component_order_contract"]
+ != profile["component_order_contract"]
+ or item["measurement"]["combine_semantics"] != profile["combine_semantics"]
+ or item["measurement"]["payload_unit"] != profile["payload_unit"]
+ ):
+ raise PublisherError("series differs from the frozen v1 workload/suite profile")
+ backend_id = item["backend"]["id"]
+ expected_role = "reference" if backend_id == "nccl-ep" else "library"
+ if (
+ backend_id not in capability.BACKENDS
+ or item["backend"]["label"] != BACKEND_LABELS[backend_id]
+ or item["backend"]["role"] != expected_role
+ or item["backend"]["version"] is None
+ ):
+ raise PublisherError("series backend projection differs from v1")
+ sku = item["system"]["sku"]
+ platform = capability.PLATFORMS.get(sku)
+ ep_size = item["system"]["ep_size"]
+ registered_topology = capability.topology_for(sku, ep_size)
+ if platform is None or registered_topology is None:
+ raise PublisherError("series system projection differs from v1")
+ supported, _ = capability.resolve(
+ sku, backend_id, ep=ep_size, nodes=item["system"]["nodes"],
+ routing=workload["routing"], eplb=workload["eplb"],
+ mode=item["mode"],
+ )
+ if (
+ not supported
+ or item["system"]["vendor"] != platform["vendor"]
+ or any(
+ item["system"][field] != registered_topology[field]
+ for field in (
+ "nodes", "gpus_per_node", "scale_up_domain", "scope",
+ "scale_up_transport", "scale_out_transport", "transport",
+ "topology_class",
+ )
+ )
+ or item["system"]["world_size"] != ep_size
+ or platform["product"] not in set(
+ re.findall(r"[a-z]+\d+[a-z]*", item["system"]["label"].lower())
+ )
+ ):
+ raise PublisherError("series system projection differs from v1")
+ if contracts.public_series_config_sha256(_public_series_config(item)) != item[
+ "build"
+ ]["public_config_sha256"]:
+ raise PublisherError("public series configuration differs from its commitment")
+ covered = [coverage_by_case.get(case_id) for case_id in item["case_ids"]]
+ if not covered or any(
+ case is None
+ or {
+ "sku": case["sku"], "backend": case["backend"],
+ "mode": case["mode"], "phase": case["phase"],
+ "topology": case["topology"],
+ }
+ != {
+ "sku": sku, "backend": backend_id,
+ "mode": item["mode"], "phase": item["phase"],
+ "topology": _coverage_topology(item["system"]),
+ }
+ for case in covered
+ ):
+ raise PublisherError("series projection differs from its case coverage")
+ if (
+ item["eplb"]["enabled"] != item["workload"]["eplb"]
+ or item["eplb"]["logical_experts"] != item["workload"]["experts"]
+ ):
+ raise PublisherError("series EPLB descriptor differs from its workload")
+ eplb = item["eplb"]
+ expected_physical = eplb["logical_experts"] + eplb["redundant_experts"]
+ nullable_eplb = (
+ "planner", "mapping_sha256", "reference_tokens_per_rank", "max_replicas",
+ "imbalance_before", "imbalance_after",
+ )
+ if eplb["enabled"]:
+ if (
+ item["workload"]["routing"] != "zipf"
+ or any(eplb[field] is None for field in nullable_eplb)
+ or eplb["planner"] != "greedy-rank-major-v1"
+ or eplb["reference_tokens_per_rank"] != 2048
+ or eplb["redundant_experts"] != 32
+ or eplb["redundant_experts"] % ep_size != 0
+ or eplb["physical_experts"] != expected_physical
+ or eplb["logical_experts"] % ep_size != 0
+ or eplb["physical_experts"] % ep_size != 0
+ or not 1 <= eplb["replicated_experts"] <= min(
+ eplb["logical_experts"], eplb["redundant_experts"]
+ )
+ or not 2 <= eplb["max_replicas"] <= 1 + eplb["redundant_experts"]
+ or not 1 <= eplb["imbalance_after"] <= eplb["imbalance_before"] <= ep_size
+ ):
+ raise PublisherError("enabled EPLB descriptor is incomplete")
+ expected_plan = contracts._expected_eplb_plan(
+ workload["routing"], workload["top_k"],
+ eplb["logical_experts"], eplb["physical_experts"], ep_size,
+ identity.V1_CASE_PROFILE["seed"],
+ identity.V1_CASE_PROFILE["eplb_reference_tokens_per_rank"],
+ )
+ expected_eplb = {
+ "enabled": True,
+ "planner": identity.V1_CASE_PROFILE["eplb_planner"],
+ "mapping_sha256": contracts.eplb_contract.mapping_hash(expected_plan),
+ "logical_experts": eplb["logical_experts"],
+ "physical_experts": eplb["physical_experts"],
+ "redundant_experts": identity.V1_CASE_PROFILE["eplb_redundant_experts"],
+ "reference_tokens_per_rank": identity.V1_CASE_PROFILE[
+ "eplb_reference_tokens_per_rank"
+ ],
+ "replicated_experts": expected_plan["replicated_experts"],
+ "max_replicas": expected_plan["max_replicas"],
+ "imbalance_before": expected_plan["imbalance_before"],
+ "imbalance_after": expected_plan["imbalance_after"],
+ }
+ if eplb != expected_eplb:
+ raise PublisherError("enabled EPLB descriptor differs from deterministic plan")
+ elif (
+ any(eplb[field] is not None for field in nullable_eplb)
+ or eplb["physical_experts"] != expected_physical
+ or eplb["redundant_experts"] != 0
+ or eplb["replicated_experts"] != 0
+ ):
+ raise PublisherError("disabled EPLB descriptor claims a plan")
+ if item["backend"]["id"] == "nccl-ep":
+ expected_generation = (
+ "nccl" if item["system"]["vendor"] == "nvidia" else "rccl"
+ )
+ if item["backend"]["generation"] != expected_generation:
+ raise PublisherError("NCCL/RCCL reference generation differs from system vendor")
+ if (item["status"] == "decision-grade") != eligibility["decision_grade"]:
+ raise PublisherError("series status differs from eligibility")
+ if (
+ set(eligibility["allocation_ids"]) != set(item["allocation_ids"])
+ or eligibility["correct"] != all(point["correct"] for point in item["points"])
+ ):
+ raise PublisherError("series eligibility differs from its evidence")
+ selected_attempts = selected_by_series.get(item["series_id"], [])
+ if (
+ set(item["case_ids"]) != {attempt["case_id"] for attempt in selected_attempts}
+ or set(item["allocation_ids"])
+ != {attempt["allocation_id"] for attempt in selected_attempts}
+ ):
+ raise PublisherError("series case/allocation catalog differs from selected attempts")
+ if item["eligibility"]["decision_grade"] and len(
+ {attempt["run_id"] for attempt in selected_attempts}
+ ) < REQUIRED_ALLOCATIONS:
+ raise PublisherError("decision-grade series lacks independent workflow runs")
+ tokens = [point["tokens_per_rank"] for point in item["points"]]
+ if tokens != sorted(set(tokens)):
+ raise PublisherError("series points are not in unique ascending token order")
+ if len(item["case_ids"]) != 1:
+ raise PublisherError("public series must represent exactly one v1 case")
+ case_id = item["case_ids"][0]
+ if identity.digest("case", _public_case_factors(item)) != case_id:
+ raise PublisherError("public series projection differs from its case identity")
+ build = item["build"]
+ expected_series_id = identity.series_id({
+ "backend": backend_id,
+ "case_id": case_id,
+ "image_digest": build["image_digest"],
+ "implementation_contract_sha256": build[
+ "implementation_contract_sha256"
+ ],
+ "public_config_sha256": build["public_config_sha256"],
+ "routing_control_sha256": build["routing_control_sha256"],
+ "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"],
+ "source_sha": build["source_sha"],
+ "squash_sha256": build["squash_sha256"],
+ "workload_id": workload["workload_id"],
+ })
+ if item["series_id"] != expected_series_id:
+ raise PublisherError("public series identity differs from its committed factors")
+ for point in item["points"]:
+ if point["point_id"] != identity.point_id(series=item["series_id"], tokens_per_rank=point["tokens_per_rank"]):
+ raise PublisherError("point identity differs")
+ if point["global_tokens"] != point["tokens_per_rank"] * item["system"]["ep_size"]:
+ raise PublisherError("global_tokens must use EP size")
+ routing = point["routing"]
+ max_fanout = min(item["workload"]["top_k"], item["system"]["ep_size"])
+ if (
+ routing["routed_copies"] < point["global_tokens"]
+ or routing["routed_copies"] > point["global_tokens"] * max_fanout
+ or routing["recv_tokens_max"] > routing["routed_copies"]
+ or routing["recv_tokens_max"] * item["system"]["ep_size"]
+ < routing["routed_copies"]
+ or not math.isclose(
+ routing["fanout_mean"],
+ routing["routed_copies"] / point["global_tokens"],
+ rel_tol=1e-12,
+ )
+ or routing["hotspot_ratio"] < 1
+ or routing["empty_expert_count"] >= eplb["physical_experts"]
+ or routing["empty_rank_count"] >= item["system"]["ep_size"]
+ ):
+ raise PublisherError("point routing/load facts are internally inconsistent")
+ expected_evidence = selected_evidence.get(
+ (item["series_id"], point["point_id"]), set()
+ )
+ if set(point["evidence_ids"]) != expected_evidence:
+ raise PublisherError("point evidence differs from selected series attempts")
+ components = point["components"]
+ if (components["dispatch"] is None) != (components["combine"] is None):
+ raise PublisherError("dispatch/combine availability differs")
+ for name, component in components.items():
+ if component is None:
+ continue
+ expected_origin = "derived" if name == "isolated_sum" else "measured"
+ expected_samples = None if name == "isolated_sum" else 512
+ if component["origin"] != expected_origin or component["sample_count"] != expected_samples:
+ raise PublisherError(f"{name} origin or sample count differs")
+ if name == "isolated_sum" and (
+ component["logical_bytes"] is not None
+ or component["logical_payload_rate_gbps_at_latency_percentile"] is not None
+ ):
+ raise PublisherError("isolated_sum cannot publish logical bandwidth")
+ if name != "isolated_sum" and (
+ component["logical_bytes"] is None
+ or component["logical_payload_rate_gbps_at_latency_percentile"] is None
+ ):
+ raise PublisherError(f"{name} measured logical bandwidth is missing")
+ latency = component["latency_us"]
+ if list(latency.values()) != sorted(latency.values()):
+ raise PublisherError("latency percentiles are not ordered")
+ if component["logical_payload_rate_gbps_at_latency_percentile"] is not None:
+ for statistic, rate in component["logical_payload_rate_gbps_at_latency_percentile"].items():
+ expected = component["logical_bytes"] / (latency[statistic] * 1000.0)
+ if not math.isclose(rate, expected, rel_tol=1e-9, abs_tol=1e-12):
+ raise PublisherError("logical GB/s formula differs")
+ if components["roundtrip"] is None or components["roundtrip"]["origin"] != "measured":
+ raise PublisherError("roundtrip must be measured")
+ for statistic, throughput in point["roundtrip_token_rate_at_latency_percentile"].items():
+ expected = point["global_tokens"] / (
+ components["roundtrip"]["latency_us"][statistic] * 1e-6
+ )
+ if not math.isclose(throughput, expected, rel_tol=1e-9):
+ raise PublisherError("roundtrip token throughput formula differs")
+ if components["dispatch"] is not None:
+ derived = components["isolated_sum"]
+ if derived is None or any(not math.isclose(
+ derived["latency_us"][statistic],
+ components["dispatch"]["latency_us"][statistic]
+ + components["combine"]["latency_us"][statistic], rel_tol=1e-12
+ ) for statistic in ("p50", "p90", "p95", "p99")):
+ raise PublisherError("isolated_sum is not the component percentile sum")
+ elif components["isolated_sum"] is not None:
+ raise PublisherError("isolated_sum requires measured dispatch/combine components")
+ cohorts = {item["cohort_id"]: item for item in doc["cohorts"]}
+ if len(cohorts) != len(doc["cohorts"]):
+ raise PublisherError("dataset has duplicate cohort IDs")
+ for item in doc["cohorts"]:
+ if not set(item["series_ids"]).issubset(series):
+ raise PublisherError("cohort references unknown series")
+ members = [series[series_id] for series_id in item["series_ids"]]
+ expected_tier = (
+ "comparable-experimental"
+ if any(member["publication_tier"] == "comparable-experimental" for member in members)
+ else "official"
+ )
+ if item["publication_tier"] != expected_tier:
+ raise PublisherError("cohort publication tier differs from its members")
+ if f"/ {members[0]['mode']} /" not in item["label"]:
+ raise PublisherError("cohort label omits its controlled mode")
+ roles = {member["backend"]["role"] for member in members}
+ if item["kind"] == "library" and roles != {"library"}:
+ raise PublisherError("library cohort contains non-library evidence")
+ if item["kind"] == "system" and roles != {"reference"}:
+ raise PublisherError("system cohort is not a portable reference comparison")
+ if item["kind"] in {"chip", "routing"} and len(
+ {_canonical(member["backend"]) for member in members}
+ ) != 1:
+ raise PublisherError(f"{item['kind']} cohort mixes backend implementations")
+ public_factors = [_public_cohort_factors(item["kind"], member) for member in members]
+ if len({_canonical(value[0]) for value in public_factors}) != 1:
+ raise PublisherError(f"{item['kind']} cohort does not control its public factors")
+ if len({_canonical(value[1]) for value in public_factors}) < 2:
+ raise PublisherError(f"{item['kind']} cohort does not vary its declared contrast")
+ if item["kind"] == "routing":
+ if item["publication_tier"] != "comparable-experimental":
+ raise PublisherError("routing cohort must be experimental")
+ has_baseline = sum(
+ member["workload"]["routing"] == "uniform"
+ and not member["workload"]["eplb"]
+ for member in members
+ ) == 1
+ missing_reason = "missing-uniform-baseline" in item["eligibility"]["reasons"]
+ if has_baseline == missing_reason:
+ raise PublisherError("routing baseline and eligibility reason disagree")
+ mismatch = _routing_implementation_mismatch(members)
+ mismatch_reason = "implementation-config-mismatch" in item["eligibility"]["reasons"]
+ if mismatch != mismatch_reason:
+ raise PublisherError("routing implementation control and eligibility disagree")
+ expected_id = _derived_id("cxcohort-v1-", {
+ "kind": item["kind"], "series_ids": item["series_ids"],
+ "controlled_factors": item["controlled_factors"],
+ "varying_factors": item["varying_factors"],
+ })
+ if item["cohort_id"] != expected_id:
+ raise PublisherError("cohort ID differs from its public factors")
+ expected_factors = {
+ "library": (
+ ["system", "workload", "mode", "phase", "measurement", "resource.mode", "source"],
+ ["backend", "resource"],
+ ),
+ "chip": (
+ ["backend", "source", "workload", "mode", "phase", "measurement", "resource.mode"],
+ ["system", "resource"],
+ ),
+ "system": (
+ ["workload", "mode", "phase", "measurement", "source"],
+ ["system", "backend", "resource"],
+ ),
+ "routing": (
+ ["backend", "implementation-static-build", "system", "model-shape", "mode", "phase", "measurement", "resource"],
+ ["workload.routing", "workload.eplb", "implementation-config"],
+ ),
+ }[item["kind"]]
+ member_allocations = {
+ allocation for series_id in item["series_ids"]
+ for allocation in series[series_id]["allocation_ids"]
+ }
+ if (
+ (item["controlled_factors"], item["varying_factors"]) != expected_factors
+ or set(item["eligibility"]["allocation_ids"]) != member_allocations
+ ):
+ raise PublisherError("cohort factors or allocations differ from its members")
+ _eligibility(item["eligibility"], f"cohort {item['cohort_id']}")
+ expected_ranking_keys: set[tuple[str, str, str, int]] = set()
+ for cohort in doc["cohorts"]:
+ if not cohort["eligibility"]["decision_grade"]:
+ continue
+ members = [series[series_id] for series_id in cohort["series_ids"]]
+ tokens = set.intersection(*(
+ {point["tokens_per_rank"] for point in member["points"]}
+ for member in members
+ ))
+ expected_ranking_keys.update(
+ (cohort["cohort_id"], measure, statistic, token)
+ for token in tokens
+ for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile")
+ for statistic in ("p50", "p99")
+ )
+ ranking_top: dict[tuple[str, str, str, int], dict[str, Any]] = {}
+ ranking_ids: set[str] = set()
+ for ranking in doc["rankings"]:
+ cohort = cohorts.get(ranking["cohort_id"])
+ if (
+ cohort is None
+ or not cohort["eligibility"]["decision_grade"]
+ or ranking["eligibility"] != cohort["eligibility"]
+ or ranking["publication_tier"] != cohort["publication_tier"]
+ ):
+ raise PublisherError("ranking references an ineligible cohort")
+ entries = ranking["entries"]
+ _validate_metric(ranking["metric"])
+ if cohort["kind"] == "library" and any(
+ series[series_id]["backend"]["role"] == "reference"
+ for series_id in cohort["series_ids"]
+ ):
+ raise PublisherError("reference evidence cannot drive a library ranking")
+ if {entry["series_id"] for entry in entries} != set(cohort["series_ids"]):
+ raise PublisherError("ranking does not cover its cohort")
+ for entry in entries:
+ point_id, value, unit = _metric_value(series[entry["series_id"]], ranking["metric"])
+ if entry["point_id"] != point_id or entry["unit"] != unit or not math.isclose(entry["value"], value, rel_tol=1e-12):
+ raise PublisherError("ranking entry differs from series data")
+ reverse = ranking["metric"]["objective"] == "max"
+ expected = sorted(entries, key=lambda entry: (entry["value"], entry["series_id"]), reverse=reverse)
+ if entries != expected or [entry["rank"] for entry in entries] != list(range(1, len(entries) + 1)):
+ raise PublisherError("ranking order differs")
+ metric = ranking["metric"]
+ expected_id = _derived_id("cxranking-v1-", {
+ "cohort_id": ranking["cohort_id"], "metric": metric,
+ })
+ if ranking["ranking_id"] != expected_id or expected_id in ranking_ids:
+ raise PublisherError("ranking ID is duplicate or differs")
+ ranking_ids.add(expected_id)
+ ranking_top[(ranking["cohort_id"], metric["measure"], metric["statistic"], metric["tokens_per_rank"])] = entries[0]
+ if set(ranking_top) != expected_ranking_keys:
+ raise PublisherError("rankings do not cover every eligible cohort metric")
+ objective = {
+ "min-p50-latency": ("latency_us", "p50"), "min-p99-latency": ("latency_us", "p99"),
+ "max-payload-rate-at-p50-latency": (
+ "logical_payload_rate_gbps_at_latency_percentile", "p50"
+ ),
+ "max-payload-rate-at-p99-latency": (
+ "logical_payload_rate_gbps_at_latency_percentile", "p99"
+ ),
+ }
+ recommendation_ids: set[str] = set()
+ for item in doc["recommendations"]:
+ measure, statistic = objective[item["objective"]]
+ candidates = [top for key, top in ranking_top.items()
+ if key[:3] == (item["cohort_id"], measure, statistic) and top["point_id"] == item["point_id"]]
+ if len(candidates) != 1 or any(item[field] != candidates[0][field] for field in ("series_id", "point_id", "value", "unit")):
+ raise PublisherError("recommendation is not a ranking winner")
+ matching_ranking = next(
+ ranking for ranking in doc["rankings"]
+ if ranking["cohort_id"] == item["cohort_id"]
+ and ranking["metric"]["measure"] == measure
+ and ranking["metric"]["statistic"] == statistic
+ and ranking["entries"][0]["point_id"] == item["point_id"]
+ )
+ expected_id = _derived_id("cxrecommendation-v1-", {
+ "objective": item["objective"], "ranking_id": matching_ranking["ranking_id"],
+ })
+ cohort = cohorts[item["cohort_id"]]
+ if (item["recommendation_id"] != expected_id or expected_id in recommendation_ids
+ or cohort["publication_tier"] != "official"
+ or item["publication_tier"] != "official"
+ or item["eligibility"] != cohort["eligibility"]):
+ raise PublisherError("recommendation ID/eligibility differs")
+ recommendation_ids.add(expected_id)
+ expected_recommendations = sum(
+ cohorts[ranking["cohort_id"]]["publication_tier"] == "official"
+ for ranking in doc["rankings"]
+ )
+ if len(doc["recommendations"]) != expected_recommendations:
+ raise PublisherError("recommendations do not cover every actionable ranking")
+ sensitivity_ids: set[str] = set()
+ sensitivity_keys: set[tuple[str, str, str, str, str, int]] = set()
+ for item in doc["sensitivities"]:
+ cohort = cohorts.get(item["cohort_id"])
+ if (
+ cohort is None
+ or cohort["kind"] != "routing"
+ or not cohort["eligibility"]["decision_grade"]
+ or item["publication_tier"] != cohort["publication_tier"]
+ or item["eligibility"] != cohort["eligibility"]
+ ):
+ raise PublisherError("sensitivity references a non-routing cohort")
+ if (
+ item["baseline_series_id"] == item["candidate_series_id"]
+ or not {item["baseline_series_id"], item["candidate_series_id"]}.issubset(cohort["series_ids"])
+ ):
+ raise PublisherError("sensitivity series differ from its routing cohort")
+ _validate_metric(item["metric"])
+ baseline_series = series[item["baseline_series_id"]]
+ if (
+ baseline_series["workload"]["routing"] != "uniform"
+ or baseline_series["workload"]["eplb"]
+ ):
+ raise PublisherError("sensitivity baseline is not uniform without EPLB")
+ _, baseline, _ = _metric_value(series[item["baseline_series_id"]], item["metric"])
+ _, candidate, _ = _metric_value(series[item["candidate_series_id"]], item["metric"])
+ if not math.isclose(item["signed_change_ratio"], (candidate - baseline) / baseline, rel_tol=1e-12):
+ raise PublisherError("sensitivity ratio differs")
+ expected_id = _derived_id("cxsensitivity-v1-", {
+ "baseline": item["baseline_series_id"],
+ "candidate": item["candidate_series_id"],
+ "cohort": item["cohort_id"], "metric": item["metric"],
+ })
+ if item["sensitivity_id"] != expected_id or expected_id in sensitivity_ids:
+ raise PublisherError("sensitivity ID is duplicate or differs")
+ sensitivity_ids.add(expected_id)
+ sensitivity_keys.add((
+ item["cohort_id"], item["baseline_series_id"], item["candidate_series_id"],
+ item["metric"]["measure"], item["metric"]["statistic"],
+ item["metric"]["tokens_per_rank"],
+ ))
+ expected_sensitivity_keys: set[tuple[str, str, str, str, str, int]] = set()
+ for cohort in doc["cohorts"]:
+ if cohort["kind"] != "routing" or not cohort["eligibility"]["decision_grade"]:
+ continue
+ members = [series[series_id] for series_id in cohort["series_ids"]]
+ baseline = next((
+ member for member in members
+ if member["workload"]["routing"] == "uniform" and not member["workload"]["eplb"]
+ ), None)
+ if baseline is None:
+ continue
+ tokens = set.intersection(*(
+ {point["tokens_per_rank"] for point in member["points"]}
+ for member in members
+ ))
+ expected_sensitivity_keys.update(
+ (cohort["cohort_id"], baseline["series_id"], candidate["series_id"],
+ measure, statistic, token)
+ for candidate in members if candidate is not baseline
+ for token in tokens
+ for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile")
+ for statistic in ("p50", "p99")
+ )
+ if sensitivity_keys != expected_sensitivity_keys:
+ raise PublisherError("sensitivities do not cover every routing contrast metric")
+ if promotion["status"] == "promoted":
+ run_ids = {item["run_id"] for item in doc["attempts"] if item["selected"]}
+ repeated_cases = all(
+ len({
+ attempts[attempt_id]["run_id"]
+ for attempt_id in coverage["attempt_ids"]
+ if attempts[attempt_id]["selected"]
+ }) == REQUIRED_ALLOCATIONS
+ for coverage in doc["coverage"]
+ )
+ if promotion["matrix_id"] != CANONICAL_FULL_V1_MATRIX_SHA256:
+ raise PublisherError("promotion requires the canonical full-v1 matrix")
+ if (
+ _case_disposition_catalog_sha256(doc["coverage"])
+ != CANONICAL_FULL_V1_CASE_CATALOG_SHA256
+ ):
+ raise PublisherError("promotion requires the canonical case/disposition catalog")
+ if (
+ terminal != len(doc["coverage"])
+ or len(doc["source_bundle_ids"]) != REQUIRED_ALLOCATIONS
+ or len(run_ids) != REQUIRED_ALLOCATIONS
+ or not repeated_cases
+ ):
+ raise PublisherError("promoted dataset lacks complete coverage")
+ expected_outcomes = {
+ item["case_id"]: (
+ "success" if item["disposition"] == "runnable" else "unsupported"
+ )
+ for item in doc["coverage"]
+ }
+ if any(
+ item["selected"]
+ and item["outcome"] != expected_outcomes[item["case_id"]]
+ for item in doc["attempts"]
+ ):
+ raise PublisherError("promoted outcomes differ from requested dispositions")
+ runnable_cases = {
+ item["case_id"] for item in doc["coverage"]
+ if item["disposition"] == "runnable"
+ }
+ if any(
+ item["case_id"] in runnable_cases and item["outcome"] != "success"
+ for item in doc["attempts"]
+ ):
+ raise PublisherError(
+ "promotion rejects runnable cases with failed, invalid, or diagnostic retries"
+ )
+ _require_promotion_series(doc["series"])
+ _require_promotion_cohorts(doc["cohorts"], doc["series"])
+ if not doc["rankings"] or not doc["recommendations"]:
+ raise PublisherError("promoted dataset lacks eligible decisions")
+ if promotion["status"] == "quarantined" and any((
+ doc["source_bundle_ids"], promotion["allocation_ids"], doc["coverage"],
+ doc["attempts"], doc["series"], doc["cohorts"], doc["rankings"],
+ doc["recommendations"], doc["sensitivities"],
+ )):
+ raise PublisherError("quarantined dataset exposes unvalidated evidence")
+ return doc
+
+
+def _file_record(value: Any, path: str) -> dict[str, Any]:
+ item = _exact(value, {"path", "sha256", "bytes"}, path)
+ if not isinstance(item["path"], str) or PurePosixPath(item["path"]).is_absolute() or ".." in PurePosixPath(item["path"]).parts:
+ raise PublisherError(f"{path}.path is unsafe")
+ if not isinstance(item["sha256"], str) or HEX64.fullmatch(item["sha256"]) is None:
+ raise PublisherError(f"{path}.sha256 is invalid")
+ _integer(item["bytes"], f"{path}.bytes", minimum=1)
+ return item
+
+def validate_bundle_manifest(doc: Any) -> dict[str, Any]:
+ _schema("private-bundle-v1.schema.json", doc)
+ attempts = {item["attempt_id"]: item for item in doc["attempts"]}
+ if len(attempts) != len(doc["attempts"]):
+ raise PublisherError("bundle has duplicate attempt IDs")
+ selections = doc["coverage"]["selections"]
+ if len({item["case_id"] for item in selections}) != len(selections):
+ raise PublisherError("bundle has duplicate selected cases")
+ counts = {name: 0 for name in OUTCOMES}
+ for selection in selections:
+ attempt = attempts.get(selection["selected_attempt_id"])
+ if attempt is None or not attempt["selected"] or attempt["case_id"] != selection["case_id"] or attempt["outcome"] != selection["outcome"]:
+ raise PublisherError("bundle selection differs from retained attempt")
+ counts[selection["outcome"]] += 1
+ coverage = doc["coverage"]
+ if coverage["terminal_cases"] != len(selections) or coverage["outcome_counts"] != counts:
+ raise PublisherError("bundle terminal counts differ")
+ if coverage["complete"] != (coverage["expected_cases"] == len(selections)):
+ raise PublisherError("bundle completeness differs from coverage")
+ fingerprints: dict[str, set[str]] = {}
+ for attempt in doc["attempts"]:
+ value = attempt["runtime_fingerprint_sha256"]
+ if value:
+ fingerprints.setdefault(attempt["allocation_id"], set()).add(value)
+ if any(len(values) != 1 for values in fingerprints.values()):
+ raise PublisherError("bundle runtime is heterogeneous within an allocation")
+ return doc
+
+
+def _fsync_dir(path: Path) -> None:
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_DIRECTORY", 0))
+ try:
+ os.fsync(descriptor)
+ finally:
+ os.close(descriptor)
+
+
+def _write_bytes(path: Path, data: bytes, *, mode: int) -> None:
+ descriptor = os.open(
+ path,
+ os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0),
+ mode,
+ )
+ try:
+ os.fchmod(descriptor, mode)
+ with os.fdopen(descriptor, "wb", closefd=False) as handle:
+ handle.write(data)
+ handle.flush()
+ os.fsync(handle.fileno())
+ finally:
+ os.close(descriptor)
+
+
+def _write_all(descriptor: int, data: bytes) -> None:
+ view = memoryview(data)
+ while view:
+ view = view[os.write(descriptor, view):]
+
+
+def _write_json(path: Path, value: Any, *, mode: int) -> bytes:
+ data = _canonical(value) + b"\n"
+ _write_bytes(path, data, mode=mode)
+ return data
+
+
+def _file_metadata(path: Path, relative_to: Path) -> dict[str, Any]:
+ return {
+ "path": path.relative_to(relative_to).as_posix(),
+ "sha256": _sha_file(path),
+ "bytes": path.stat().st_size,
+ }
+
+
+def _tree_files(root: Path) -> list[Path]:
+ return sorted(
+ path for path in root.rglob("*")
+ if path.is_file() and not path.is_symlink() and path.name != "COMPLETE"
+ )
+
+
+def _verify_regular_file(path: Path, expected_mode: int) -> None:
+ _reject_symlinked_path(path.parent)
+ try:
+ metadata = os.lstat(path)
+ except FileNotFoundError as exc:
+ raise PublisherError(f"required file is missing: {path.name}") from exc
+ if (
+ not stat.S_ISREG(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or stat.S_IMODE(metadata.st_mode) != expected_mode
+ ):
+ raise PublisherError(
+ f"file is not an owned regular {expected_mode:o} object: {path.name}"
+ )
+
+
+def _verify_frozen_tree(root: Path, *, private: bool) -> None:
+ _reject_symlinked_path(root)
+ directory_mode = 0o500 if private else 0o555
+ file_mode = 0o400 if private else 0o444
+ try:
+ root_metadata = os.lstat(root)
+ except OSError as exc:
+ raise PublisherError(f"cannot inspect immutable object: {root.name}") from exc
+ if not stat.S_ISDIR(root_metadata.st_mode):
+ raise PublisherError(f"immutable object is not a real directory: {root.name}")
+ try:
+ entries = [root, *root.rglob("*")]
+ except OSError as exc:
+ raise PublisherError(f"cannot inspect immutable object: {root.name}") from exc
+ for path in entries:
+ metadata = os.lstat(path)
+ if metadata.st_uid != os.getuid():
+ raise PublisherError(f"immutable object has the wrong owner: {path.name}")
+ if stat.S_ISDIR(metadata.st_mode):
+ expected = directory_mode
+ elif stat.S_ISREG(metadata.st_mode):
+ expected = file_mode
+ else:
+ raise PublisherError(f"immutable object contains a linked or special entry: {path.name}")
+ if stat.S_IMODE(metadata.st_mode) != expected:
+ raise PublisherError(
+ f"immutable object mode differs for {path.name}: expected {expected:o}"
+ )
+
+
+def _freeze_tree(root: Path, *, private: bool) -> None:
+ files: list[Path] = []
+ directories = [root]
+ for path in root.rglob("*"):
+ metadata = os.lstat(path)
+ if stat.S_ISDIR(metadata.st_mode):
+ directories.append(path)
+ elif stat.S_ISREG(metadata.st_mode):
+ files.append(path)
+ else:
+ raise PublisherError(f"immutable object contains a linked or special entry: {path.name}")
+ for path in files:
+ os.chmod(path, 0o400 if private else 0o444)
+ for path in sorted(directories, key=lambda item: len(item.parts), reverse=True):
+ os.chmod(path, 0o500 if private else 0o555)
+ _fsync_dir(path)
+ _verify_frozen_tree(root, private=private)
+
+
+def _reject_symlinked_path(path: Path) -> None:
+ current = Path(path.anchor)
+ for part in path.parts[1:]:
+ current /= part
+ try:
+ metadata = os.lstat(current)
+ except FileNotFoundError:
+ break
+ if stat.S_ISLNK(metadata.st_mode):
+ raise PublisherError("COLLECTIVEX_STORE_ROOT must not traverse a symlinked parent")
+ if not stat.S_ISDIR(metadata.st_mode):
+ raise PublisherError(f"store path component is not a directory: {current}")
+
+
+class Store:
+ """Atomic private/public directory operations on one operator filesystem."""
+
+ def __init__(self, root: str | os.PathLike[str]):
+ candidate = Path(os.path.abspath(os.path.expanduser(root)))
+ _reject_symlinked_path(candidate)
+ candidate.mkdir(parents=True, exist_ok=True, mode=0o750)
+ resolved = candidate.resolve()
+ if candidate != resolved:
+ raise PublisherError(
+ "COLLECTIVEX_STORE_ROOT must not traverse a symlinked parent"
+ )
+ root_metadata = candidate.stat()
+ if root_metadata.st_uid != os.getuid() or stat.S_IMODE(root_metadata.st_mode) & 0o022:
+ raise PublisherError(
+ "COLLECTIVEX_STORE_ROOT must be owned by this user and not group/world writable"
+ )
+ os.chmod(candidate, 0o750)
+ if stat.S_IMODE(candidate.stat().st_mode) != 0o750:
+ raise PublisherError("COLLECTIVEX_STORE_ROOT mode must be 750")
+ self.root = resolved
+ raw = self.root
+ self.private = raw / "private"
+ self.incoming = self.private / "incoming"
+ self.bundles = self.private / "bundles"
+ self.quarantine = self.private / "quarantine"
+ self.public = raw / "public"
+ self.datasets = self.public / "datasets"
+ self.channels = self.public / "channels"
+ self.locks = raw / "locks"
+ for path, mode in (
+ (self.private, 0o700), (self.incoming, 0o700), (self.bundles, 0o700),
+ (self.quarantine, 0o700), (self.public, 0o755), (self.datasets, 0o755),
+ (self.channels, 0o755), (self.locks, 0o700),
+ ):
+ path.mkdir(parents=True, exist_ok=True, mode=mode)
+ if path.is_symlink() or not path.is_dir():
+ raise PublisherError(f"store path is not a real directory: {path}")
+ os.chmod(path, mode)
+
+ @contextlib.contextmanager
+ def locked(self) -> Iterator[None]:
+ lock_path = self.locks / "publisher.lock"
+ descriptor = os.open(
+ lock_path,
+ os.O_RDWR | os.O_CREAT | getattr(os, "O_NOFOLLOW", 0),
+ 0o600,
+ )
+ try:
+ os.fchmod(descriptor, 0o600)
+ metadata = os.fstat(descriptor)
+ if (
+ not stat.S_ISREG(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or stat.S_IMODE(metadata.st_mode) != 0o600
+ ):
+ raise PublisherError("publisher lock is not an owned regular 600 file")
+ fcntl.flock(descriptor, fcntl.LOCK_EX)
+ yield
+ finally:
+ fcntl.flock(descriptor, fcntl.LOCK_UN)
+ os.close(descriptor)
+
+ @contextlib.contextmanager
+ def staging(self, parent: Path, *, private: bool) -> Iterator[Path]:
+ stage = Path(tempfile.mkdtemp(prefix=".staging-", dir=parent))
+ os.chmod(stage, 0o700 if private else 0o755)
+ try:
+ yield stage
+ finally:
+ if stage.exists():
+ for path in stage.rglob("*"):
+ metadata = os.lstat(path)
+ if stat.S_ISDIR(metadata.st_mode):
+ os.chmod(path, 0o700)
+ elif stat.S_ISREG(metadata.st_mode):
+ os.chmod(path, 0o600)
+ os.chmod(stage, 0o700)
+ shutil.rmtree(stage, ignore_errors=True)
+
+ @staticmethod
+ def complete(stage: Path, value: str, *, private: bool) -> None:
+ _write_bytes(stage / "COMPLETE", (value + "\n").encode(), mode=0o600 if private else 0o644)
+ _fsync_dir(stage)
+
+ @staticmethod
+ def install(stage: Path, destination: Path, *, private: bool) -> None:
+ if destination.is_symlink():
+ raise PublisherError(f"immutable destination is a symlink: {destination.name}")
+ if destination.exists():
+ _verify_frozen_tree(destination, private=private)
+ marker = destination / "COMPLETE"
+ if not marker.is_file() or marker.read_text().strip() != destination.name:
+ raise PublisherError(f"immutable destination is incomplete: {destination.name}")
+ return
+ _freeze_tree(stage, private=private)
+ os.rename(stage, destination)
+ _fsync_dir(destination.parent)
+ _verify_frozen_tree(destination, private=private)
+
+ def install_dataset(self, dataset: dict[str, Any]) -> tuple[str, int]:
+ validate_public_dataset(dataset)
+ payload = _canonical(dataset) + b"\n"
+ if len(payload) > MAX_PUBLIC_DATASET_BYTES:
+ raise PublisherError("public dataset exceeds the serving size limit")
+ digest = _sha_bytes(payload)
+ destination = self.datasets / digest
+ with self.staging(self.datasets, private=False) as stage:
+ _write_bytes(stage / "dataset.json", payload, mode=0o644)
+ self.complete(stage, digest, private=False)
+ self.install(stage, destination, private=False)
+ stored = destination / "dataset.json"
+ marker = destination / "COMPLETE"
+ if (not marker.is_file() or marker.read_text().strip() != digest
+ or _sha_file(stored) != digest or stored.stat().st_size != len(payload)):
+ raise PublisherError("stored dataset checksum differs after installation")
+ return digest, len(payload)
+
+ def update_channel(self, channel: str, digest: str, size: int, generated_at: str) -> None:
+ if size > MAX_PUBLIC_DATASET_BYTES:
+ raise PublisherError("channel dataset exceeds the serving size limit")
+ _verify_frozen_tree(self.datasets / digest, private=False)
+ marker = self.datasets / digest / "COMPLETE"
+ if not marker.is_file() or marker.read_text().strip() != digest:
+ raise PublisherError("cannot advance a channel to an incomplete dataset")
+ dataset_path = self.datasets / digest / "dataset.json"
+ dataset = validate_public_dataset(strict_load(dataset_path))
+ if (
+ _sha_file(dataset_path) != digest
+ or dataset_path.stat().st_size != size
+ or dataset["generated_at"] != generated_at
+ ):
+ raise PublisherError("channel metadata differs from its stored dataset")
+ if channel == "dev-latest" and dataset["promotion"]["status"] != "promoted":
+ raise PublisherError("dev-latest may only reference a promoted dataset")
+ pointer = {
+ "format": FORMAT_CHANNEL,
+ "channel": channel,
+ "dataset": {
+ "path": f"datasets/{digest}/dataset.json",
+ "sha256": digest,
+ "bytes": size,
+ },
+ "generated_at": generated_at,
+ }
+ validate_channel(pointer, expected_channel=channel)
+ destination = self.channels / f"{channel}.json"
+ temporary = self.channels / f".{channel}.tmp-{os.getpid()}"
+ try:
+ data = _canonical(pointer) + b"\n"
+ _write_bytes(temporary, data, mode=0o644)
+ os.replace(temporary, destination)
+ _fsync_dir(self.channels)
+ finally:
+ temporary.unlink(missing_ok=True)
+
+ def verify_channel(self, channel: str) -> dict[str, Any]:
+ channel_path = self.channels / f"{channel}.json"
+ _verify_regular_file(channel_path, 0o644)
+ pointer = validate_channel(strict_load(channel_path), expected_channel=channel)
+ target = self.public / pointer["dataset"]["path"]
+ _verify_frozen_tree(target.parent, private=False)
+ if target.stat().st_size != pointer["dataset"]["bytes"] or _sha_file(target) != pointer["dataset"]["sha256"]:
+ raise PublisherError(f"channel {channel} dataset checksum differs")
+ marker = target.parent / "COMPLETE"
+ if not marker.is_file() or marker.read_text().strip() != pointer["dataset"]["sha256"]:
+ raise PublisherError(f"channel {channel} dataset is incomplete")
+ dataset = validate_public_dataset(strict_load(target))
+ if pointer["generated_at"] != dataset["generated_at"]:
+ raise PublisherError(f"channel {channel} metadata differs from its dataset")
+ if channel == "dev-latest" and dataset["promotion"]["status"] != "promoted":
+ raise PublisherError("dev-latest points to a non-promoted dataset")
+ return pointer
+
+
+def _copy_source(source: Path, destination: Path) -> None:
+ if source.is_symlink() or not source.is_file() or not stat.S_ISREG(source.stat().st_mode):
+ raise PublisherError(f"source must be a regular non-symlink file: {source}")
+ descriptor = os.open(source, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ output = os.open(destination, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600)
+ try:
+ while True:
+ chunk = os.read(descriptor, 1024 * 1024)
+ if not chunk:
+ break
+ _write_all(output, chunk)
+ os.fsync(output)
+ finally:
+ os.close(output)
+ finally:
+ os.close(descriptor)
+
+
+def _archive_download_directory(source: Path, destination: Path) -> None:
+ if source.is_symlink() or not source.is_dir():
+ raise PublisherError(f"artifact directory is invalid: {source}")
+ files: list[Path] = []
+ for path in source.rglob("*"):
+ if path.is_symlink():
+ raise PublisherError("artifact directory contains a symlink")
+ if path.is_dir():
+ continue
+ if not path.is_file():
+ raise PublisherError("artifact directory contains a non-regular entry")
+ files.append(path)
+ files.sort()
+ if not files or len(files) > MAX_ARCHIVE_MEMBERS:
+ raise PublisherError("artifact directory has an invalid file count")
+ total = 0
+ with zipfile.ZipFile(destination, "x", compression=zipfile.ZIP_STORED) as archive:
+ for path in files:
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ with os.fdopen(descriptor, "rb") as handle:
+ metadata = os.fstat(handle.fileno())
+ if not stat.S_ISREG(metadata.st_mode):
+ raise PublisherError("artifact directory member changed type")
+ size = metadata.st_size
+ total += size
+ if size > MAX_ARCHIVE_MEMBER_BYTES or total > MAX_ARCHIVE_TOTAL_BYTES:
+ raise PublisherError("artifact directory exceeds size limits")
+ relative = path.relative_to(source).as_posix()
+ _safe_member(relative)
+ info = zipfile.ZipInfo(relative, date_time=(1980, 1, 1, 0, 0, 0))
+ info.compress_type = zipfile.ZIP_STORED
+ info.external_attr = (stat.S_IFREG | 0o600) << 16
+ with archive.open(info, "w") as output:
+ written = 0
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ output.write(chunk)
+ written += len(chunk)
+ if written != size:
+ raise PublisherError("artifact directory member changed size")
+ descriptor = os.open(destination, os.O_RDONLY)
+ try:
+ os.fsync(descriptor)
+ finally:
+ os.close(descriptor)
+
+
+def _artifact_name(source: Path) -> str:
+ name = source.name if source.is_dir() else source.name.removesuffix(".zip")
+ if (
+ not source.is_dir() and source.suffix != ".zip"
+ or ARTIFACT_NAME.fullmatch(name) is None
+ ):
+ raise PublisherError(f"artifact source has an invalid GHA name: {source.name}")
+ return name
+
+
+def archive_incoming(
+ store: Store,
+ matrix: Path,
+ artifacts: Sequence[Path],
+ run: dict[str, Any],
+) -> tuple[str, Path, list[dict[str, Any]]]:
+ """Copy exact delivery bytes into immutable incoming before any JSON/ZIP parse."""
+ if not artifacts:
+ raise PublisherError("at least one GitHub artifact archive is required")
+ with store.staging(store.incoming, private=True) as stage:
+ sources = stage / "sources"
+ sources.mkdir(mode=0o700)
+ copied: list[dict[str, Any]] = []
+ named_artifacts = sorted(
+ ((_artifact_name(path), path) for path in artifacts), key=lambda item: item[0]
+ )
+ artifact_names = [name for name, _ in named_artifacts]
+ if len(artifact_names) != len(set(artifact_names)):
+ raise PublisherError("artifact delivery contains duplicate GHA names")
+ inputs = [("matrix.json", matrix, "matrix", None)] + [
+ (f"artifact-{index:04d}.zip", path, "artifact", artifact_name)
+ for index, (artifact_name, path) in enumerate(named_artifacts)
+ ]
+ for name, source, kind, artifact_name in inputs:
+ destination = sources / name
+ if source.is_dir():
+ _archive_download_directory(source, destination)
+ else:
+ if source != matrix and source.stat().st_size > MAX_ARCHIVE_TOTAL_BYTES:
+ raise PublisherError("artifact archive exceeds the size limit")
+ _copy_source(source, destination)
+ copied.append({
+ **_file_metadata(destination, stage),
+ "kind": kind,
+ "artifact_name": artifact_name,
+ })
+ ingest_id = _sha_bytes(_canonical({"run": run, "sources": copied}))
+ incoming_manifest = {
+ "format": "collectivex.incoming.v1",
+ "schema_version": 1,
+ "ingest_id": ingest_id,
+ "run": run,
+ "sources": copied,
+ }
+ _write_json(stage / "incoming.json", incoming_manifest, mode=0o600)
+ store.complete(stage, ingest_id, private=True)
+ destination = store.incoming / ingest_id
+ store.install(stage, destination, private=True)
+ installed = store.incoming / ingest_id
+ if strict_load(installed / "incoming.json") != incoming_manifest:
+ raise PublisherError("existing incoming object differs from archived delivery")
+ for record in copied:
+ _resolve_bundle_file(installed, record)
+ return ingest_id, installed, copied
+
+
+def _safe_member(name: str) -> PurePosixPath:
+ if "\\" in name or "\0" in name:
+ raise PublisherError("archive member has an unsafe separator")
+ path = PurePosixPath(name)
+ if path.is_absolute() or not path.parts or any(part in {"", ".", ".."} for part in path.parts):
+ raise PublisherError("archive member path escapes its artifact")
+ return path
+
+
+def extract_archive(archive: Path, destination: Path) -> list[Path]:
+ """Extract a bounded regular-file ZIP without trusting member paths or links."""
+ try:
+ handle = zipfile.ZipFile(archive)
+ except (OSError, zipfile.BadZipFile) as exc:
+ raise PublisherError("artifact is not a valid ZIP archive") from exc
+ extracted: list[Path] = []
+ seen: set[str] = set()
+ total = 0
+ with handle:
+ members = handle.infolist()
+ if not members or len(members) > MAX_ARCHIVE_MEMBERS:
+ raise PublisherError("artifact has an invalid member count")
+ for member in members:
+ path = _safe_member(member.filename.rstrip("/"))
+ key = path.as_posix()
+ if key in seen:
+ raise PublisherError("artifact contains duplicate member paths")
+ seen.add(key)
+ mode = member.external_attr >> 16
+ if stat.S_ISLNK(mode) or (mode and not (stat.S_ISREG(mode) or stat.S_ISDIR(mode))):
+ raise PublisherError("artifact contains a non-regular member")
+ if member.flag_bits & 0x1:
+ raise PublisherError("encrypted artifact members are not accepted")
+ if member.file_size > MAX_ARCHIVE_MEMBER_BYTES:
+ raise PublisherError("artifact member exceeds the size limit")
+ total += member.file_size
+ if total > MAX_ARCHIVE_TOTAL_BYTES:
+ raise PublisherError("artifact exceeds the expanded size limit")
+ target = destination.joinpath(*path.parts)
+ if member.is_dir():
+ target.mkdir(parents=True, exist_ok=True, mode=0o700)
+ continue
+ target.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
+ output = os.open(target, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600)
+ try:
+ with handle.open(member, "r") as source:
+ written = 0
+ while True:
+ chunk = source.read(1024 * 1024)
+ if not chunk:
+ break
+ _write_all(output, chunk)
+ written += len(chunk)
+ if written != member.file_size:
+ raise PublisherError("artifact member size changed during extraction")
+ os.fsync(output)
+ finally:
+ os.close(output)
+ extracted.append(target)
+ return extracted
+
+
+def validate_matrix(document: Any) -> list[dict[str, Any]]:
+ try:
+ artifact_safety.assert_publication_safe([document])
+ matrix = sweep_matrix.validate_matrix_document(document)
+ except (SystemExit, ValueError, artifact_safety.ArtifactSafetyError) as exc:
+ raise PublisherError(f"requested matrix is invalid: {exc}") from exc
+ return [
+ {
+ "sku": item["sku"],
+ **item["case"],
+ "_disposition": item["disposition"],
+ "_reason": item["reason"],
+ }
+ for item in matrix["requested_cases"]
+ ]
+
+
+def _expected_deliveries(
+ matrix: dict[str, Any], cases: Sequence[dict[str, Any]], run: dict[str, Any]
+) -> dict[str, tuple[str, str, str]]:
+ shard_by_case: dict[str, str] = {}
+ for shard in matrix["include"]:
+ for case_id in shard["case_ids"]:
+ if case_id in shard_by_case:
+ raise PublisherError("requested case appears in two runnable shards")
+ shard_by_case[case_id] = shard["id"]
+ suffix = f"{run['run_id']}-{run['run_attempt']}"
+ deliveries: dict[str, tuple[str, str, str]] = {}
+ for case in cases:
+ case_id = case["case_id"]
+ if case["_disposition"] == "unsupported":
+ deliveries[case_id] = (
+ f"cxunsupported-{suffix}", "setup",
+ f"{run['run_id']}_{run['run_attempt']}_unsupported",
+ )
+ continue
+ shard_id = shard_by_case.get(case_id)
+ if shard_id is None:
+ raise PublisherError("runnable case has no matrix shard")
+ deliveries[case_id] = (
+ f"cxshard-{shard_id}-{suffix}", "sweep",
+ f"{run['run_id']}_{run['run_attempt']}_{shard_id}",
+ )
+ return deliveries
+
+
+def _document_git_run(document: dict[str, Any]) -> dict[str, Any] | None:
+ provenance = document.get("provenance")
+ if not isinstance(provenance, dict):
+ return None
+ value = provenance.get("git_run", provenance)
+ return value if isinstance(value, dict) else None
+
+
+def _run_matches(document: dict[str, Any], run: dict[str, Any]) -> bool:
+ git_run = _document_git_run(document)
+ if git_run is None:
+ return False
+ return (
+ str(git_run.get("run_id")) == run["run_id"]
+ and str(git_run.get("run_attempt")) == str(run["run_attempt"])
+ and git_run.get("source_sha") == run["source_sha"]
+ and (git_run.get("repo") or git_run.get("repository")) == run["repository"]
+ )
+
+
+def _case_matches(document: dict[str, Any], expected: dict[str, Any]) -> bool:
+ scheduled = {
+ key: value for key, value in expected.items()
+ if key not in {"sku", "case_id"} and not key.startswith("_")
+ }
+ return document.get("identity", {}).get("case_factors") == {
+ "case": scheduled,
+ "profile": identity.V1_CASE_PROFILE,
+ "sku": expected["sku"],
+ }
+
+
+def _outcome(document: dict[str, Any]) -> tuple[str, str | None]:
+ status = document["outcome"]["status"]
+ if status == "success":
+ return status, None
+ native = document["outcome"].get("reason")
+ reason = native if isinstance(native, str) and REASON.fullmatch(native) else {
+ "unsupported": "unsupported-capability", "failed": "execution-failed",
+ "invalid": "validation-failed", "diagnostic": "diagnostic-evidence",
+ }.get(status)
+ if reason is None:
+ raise PublisherError(f"unsupported native outcome {status!r}")
+ return status, reason
+
+
+def _attempt_record(
+ document: dict[str, Any], path: Path, root: Path, *, selected: bool
+) -> dict[str, Any]:
+ normalized = contracts.normalize_attempt(document)
+ runtime = normalized["runtime_fingerprint"]
+ runtime_sha = _sha_bytes(_canonical(runtime)) if runtime is not None else None
+ sample_record = None
+ evidence_ids: list[str] = []
+ series_ids: list[str] = []
+ if document["format"] == contracts.RAW_FORMAT:
+ sample_path = path.with_name(document["sample_artifact"]["path"])
+ sample_record = _file_metadata(sample_path, root)
+ evidence_ids = [row["evidence_id"] for row in document["measurement"]["rows"]]
+ series_ids = [document["identity"]["series_id"]]
+ declared = document["identity"]["series_factors"]["runtime_fingerprint_sha256"]
+ if runtime_sha != declared:
+ raise PublisherError("runtime fingerprint checksum differs from series identity")
+ status, reason = _outcome(document)
+ return {
+ "attempt_id": normalized["attempt_id"],
+ "allocation_id": normalized["allocation_id"],
+ "case_id": normalized["case_id"],
+ "outcome": status,
+ "reason": reason,
+ "selected": selected,
+ "document": _file_metadata(path, root),
+ "samples": sample_record,
+ "runtime_fingerprint_sha256": runtime_sha,
+ "series_ids": series_ids,
+ "evidence_ids": evidence_ids,
+ }
+
+
+def _validate_delivery_binding(
+ document: dict[str, Any], path: Path, raw_root: Path,
+ artifact_by_root: dict[str, str], expected_by_id: dict[str, dict[str, Any]],
+ expected_deliveries: dict[str, tuple[str, str, str]], run: dict[str, Any],
+) -> str:
+ case_id = document["identity"]["case_id"]
+ if case_id not in expected_by_id:
+ raise PublisherError("artifact contains an extra case outcome")
+ expected = expected_by_id[case_id]
+ if not _case_matches(document, expected):
+ raise PublisherError("attempt case coordinates differ from the requested matrix")
+ unsupported = document["outcome"]["status"] == "unsupported"
+ if (expected["_disposition"] == "unsupported") != unsupported:
+ raise PublisherError("terminal outcome differs from requested capability disposition")
+ if unsupported and document["outcome"]["reason"] != expected["_reason"]:
+ raise PublisherError("unsupported outcome reason differs from requested matrix")
+ if not _run_matches(document, run):
+ raise PublisherError("attempt provenance differs from publisher run metadata")
+ relative = path.relative_to(raw_root)
+ if len(relative.parts) < 2:
+ raise PublisherError("attempt document is outside a delivered artifact")
+ delivered_name = artifact_by_root.get(relative.parts[0])
+ expected_name, expected_job, expected_execution = expected_deliveries[case_id]
+ git_run = _document_git_run(document)
+ allocation = document["identity"]["allocation_factors"]
+ if (
+ git_run is None
+ or delivered_name != expected_name
+ or git_run["artifact"] != delivered_name
+ or git_run["job"] != expected_job
+ or allocation["execution_id"] != expected_execution
+ ):
+ raise PublisherError("attempt provenance differs from its delivered GHA shard")
+ return case_id
+
+
+def _parse_extracted(root: Path) -> tuple[list[tuple[Path, dict[str, Any]]], set[Path]]:
+ attempts: list[tuple[Path, dict[str, Any]]] = []
+ consumed_samples: set[Path] = set()
+ json_paths = sorted(path for path in root.rglob("*.json") if path.is_file())
+ for path in json_paths:
+ if path in consumed_samples:
+ continue
+ try:
+ document = contracts.strict_load(path)
+ artifact_safety.assert_publication_safe([document])
+ format_name = document.get("format") if isinstance(document, dict) else None
+ if format_name == contracts.SAMPLES_FORMAT:
+ _schema("samples-v1.schema.json", document)
+ # It must be claimed by a raw document; orphan checking happens after the scan.
+ continue
+ if format_name == contracts.RAW_FORMAT:
+ _schema("raw-case-v1.schema.json", document)
+ sample_path = path.with_name(document["sample_artifact"]["path"])
+ sample_document = contracts.strict_load(sample_path)
+ artifact_safety.assert_publication_safe([sample_document])
+ _schema("samples-v1.schema.json", sample_document)
+ validated = contracts.load_raw_attempt(path)
+ consumed_samples.add(sample_path)
+ elif format_name == contracts.TERMINAL_FORMAT:
+ _schema("terminal-outcome-v1.schema.json", document)
+ validated = contracts.validate_terminal_document(document)
+ else:
+ raise PublisherError(f"artifact contains unknown JSON document {path.name}")
+ except (
+ contracts.ContractError, artifact_safety.ArtifactSafetyError,
+ jsonschema.ValidationError, OSError,
+ ) as exc:
+ raise PublisherError(f"native contract rejected {path.name}: {exc}") from exc
+ attempts.append((path, validated))
+ orphan_samples = [
+ path for path in json_paths
+ if isinstance((doc := contracts.strict_load(path)), dict)
+ and doc.get("format") == contracts.SAMPLES_FORMAT
+ and path not in consumed_samples
+ ]
+ if orphan_samples:
+ raise PublisherError("artifact contains an orphan samples document")
+ if not attempts:
+ raise PublisherError("artifact contains zero native attempt documents")
+ return attempts, consumed_samples
+
+
+def build_bundle(
+ store: Store,
+ incoming_id: str,
+ incoming_path: Path,
+ run: dict[str, Any],
+) -> tuple[str, dict[str, Any], list[dict[str, Any]]]:
+ """Validate one exact workflow delivery and install its immutable private bundle."""
+ incoming_manifest = strict_load(incoming_path / "incoming.json")
+ _exact(
+ incoming_manifest,
+ {"format", "schema_version", "ingest_id", "run", "sources"},
+ "incoming",
+ )
+ artifact_safety.assert_publication_safe([incoming_manifest])
+ if (
+ incoming_manifest["format"] != "collectivex.incoming.v1"
+ or incoming_manifest["schema_version"] != 1
+ or incoming_manifest["ingest_id"] != incoming_id
+ or incoming_manifest["run"] != run
+ or _sha_bytes(_canonical({"run": run, "sources": incoming_manifest["sources"]}))
+ != incoming_id
+ ):
+ raise PublisherError("incoming manifest identity differs from archived delivery")
+ incoming_sources = _array(incoming_manifest["sources"], "incoming.sources", nonempty=True)
+ for index, record in enumerate(incoming_sources):
+ _exact(
+ record,
+ {"path", "sha256", "bytes", "kind", "artifact_name"},
+ f"incoming.sources[{index}]",
+ )
+ _resolve_bundle_file(incoming_path, record)
+ matrix_records = [record for record in incoming_sources if record["kind"] == "matrix"]
+ artifact_records = [record for record in incoming_sources if record["kind"] == "artifact"]
+ if (
+ len(matrix_records) != 1
+ or matrix_records[0]["artifact_name"] is not None
+ or not artifact_records
+ or any(ARTIFACT_NAME.fullmatch(record["artifact_name"] or "") is None
+ for record in artifact_records)
+ or len({record["artifact_name"] for record in artifact_records}) != len(artifact_records)
+ ):
+ raise PublisherError("incoming source catalog is invalid")
+ matrix_source = _resolve_bundle_file(incoming_path, matrix_records[0])
+ matrix_document = strict_load(matrix_source)
+ expected_cases = validate_matrix(matrix_document)
+ expected_by_id = {case["case_id"]: case for case in expected_cases}
+ expected_deliveries = _expected_deliveries(matrix_document, expected_cases, run)
+ if {record["artifact_name"] for record in artifact_records} != {
+ delivery[0] for delivery in expected_deliveries.values()
+ }:
+ raise PublisherError("incoming artifact archive set differs from requested matrix shards")
+ with store.staging(store.bundles, private=True) as stage:
+ source_copy = stage / "source"
+ raw_root = stage / "raw"
+ source_copy.mkdir(mode=0o700)
+ raw_root.mkdir(mode=0o700)
+ matrix_path = stage / "matrix.json"
+ _copy_source(matrix_source, matrix_path)
+ source_records: list[dict[str, Any]] = []
+ artifact_by_root: dict[str, str] = {}
+ for index, source_record in enumerate(artifact_records):
+ archive = _resolve_bundle_file(incoming_path, source_record)
+ copied = source_copy / f"artifact-{index:04d}.zip"
+ _copy_source(archive, copied)
+ source_records.append({
+ **_file_metadata(copied, stage),
+ "artifact_name": source_record["artifact_name"],
+ })
+ artifact_root = raw_root / f"artifact-{index:04d}"
+ artifact_root.mkdir(mode=0o700)
+ artifact_by_root[artifact_root.name] = source_record["artifact_name"]
+ extract_archive(copied, artifact_root)
+ parsed, consumed_samples = _parse_extracted(raw_root)
+ created_at = _latest_timestamp(
+ [document["generated_at"] for _, document in parsed]
+ )
+ consumed_files = {path for path, _ in parsed} | consumed_samples
+ extracted_files = {
+ path for path in raw_root.rglob("*")
+ if path.is_file() and not path.is_symlink()
+ }
+ if consumed_files != extracted_files:
+ raise PublisherError("artifact contains an unconsumed non-native member")
+ by_case: dict[str, list[tuple[Path, dict[str, Any]]]] = {}
+ for path, document in parsed:
+ case_id = _validate_delivery_binding(
+ document, path, raw_root, artifact_by_root, expected_by_id,
+ expected_deliveries, run,
+ )
+ by_case.setdefault(case_id, []).append((path, document))
+ missing = set(expected_by_id) - set(by_case)
+ if missing:
+ raise PublisherError(f"artifact is missing {len(missing)} requested case outcomes")
+ attempt_records: list[dict[str, Any]] = []
+ selections: list[dict[str, Any]] = []
+ selected_documents: list[dict[str, Any]] = []
+ runtime_hashes: set[str] = set()
+ outcome_counts = {name: 0 for name in OUTCOMES}
+ for case_id in sorted(expected_by_id):
+ case_attempts = by_case[case_id]
+ ordinals = [document["identity"]["attempt_ordinal"] for _, document in case_attempts]
+ allocations_for_case = {
+ document["identity"]["allocation_id"] for _, document in case_attempts
+ }
+ if len(allocations_for_case) != 1 or sorted(ordinals) != list(
+ range(1, len(ordinals) + 1)
+ ):
+ raise PublisherError(
+ "case retries must retain contiguous ordinals in one allocation"
+ )
+ _, selected_document = max(
+ case_attempts, key=lambda item: item[1]["identity"]["attempt_ordinal"]
+ )
+ selected_id = selected_document["identity"]["attempt_id"]
+ selected_documents.append(selected_document)
+ selected_status, _ = _outcome(selected_document)
+ selections.append({
+ "case_id": case_id,
+ "selected_attempt_id": selected_id,
+ "outcome": selected_status,
+ })
+ outcome_counts[selected_status] += 1
+ for path, document in sorted(
+ case_attempts, key=lambda item: item[1]["identity"]["attempt_ordinal"]
+ ):
+ normalized = contracts.normalize_attempt(document)
+ if document["format"] == contracts.RAW_FORMAT:
+ sample_path = path.with_name(document["sample_artifact"]["path"])
+ if sample_path not in consumed_samples:
+ raise PublisherError("validated raw attempt lost its samples document")
+ record = _attempt_record(
+ document, path, stage,
+ selected=normalized["attempt_id"] == selected_id,
+ )
+ if record["runtime_fingerprint_sha256"]:
+ runtime_hashes.add(record["runtime_fingerprint_sha256"])
+ attempt_records.append(record)
+ # Every extracted byte is covered; the bundle manifest anchors this checksum catalog.
+ payload_records = [_file_metadata(path, stage) for path in _tree_files(stage)]
+ checksum_document = {
+ "format": "collectivex.checksums.v1",
+ "files": payload_records,
+ }
+ checksum_path = stage / "checksums.json"
+ _write_json(checksum_path, checksum_document, mode=0o600)
+ bundle = {
+ "format": FORMAT_BUNDLE,
+ "schema_version": 1,
+ "created_at": created_at,
+ "ingest_id": incoming_id,
+ "run": run,
+ "matrix": _file_metadata(matrix_path, stage),
+ "sources": source_records,
+ "attempts": attempt_records,
+ "coverage": {
+ "expected_cases": len(expected_cases),
+ "terminal_cases": len(selections),
+ "complete": len(selections) == len(expected_cases),
+ "outcome_counts": outcome_counts,
+ "selections": selections,
+ },
+ "runtime_fingerprints": sorted(runtime_hashes),
+ "checksums": _file_metadata(checksum_path, stage),
+ "validation": {
+ "policy": PUBLISHER_POLICY,
+ "passed": True,
+ "checks": [
+ "archive-safety", "checksums", "exact-coverage", "identity",
+ "native-schema", "privacy", "runtime-homogeneity", "terminal-outcomes",
+ ],
+ },
+ }
+ validate_bundle_manifest(bundle)
+ # Runtime homogeneity is scoped to a realized allocation, not across unlike SKUs.
+ by_allocation: dict[str, set[str]] = {}
+ for attempt in attempt_records:
+ fingerprint = attempt["runtime_fingerprint_sha256"]
+ if fingerprint:
+ by_allocation.setdefault(attempt["allocation_id"], set()).add(fingerprint)
+ if any(len(values) != 1 for values in by_allocation.values()):
+ raise PublisherError("runtime fingerprint is heterogeneous within an allocation")
+ bundle_bytes = _canonical(bundle) + b"\n"
+ bundle_id = _sha_bytes(bundle_bytes)
+ _write_bytes(stage / "bundle.json", bundle_bytes, mode=0o600)
+ store.complete(stage, bundle_id, private=True)
+ store.install(stage, store.bundles / bundle_id, private=True)
+ installed = load_bundle(store, bundle_id)
+ if installed["manifest"] != bundle:
+ raise PublisherError("existing bundle differs from validated manifest")
+ return bundle_id, bundle, selected_documents
+
+
+def _slug(value: Any, fallback: str = "unknown") -> str:
+ text = re.sub(r"[^a-z0-9_.-]+", "-", str(value or "").lower()).strip("-.")
+ return text[:128] if text and SAFE_ID.fullmatch(text[:128]) else fallback
+
+
+def _derived_id(prefix: str, value: Any) -> str:
+ return f"{prefix}{_sha_bytes(_canonical(value))}"
+
+
+def _git_run(document: dict[str, Any]) -> dict[str, Any]:
+ return _document_git_run(document) or {}
+
+
+def _public_attempt(document: dict[str, Any], *, selected: bool = False) -> dict[str, Any]:
+ normalized = contracts.normalize_attempt(document)
+ run = _git_run(document)
+ evidence = (
+ [{"evidence_id": row["evidence_id"], "point_id": row["point_id"]}
+ for row in document["measurement"]["rows"]]
+ if document["format"] == contracts.RAW_FORMAT else []
+ )
+ status, reason = _outcome(document)
+ failure_mode = document["outcome"].get("failure_mode")
+ if not isinstance(failure_mode, str) or REASON.fullmatch(failure_mode) is None:
+ failure_mode = None if status == "success" else reason
+ series_id = normalized["series_id"] if status == "success" and selected else None
+ return {
+ "attempt_id": normalized["attempt_id"],
+ "evidence": evidence,
+ "case_id": normalized["case_id"],
+ "allocation_id": normalized["allocation_id"],
+ "run_id": str(run["run_id"]),
+ "run_attempt": int(run["run_attempt"]),
+ "attempt_index": document["identity"]["attempt_ordinal"],
+ "selected": selected,
+ "outcome": status,
+ "failure_mode": failure_mode,
+ "reason": reason,
+ "series_id": series_id,
+ "completed_at": document["generated_at"],
+ }
+
+
+def _ratio(values: Sequence[float]) -> float | None:
+ return max(values) / min(values) if len(values) >= REQUIRED_ALLOCATIONS and min(values) > 0 else None
+
+
+def _eligibility_record(
+ allocations: Sequence[str],
+ *,
+ complete: bool,
+ correct: bool,
+ measured: bool,
+ stable_ordering: bool,
+ p50_ratio: float | None,
+ p99_ratio: float | None,
+ extra_reasons: Sequence[str] = (),
+) -> dict[str, Any]:
+ ids = sorted(set(allocations))
+ stable_p50 = p50_ratio is not None and p50_ratio <= P50_STABILITY_LIMIT
+ stable_p99 = p99_ratio is not None and p99_ratio <= P99_STABILITY_LIMIT
+ reasons = list(extra_reasons)
+ for condition, reason in (
+ (len(ids) >= REQUIRED_ALLOCATIONS, "insufficient-allocations"),
+ (complete, "incomplete-repeat-coverage"),
+ (correct, "correctness-failed"),
+ (measured, "missing-measured-roundtrip-p99"),
+ (stable_p50, "unstable-p50"),
+ (stable_p99, "unstable-p99"),
+ (stable_ordering, "unstable-ordering"),
+ ):
+ if not condition:
+ reasons.append(reason)
+ reasons = sorted(set(reasons))
+ decision = not reasons
+ return {
+ "decision_grade": decision,
+ "allocation_ids": ids,
+ "complete": complete,
+ "correct": correct,
+ "measured_roundtrip_p99": measured,
+ "stable_p50": stable_p50,
+ "stable_p99": stable_p99,
+ "stable_ordering": stable_ordering,
+ "p50_max_min_ratio": p50_ratio,
+ "p99_max_min_ratio": p99_ratio,
+ "reasons": reasons,
+ }
+
+
+def _aggregate_percentiles(values: Sequence[dict[str, Any]]) -> dict[str, float]:
+ return {
+ name: float(statistics.median(float(value[name]) for value in values))
+ for name in ("p50", "p90", "p95", "p99")
+ }
+
+
+def _aggregate_component(
+ rows: Sequence[dict[str, Any]], name: str
+) -> dict[str, Any] | None:
+ components = [row["components"][name] for row in rows]
+ if all(component["availability"] == "unavailable" for component in components):
+ return None
+ if any(component["availability"] == "unavailable" for component in components):
+ raise PublisherError("component availability differs across repeat allocations")
+ latency = _aggregate_percentiles([component["percentiles_us"] for component in components])
+ if name == "isolated_sum":
+ return {
+ "origin": "derived",
+ "latency_us": latency,
+ "logical_bytes": None,
+ "logical_payload_rate_gbps_at_latency_percentile": None,
+ "sample_count": None,
+ }
+ byte_values = {row["logical_bytes"][name] for row in rows}
+ if len(byte_values) != 1:
+ raise PublisherError("logical byte accounting differs across repeat allocations")
+ logical_bytes = byte_values.pop()
+ rates = {statistic: logical_bytes / (latency[statistic] * 1000.0) for statistic in latency}
+ return {
+ "origin": "measured",
+ "latency_us": latency,
+ "logical_bytes": logical_bytes,
+ "logical_payload_rate_gbps_at_latency_percentile": rates,
+ "sample_count": 512,
+ }
+
+
+def _exact_repeat_value(values: Sequence[Any], label: str) -> Any:
+ if not values or len({_canonical(value) for value in values}) != 1:
+ raise PublisherError(f"{label} differs across repeat allocations")
+ return values[0]
+
+
+def _eplb_descriptor(document: dict[str, Any]) -> dict[str, Any]:
+ value = document["case"]["eplb"]
+ return {
+ "enabled": value["enabled"],
+ "planner": value["planner"],
+ "mapping_sha256": value["mapping_hash"],
+ "logical_experts": value["num_logical_experts"],
+ "physical_experts": value["num_physical_experts"],
+ "redundant_experts": value["num_redundant"],
+ "reference_tokens_per_rank": value["reference_tokens_per_rank"],
+ "replicated_experts": value["replicated_experts"],
+ "max_replicas": value["max_replicas"],
+ "imbalance_before": value["imbalance_before"],
+ "imbalance_after": value["imbalance_after"],
+ }
+
+
+def _routing_facts(row: dict[str, Any]) -> dict[str, Any]:
+ routing = row["routing"]
+ return {
+ "fanout_mean": routing["fanout_mean"],
+ "recv_tokens_max": row["receive"]["max"],
+ "expert_load_cv": routing["expert_load_cv"],
+ "payload_rank_cv": routing["payload_rank_cv"],
+ "hotspot_ratio": routing["hotspot_ratio"],
+ "empty_expert_count": routing["empty_expert_count"],
+ "empty_rank_count": routing["empty_rank_count"],
+ "routed_copies": routing["routed_copies"],
+ }
+
+
+def _series_extra_reasons(documents: Sequence[dict[str, Any]]) -> list[str]:
+ reasons: set[str] = set()
+ for document in documents:
+ validity = document["outcome"]["validity"]
+ rows = document["measurement"]["rows"]
+ if validity.get("provenance_complete") is not True:
+ reasons.add("incomplete-provenance")
+ if validity.get("workload_source") != "canonical-serialized":
+ reasons.add("noncanonical-workload")
+ if validity.get("anomaly_free") is not True or any(row["anomalies"] for row in rows):
+ reasons.add("unresolved-anomaly")
+ if validity.get("semantic_correctness") != "pass":
+ reasons.add("semantic-correctness-failed")
+ if validity.get("measurement_conformance") != "conformant" or validity.get("sampling_conformance") != "conformant":
+ reasons.add("measurement-nonconformant")
+ profile = identity.case_profile(document["case"]["mode"])
+ scopes = {row["correctness"].get("scope") for row in rows}
+ if scopes != {profile["correctness_scope"]}:
+ reasons.add("expert-oracle-incomplete")
+ return sorted(reasons)
+
+
+BACKEND_LABELS = {
+ "deepep": "DeepEP V1",
+ "deepep-v2": "DeepEP V2",
+ "deepep-hybrid": "DeepEP Hybrid",
+ "uccl": "UCCL",
+ "mori": "MoRI",
+ "nccl-ep": "NCCL/RCCL reference",
+}
+
+
+def _build_series(
+ series_id: str,
+ documents: Sequence[dict[str, Any]],
+ expected_repeats: int,
+) -> tuple[dict[str, Any], dict[str, Any]]:
+ if not documents:
+ raise PublisherError("cannot aggregate an empty series")
+ first = documents[0]
+ if any(document["identity"]["series_id"] != series_id for document in documents):
+ raise PublisherError("series aggregation mixed identities")
+ allocations = [document["identity"]["allocation_id"] for document in documents]
+ if len(allocations) != len(set(allocations)):
+ raise PublisherError("series repeats reuse an allocation identity")
+ row_maps = [
+ {row["tokens_per_rank"]: row for row in document["measurement"]["rows"]}
+ for document in documents
+ ]
+ token_sets = {tuple(sorted(rows)) for rows in row_maps}
+ if len(token_sets) != 1:
+ raise PublisherError("series token coverage differs across allocations")
+ tokens = list(next(iter(token_sets)))
+ p50_ratios = [
+ _ratio([rows[token]["components"]["roundtrip"]["percentiles_us"]["p50"] for rows in row_maps])
+ for token in tokens
+ ]
+ p99_ratios = [
+ _ratio([rows[token]["components"]["roundtrip"]["percentiles_us"]["p99"] for rows in row_maps])
+ for token in tokens
+ ]
+ p50_ratio = max((value for value in p50_ratios if value is not None), default=None)
+ p99_ratio = max((value for value in p99_ratios if value is not None), default=None)
+ correct = all(
+ row["correctness"]["passed"]
+ for document in documents for row in document["measurement"]["rows"]
+ )
+ measured = all(
+ row["components"]["roundtrip"]["availability"] == "measured"
+ and row["components"]["roundtrip"]["percentiles_us"].get("p99") is not None
+ for document in documents for row in document["measurement"]["rows"]
+ )
+ eligibility = _eligibility_record(
+ allocations,
+ complete=len(documents) == expected_repeats,
+ correct=correct,
+ measured=measured,
+ # Ordering is defined only across alternatives in a controlled cohort.
+ stable_ordering=True,
+ p50_ratio=p50_ratio,
+ p99_ratio=p99_ratio,
+ extra_reasons=_series_extra_reasons(documents),
+ )
+ case = first["case"]
+ shape = case["shape"]
+ topology = first["topology"]
+ runtime = first["runtime_fingerprint"]
+ workload_id = first["workload"]["workload_id"]
+ if not identity.is_typed_id(workload_id, "workload"):
+ raise PublisherError("raw workload is not canonical")
+ backend_id = case["backend"]
+ resource_raw = first["implementation"]["resource_profile"]
+ public_config = contracts.public_series_config(
+ kernel_generation=first["implementation"]["kernel_generation"],
+ provenance=first["implementation"]["provenance"],
+ resource_profile=resource_raw,
+ resource_mode=case["resource_mode"],
+ device_product=topology["device_product"],
+ )
+ resource_profile = public_config["resource"]["profile"]
+ configured_units = public_config["resource"]["configured_units"]
+ units_kind = public_config["resource"]["comm_units_kind"]
+ resource_label = (
+ f"{configured_units} {str(units_kind).upper()}"
+ if configured_units is not None and units_kind
+ else resource_profile
+ )
+ eplb = _exact_repeat_value(
+ [_eplb_descriptor(document) for document in documents], "EPLB descriptor"
+ )
+ points: list[dict[str, Any]] = []
+ run_metrics: dict[str, dict[int, dict[str, float]]] = {}
+ for document, rows in zip(documents, row_maps, strict=True):
+ run_id = str(_git_run(document)["run_id"])
+ if run_id in run_metrics:
+ raise PublisherError("series has two allocations from one workflow run")
+ run_metrics[run_id] = {}
+ for token in tokens:
+ latency = rows[token]["components"]["roundtrip"]["percentiles_us"]
+ logical_bytes = rows[token]["logical_bytes"]["roundtrip"]
+ run_metrics[run_id][token] = {
+ "latency_us": {statistic: latency[statistic] for statistic in ("p50", "p99")},
+ "logical_payload_rate_gbps_at_latency_percentile": {
+ statistic: logical_bytes / (latency[statistic] * 1000.0)
+ for statistic in ("p50", "p99")
+ },
+ }
+ for token in tokens:
+ rows = [row_map[token] for row_map in row_maps]
+ routing = _exact_repeat_value(
+ [_routing_facts(row) for row in rows], "routing/load facts"
+ )
+ components = {
+ name: _aggregate_component(rows, name)
+ for name in ("dispatch", "combine", "roundtrip")
+ }
+ if components["dispatch"] is None:
+ components["isolated_sum"] = None
+ else:
+ latency = {
+ statistic: components["dispatch"]["latency_us"][statistic]
+ + components["combine"]["latency_us"][statistic]
+ for statistic in ("p50", "p90", "p95", "p99")
+ }
+ components["isolated_sum"] = {
+ "origin": "derived", "latency_us": latency, "logical_bytes": None,
+ "logical_payload_rate_gbps_at_latency_percentile": None, "sample_count": None,
+ }
+ points.append({
+ "point_id": rows[0]["point_id"],
+ "tokens_per_rank": token,
+ "global_tokens": token * case["ep_size"],
+ "correct": all(row["correctness"]["passed"] for row in rows),
+ "routing": routing,
+ "components": components,
+ "roundtrip_token_rate_at_latency_percentile": {
+ statistic: (token * case["ep_size"])
+ / (components["roundtrip"]["latency_us"][statistic] * 1e-6)
+ for statistic in ("p50", "p90", "p95", "p99")
+ },
+ "evidence_ids": [row["evidence_id"] for row in rows],
+ })
+ series = {
+ "series_id": series_id,
+ "label": (
+ f"{case['runner'].upper()} / {BACKEND_LABELS.get(backend_id, backend_id)} / "
+ f"EP{case['ep_size']} / {topology['nodes']} node"
+ f"{'s' if topology['nodes'] != 1 else ''} / {topology['scope']} / "
+ f"{case['mode']} / {case['phase']} / {shape['routing']}"
+ f"{' + EPLB' if case['eplb']['enabled'] else ''} / {resource_label}"
+ ),
+ "status": "decision-grade" if eligibility["decision_grade"] else "diagnostic",
+ "case_ids": sorted({document["identity"]["case_id"] for document in documents}),
+ "allocation_ids": sorted(allocations),
+ "model": _slug(case["workload_name"]),
+ "suite": _slug(case["suite"]),
+ "mode": case["mode"],
+ "phase": case["phase"],
+ "publication_tier": case["required_publication"],
+ "backend": {
+ "id": _slug(backend_id),
+ "label": BACKEND_LABELS.get(backend_id, backend_id),
+ "role": "reference" if backend_id == "nccl-ep" else "library",
+ **public_config["backend"],
+ },
+ "build": {
+ "implementation_contract_sha256": first["identity"]["series_factors"][
+ "implementation_contract_sha256"
+ ],
+ "public_config_sha256": first["identity"]["series_factors"][
+ "public_config_sha256"
+ ],
+ "routing_control_sha256": first["identity"]["series_factors"][
+ "routing_control_sha256"
+ ],
+ "runtime_fingerprint_sha256": first["identity"]["series_factors"][
+ "runtime_fingerprint_sha256"
+ ],
+ "image_digest": first["identity"]["series_factors"]["image_digest"],
+ "source_sha": first["identity"]["series_factors"]["source_sha"],
+ "squash_sha256": first["identity"]["series_factors"]["squash_sha256"],
+ },
+ "system": {
+ "sku": _slug(case["runner"]),
+ "label": public_config["system"]["label"],
+ "vendor": runtime["vendor"],
+ "topology_class": _slug(topology["topology_class"]),
+ "transport": _slug(topology["transport"]),
+ "scale_up_transport": _slug(topology["scale_up_transport"]),
+ "scale_out_transport": (
+ _slug(topology["scale_out_transport"])
+ if topology["scale_out_transport"] is not None
+ else None
+ ),
+ "scope": topology["scope"],
+ "nodes": topology["nodes"],
+ "gpus_per_node": topology["gpus_per_node"],
+ "scale_up_domain": topology["scale_up_domain"],
+ "world_size": topology["world_size"],
+ "ep_size": case["ep_size"],
+ "placement": topology["placement"],
+ },
+ "workload": {
+ "workload_id": workload_id,
+ "hidden": shape["hidden"],
+ "top_k": shape["topk"],
+ "experts": case["eplb"]["num_logical_experts"],
+ "routing": shape["routing"],
+ "eplb": case["eplb"]["enabled"],
+ "dispatch_dtype": shape["dispatch_dtype"],
+ "combine_dtype": shape["quant"]["combine_output_dtype"],
+ "activation_profile": shape["activation_profile"],
+ },
+ "eplb": eplb,
+ "resource": public_config["resource"],
+ "measurement": {
+ "contract": first["measurement"]["contract"],
+ "component_order_contract": first["measurement"]["component_order_contract"],
+ "combine_semantics": identity.case_profile(case["mode"])["combine_semantics"],
+ "payload_unit": identity.case_profile(case["mode"])["payload_unit"],
+ "sampling_contract": first["measurement"]["sampling"]["contract"],
+ "iters": first["measurement"]["sampling"]["iterations_per_trial"],
+ "trials": first["measurement"]["sampling"]["trials"],
+ "warmups": first["measurement"]["sampling"]["warmup_iterations"],
+ "samples_per_component": first["measurement"]["sampling"]["samples_per_component"],
+ "headline_component": "roundtrip",
+ "headline_percentile": "p99",
+ },
+ "points": points,
+ "eligibility": eligibility,
+ }
+ internal = {
+ "documents": list(documents),
+ "run_metrics": run_metrics,
+ "series_factors": first["identity"]["series_factors"],
+ }
+ return series, internal
+
+
+def _resolve_bundle_file(root: Path, record: dict[str, Any]) -> Path:
+ path = root.joinpath(*PurePosixPath(record["path"]).parts)
+ try:
+ path.relative_to(root)
+ except ValueError as exc:
+ raise PublisherError("bundle record escapes its directory") from exc
+ if path.resolve() != path or path.is_symlink() or not path.is_file():
+ raise PublisherError("bundle record points to a missing or linked file")
+ if path.stat().st_size != record["bytes"] or _sha_file(path) != record["sha256"]:
+ raise PublisherError("bundle file checksum differs from its manifest")
+ return path
+
+
+def load_bundle(store: Store, bundle_id: str) -> dict[str, Any]:
+ if HEX64.fullmatch(bundle_id) is None:
+ raise PublisherError("bundle ID must be a SHA-256 digest")
+ root = store.bundles / bundle_id
+ if root.is_symlink() or not (root / "COMPLETE").is_file():
+ raise PublisherError(f"bundle {bundle_id} is missing or incomplete")
+ _verify_frozen_tree(root, private=True)
+ if (root / "COMPLETE").read_text().strip() != bundle_id:
+ raise PublisherError("bundle COMPLETE marker differs")
+ manifest_path = root / "bundle.json"
+ if _sha_file(manifest_path) != bundle_id:
+ raise PublisherError("bundle directory digest differs from bundle.json")
+ manifest = validate_bundle_manifest(strict_load(manifest_path))
+ checksum_path = _resolve_bundle_file(root, manifest["checksums"])
+ checksum_document = strict_load(checksum_path)
+ checksum_document = _exact(checksum_document, {"format", "files"}, "checksums")
+ if checksum_document["format"] != "collectivex.checksums.v1":
+ raise PublisherError("bundle checksum format is invalid")
+ records = [_file_record(value, f"checksums.files[{index}]")
+ for index, value in enumerate(_array(checksum_document["files"], "checksums.files"))]
+ _unique([record["path"] for record in records], "checksums.files[].path")
+ for record in records:
+ _resolve_bundle_file(root, record)
+ expected_paths = {
+ path.relative_to(root).as_posix() for path in _tree_files(root)
+ if path.name not in {"bundle.json", "checksums.json"}
+ }
+ if {record["path"] for record in records} != expected_paths:
+ raise PublisherError("bundle checksum catalog does not cover its payload exactly")
+ artifact_by_root: dict[str, str] = {}
+ for index, source in enumerate(manifest["sources"]):
+ _resolve_bundle_file(root, source)
+ archive_key = f"artifact-{index:04d}"
+ if source["path"] != f"source/{archive_key}.zip":
+ raise PublisherError("bundle source catalog order/path differs")
+ artifact_by_root[archive_key] = source["artifact_name"]
+ if len(set(artifact_by_root.values())) != len(artifact_by_root):
+ raise PublisherError("bundle source catalog repeats an artifact name")
+ matrix_path = _resolve_bundle_file(root, manifest["matrix"])
+ matrix_document = strict_load(matrix_path)
+ cases = validate_matrix(matrix_document)
+ expected_by_id = {case["case_id"]: case for case in cases}
+ expected_deliveries = _expected_deliveries(
+ matrix_document, cases, manifest["run"]
+ )
+ if {item["case_id"] for item in manifest["coverage"]["selections"]} != set(expected_by_id):
+ raise PublisherError("bundle selected coverage differs from requested matrix")
+ documents: dict[str, dict[str, Any]] = {}
+ runtime_fingerprints: set[str] = set()
+ for attempt in manifest["attempts"]:
+ document_path = _resolve_bundle_file(root, attempt["document"])
+ document = contracts.strict_load(document_path)
+ artifact_safety.assert_publication_safe([document])
+ if document.get("format") == contracts.RAW_FORMAT:
+ _schema("raw-case-v1.schema.json", document)
+ sample_path = document_path.with_name(document["sample_artifact"]["path"])
+ if attempt["samples"] is None:
+ raise PublisherError("raw attempt is missing its sample manifest record")
+ manifest_sample_path = _resolve_bundle_file(root, attempt["samples"])
+ if manifest_sample_path != sample_path:
+ raise PublisherError("sample manifest record points to the wrong raw evidence")
+ sample_document = contracts.strict_load(sample_path)
+ artifact_safety.assert_publication_safe([sample_document])
+ _schema("samples-v1.schema.json", sample_document)
+ document = contracts.load_raw_attempt(document_path)
+ else:
+ if attempt["samples"] is not None:
+ raise PublisherError("terminal attempt unexpectedly names a sample artifact")
+ _schema("terminal-outcome-v1.schema.json", document)
+ document = contracts.validate_terminal_document(document)
+ _validate_delivery_binding(
+ document, document_path, root / "raw", artifact_by_root,
+ expected_by_id, expected_deliveries, manifest["run"],
+ )
+ expected_record = _attempt_record(
+ document, document_path, root, selected=attempt["selected"]
+ )
+ if expected_record != attempt:
+ raise PublisherError("bundle attempt record differs from native document")
+ if attempt["runtime_fingerprint_sha256"]:
+ runtime_fingerprints.add(attempt["runtime_fingerprint_sha256"])
+ documents[attempt["attempt_id"]] = document
+ if sorted(runtime_fingerprints) != manifest["runtime_fingerprints"]:
+ raise PublisherError("bundle runtime fingerprint catalog differs from attempts")
+ selected = {
+ selection["case_id"]: documents[selection["selected_attempt_id"]]
+ for selection in manifest["coverage"]["selections"]
+ }
+ return {
+ "id": bundle_id,
+ "root": root,
+ "manifest": manifest,
+ "cases": cases,
+ "documents": documents,
+ "selected": selected,
+ }
+
+
+def _cohort_control(
+ kind: str, series: dict[str, Any], internal: dict[str, Any]
+) -> tuple[dict[str, Any], list[str], list[str], Any]:
+ binary_build = series["build"]
+ source = binary_build["source_sha"]
+ workload = series["workload"]
+ shape = {
+ key: workload[key]
+ for key in ("hidden", "top_k", "experts", "dispatch_dtype", "combine_dtype", "activation_profile")
+ }
+ common = {
+ "model": series["model"], "mode": series["mode"],
+ "phase": series["phase"], "shape": shape,
+ "measurement": series["measurement"], "ep_size": series["system"]["ep_size"],
+ }
+ if kind == "library":
+ control = {**common, "system": series["system"], "workload": workload,
+ "resource_mode": series["resource"]["mode"], "source": source}
+ return control, ["system", "workload", "mode", "phase", "measurement", "resource.mode", "source"], ["backend", "resource"], series["backend"]["id"]
+ if kind == "chip":
+ control = {**common, "backend": series["backend"], "source": source,
+ "workload": workload, "resource_mode": series["resource"]["mode"]}
+ return control, ["backend", "source", "workload", "mode", "phase", "measurement", "resource.mode"], ["system", "resource"], series["system"]
+ if kind == "system":
+ control = {**common, "workload": workload, "source": source}
+ varying = [series["system"]["sku"], series["backend"]["id"], series["resource"]["profile"]]
+ return control, ["workload", "mode", "phase", "measurement", "source"], ["system", "backend", "resource"], varying
+ if kind == "routing":
+ control = {
+ **common,
+ "backend": series["backend"],
+ "system": series["system"],
+ "resource": series["resource"],
+ "build": _routing_build_control(binary_build),
+ }
+ varying = [
+ workload["routing"], workload["eplb"],
+ binary_build["implementation_contract_sha256"],
+ ]
+ return (
+ control,
+ ["backend", "implementation-static-build", "system", "model-shape", "mode", "phase", "measurement", "resource"],
+ ["workload.routing", "workload.eplb", "implementation-config"],
+ varying,
+ )
+ raise PublisherError(f"unknown cohort kind {kind}")
+
+
+def _cohort_ordering(
+ members: Sequence[dict[str, Any]], internals: dict[str, dict[str, Any]], tokens: Sequence[int]
+) -> tuple[bool, int]:
+ run_ids = set.intersection(*(
+ set(internals[member["series_id"]]["run_metrics"]) for member in members
+ ))
+ if len(run_ids) < REQUIRED_ALLOCATIONS:
+ return False, len(run_ids)
+ orders: list[tuple[str, str, int, str, tuple[str, ...]]] = []
+ for run_id in sorted(run_ids):
+ for token in tokens:
+ for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile"):
+ for statistic in ("p50", "p99"):
+ ordered = tuple(
+ member["series_id"]
+ for member in sorted(
+ members,
+ key=lambda item: (
+ internals[item["series_id"]]["run_metrics"][run_id][token][measure][statistic],
+ item["series_id"],
+ ),
+ reverse=measure == "logical_payload_rate_gbps_at_latency_percentile",
+ )
+ )
+ orders.append((measure, statistic, token, run_id, ordered))
+ for token in tokens:
+ for measure in ("latency_us", "logical_payload_rate_gbps_at_latency_percentile"):
+ for statistic in ("p50", "p99"):
+ observed = {
+ entry[4]
+ for entry in orders
+ if entry[0] == measure and entry[1] == statistic and entry[2] == token
+ }
+ if len(observed) != 1:
+ return False, len(run_ids)
+ return True, len(run_ids)
+
+
+def build_decisions(
+ series: Sequence[dict[str, Any]], internals: dict[str, dict[str, Any]]
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]:
+ cohorts: list[dict[str, Any]] = []
+ for kind in ("library", "chip", "system", "routing"):
+ groups: dict[bytes, list[tuple[dict[str, Any], Any, list[str], list[str]]]] = {}
+ for item in series:
+ if kind == "library" and item["backend"]["role"] != "library":
+ continue
+ if kind == "system" and item["backend"]["role"] != "reference":
+ continue
+ control, controlled, varying, variant = _cohort_control(kind, item, internals[item["series_id"]])
+ groups.setdefault(_canonical(control), []).append((item, variant, controlled, varying))
+ for entries in groups.values():
+ variants = {_canonical(entry[1]) for entry in entries}
+ if len(entries) < 2 or len(variants) < 2:
+ continue
+ members = sorted((entry[0] for entry in entries), key=lambda item: item["series_id"])
+ token_sets = [set(point["tokens_per_rank"] for point in member["points"]) for member in members]
+ tokens = sorted(set.intersection(*token_sets))
+ same_points = len({tuple(sorted(values)) for values in token_sets}) == 1
+ ordering, aligned_runs = _cohort_ordering(members, internals, tokens) if tokens else (False, 0)
+ allocations = sorted({value for member in members for value in member["allocation_ids"]})
+ p50_ratio = max(
+ (member["eligibility"]["p50_max_min_ratio"] for member in members
+ if member["eligibility"]["p50_max_min_ratio"] is not None), default=None
+ )
+ p99_ratio = max(
+ (member["eligibility"]["p99_max_min_ratio"] for member in members
+ if member["eligibility"]["p99_max_min_ratio"] is not None), default=None
+ )
+ extra = {
+ reason for member in members for reason in member["eligibility"]["reasons"]
+ if reason not in {"unstable-ordering"}
+ }
+ if aligned_runs < REQUIRED_ALLOCATIONS:
+ extra.add("incomplete-aligned-repeats")
+ if kind == "routing" and sum(
+ member["workload"]["routing"] == "uniform"
+ and not member["workload"]["eplb"]
+ for member in members
+ ) != 1:
+ extra.add("missing-uniform-baseline")
+ if kind == "routing" and {
+ (member["workload"]["routing"], member["workload"]["eplb"])
+ for member in members
+ } != {("uniform", False), ("zipf", False), ("zipf", True)}:
+ extra.add("incomplete-routing-anchors")
+ if kind == "routing" and _routing_implementation_mismatch(members):
+ extra.add("implementation-config-mismatch")
+ if not tokens or (kind != "routing" and not same_points):
+ extra.add("unmatched-token-coverage")
+ eligibility = _eligibility_record(
+ allocations,
+ complete=all(member["eligibility"]["complete"] for member in members)
+ and bool(tokens) and (kind == "routing" or same_points),
+ correct=all(member["eligibility"]["correct"] for member in members),
+ measured=all(member["eligibility"]["measured_roundtrip_p99"] for member in members),
+ stable_ordering=ordering,
+ p50_ratio=p50_ratio,
+ p99_ratio=p99_ratio,
+ extra_reasons=sorted(extra),
+ )
+ member_ids = [member["series_id"] for member in members]
+ publication_tier = (
+ "comparable-experimental"
+ if any(member["publication_tier"] == "comparable-experimental" for member in members)
+ else "official"
+ )
+ controlled, varying = entries[0][2], entries[0][3]
+ cohort_id = _derived_id("cxcohort-v1-", {
+ "kind": kind, "series_ids": member_ids,
+ "controlled_factors": controlled, "varying_factors": varying,
+ })
+ kind_label = "Platform" if kind == "chip" else kind.title()
+ first = members[0]
+ routing_label = first["workload"]["routing"] + (
+ "+EPLB" if first["workload"]["eplb"] else ""
+ )
+ context = {
+ "library": (
+ f"{first['system']['sku'].upper()} EP{first['system']['ep_size']} / "
+ f"{first['mode']} / {first['phase']} / {routing_label}"
+ ),
+ "chip": (
+ f"{first['backend']['label']} EP{first['system']['ep_size']} / "
+ f"{first['mode']} / {first['phase']} / {routing_label}"
+ ),
+ "system": (
+ f"Reference EP{first['system']['ep_size']} / {first['mode']} / "
+ f"{first['phase']} / {routing_label}"
+ ),
+ "routing": (
+ f"{first['system']['sku'].upper()} / {first['backend']['label']} / "
+ f"EP{first['system']['ep_size']} / {first['mode']} / {first['phase']}"
+ ),
+ }[kind]
+ cohorts.append({
+ "cohort_id": cohort_id,
+ "kind": kind,
+ "label": f"{context} / {kind_label} contrast ({len(members)} series)",
+ "description": (
+ "Publisher-controlled NCCL/RCCL system comparison"
+ if kind == "system"
+ else f"Publisher-controlled {kind_label.lower()} comparison"
+ ),
+ "series_ids": member_ids,
+ "controlled_factors": controlled,
+ "varying_factors": varying,
+ "publication_tier": publication_tier,
+ "eligibility": eligibility,
+ })
+ cohorts.sort(key=lambda item: item["cohort_id"])
+ series_by_id = {item["series_id"]: item for item in series}
+ rankings: list[dict[str, Any]] = []
+ recommendations: list[dict[str, Any]] = []
+ sensitivities: list[dict[str, Any]] = []
+ for cohort in cohorts:
+ if not cohort["eligibility"]["decision_grade"]:
+ continue
+ members = [series_by_id[series_id] for series_id in cohort["series_ids"]]
+ tokens = sorted(set.intersection(*(
+ {point["tokens_per_rank"] for point in member["points"]} for member in members
+ )))
+ for token in tokens:
+ for measure, objective, unit in (
+ ("latency_us", "min", "us"), ("logical_payload_rate_gbps_at_latency_percentile", "max", "GB/s")
+ ):
+ for statistic in ("p50", "p99"):
+ metric = {
+ "operation": "roundtrip", "statistic": statistic,
+ "measure": measure, "objective": objective,
+ "tokens_per_rank": token, "phase": members[0]["phase"],
+ }
+ entries = []
+ for member in members:
+ point_id, value, observed_unit = _metric_value(member, metric)
+ if observed_unit != unit:
+ raise PublisherError("publisher metric unit differs")
+ entries.append({
+ "rank": 0, "series_id": member["series_id"], "point_id": point_id,
+ "value": value, "unit": unit,
+ })
+ entries.sort(key=lambda item: (item["value"], item["series_id"]), reverse=objective == "max")
+ for rank, entry in enumerate(entries, 1):
+ entry["rank"] = rank
+ ranking_id = _derived_id("cxranking-v1-", {
+ "cohort_id": cohort["cohort_id"], "metric": metric,
+ })
+ metric_label = _metric_label(measure, statistic)
+ rankings.append({
+ "ranking_id": ranking_id, "cohort_id": cohort["cohort_id"],
+ "label": f"{cohort['kind'].title()} {metric_label} T={token}",
+ "metric": metric, "entries": entries,
+ "publication_tier": cohort["publication_tier"],
+ "eligibility": cohort["eligibility"],
+ })
+ if cohort["publication_tier"] != "official":
+ continue
+ objective_name = (
+ f"min-{statistic}-latency"
+ if measure == "latency_us"
+ else f"max-payload-rate-at-{statistic}-latency"
+ )
+ top = entries[0]
+ recommendation_id = _derived_id("cxrecommendation-v1-", {
+ "objective": objective_name, "ranking_id": ranking_id,
+ })
+ recommendations.append({
+ "recommendation_id": recommendation_id,
+ "cohort_id": cohort["cohort_id"],
+ "label": f"Best {metric_label} at T={token}",
+ "objective": objective_name,
+ "series_id": top["series_id"], "point_id": top["point_id"],
+ "value": top["value"], "unit": top["unit"],
+ "rationale": "Top stable measured roundtrip result in a controlled cohort",
+ "publication_tier": cohort["publication_tier"],
+ "eligibility": cohort["eligibility"],
+ })
+ if cohort["kind"] == "routing":
+ baseline = next(
+ (member for member in members
+ if member["workload"]["routing"] == "uniform" and not member["workload"]["eplb"]),
+ None,
+ )
+ if baseline:
+ for candidate in members:
+ if candidate is baseline:
+ continue
+ for token in tokens:
+ for measure, objective in (("latency_us", "min"), ("logical_payload_rate_gbps_at_latency_percentile", "max")):
+ for statistic in ("p50", "p99"):
+ metric = {
+ "operation": "roundtrip", "statistic": statistic,
+ "measure": measure, "objective": objective,
+ "tokens_per_rank": token, "phase": baseline["phase"],
+ }
+ _, base_value, _ = _metric_value(baseline, metric)
+ _, candidate_value, _ = _metric_value(candidate, metric)
+ sensitivity_id = _derived_id("cxsensitivity-v1-", {
+ "baseline": baseline["series_id"], "candidate": candidate["series_id"],
+ "cohort": cohort["cohort_id"], "metric": metric,
+ })
+ sensitivities.append({
+ "sensitivity_id": sensitivity_id,
+ "cohort_id": cohort["cohort_id"],
+ "label": (
+ f"Routing sensitivity: "
+ f"{_metric_label(measure, statistic)} T={token}"
+ ),
+ "baseline_series_id": baseline["series_id"],
+ "candidate_series_id": candidate["series_id"],
+ "metric": metric,
+ "signed_change_ratio": (candidate_value - base_value) / base_value,
+ "publication_tier": cohort["publication_tier"],
+ "eligibility": cohort["eligibility"],
+ })
+ rankings.sort(key=lambda item: item["ranking_id"])
+ recommendations.sort(key=lambda item: item["recommendation_id"])
+ sensitivities.sort(key=lambda item: item["sensitivity_id"])
+ return cohorts, rankings, recommendations, sensitivities
+
+
+def _require_runnable_promotion_success(
+ bundles: Sequence[dict[str, Any]], cases: dict[str, dict[str, Any]]
+) -> None:
+ for bundle in bundles:
+ for case_id, case in cases.items():
+ if case["_disposition"] != "runnable":
+ continue
+ status, _ = _outcome(bundle["selected"][case_id])
+ if status != "success":
+ raise PublisherError(
+ "promotion requires every runnable matrix case to succeed "
+ "in every selected bundle"
+ )
+ prior_statuses = {
+ _outcome(document)[0]
+ for document in bundle["documents"].values()
+ if document["identity"]["case_id"] == case_id
+ }
+ if prior_statuses != {"success"}:
+ raise PublisherError(
+ "promotion rejects runnable cases with failed, invalid, or diagnostic retries"
+ )
+
+
+def _expected_chip_cohort_count(series: Sequence[dict[str, Any]]) -> int:
+ groups: dict[bytes, set[bytes]] = {}
+ for item in series:
+ control, variant = _public_cohort_factors("chip", item)
+ groups.setdefault(_canonical(control), set()).add(_canonical(variant))
+ return sum(len(variants) >= 2 for variants in groups.values())
+
+
+def _require_promotion_cohorts(
+ cohorts: Sequence[dict[str, Any]], series: Sequence[dict[str, Any]]
+) -> None:
+ eligible_kinds = {
+ cohort["kind"]
+ for cohort in cohorts
+ if cohort["eligibility"]["decision_grade"]
+ }
+ missing = [kind for kind in REQUIRED_COHORT_KINDS if kind not in eligible_kinds]
+ if missing:
+ raise PublisherError(
+ "promotion lacks decision-grade cohort kinds: " + ", ".join(missing)
+ )
+ for kind, expected in REQUIRED_PROMOTION_COHORT_COUNTS.items():
+ members = [cohort for cohort in cohorts if cohort["kind"] == kind]
+ if len(members) != expected or any(
+ not cohort["eligibility"]["decision_grade"] for cohort in members
+ ):
+ raise PublisherError(
+ f"promotion requires exactly {expected} decision-grade {kind} cohorts"
+ )
+
+ chip_cohorts = [cohort for cohort in cohorts if cohort["kind"] == "chip"]
+ expected_chips = _expected_chip_cohort_count(series)
+ if len(chip_cohorts) != expected_chips or any(
+ not cohort["eligibility"]["decision_grade"] for cohort in chip_cohorts
+ ):
+ raise PublisherError(
+ f"promotion requires all {expected_chips} derived chip cohorts to be decision-grade"
+ )
+
+ by_id = {item["series_id"]: item for item in series}
+ anchors = {("uniform", False), ("zipf", False), ("zipf", True)}
+ for cohort in (
+ item for item in cohorts
+ if item["kind"] == "routing" and item["eligibility"]["decision_grade"]
+ ):
+ observed = {
+ (by_id[series_id]["workload"]["routing"], by_id[series_id]["workload"]["eplb"]):
+ by_id[series_id]
+ for series_id in cohort["series_ids"]
+ }
+ if len(cohort["series_ids"]) != len(anchors) or set(observed) != anchors:
+ raise PublisherError(
+ "promotion routing cohorts require exact uniform, zipf, and zipf+EPLB anchors"
+ )
+ if (
+ observed[("uniform", False)]["build"]["implementation_contract_sha256"]
+ != observed[("zipf", False)]["build"]["implementation_contract_sha256"]
+ ):
+ raise PublisherError(
+ "promotion routing cohorts require identical off-EPLB generated implementation"
+ )
+
+
+def _require_promotion_series(series: Sequence[dict[str, Any]]) -> None:
+ if not series or any(item["status"] != "decision-grade" for item in series):
+ raise PublisherError("promotion has unstable or incomplete required series")
+
+
+def build_dataset(
+ store: Store,
+ bundle_ids: Sequence[str],
+ *,
+ promote: bool,
+) -> dict[str, Any]:
+ if not bundle_ids or len(bundle_ids) != len(set(bundle_ids)):
+ raise PublisherError("dataset requires unique explicit bundle IDs")
+ loaded = [load_bundle(store, bundle_id) for bundle_id in bundle_ids]
+ loaded.sort(key=lambda bundle: (
+ int(bundle["manifest"]["run"]["run_id"]),
+ bundle["manifest"]["run"]["run_attempt"],
+ bundle["id"],
+ ))
+ matrix_ids = {bundle["manifest"]["matrix"]["sha256"] for bundle in loaded}
+ case_sets = [{case["case_id"] for case in bundle["cases"]} for bundle in loaded]
+ if len(matrix_ids) != 1 or len({tuple(sorted(values)) for values in case_sets}) != 1:
+ raise PublisherError("dataset bundles do not share one exact requested matrix")
+ run_ids = [bundle["manifest"]["run"]["run_id"] for bundle in loaded]
+ if promote and (
+ len(loaded) != REQUIRED_ALLOCATIONS
+ or len(run_ids) != len(set(run_ids))
+ ):
+ raise PublisherError("promotion requires three independent complete workflow runs")
+ if promote and matrix_ids != {CANONICAL_FULL_V1_MATRIX_SHA256}:
+ raise PublisherError("promotion requires the canonical full-v1 matrix")
+ cases = {case["case_id"]: case for case in loaded[0]["cases"]}
+ if promote:
+ _require_runnable_promotion_success(loaded, cases)
+ all_documents = [
+ document for bundle in loaded for document in bundle["documents"].values()
+ ]
+ selected_ids = {
+ selection["selected_attempt_id"]
+ for bundle in loaded for selection in bundle["manifest"]["coverage"]["selections"]
+ }
+ public_attempts = [
+ _public_attempt(
+ document, selected=document["identity"]["attempt_id"] in selected_ids
+ )
+ for document in all_documents
+ ]
+ _unique([attempt["attempt_id"] for attempt in public_attempts], "dataset attempts")
+ selected_by_case: dict[str, list[dict[str, Any]]] = {
+ case_id: [bundle["selected"][case_id] for bundle in loaded]
+ for case_id in sorted(cases)
+ }
+ coverage: list[dict[str, Any]] = []
+ for case_id, case in sorted(cases.items()):
+ attempts = sorted(
+ (attempt for attempt in public_attempts if attempt["case_id"] == case_id),
+ key=lambda attempt: (
+ int(attempt["run_id"]), attempt["run_attempt"],
+ attempt["attempt_index"], attempt["attempt_id"],
+ ),
+ )
+ selected = _public_attempt(selected_by_case[case_id][-1], selected=True)
+ coverage.append({
+ "case_id": case_id,
+ "label": (
+ f"{case['sku'].upper()} / {case['backend']} / EP{case['ep']} / "
+ f"{case['mode']} / {case['phase']} / {case['routing']}"
+ ),
+ "required": True,
+ "sku": _slug(case["sku"]),
+ "backend": _slug(case["backend"]),
+ "mode": case["mode"],
+ "phase": case["phase"],
+ "topology": _coverage_topology(case),
+ "disposition": case["_disposition"],
+ "selected_attempt_id": selected["attempt_id"],
+ "outcome": selected["outcome"],
+ "failure_mode": selected["failure_mode"],
+ "reason": case["_reason"] if case["_disposition"] == "unsupported" else selected["reason"],
+ "attempt_ids": [attempt["attempt_id"] for attempt in attempts],
+ })
+ by_series: dict[str, list[dict[str, Any]]] = {}
+ for case_documents in selected_by_case.values():
+ for document in case_documents:
+ if (
+ document["format"] == contracts.RAW_FORMAT
+ and document["outcome"]["status"] == "success"
+ ):
+ by_series.setdefault(document["identity"]["series_id"], []).append(document)
+ series: list[dict[str, Any]] = []
+ internals: dict[str, dict[str, Any]] = {}
+ for series_id, documents in sorted(by_series.items()):
+ item, internal = _build_series(series_id, documents, len(loaded))
+ series.append(item)
+ internals[series_id] = internal
+ cohorts, rankings, recommendations, sensitivities = build_decisions(series, internals)
+ allocation_ids = sorted({attempt["allocation_id"] for attempt in public_attempts})
+ status = "promoted" if promote else "diagnostic"
+ dataset = {
+ "format": FORMAT_PUBLIC,
+ "schema_version": 1,
+ "generated_at": _latest_timestamp(
+ [bundle["manifest"]["created_at"] for bundle in loaded]
+ ),
+ "source_bundle_ids": sorted(bundle_ids),
+ "promotion": {
+ "status": status,
+ "reason": None,
+ "matrix_id": next(iter(matrix_ids)),
+ "allocation_ids": allocation_ids,
+ "required_allocations": REQUIRED_ALLOCATIONS,
+ "requested_cases": len(coverage),
+ "terminal_cases": len(coverage),
+ "policy": POLICY,
+ },
+ "coverage": coverage,
+ "attempts": sorted(public_attempts, key=lambda attempt: attempt["attempt_id"]),
+ "series": series,
+ "cohorts": cohorts,
+ "rankings": rankings,
+ "recommendations": recommendations,
+ "sensitivities": sensitivities,
+ }
+ if promote:
+ _require_promotion_series(series)
+ _require_promotion_cohorts(cohorts, series)
+ validate_public_dataset(dataset)
+ return dataset
+
+
+def _quarantine_dataset(reason: str, generated_at: str) -> dict[str, Any]:
+ dataset = {
+ "format": FORMAT_PUBLIC,
+ "schema_version": 1,
+ "generated_at": generated_at,
+ "source_bundle_ids": [],
+ "promotion": {
+ "status": "quarantined",
+ "reason": reason,
+ "matrix_id": None,
+ "allocation_ids": [],
+ "required_allocations": REQUIRED_ALLOCATIONS,
+ "requested_cases": 0,
+ "terminal_cases": 0,
+ "policy": POLICY,
+ },
+ "coverage": [],
+ "attempts": [],
+ "series": [],
+ "cohorts": [],
+ "rankings": [],
+ "recommendations": [],
+ "sensitivities": [],
+ }
+ validate_public_dataset(dataset)
+ return dataset
+
+
+def quarantine_incoming(
+ store: Store, ingest_id: str, reason: str, generated_at: str
+) -> str:
+ if REASON.fullmatch(reason) is None:
+ raise PublisherError("quarantine reason must be a machine code")
+ public_reason = f"{reason}-{ingest_id}"
+ if REASON.fullmatch(public_reason) is None:
+ raise PublisherError("quarantine reason and incoming ID exceed the public reason contract")
+ manifest = {
+ "format": "collectivex.quarantine.v1",
+ "schema_version": 1,
+ "created_at": generated_at,
+ "incoming_id": ingest_id,
+ "reason": reason,
+ }
+ digest = _sha_bytes(_canonical(manifest))
+ with store.staging(store.quarantine, private=True) as stage:
+ _write_json(stage / "quarantine.json", manifest, mode=0o600)
+ store.complete(stage, digest, private=True)
+ store.install(stage, store.quarantine / digest, private=True)
+ if _sha_bytes(_canonical(strict_load(store.quarantine / digest / "quarantine.json"))) != digest:
+ raise PublisherError("existing quarantine object differs")
+ # The incoming digest distinguishes separate rejected deliveries while preserving
+ # byte-identical output when the operator retries the same immutable input.
+ dataset = _quarantine_dataset(public_reason, generated_at)
+ dataset_digest, size = store.install_dataset(dataset)
+ store.update_channel("latest-attempt", dataset_digest, size, generated_at)
+ return digest
+
+
+def _store_from_args(args: argparse.Namespace) -> Store:
+ root = args.store_root or os.environ.get("COLLECTIVEX_STORE_ROOT")
+ if not root:
+ raise PublisherError("COLLECTIVEX_STORE_ROOT or --store-root is required")
+ if not Path(root).is_absolute():
+ raise PublisherError("COLLECTIVEX_STORE_ROOT must be an absolute path")
+ return Store(root)
+
+
+def _run_metadata(args: argparse.Namespace) -> dict[str, Any]:
+ """Validate offline operator assertions about a completed successful GHA run.
+
+ The publisher deliberately performs no network access. The caller must preflight workflow
+ identity and conclusion against GitHub before supplying these values; artifact-internal
+ provenance is then required to match them exactly.
+ """
+ run = {
+ "repository": args.repository,
+ "run_id": args.run_id,
+ "run_attempt": args.run_attempt,
+ "source_sha": args.source_sha,
+ }
+ # Reuse the authoritative private schema constraints before any filesystem mutation.
+ if not re.fullmatch(r"[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+", run["repository"] or ""):
+ raise PublisherError("--repository must be owner/name")
+ if not re.fullmatch(r"[1-9][0-9]*", run["run_id"] or ""):
+ raise PublisherError("--run-id must be a positive decimal string")
+ if type(run["run_attempt"]) is not int or run["run_attempt"] < 1:
+ raise PublisherError("--run-attempt must be positive")
+ if not re.fullmatch(r"[0-9a-f]{40}", run["source_sha"] or ""):
+ raise PublisherError("--source-sha must be a 40-character lowercase Git SHA")
+ return run
+
+
+def _ingest_inputs(
+ args: argparse.Namespace,
+) -> tuple[dict[str, Any], Path, list[Path]]:
+ run = _run_metadata(args)
+ matrix = Path(args.matrix).absolute()
+ if matrix.is_symlink() or not matrix.is_file():
+ raise PublisherError("--matrix must be a regular non-symlink file")
+ artifacts = [Path(value).absolute() for value in args.artifact]
+ if not artifacts:
+ raise PublisherError("at least one --artifact is required")
+ names = [_artifact_name(path) for path in artifacts]
+ if len(names) != len(set(names)):
+ raise PublisherError("--artifact contains duplicate GHA names")
+ for path in artifacts:
+ if path.is_symlink() or not (path.is_dir() or path.is_file()):
+ raise PublisherError("--artifact must be a regular ZIP or real directory")
+ return run, matrix, artifacts
+
+
+def _bundle_ids(values: Sequence[str], *, promote: bool) -> list[str]:
+ bundle_ids = list(values)
+ if (
+ not bundle_ids
+ or len(bundle_ids) != len(set(bundle_ids))
+ or any(HEX64.fullmatch(value) is None for value in bundle_ids)
+ ):
+ raise PublisherError("bundle IDs must be unique SHA-256 digests")
+ if promote and len(bundle_ids) != REQUIRED_ALLOCATIONS:
+ raise PublisherError("promotion requires exactly three explicit bundle IDs")
+ return bundle_ids
+
+
+def ingest_command(args: argparse.Namespace) -> dict[str, Any]:
+ run, matrix, artifacts = _ingest_inputs(args)
+ store = _store_from_args(args)
+ with store.locked():
+ ingest_id, incoming, _ = archive_incoming(
+ store, matrix, artifacts, run
+ )
+ try:
+ bundle_id, _, _ = build_bundle(store, ingest_id, incoming, run)
+ dataset = build_dataset(store, [bundle_id], promote=False)
+ dataset_id, size = store.install_dataset(dataset)
+ store.update_channel(
+ "latest-attempt", dataset_id, size, dataset["generated_at"]
+ )
+ store.verify_channel("latest-attempt")
+ return {
+ "status": "accepted", "incoming_id": ingest_id,
+ "bundle_id": bundle_id, "dataset_sha256": dataset_id,
+ "channel": "latest-attempt",
+ }
+ except (
+ PublisherError, contracts.ContractError, artifact_safety.ArtifactSafetyError,
+ jsonschema.ValidationError,
+ ) as exc:
+ # Invalid delivery bytes provide no trusted timestamp. A fixed sentinel keeps
+ # repeated quarantine of the same immutable incoming object content-idempotent.
+ generated_at = "1970-01-01T00:00:00Z"
+ quarantine_id = quarantine_incoming(
+ store, ingest_id, "artifact-validation-failed", generated_at
+ )
+ raise PublisherError(
+ f"incoming {ingest_id} quarantined as {quarantine_id}: {exc}"
+ ) from exc
+
+
+def promote_command(args: argparse.Namespace) -> dict[str, Any]:
+ bundle_ids = _bundle_ids(args.bundle, promote=True)
+ store = _store_from_args(args)
+ with store.locked():
+ dataset = build_dataset(store, bundle_ids, promote=True)
+ digest, size = store.install_dataset(dataset)
+ store.update_channel("dev-latest", digest, size, dataset["generated_at"])
+ store.verify_channel("dev-latest")
+ return {
+ "status": "promoted", "bundle_ids": bundle_ids,
+ "dataset_sha256": digest, "channel": "dev-latest",
+ }
+
+
+def verify_command(args: argparse.Namespace) -> dict[str, Any]:
+ bundle_ids = _bundle_ids(args.bundle, promote=False) if args.bundle else []
+ channels = args.channel or ["latest-attempt"]
+ if any(channel not in {"latest-attempt", "dev-latest"} for channel in channels):
+ raise PublisherError("unknown channel")
+ store = _store_from_args(args)
+ if args.channel is None and (store.channels / "dev-latest.json").is_file():
+ channels.append("dev-latest")
+ with store.locked():
+ pointers = {channel: store.verify_channel(channel) for channel in channels}
+ bundles = [load_bundle(store, bundle_id)["id"] for bundle_id in bundle_ids]
+ return {"status": "verified", "channels": pointers, "bundle_ids": bundles}
+
+
+def _parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description="CollectiveX isolated filesystem publisher")
+ parser.add_argument("--store-root", help="defaults to COLLECTIVEX_STORE_ROOT")
+ subparsers = parser.add_subparsers(dest="command", required=True)
+ ingest = subparsers.add_parser("ingest", help="archive and validate one complete GHA run")
+ ingest.add_argument("--matrix", required=True)
+ ingest.add_argument("--artifact", action="append", required=True)
+ ingest.add_argument("--repository", required=True)
+ ingest.add_argument("--run-id", required=True)
+ ingest.add_argument("--run-attempt", required=True, type=int)
+ ingest.add_argument("--source-sha", required=True)
+ promote = subparsers.add_parser("promote", help="publish explicit independent bundles")
+ promote.add_argument("--bundle", action="append", required=True)
+ verify = subparsers.add_parser("verify", help="verify immutable targets and pointers")
+ verify.add_argument("--channel", action="append", choices=["latest-attempt", "dev-latest"])
+ verify.add_argument("--bundle", action="append", default=[])
+ return parser
+
+
+def main() -> int:
+ args = _parser().parse_args()
+ try:
+ if args.command == "ingest":
+ result = ingest_command(args)
+ elif args.command == "promote":
+ result = promote_command(args)
+ elif args.command == "verify":
+ result = verify_command(args)
+ else:
+ raise PublisherError(f"unknown command {args.command!r}")
+ except (
+ PublisherError, contracts.ContractError, artifact_safety.ArtifactSafetyError,
+ jsonschema.ValidationError, OSError,
+ ) as exc:
+ print(json.dumps({"status": "error", "error": str(exc)}), file=sys.stderr)
+ return 2
+ print(json.dumps(result, sort_keys=True))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt
new file mode 100644
index 0000000000..f68f97d83d
--- /dev/null
+++ b/experimental/CollectiveX/requirements.txt
@@ -0,0 +1,8 @@
+# Host-side matrix generation. GPU libraries are supplied by benchmark images.
+PyYAML==6.0.2
+
+# Canonical workload serialization.
+numpy>=1.26,<3
+
+# Host-only strict artifact publisher schemas (never imported by GPU execution).
+jsonschema==4.25.1
diff --git a/experimental/CollectiveX/runtime/common.sh b/experimental/CollectiveX/runtime/common.sh
new file mode 100644
index 0000000000..1f6a32c2da
--- /dev/null
+++ b/experimental/CollectiveX/runtime/common.sh
@@ -0,0 +1,2339 @@
+# shellcheck shell=bash
+# CollectiveX — shared launcher helpers (sourced, not executed).
+#
+# Cluster-generic scaffolding only (Slurm/container/build/staging); no
+# model-serving. Logging goes to stderr so functions can `echo` a single
+# result on stdout.
+
+_CX_COMMON_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+CX_SQUASH_FORMAT_VERSION="repro-v1"
+CX_SQUASH_SOURCE_DATE_EPOCH=1
+CX_DEEPEP_V2_COMMIT="fa8a9b16898204afd347c663b89e65ef87dc6ce6" # pragma: allowlist secret
+CX_DEEPEP_V2_TREE="29809e75c5874e6609dac4804e7b651d5226959f" # pragma: allowlist secret
+CX_DEEPEP_V2_FMT_COMMIT="a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" # pragma: allowlist secret
+CX_DEEPEP_HYBRID_COMMIT="e0a5b1d9848ab3e7b4a67842bf06f067bfac67f8" # pragma: allowlist secret
+CX_DEEPEP_HYBRID_TREE="d77aeab7f1bb52b615666fe178d26ced41fae08e" # pragma: allowlist secret
+CX_DEEPEP_HYBRID_NCCL_COMMIT="1e0c869c39bb33f1034cb9920bd2a8a8406f04a3" # pragma: allowlist secret
+unset COLLECTIVEX_OPERATOR_CONFIG_LOADED COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+
+cx_log() { printf '[collectivex] %s\n' "$*" >&2; }
+cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; }
+
+# Public failure telemetry is a closed vocabulary. Raw scheduler, container,
+# host, and filesystem diagnostics stay in the mode-0600 private logs.
+cx_set_failure_stage() {
+ local stage="$1"
+ case "$stage" in
+ setup|repository-stage|registry-verification|scheduler-allocation|container-import) ;;
+ container-hash|container-launch|backend-setup|execution|artifact-collection) ;;
+ *) cx_die "invalid launcher failure stage" ;;
+ esac
+ export CX_FAILSAFE_MODE="$stage"
+}
+
+cx_fail_stage() {
+ local stage="$1" log_path="${2:-}" diagnostic="unknown"
+ cx_set_failure_stage "$stage"
+ if [ -n "$log_path" ] && [ -f "$log_path" ]; then
+ if grep -aEqi 'no space left|disk quota|quota exceeded' "$log_path"; then
+ diagnostic="storage-capacity"
+ elif grep -aEqi 'permission denied|operation not permitted|read-only file system|source mount (creation|ownership validation|permission inspection|permission normalization|permission validation) failed' "$log_path"; then
+ diagnostic="storage-permission"
+ elif grep -aEqi 'outside one realized LSA domain|lsa(Size| team| domain).*(mismatch|invalid|expected)|ranks.*not in (one|the same) nvlink.domain' "$log_path" \
+ || { [ "${CX_BENCH:-}" = deepep-v2 ] \
+ && grep -aEqi 'nccl[.]cu:(111|112)([^0-9]|$)' "$log_path"; }; then
+ diagnostic="accelerator-topology"
+ elif grep -aEqi 'cuda driver version is insufficient|call requires newer driver|cudaErrorCallRequiresNewerDriver|CUDA_ERROR_SYSTEM_DRIVER_MISMATCH|unsupported toolchain' "$log_path"; then
+ diagnostic="accelerator-driver"
+ elif grep -aEqi 'ncclDevCommCreate|ncclCommWindowRegister|ncclGetLsa(Device)?Pointer|Communicator does not support symmetric memory|Symmetric memory is not supported' "$log_path" \
+ || { [ "${CX_BENCH:-}" = deepep-v2 ] \
+ && grep -aEqi 'nccl[.]cu:(106|127|128|129|135)([^0-9]|$)' "$log_path"; }; then
+ diagnostic="nccl-device-api"
+ elif grep -aEqi 'NVCC (PTX )?compilation failed|cuobjdump failed|invalid device (kernel )?image|no kernel image is available' "$log_path"; then
+ diagnostic="jit-toolchain"
+ elif grep -aEqi 'cuda out of memory|CUDA_ERROR_OUT_OF_MEMORY|out of memory.*cuda' "$log_path"; then
+ diagnostic="accelerator-memory"
+ elif grep -aEqi 'does not match its pinned image contract|requires the exact pinned|version mismatch' "$log_path"; then
+ diagnostic="backend-version"
+ elif grep -aEqi 'nvshmem is unavailable|build-tool installation failed' "$log_path"; then
+ diagnostic="backend-dependency"
+ elif grep -aEqi 'revision fetch failed|submodule fetch failed|package installation failed|staged source is invalid|source (pin resolution|seed validation|seed copy|checkout creation|publication validation|existing source validation) failed' "$log_path"; then
+ diagnostic="backend-source"
+ elif grep -aEqi 'failed to mount|squashfs|enroot|pyxis|mount.*invalid argument|invalid argument.*mount' "$log_path"; then
+ diagnostic="container-runtime"
+ elif grep -aEqi 'backend preparation failed|build (failed|is incomplete)|cache (mount identity )?validation failed|import failed' "$log_path"; then
+ diagnostic="backend-build"
+ elif grep -aEqi 'command not found|not found on this runner|git lookup failed' "$log_path"; then
+ diagnostic="missing-runtime"
+ elif grep -aEqi 'too many requests|rate.?limit' "$log_path"; then
+ diagnostic="registry-rate-limit"
+ elif grep -aEqi 'timed out|operation timeout|wait timeout after|watchdog.*timeout|timeout: sending signal|connection reset|could not resolve|TLS|certificate' "$log_path"; then
+ diagnostic="network-or-timeout"
+ elif grep -aEqi 'salloc:|srun:.*(unable to create step|step creation|invalid partition|invalid account)|unable to create step|job allocation' "$log_path"; then
+ diagnostic="scheduler"
+ elif grep -aEqi 'SHARD done: [0-9]+/[0-9]+ case\(s\) failed|WARN: .* run failed rc=|completed with invalid semantic evidence' "$log_path"; then
+ diagnostic="benchmark-case-failure"
+ elif [ -s "$log_path" ]; then
+ diagnostic="unclassified"
+ else
+ diagnostic="empty-log"
+ fi
+ fi
+ cx_log "ERROR: failure-class=$stage diagnostic=$diagnostic"
+ return 1
+}
+
+# Runner-local deployment settings are strict JSON kept outside the checkout.
+# Only the selected runner's allowlisted values are exported; the document is
+# never sourced or evaluated as shell.
+cx_load_operator_config() {
+ [ -n "${COLLECTIVEX_OPERATOR_CONFIG_LOADED:-}" ] \
+ && [ "$COLLECTIVEX_OPERATOR_CONFIG_LOADED" = "$$" ] && return 0
+ local config_path generated=0 parsed_path config_log key value
+ unset CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR CX_ENROOT_CACHE_PATH
+ unset ENROOT_CACHE_PATH
+ unset CX_EXCLUDE_NODES CX_NODELIST CX_LOCK_DIR CX_MASTER_PORT
+ unset CX_SOCKET_IFNAME CX_RDMA_DEVICES CX_IB_GID_INDEX CX_RDMA_SERVICE_LEVEL
+ unset MASTER_ADDR MASTER_PORT RANK WORLD_SIZE LOCAL_RANK LOCAL_WORLD_SIZE
+ config_path="${COLLECTIVEX_OPERATOR_CONFIG:-${XDG_CONFIG_HOME:-${HOME}/.config}/inferencex/collectivex.json}"
+ if [ -n "${COLLECTIVEX_OPERATOR_CONFIG_CONTENT:-}" ]; then
+ umask 077
+ if [[ "${CX_JOB_ROOT:-}" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \
+ && [ -d "$CX_JOB_ROOT" ] && [ ! -L "$CX_JOB_ROOT" ] \
+ && [ "$(stat -c '%u:%a' "$CX_JOB_ROOT" 2>/dev/null)" = "$(id -u):700" ]; then
+ config_path="$CX_JOB_ROOT/operator-config.json"
+ (set -C; : > "$config_path") 2>/dev/null \
+ || cx_die "cannot create ephemeral runner configuration"
+ else
+ config_path="$(mktemp /tmp/inferencex-collectivex-config.XXXXXX)" \
+ || cx_die "cannot create ephemeral runner configuration"
+ fi
+ COLLECTIVEX_EPHEMERAL_CONFIG_PATH="$config_path"
+ generated=1
+ if ! printf '%s' "$COLLECTIVEX_OPERATOR_CONFIG_CONTENT" > "$config_path"; then
+ unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT
+ rm -f -- "$config_path"
+ unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+ cx_die "cannot materialize runner configuration"
+ fi
+ elif [ "${COLLECTIVEX_OPERATOR_CONFIG_REQUIRED:-0}" = 1 ]; then
+ unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT
+ cx_die "runner configuration is unavailable"
+ fi
+ unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT COLLECTIVEX_OPERATOR_CONFIG_REQUIRED
+ if [ ! -e "$config_path" ]; then
+ COLLECTIVEX_OPERATOR_CONFIG_LOADED="$$"
+ return 0
+ fi
+ umask 077
+ parsed_path="$(mktemp /tmp/inferencex-collectivex-parsed.XXXXXX)" || {
+ [ "$generated" = 0 ] || rm -f -- "$config_path"
+ cx_die "cannot parse runner configuration"
+ }
+ config_log="$(cx_private_log_path operator-config)"
+ if ! python3 - "$config_path" "${CX_RUNNER:-${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}}" \
+ > "$parsed_path" 2> "$config_log" <<'PY'
+import json
+import os
+import posixpath
+import re
+import stat
+import sys
+
+RUNNERS = {
+ "h100-dgxc", "h200-dgxc", "b200-dgxc", "b300",
+ "gb200", "gb300", "mi325x", "mi355x",
+}
+FIELDS = {
+ "partition": "CX_PARTITION",
+ "account": "CX_ACCOUNT",
+ "squash_dir": "CX_SQUASH_DIR",
+ "stage_dir": "CX_STAGE_DIR",
+ "enroot_cache_path": "CX_ENROOT_CACHE_PATH",
+ "exclude_nodes": "CX_EXCLUDE_NODES",
+ "nodelist": "CX_NODELIST",
+ "lock_dir": "CX_LOCK_DIR",
+ "socket_ifname": "CX_SOCKET_IFNAME",
+ "rdma_devices": "CX_RDMA_DEVICES",
+ "ib_gid_index": "CX_IB_GID_INDEX",
+ "rdma_service_level": "CX_RDMA_SERVICE_LEVEL",
+}
+NETWORK_FIELDS = {
+ "socket_ifname", "rdma_devices", "ib_gid_index", "rdma_service_level",
+}
+REQUIRED = {
+ "h100-dgxc": {"partition", "account", "squash_dir", "stage_dir"},
+ "h200-dgxc": {"partition", "squash_dir", "stage_dir"},
+ "b200-dgxc": {"partition", "account", "squash_dir", "stage_dir"},
+ "b300": {
+ "partition", "account", "squash_dir", "stage_dir",
+ },
+ "gb200": {"partition", "account", "storage_roots"},
+ "gb300": {"partition", "account", "squash_dir", "stage_dir", "enroot_cache_path"},
+ "mi325x": {"partition", "squash_dir", "stage_dir"},
+ "mi355x": {"partition", "squash_dir", "stage_dir"},
+}
+ALLOWED = {
+ "h100-dgxc": REQUIRED["h100-dgxc"] | {"exclude_nodes", "stage_dir"} | NETWORK_FIELDS,
+ "h200-dgxc": REQUIRED["h200-dgxc"] | {"account", "exclude_nodes", "stage_dir"} | NETWORK_FIELDS,
+ "b200-dgxc": REQUIRED["b200-dgxc"] | {"exclude_nodes", "stage_dir"} | NETWORK_FIELDS,
+ "b300": REQUIRED["b300"] | {"exclude_nodes"} | NETWORK_FIELDS,
+ "gb200": REQUIRED["gb200"] | NETWORK_FIELDS,
+ "gb300": REQUIRED["gb300"] | NETWORK_FIELDS,
+ "mi325x": REQUIRED["mi325x"] | {"exclude_nodes", "nodelist", "stage_dir", "lock_dir"} | NETWORK_FIELDS,
+ "mi355x": REQUIRED["mi355x"] | {"exclude_nodes", "nodelist", "stage_dir", "lock_dir"} | NETWORK_FIELDS,
+}
+TOKEN = re.compile(r"^[A-Za-z0-9_.\[\],-]+$")
+PATH = re.compile(r"^/[A-Za-z0-9._/+\-]+$")
+IPV4 = re.compile(r"(? 65536
+ ):
+ raise ValueError
+ flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
+ descriptor = os.open(path, flags)
+ try:
+ opened = os.fstat(descriptor)
+ if (opened.st_dev, opened.st_ino) != (metadata.st_dev, metadata.st_ino):
+ raise ValueError
+ payload = b""
+ while len(payload) <= 65536:
+ chunk = os.read(descriptor, 65537 - len(payload))
+ if not chunk:
+ break
+ payload += chunk
+ document = json.loads(
+ payload.decode("utf-8"),
+ object_pairs_hook=pairs,
+ parse_constant=lambda _: (_ for _ in ()).throw(ValueError()),
+ )
+ finally:
+ os.close(descriptor)
+ if (
+ set(document) != {"schema_version", "runners"}
+ or type(document["schema_version"]) is not int
+ or document["schema_version"] != 1
+ ):
+ raise ValueError
+ runners = document["runners"]
+ if (
+ not isinstance(runners, dict) or not runners or set(runners) - RUNNERS
+ or runner not in runners
+ ):
+ raise ValueError
+ selected = None
+ for name, config in runners.items():
+ if (
+ not isinstance(config, dict)
+ or (name == runner and not REQUIRED[name].issubset(config))
+ ):
+ raise ValueError
+ if set(config) - ALLOWED[name]:
+ raise ValueError
+ for field, value in config.items():
+ if field == "storage_roots":
+ if (
+ not isinstance(value, list) or not 1 <= len(value) <= 16
+ or len(value) != len(set(value)) or not all(valid_path(item) for item in value)
+ ):
+ raise ValueError
+ elif field == "socket_ifname":
+ if not isinstance(value, str) or not INTERFACES.fullmatch(value):
+ raise ValueError
+ elif field == "rdma_devices":
+ if not isinstance(value, str) or not RDMA_DEVICES.fullmatch(value):
+ raise ValueError
+ elif field == "ib_gid_index":
+ if type(value) is not int or not 0 <= value <= 255:
+ raise ValueError
+ elif field == "rdma_service_level":
+ if type(value) is not int or not 0 <= value <= 15:
+ raise ValueError
+ elif field.endswith(("_dir", "_path")):
+ if not valid_path(value):
+ raise ValueError
+ elif (
+ not isinstance(value, str) or not value or len(value) > 512
+ or not TOKEN.fullmatch(value) or IPV4.search(value)
+ ):
+ raise ValueError
+ if name == runner:
+ selected = dict(config)
+ if selected is None:
+ raise ValueError
+ roots = selected.pop("storage_roots", None)
+ if roots is not None:
+ for root in roots:
+ squash = posixpath.join(root, "collectivex", "containers")
+ stage = posixpath.join(root, "collectivex", "stage")
+ probes = []
+ try:
+ for directory in (squash, stage):
+ os.makedirs(directory, mode=0o700, exist_ok=True)
+ probe = posixpath.join(directory, f".write-probe-{os.getpid()}")
+ fd = os.open(probe, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600)
+ os.close(fd)
+ probes.append(probe)
+ selected.update(squash_dir=squash, stage_dir=stage)
+ break
+ except OSError:
+ pass
+ finally:
+ for probe in probes:
+ try:
+ os.unlink(probe)
+ except OSError:
+ pass
+ else:
+ raise ValueError
+ for field, value in selected.items():
+ key = FIELDS[field]
+ sys.stdout.buffer.write(
+ key.encode() + b"\0" + str(value).encode() + b"\0"
+ )
+except (KeyError, OSError, TypeError, UnicodeError, ValueError):
+ raise SystemExit(1)
+PY
+ then
+ rm -f -- "$parsed_path"
+ [ "$generated" = 0 ] || rm -f -- "$config_path"
+ unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+ unset COLLECTIVEX_OPERATOR_CONFIG COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL
+ cx_die "runner-local configuration failed"
+ fi
+ while IFS= read -r -d '' key && IFS= read -r -d '' value; do
+ printf -v "$key" '%s' "$value"
+ export "${key?}"
+ done < "$parsed_path"
+ rm -f -- "$parsed_path"
+ if [ "$generated" = 1 ] || [ "${COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL:-0}" = 1 ]; then
+ rm -f -- "$config_path" || cx_die "cannot remove ephemeral runner configuration"
+ fi
+ unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+ unset COLLECTIVEX_OPERATOR_CONFIG COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL
+ COLLECTIVEX_OPERATOR_CONFIG_LOADED="$$"
+}
+
+cx_private_log_path() {
+ local label="$1" tag="${COLLECTIVEX_EXECUTION_ID:-manual_$$}" path
+ path="$(python3 - "$tag" "$label" <<'PY' 2>/dev/null
+import os
+import re
+import shutil
+import stat
+import sys
+import time
+
+tag, label = sys.argv[1:]
+if not all(re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", value) for value in (tag, label)):
+ raise SystemExit(1)
+root = f"/tmp/inferencex-collectivex-{os.getuid()}"
+old_umask = os.umask(0o077)
+flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0)
+try:
+ try:
+ os.mkdir(root, 0o700)
+ except FileExistsError:
+ pass
+ root_fd = os.open(root, flags)
+ try:
+ metadata = os.fstat(root_fd)
+ if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700:
+ raise OSError("unsafe root")
+ cutoff = time.time() - 86400
+ for entry in os.scandir(root):
+ try:
+ if (
+ entry.name != tag and entry.is_dir(follow_symlinks=False)
+ and entry.stat(follow_symlinks=False).st_mtime < cutoff
+ ):
+ shutil.rmtree(entry.path)
+ except OSError:
+ pass
+ try:
+ os.mkdir(tag, 0o700, dir_fd=root_fd)
+ except FileExistsError:
+ pass
+ directory_fd = os.open(tag, flags, dir_fd=root_fd)
+ try:
+ metadata = os.fstat(directory_fd)
+ if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700:
+ raise OSError("unsafe directory")
+ log_flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0)
+ log_fd = os.open(f"{label}.log", log_flags, 0o600, dir_fd=directory_fd)
+ os.close(log_fd)
+ finally:
+ os.close(directory_fd)
+ finally:
+ os.close(root_fd)
+finally:
+ os.umask(old_umask)
+print(f"{root}/{tag}/{label}.log", end="")
+PY
+)" || cx_die "cannot create private runtime log"
+ printf '%s' "$path"
+}
+
+# Manual successes delete diagnostics immediately. Canonical workflow logs survive
+# until artifact upload succeeds; failed logs remain private for debugging, and a
+# later run prunes abandoned directories older than 24 hours.
+cx_cleanup_private_logs() {
+ local rc="$1" tag="${COLLECTIVEX_EXECUTION_ID:-manual_$$}"
+ [ "$rc" = 0 ] || return 0
+ python3 - "$tag" <<'PY' >/dev/null 2>&1 || true
+import os
+import re
+import shutil
+import stat
+import sys
+
+tag = sys.argv[1]
+if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", tag):
+ raise SystemExit(1)
+root = f"/tmp/inferencex-collectivex-{os.getuid()}"
+flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0)
+root_fd = os.open(root, flags)
+try:
+ metadata = os.fstat(root_fd)
+ if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700:
+ raise SystemExit(1)
+finally:
+ os.close(root_fd)
+path = os.path.join(root, tag)
+if os.path.isdir(path) and not os.path.islink(path):
+ shutil.rmtree(path)
+PY
+}
+
+# Explicit Slurm export boundary. Operator config, runner credentials, HOME,
+# workspace paths, and unrelated service secrets never enter the container.
+cx_container_exports() {
+ printf '%s' 'COLLECTIVEX_SOURCE_SHA,COLLECTIVEX_ARTIFACT_NAME,COLLECTIVEX_EXECUTION_ID,COLLECTIVEX_CONTROL_SHA256,COLLECTIVEX_IMAGE,COLLECTIVEX_IMAGE_DIGEST,COLLECTIVEX_IMAGE_DIGEST_VERIFIED,COLLECTIVEX_SQUASH_SHA256,GITHUB_REF_NAME,GITHUB_REF,GITHUB_REPOSITORY,GITHUB_JOB,GITHUB_RUN_ID,GITHUB_RUN_ATTEMPT,GITHUB_SHA,CX_RUNNER,CX_BENCH,CX_NODES,CX_GPUS_PER_NODE,CX_SCALE_UP_DOMAIN,CX_SHARD_FILE,CX_SHARD_SKU,CX_NGPUS,CX_TS,CX_TOPO,CX_SCOPE,CX_TRANSPORT,CX_SCALE_UP_TRANSPORT,CX_SCALE_OUT_TRANSPORT,CX_MODE,CX_PHASE,CX_ROUTING,CX_EPLB,CX_CASE_ID,CX_SUITE,CX_WORKLOAD_NAME,CX_REQUIRED_PUBLICATION,CX_HIDDEN,CX_TOPK,CX_EXPERTS,CX_TOKENS_LADDER,CX_CANONICAL,CX_ITERS,CX_TRIALS,CX_WARMUP,CX_SAMPLES_PER_POINT,CX_WARMUP_SEMANTICS,CX_SEED,CX_RUN_TIMEOUT,CX_NCCL_HOME,CX_ALLOW_MNNVL,CX_ATTEMPT_ID,CX_RUNTIME_MARKER,CX_MORI_KERNEL_TYPE,CX_WORKLOAD_DIR,CX_BACKEND_CACHE_ROOT,CX_BACKEND_CACHE_SENTINEL_SHA256,CX_BACKEND_SOURCE_ROOT,MASTER_ADDR,MASTER_PORT,RANK,WORLD_SIZE,LOCAL_RANK,LOCAL_WORLD_SIZE,NCCL_NET,NCCL_SOCKET_IFNAME,GLOO_SOCKET_IFNAME,NCCL_IB_HCA,NCCL_IB_GID_INDEX,NCCL_IB_SL,NVSHMEM_HCA_LIST,NVSHMEM_IB_GID_INDEX,NVSHMEM_IB_SL,NVSHMEM_IB_ENABLE_IBGDA,NVSHMEM_IBGDA_NIC_HANDLER,EP_NIC_NAME,EP_OVERRIDE_RDMA_SL,UCCL_SOCKET_IFNAME,UCCL_IB_GID_INDEX,UCCL_IB_SL,MORI_RDMA_DEVICES,HYBRID_EP_MULTINODE,USE_NIXL,RDMA_CORE_HOME,DEEPEP_HYBRID_BUILD_MODE,NCCL_CUMEM_ENABLE,NCCL_MNNVL_ENABLE,MC_FORCE_MNNVL,MORI_DISABLE_AUTO_XGMI,MORI_ENABLE_SDMA,MORI_APP_LOG_LEVEL,MORI_SHMEM_LOG_LEVEL,MORI_IO_LOG_LEVEL'
+ printf '%s' ',MORI_COMMIT'
+}
+
+# Host-side utility steps need only the basic login paths. They never receive
+# the complete Actions or runner environment.
+cx_host_exports() {
+ printf '%s' 'HOME,PATH,USER,XDG_CACHE_HOME,ENROOT_CACHE_PATH'
+}
+
+cx_prepare_runtime_marker() {
+ local mount_src="$1" tag="${COLLECTIVEX_EXECUTION_ID:-${CX_TS:-}}" marker
+ [[ "$tag" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \
+ || cx_die "cannot create runtime stage marker"
+ marker=".shards/runtime-stage-${tag}.txt"
+ mkdir -p "$mount_src/experimental/CollectiveX/.shards" >/dev/null 2>&1 \
+ || cx_die "cannot create runtime stage marker"
+ rm -f -- "$mount_src/experimental/CollectiveX/$marker" >/dev/null 2>&1 \
+ || cx_die "cannot reset runtime stage marker"
+ export CX_RUNTIME_MARKER="$marker"
+}
+
+cx_write_runtime_stage() {
+ local stage="$1" marker="${CX_RUNTIME_MARKER:-}"
+ [ -n "$marker" ] || return 0
+ [[ "$marker" =~ ^\.shards/runtime-stage-[A-Za-z0-9][A-Za-z0-9._-]*\.txt$ ]] \
+ || return 1
+ case "$stage" in backend-setup|execution) ;; *) return 1 ;; esac
+ printf '%s\n' "$stage" > "$marker"
+}
+
+cx_adopt_runtime_stage() {
+ local mount_src="$1" marker="${CX_RUNTIME_MARKER:-}" stage=""
+ [ -n "$marker" ] || return 0
+ if [[ "$marker" =~ ^\.shards/runtime-stage-[A-Za-z0-9][A-Za-z0-9._-]*\.txt$ ]] \
+ && [ -f "$mount_src/experimental/CollectiveX/$marker" ]; then
+ IFS= read -r stage < "$mount_src/experimental/CollectiveX/$marker" || true
+ rm -f -- "$mount_src/experimental/CollectiveX/$marker" >/dev/null 2>&1 || true
+ case "$stage" in
+ backend-setup|execution) cx_set_failure_stage "$stage" ;;
+ esac
+ fi
+}
+
+cx_require_vars() {
+ local name
+ local -a missing=()
+ for name in "$@"; do
+ [ -n "${!name:-}" ] || missing+=("$name")
+ done
+ [ "${#missing[@]}" -eq 0 ] || cx_die \
+ "missing runner-local configuration: ${missing[*]} (set them in COLLECTIVEX_OPERATOR_CONFIG)"
+}
+
+cx_bool_enabled() {
+ local normalized
+ normalized="$(printf '%s' "$1" | tr '[:upper:]' '[:lower:]')"
+ case "$normalized" in
+ 1|true|yes) return 0 ;;
+ *) return 1 ;;
+ esac
+}
+
+cx_require_record_safe() {
+ local value
+ for value in "$@"; do
+ case "$value" in
+ *'|'*|*$'\n'*|*$'\r'*) cx_die "manual case field contains a record delimiter" ;;
+ esac
+ done
+}
+
+cx_require_single_node() {
+ [ "${CX_NODES:-1}" = "1" ] || cx_die "$1 supports one-node EP only"
+}
+
+# Convert private, runner-local network selectors into the public library
+# variables needed inside the container. Values are interface/HCA identifiers,
+# never addresses; the rendezvous hostname is derived from the allocation.
+cx_apply_network_profile() {
+ local nodes="$1" transport="$2" selector rdma_name rdma_names="" ep_nic=""
+ local -a selectors
+ [[ "$nodes" =~ ^[1-9][0-9]*$ ]] || cx_die "invalid network placement"
+ unset NCCL_NET NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME NCCL_IB_HCA
+ unset NCCL_IB_GID_INDEX NCCL_IB_SL
+ unset NVSHMEM_HCA_LIST NVSHMEM_IB_GID_INDEX NVSHMEM_IB_SL
+ unset NVSHMEM_IB_ENABLE_IBGDA NVSHMEM_IBGDA_NIC_HANDLER
+ unset EP_NIC_NAME EP_OVERRIDE_RDMA_SL
+ unset UCCL_SOCKET_IFNAME UCCL_IB_GID_INDEX UCCL_IB_SL MORI_RDMA_DEVICES
+ [ "$nodes" -gt 1 ] && [ "$transport" != mnnvl ] || return 0
+ [ -n "${CX_SOCKET_IFNAME:-}" ] && [ -n "${CX_RDMA_DEVICES:-}" ] \
+ || cx_die "multi-node execution requires private socket and RDMA selectors"
+ if [ -n "${CX_SOCKET_IFNAME:-}" ]; then
+ [[ "$CX_SOCKET_IFNAME" =~ ^[A-Za-z][A-Za-z0-9_.-]{0,31}(,[A-Za-z][A-Za-z0-9_.-]{0,31})*$ ]] \
+ || cx_die "invalid private socket interface selector"
+ export NCCL_SOCKET_IFNAME="$CX_SOCKET_IFNAME" GLOO_SOCKET_IFNAME="$CX_SOCKET_IFNAME"
+ export UCCL_SOCKET_IFNAME="$CX_SOCKET_IFNAME"
+ fi
+ if [ -n "${CX_RDMA_DEVICES:-}" ]; then
+ [[ "$CX_RDMA_DEVICES" =~ ^[A-Za-z][A-Za-z0-9_.-]{0,31}(:[1-9][0-9]*)?(,[A-Za-z][A-Za-z0-9_.-]{0,31}(:[1-9][0-9]*)?)*$ ]] \
+ || cx_die "invalid private RDMA device selector"
+ IFS=, read -r -a selectors <<< "$CX_RDMA_DEVICES"
+ for selector in "${selectors[@]}"; do
+ rdma_name="${selector%%:*}"
+ rdma_names="${rdma_names}${rdma_names:+,}${rdma_name}"
+ [ -n "$ep_nic" ] || ep_nic="$rdma_name"
+ done
+ export NCCL_NET=IB NCCL_IB_HCA="=$CX_RDMA_DEVICES"
+ export NVSHMEM_HCA_LIST="$CX_RDMA_DEVICES"
+ export MORI_RDMA_DEVICES="$rdma_names" EP_NIC_NAME="$ep_nic"
+ fi
+ if [ -n "${CX_IB_GID_INDEX:-}" ]; then
+ [[ "$CX_IB_GID_INDEX" =~ ^[0-9]+$ ]] && [ "$CX_IB_GID_INDEX" -le 255 ] \
+ || cx_die "invalid private IB GID index"
+ export NCCL_IB_GID_INDEX="$CX_IB_GID_INDEX" NVSHMEM_IB_GID_INDEX="$CX_IB_GID_INDEX"
+ export UCCL_IB_GID_INDEX="$CX_IB_GID_INDEX"
+ fi
+ if [ -n "${CX_RDMA_SERVICE_LEVEL:-}" ]; then
+ [[ "$CX_RDMA_SERVICE_LEVEL" =~ ^[0-9]+$ ]] && [ "$CX_RDMA_SERVICE_LEVEL" -le 15 ] \
+ || cx_die "invalid private RDMA service level"
+ export NCCL_IB_SL="$CX_RDMA_SERVICE_LEVEL" NVSHMEM_IB_SL="$CX_RDMA_SERVICE_LEVEL"
+ export UCCL_IB_SL="$CX_RDMA_SERVICE_LEVEL"
+ export EP_OVERRIDE_RDMA_SL="$CX_RDMA_SERVICE_LEVEL"
+ fi
+ export NVSHMEM_IB_ENABLE_IBGDA=1 NVSHMEM_IBGDA_NIC_HANDLER=gpu
+}
+
+# Prove that the operator-pinned scale-out fabric exists on every allocated
+# node before image import or backend initialization. Selector values and node
+# diagnostics stay in the runner-private log.
+cx_validate_network_profile_on_job() {
+ local job_id="$1" nodes="$2" transport="$3" log rc=0
+ [ "$nodes" -gt 1 ] && [ "$transport" != mnnvl ] || return 0
+ [[ "$job_id" =~ ^[1-9][0-9]*$ && "$nodes" =~ ^[1-9][0-9]*$ ]] \
+ || return 1
+ [ -n "${CX_SOCKET_IFNAME:-}" ] && [ -n "${CX_RDMA_DEVICES:-}" ] \
+ || return 1
+ log="$(cx_private_log_path network-profile)" || return 1
+ srun --jobid="$job_id" --nodes="$nodes" --ntasks="$nodes" --ntasks-per-node=1 \
+ --chdir=/tmp --input=all \
+ --export="$(cx_host_exports),CX_SOCKET_IFNAME,CX_RDMA_DEVICES,CX_IB_GID_INDEX" \
+ bash -s > "$log" 2>&1 <<'BASH' || rc=$?
+set -euo pipefail
+[[ "$CX_SOCKET_IFNAME" =~ ^[A-Za-z][A-Za-z0-9_.-]{0,31}(,[A-Za-z][A-Za-z0-9_.-]{0,31})*$ ]]
+[[ "$CX_RDMA_DEVICES" =~ ^[A-Za-z][A-Za-z0-9_.-]{0,31}(:[1-9][0-9]*)?(,[A-Za-z][A-Za-z0-9_.-]{0,31}(:[1-9][0-9]*)?)*$ ]]
+if [ -n "${CX_IB_GID_INDEX:-}" ]; then
+ [[ "$CX_IB_GID_INDEX" =~ ^[0-9]+$ ]] && [ "$CX_IB_GID_INDEX" -le 255 ]
+fi
+IFS=, read -r -a interfaces <<< "$CX_SOCKET_IFNAME"
+for interface in "${interfaces[@]}"; do
+ [ -d "/sys/class/net/$interface" ]
+ state="$(cat "/sys/class/net/$interface/operstate")"
+ [ "$state" = up ] || [ "$state" = unknown ]
+done
+check_port() {
+ local port_path="$1" state gid
+ [ -d "$port_path" ] || return 1
+ read -r state _ < "$port_path/state"
+ [ "$state" = 4: ] || return 1
+ if [ -n "${CX_IB_GID_INDEX:-}" ]; then
+ [ -r "$port_path/gids/$CX_IB_GID_INDEX" ] || return 1
+ gid="$(tr -d ':0[:space:]' < "$port_path/gids/$CX_IB_GID_INDEX")"
+ [ -n "$gid" ] || return 1
+ fi
+}
+IFS=, read -r -a devices <<< "$CX_RDMA_DEVICES"
+for selector in "${devices[@]}"; do
+ device="${selector%%:*}"
+ configured_port=""
+ [ "$selector" = "$device" ] || configured_port="${selector#*:}"
+ ports="/sys/class/infiniband/$device/ports"
+ [ -d "$ports" ]
+ if [ -n "$configured_port" ]; then
+ check_port "$ports/$configured_port"
+ else
+ active=0
+ for port_path in "$ports"/*; do
+ if check_port "$port_path"; then
+ active=1
+ break
+ fi
+ done
+ [ "$active" = 1 ]
+ fi
+done
+BASH
+ if [ "$rc" != 0 ]; then
+ cx_fail_stage setup "$log" || true
+ return "$rc"
+ fi
+}
+
+cx_resolve_slurm_rendezvous() {
+ local job_id="$1" nodes master_addr master_port
+ [[ "$job_id" =~ ^[1-9][0-9]*$ ]] || cx_die "invalid rendezvous allocation"
+ nodes="$(squeue -j "$job_id" -h -o %N 2>/dev/null)"
+ master_addr="$(scontrol show hostnames "$nodes" 2>/dev/null | head -n1)"
+ master_port="${CX_MASTER_PORT:-29551}"
+ [[ "$master_addr" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \
+ || cx_die "could not resolve the allocated primary node"
+ [[ "$master_port" =~ ^[1-9][0-9]*$ ]] && [ "$master_port" -le 65535 ] \
+ || cx_die "invalid distributed rendezvous port"
+ export MASTER_ADDR="$master_addr" MASTER_PORT="$master_port"
+}
+
+# Printed into `bash -c` for one Slurm task per GPU. Every rank derives its
+# identity from Slurm rather than accepting caller-supplied rank values.
+cx_slurm_rank_wrapper() {
+ cat <<'BASH'
+case "${SLURM_PROCID:-}:${SLURM_NTASKS:-}:${SLURM_LOCALID:-}:${SLURM_NODEID:-}" in
+ *[!0-9:]*|:*|*::*|*:) exit 67 ;;
+esac
+[ "$SLURM_NTASKS" = "$CX_NGPUS" ] || exit 67
+[ "$SLURM_LOCALID" -lt "$CX_GPUS_PER_NODE" ] || exit 67
+export RANK="$SLURM_PROCID" WORLD_SIZE="$SLURM_NTASKS"
+export LOCAL_RANK="$SLURM_LOCALID" LOCAL_WORLD_SIZE="$CX_GPUS_PER_NODE"
+exec python3 tests/run_ep.py "$@"
+BASH
+}
+
+# A set shard path is an execution contract, never a hint. Validate it before
+# staging/allocation and again in-container so a missing or stale control file
+# cannot silently fall back to a manual single-case run.
+cx_validate_shard_control() {
+ local cx_root="$1" shard="${CX_SHARD_FILE:-}" path expected_sku control_sha256
+ [ -n "$shard" ] || return 0
+ expected_sku="${CX_SHARD_SKU:-}"
+ [ -n "$expected_sku" ] || cx_die "CX_SHARD_SKU is required with CX_SHARD_FILE"
+ [ -n "${CX_BENCH:-}" ] || cx_die "CX_BENCH is required with CX_SHARD_FILE"
+ [[ "${CX_NODES:-}" =~ ^[1-9][0-9]*$ ]] \
+ || cx_die "positive CX_NODES is required with CX_SHARD_FILE"
+ path="$shard"
+ [ -f "$path" ] || path="${cx_root%/}/$shard"
+ [ -f "$path" ] || cx_die "shard control does not exist"
+ [ -s "$path" ] || cx_die "shard control is empty"
+ python3 "${cx_root%/}/sweep_matrix.py" \
+ --validate-control "$path" --expect-sku "$expected_sku" \
+ --expect-backend "$CX_BENCH" --expect-nodes "$CX_NODES" >/dev/null 2>&1 \
+ || cx_die "invalid shard control"
+ control_sha256="$(sha256sum "$path" | awk '{print $1}')"
+ [[ "$control_sha256" =~ ^[0-9a-f]{64}$ ]] \
+ || cx_die "cannot hash shard control"
+ export COLLECTIVEX_CONTROL_SHA256="$control_sha256"
+}
+
+cx_apply_timing_profile() {
+ [ -n "${CX_TIMING:-}" ] || return 0
+ local iters trials warmup extra
+ IFS=: read -r iters trials warmup extra <<< "$CX_TIMING"
+ [[ "$iters" =~ ^[1-9][0-9]*$ && "$trials" =~ ^[1-9][0-9]*$ \
+ && "$warmup" =~ ^[1-9][0-9]*$ && -z "$extra" ]] \
+ || cx_die "CX_TIMING must be positive iters:trials:warmup"
+ export CX_ITERS="$iters" CX_TRIALS="$trials" CX_WARMUP="$warmup"
+}
+
+# Use an opaque, execution-bound name so a missing grant message can be
+# reconciled without exposing runner or shard details in public logs.
+cx_scheduler_job_name() {
+ local execution_id="${COLLECTIVEX_EXECUTION_ID:-manual-$$}" digest
+ digest="$(printf '%s' "$execution_id" | sha256sum | awk '{print $1}')" \
+ || return 1
+ [[ "$digest" =~ ^[0-9a-f]{64}$ ]] || return 1
+ printf 'cx-%s' "${digest:0:24}"
+}
+
+# Return 0 after recovering one allocation ID, 2 after three successful empty
+# observations, and 1 for every ambiguous or failed lookup. Callers inspect the
+# state variables rather than the status because all missing-ID paths still fail.
+cx_reconcile_salloc_jobid() {
+ local job_name="$1" scheduler_user queue_output line delay attempt
+ local -a ids=()
+ scheduler_user="$(id -un 2>/dev/null)" || return 1
+ [[ "$scheduler_user" =~ ^[A-Za-z0-9_.-]+$ \
+ && "$job_name" =~ ^cx-[0-9a-f]{24}$ ]] || return 1
+ for attempt in 1 2 3; do
+ ids=()
+ if ! queue_output="$(
+ squeue -h --user="$scheduler_user" --name="$job_name" -o %A 2>/dev/null
+ )"; then
+ return 1
+ fi
+ while IFS= read -r line; do
+ [[ "$line" =~ ^[[:space:]]*$ ]] && continue
+ if [[ "$line" =~ ^[[:space:]]*([1-9][0-9]*)[[:space:]]*$ ]]; then
+ ids+=("${BASH_REMATCH[1]}")
+ else
+ return 1
+ fi
+ done <<< "$queue_output"
+ if [ "${#ids[@]}" -eq 1 ]; then
+ JOB_ID="${ids[0]}"
+ CX_ALLOCATION_UNCERTAIN=0
+ return 0
+ fi
+ [ "${#ids[@]}" -eq 0 ] || return 1
+ if [ "$attempt" -eq 3 ]; then
+ CX_ALLOCATION_UNCERTAIN=0
+ return 2
+ fi
+ delay=$((1 << (attempt - 1)))
+ sleep "$delay" || return 1
+ done
+ return 1
+}
+
+# Allocate via salloc's stable grant message and assign JOB_ID in this shell.
+# Raw scheduler output remains in the bounded private execution log.
+cx_salloc_jobid() {
+ local log job_id job_name argument salloc_rc=0
+ log="$(cx_private_log_path scheduler-allocation)"
+ for argument in "$@"; do
+ case "$argument" in
+ --job-name|--job-name=*|-J|-J*)
+ cx_log "ERROR: scheduler job names are managed by CollectiveX"
+ return 1
+ ;;
+ esac
+ done
+ job_name="$(cx_scheduler_job_name)" || return 1
+ CX_ALLOCATION_UNCERTAIN=1
+ # salloc has no portable --parsable option. Parse the stable grant message
+ # used by the production launchers, while also accepting a bare ID from
+ # site wrappers.
+ salloc "$@" --job-name="$job_name" --no-shell > "$log" 2>&1 || salloc_rc=$?
+ job_id="$(sed -nE \
+ -e 's/^([0-9]+)(;[^[:space:]]+)?$/\1/p' \
+ -e 's/.*Granted job allocation ([0-9]+).*/\1/p' \
+ "$log" | head -n1)"
+ if [ -n "$job_id" ]; then
+ [[ "$job_id" =~ ^[0-9]+$ ]] || return 1
+ JOB_ID="$job_id"
+ CX_ALLOCATION_UNCERTAIN=0
+ fi
+ if [ "$salloc_rc" != 0 ]; then
+ if [ "$salloc_rc" -ge 128 ] && [ -z "$JOB_ID" ]; then
+ cx_fail_stage scheduler-allocation "$log"
+ return 1
+ fi
+ [ -n "$JOB_ID" ] || cx_reconcile_salloc_jobid "$job_name" || true
+ cx_fail_stage scheduler-allocation "$log"
+ return 1
+ fi
+ if [ -z "$JOB_ID" ]; then
+ cx_reconcile_salloc_jobid "$job_name" || true
+ cx_fail_stage scheduler-allocation "$log"
+ return 1
+ fi
+}
+
+cx_cancel_job() {
+ local job_id="$1" active delay
+ [[ "$job_id" =~ ^[0-9]+$ ]] || return 1
+ scancel "$job_id" >/dev/null 2>&1 || true
+ for delay in 1 2 4 8 16 32; do
+ if ! active="$(squeue -h -j "$job_id" -o %A 2>/dev/null)"; then
+ sleep "$delay"
+ continue
+ fi
+ [ -n "$active" ] || return 0
+ sleep "$delay"
+ done
+ cx_log "ERROR: scheduled allocation did not terminate during cleanup"
+ return 1
+}
+
+cx_write_cleanup_guard() {
+ local state="$1" root="${CX_JOB_ROOT:-}" safe unsafe
+ [[ "$root" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \
+ && [ -d "$root" ] && [ ! -L "$root" ] \
+ && [ "$(stat -c '%u:%a' "$root" 2>/dev/null)" = "$(id -u):700" ] || return 0
+ safe="$root/cleanup-safe"
+ unsafe="$root/cleanup-unsafe"
+ umask 077
+ case "$state" in
+ safe) : > "$safe" && rm -f -- "$unsafe" ;;
+ unsafe) rm -f -- "$safe" && : > "$unsafe" ;;
+ *) return 1 ;;
+ esac
+}
+
+# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI
+# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import
+# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.)
+# Import remains tag-based because Enroot cannot reliably import a digest-qualified
+# Docker Hub reference non-interactively. The registry digest is resolved and checked
+# immediately before import, then recorded as verified provenance.
+CX_IMAGE_MULTIARCH_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975"
+# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based
+# squash creation on these nodes — "failed to mount overlay ... Invalid argument".
+# v0.5.11-cu130 imports cleanly.)
+# Runtime setup verifies the image-bundled DeepEP build for the detected GPU target.
+CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130"
+
+# AMD (ROCm/CDNA): separate single-arch images bundle MoRI.
+CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2"
+CX_IMAGE_AMD_MORI_DIGEST="sha256:24c3b30d64475937abbb6498e3b29528649adcb836dde7a468979f767809b0e8"
+CX_MORI_COMMIT_MI355="99bc0a3a6e7a70aacc6372cd9a4275ccfb4de567" # pragma: allowlist secret
+CX_IMAGE_AMD_MORI_MI325="rocm/sgl-dev:sglang-0.5.14-rocm720-mi35x-mori-0701"
+CX_IMAGE_AMD_MORI_MI325_DIGEST="sha256:ea42375343c2ef8f73b3bdb9e1b7b435556e3ca92aba5e3f74ada29ba217fabc"
+CX_MORI_COMMIT_MI325="bf99bdf18fc69887a346913ca01c315c2aa9bd4c" # pragma: allowlist secret
+cx_default_image() {
+ case "$1" in
+ mi325x*) echo "$CX_IMAGE_AMD_MORI_MI325" ;;
+ mi355x*) echo "$CX_IMAGE_AMD_MORI" ;;
+ b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;;
+ *) cx_die "no default image for runner prefix: $1" ;;
+ esac
+}
+
+cx_resolve_registry_digest() {
+ local image="$1" repository reference token digest registry
+ if [[ "$image" == *@* ]]; then
+ cx_die "digest-qualified image overrides are unsupported; configure a tag and pinned digest"
+ fi
+ registry="${image%%/*}"
+ if [[ "$image" == */* && ( "$registry" == *.* || "$registry" == *:* || "$registry" = localhost ) ]]; then
+ case "$registry" in
+ docker.io|registry-1.docker.io) image="${image#*/}" ;;
+ *) cx_die "only Docker Hub images are supported by the registry verifier" ;;
+ esac
+ fi
+ repository="${image%:*}"
+ reference="${image##*:}"
+ [ "$repository" != "$image" ] || { repository="$image"; reference=latest; }
+ [ -n "$repository" ] && [ -n "$reference" ] \
+ || cx_die "configured image reference is malformed"
+ [[ "$repository" == */* ]] || repository="library/$repository"
+ token="$(curl -fsSLG --connect-timeout 10 --max-time 30 --retry 2 \
+ --retry-delay 1 --retry-all-errors 'https://auth.docker.io/token' \
+ --data-urlencode 'service=registry.docker.io' \
+ --data-urlencode "scope=repository:${repository}:pull" \
+ | python3 -c 'import json,sys; print(json.load(sys.stdin)["token"])')" \
+ || cx_die "cannot authenticate to the image registry"
+ digest="$(curl -fsSI --connect-timeout 10 --max-time 30 --retry 2 \
+ --retry-delay 1 --retry-all-errors \
+ -H "Authorization: Bearer $token" \
+ -H 'Accept: application/vnd.oci.image.index.v1+json, application/vnd.oci.image.manifest.v1+json, application/vnd.docker.distribution.manifest.list.v2+json, application/vnd.docker.distribution.manifest.v2+json' \
+ "https://registry-1.docker.io/v2/${repository}/manifests/${reference}" \
+ | tr -d '\r' | awk 'tolower($1)=="docker-content-digest:" {print $2; exit}')" \
+ || cx_die "cannot resolve the configured image digest"
+ [[ "$digest" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ || cx_die "registry returned an invalid image digest"
+ printf '%s' "$digest"
+}
+
+cx_verify_registry_image() {
+ local image="$1" expected actual
+ expected="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$image")}"
+ [[ "$expected" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ || cx_die "a pinned digest is required for the configured image"
+ actual="$(cx_resolve_registry_digest "$image")"
+ [ "$actual" = "$expected" ] \
+ || cx_die "configured image tag no longer matches its pinned digest"
+ export COLLECTIVEX_IMAGE="$image" COLLECTIVEX_IMAGE_DIGEST="$actual"
+ export COLLECTIVEX_IMAGE_DIGEST_VERIFIED=1
+}
+
+cx_default_image_digest() {
+ case "$1" in
+ "$CX_IMAGE_MULTIARCH") printf '%s' "$CX_IMAGE_MULTIARCH_DIGEST" ;;
+ "$CX_IMAGE_AMD_MORI") printf '%s' "$CX_IMAGE_AMD_MORI_DIGEST" ;;
+ "$CX_IMAGE_AMD_MORI_MI325") printf '%s' "$CX_IMAGE_AMD_MORI_MI325_DIGEST" ;;
+ esac
+}
+
+# Canonical workflow runs must not inherit benchmark controls from a persistent
+# self-hosted runner service. Manual/SSH diagnostics retain their explicit
+# overrides by leaving COLLECTIVEX_CANONICAL_GHA unset.
+cx_gha_workspace_stage_root() {
+ local workspace="${GITHUB_WORKSPACE:-}"
+ python3 - "$workspace" <<'PY'
+import os
+import stat
+import sys
+
+workspace = sys.argv[1]
+try:
+ if (
+ not os.path.isabs(workspace)
+ or os.path.realpath(workspace) != workspace
+ or not os.path.isdir(workspace)
+ ):
+ raise OSError
+ metadata = os.stat(workspace, follow_symlinks=False)
+ # GitHub runner workspaces are runner-owned but commonly writable by the
+ # trusted runner-service group. Keep the child mode 0700 and reject world write.
+ if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) & stat.S_IWOTH:
+ raise OSError
+except OSError:
+ raise SystemExit(1)
+print(workspace, end="")
+PY
+}
+
+# Create a per-UID cache under validated cluster-local storage. Only the fixed
+# /cx-cache mount enters the container; the operator host path does not.
+cx_prepare_backend_cache() {
+ local stage_parent="$1" cache info sentinel_sha256
+ unset CX_PREPARED_BACKEND_CACHE CX_BACKEND_CACHE_SENTINEL_SHA256
+ info="$(python3 - "$stage_parent" <<'PY'
+import hashlib
+import os
+import secrets
+import stat
+import sys
+
+configured_parent = sys.argv[1]
+try:
+ if (
+ not os.path.isabs(configured_parent)
+ or "\n" in configured_parent
+ or "\r" in configured_parent
+ ):
+ raise OSError
+ parent = os.path.realpath(configured_parent)
+ if not os.path.isdir(parent):
+ raise OSError
+ flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0)
+ parent_fd = os.open(parent, flags)
+ try:
+ probe_name = f".collectivex-owner-probe-{os.getpid()}-{secrets.token_hex(8)}"
+ os.mkdir(probe_name, 0o700, dir_fd=parent_fd)
+ try:
+ probe_fd = os.open(probe_name, flags, dir_fd=parent_fd)
+ try:
+ probe = os.fstat(probe_fd)
+ if stat.S_IMODE(probe.st_mode) & 0o777 != 0o700:
+ raise OSError
+ realized_owner = probe.st_uid
+ finally:
+ os.close(probe_fd)
+ finally:
+ os.rmdir(probe_name, dir_fd=parent_fd)
+ for generation in (3, 4):
+ name = f".collectivex-backend-cache-v{generation}-{os.getuid()}"
+ try:
+ os.mkdir(name, 0o700, dir_fd=parent_fd)
+ except FileExistsError:
+ pass
+ try:
+ cache_fd = os.open(name, flags, dir_fd=parent_fd)
+ try:
+ metadata = os.fstat(cache_fd)
+ if (
+ metadata.st_uid != realized_owner
+ or stat.S_IMODE(metadata.st_mode) & 0o777 != 0o700
+ ):
+ raise OSError
+ sentinel_name = ".collectivex-mount-sentinel-v1"
+ temporary_name = (
+ f"{sentinel_name}.tmp.{os.getpid()}.{secrets.token_hex(8)}"
+ )
+ create_flags = (
+ os.O_WRONLY | os.O_CREAT | os.O_EXCL
+ | getattr(os, "O_NOFOLLOW", 0)
+ )
+ payload = secrets.token_bytes(32)
+ temporary_fd = os.open(
+ temporary_name, create_flags, 0o600, dir_fd=cache_fd
+ )
+ try:
+ try:
+ view = memoryview(payload)
+ try:
+ while view:
+ written = os.write(temporary_fd, view)
+ if written <= 0:
+ raise OSError
+ view = view[written:]
+ os.fsync(temporary_fd)
+ finally:
+ view.release()
+ finally:
+ os.close(temporary_fd)
+ try:
+ os.link(
+ temporary_name,
+ sentinel_name,
+ src_dir_fd=cache_fd,
+ dst_dir_fd=cache_fd,
+ follow_symlinks=False,
+ )
+ except FileExistsError:
+ pass
+ finally:
+ try:
+ os.unlink(temporary_name, dir_fd=cache_fd)
+ except FileNotFoundError:
+ pass
+ sentinel_fd = os.open(
+ sentinel_name,
+ os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0),
+ dir_fd=cache_fd,
+ )
+ try:
+ sentinel = os.fstat(sentinel_fd)
+ payload = os.read(sentinel_fd, 33)
+ if (
+ not stat.S_ISREG(sentinel.st_mode)
+ or sentinel.st_uid != realized_owner
+ or stat.S_IMODE(sentinel.st_mode) & 0o777 != 0o600
+ or sentinel.st_size != 32
+ or len(payload) != 32
+ ):
+ raise OSError
+ sentinel_sha256 = hashlib.sha256(payload).hexdigest()
+ finally:
+ os.close(sentinel_fd)
+ finally:
+ os.close(cache_fd)
+ except OSError:
+ if generation == 3:
+ continue
+ raise
+ break
+ finally:
+ os.close(parent_fd)
+except OSError:
+ raise SystemExit(1)
+print(sentinel_sha256, os.path.join(parent, name), end="")
+PY
+)" || return 1
+ sentinel_sha256="${info%% *}"
+ cache="${info#* }"
+ [ "$cache" != "$info" ] && [[ "$sentinel_sha256" =~ ^[0-9a-f]{64}$ ]] \
+ && [[ "$cache" = /* ]] || return 1
+ export CX_PREPARED_BACKEND_CACHE="$cache"
+ export CX_BACKEND_CACHE_SENTINEL_SHA256="$sentinel_sha256"
+}
+
+cx_verify_backend_cache_mount() {
+ python3 - "${CX_BACKEND_CACHE_ROOT:-}" \
+ "${CX_BACKEND_CACHE_SENTINEL_SHA256:-}" <<'PY'
+import hashlib
+import os
+import re
+import stat
+import sys
+
+root, expected = sys.argv[1:]
+try:
+ if (
+ not os.path.isabs(root)
+ or os.path.realpath(root) != root
+ or re.fullmatch(r"[0-9a-f]{64}", expected) is None
+ ):
+ raise OSError
+ flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0)
+ root_fd = os.open(root, flags)
+ try:
+ root_item = os.fstat(root_fd)
+ if (
+ not stat.S_ISDIR(root_item.st_mode)
+ or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700
+ ):
+ raise OSError
+ sentinel_fd = os.open(
+ ".collectivex-mount-sentinel-v1",
+ os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0),
+ dir_fd=root_fd,
+ )
+ try:
+ sentinel = os.fstat(sentinel_fd)
+ payload = os.read(sentinel_fd, 33)
+ if (
+ not stat.S_ISREG(sentinel.st_mode)
+ or sentinel.st_uid != root_item.st_uid
+ or stat.S_IMODE(sentinel.st_mode) & 0o777 != 0o600
+ or sentinel.st_size != 32
+ or len(payload) != 32
+ or hashlib.sha256(payload).hexdigest() != expected
+ ):
+ raise OSError
+ finally:
+ os.close(sentinel_fd)
+ finally:
+ os.close(root_fd)
+except OSError:
+ raise SystemExit(1)
+PY
+}
+
+cx_git() {
+ GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null GIT_TERMINAL_PROMPT=0 \
+ git -c credential.helper= "$@"
+}
+
+cx_git_in_tree() {
+ local directory="$1" canonical
+ shift
+ [[ "$directory" = /* ]] && [ -d "$directory" ] && [ ! -L "$directory" ] \
+ || return 1
+ [[ "$directory" != *'*'* && "$directory" != *$'\n'* && "$directory" != *$'\r'* ]] \
+ || return 1
+ canonical="$(cd -P -- "$directory" && pwd -P)" || return 1
+ cx_git -c "safe.directory=$canonical" -C "$canonical" "$@"
+}
+
+cx_fetch_revision() {
+ local repository="$1" revision="$2" destination="$3" attempt
+ for attempt in 1 2 3; do
+ rm -rf -- "$destination"
+ if cx_git init -q "$destination" \
+ && cx_git_in_tree "$destination" remote add origin "$repository" \
+ && cx_git_in_tree "$destination" fetch -q --no-tags --depth 1 origin "$revision" \
+ && cx_git_in_tree "$destination" -c advice.detachedHead=false \
+ checkout -q --detach FETCH_HEAD \
+ && [ "$(cx_git_in_tree "$destination" rev-parse HEAD)" = "$revision" ]; then
+ return 0
+ fi
+ [ "$attempt" = 3 ] || sleep $((attempt * 5))
+ done
+ return 1
+}
+
+cx_backend_source_pin() {
+ case "$1" in
+ deepep-v2)
+ printf '%s|%s|%s' \
+ "$CX_DEEPEP_V2_COMMIT" "$CX_DEEPEP_V2_TREE" "$CX_DEEPEP_V2_FMT_COMMIT"
+ ;;
+ deepep-hybrid)
+ printf '%s|%s||%s' "$CX_DEEPEP_HYBRID_COMMIT" "$CX_DEEPEP_HYBRID_TREE" \
+ "$CX_DEEPEP_HYBRID_NCCL_COMMIT"
+ ;;
+ *) return 1 ;;
+ esac
+}
+
+cx_backend_source_path() {
+ local root="$1" backend="$2" revision tree fmt nccl pin
+ pin="$(cx_backend_source_pin "$backend")" || return 1
+ IFS='|' read -r revision tree fmt nccl <<< "$pin"
+ printf '%s/%s-%s' "$root" "$backend" "$revision"
+}
+
+cx_backend_source_is_valid() {
+ local backend="$1" source="$2" revision tree fmt nccl pin status ignored
+ pin="$(cx_backend_source_pin "$backend")" || return 1
+ IFS='|' read -r revision tree fmt nccl <<< "$pin"
+ [ -d "$source" ] && [ ! -L "$source" ] \
+ && [ "$(cx_git_in_tree "$source" rev-parse HEAD 2>/dev/null)" = "$revision" ] \
+ && [ "$(cx_git_in_tree "$source" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \
+ || return 1
+ status="$(cx_git_in_tree "$source" status --porcelain --untracked-files=all \
+ --ignore-submodules=none 2>/dev/null)" || return 1
+ [ -z "$status" ] || return 1
+ ignored="$(cx_git_in_tree "$source" ls-files --others --ignored --exclude-standard \
+ 2>/dev/null)" || return 1
+ [ -z "$ignored" ] || return 1
+ [ -z "$fmt" ] \
+ || [ "$(cx_git_in_tree "$source/third-party/fmt" rev-parse HEAD 2>/dev/null)" = "$fmt" ] \
+ || return 1
+ [ -z "$nccl" ] \
+ || [ "$(cx_git_in_tree "$source/third-party/nccl" rev-parse HEAD 2>/dev/null)" = "$nccl" ]
+}
+
+cx_extension_pair_sha256() {
+ python3 - "$1" "$2" "$3" <<'PY'
+import hashlib
+import os
+from pathlib import Path
+import stat
+import sys
+
+root = Path(sys.argv[1])
+digest = hashlib.sha256()
+try:
+ if root.is_symlink() or not root.is_dir():
+ raise OSError
+ for pattern in sys.argv[2:]:
+ matches = list(root.glob(pattern))
+ if len(matches) != 1 or matches[0].is_symlink():
+ raise OSError
+ path = matches[0]
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ metadata = os.fstat(descriptor)
+ if not stat.S_ISREG(metadata.st_mode):
+ raise OSError
+ file_digest = hashlib.sha256()
+ with os.fdopen(descriptor, "rb", closefd=False) as stream:
+ for chunk in iter(lambda: stream.read(1024 * 1024), b""):
+ file_digest.update(chunk)
+ digest.update(path.name.encode("utf-8") + b"\0")
+ digest.update(str(metadata.st_size).encode("ascii") + b"\0")
+ digest.update(file_digest.digest())
+ finally:
+ os.close(descriptor)
+except (OSError, UnicodeError):
+ raise SystemExit(1)
+print(digest.hexdigest(), end="")
+PY
+}
+
+# Acquire source before compute allocation, preferring the verified same-run GHA seed.
+_cx_prepare_backend_source() {
+ local mount_src="$1" backend="$2" root source temporary revision tree fmt nccl pin
+ local root_mode stage_mode root_owner stage_owner
+ local seed_root="${CX_BACKEND_SOURCE_SEED_ROOT:-}" seed seed_mode
+ root="$mount_src/experimental/CollectiveX/.cx_sources"
+ CX_BACKEND_SOURCE_STEP="source mount creation"
+ if [ ! -e "$root" ] && [ ! -L "$root" ]; then
+ mkdir -m 700 -- "$root" || return 1
+ fi
+ CX_BACKEND_SOURCE_STEP="source mount ownership validation"
+ [ -d "$mount_src" ] && [ ! -L "$mount_src" ] \
+ && [ -d "$root" ] && [ ! -L "$root" ] || return 1
+ stage_owner="$(stat -c '%u' "$mount_src" 2>/dev/null)" || return 1
+ root_owner="$(stat -c '%u' "$root" 2>/dev/null)" || return 1
+ [ "$root_owner" = "$stage_owner" ] || return 1
+ stage_mode="$(stat -c '%a' "$mount_src" 2>/dev/null)" || return 1
+ case "$stage_mode" in 700|[1-7]700) ;; *) return 1 ;; esac
+ # Shared stage parents may retain harmless special bits despite mkdir -m.
+ CX_BACKEND_SOURCE_STEP="source mount permission inspection"
+ root_mode="$(stat -c '%a' "$root" 2>/dev/null)" || return 1
+ case "$root_mode" in
+ 700|[1-7]700) ;;
+ *)
+ CX_BACKEND_SOURCE_STEP="source mount permission normalization"
+ chmod 700 "$root" || return 1
+ CX_BACKEND_SOURCE_STEP="source mount permission validation"
+ root_mode="$(stat -c '%a' "$root" 2>/dev/null)" || return 1
+ case "$root_mode" in 700|[1-7]700) ;; *) return 1 ;; esac
+ ;;
+ esac
+ CX_BACKEND_SOURCE_STEP="git lookup"
+ command -v git >/dev/null || return 1
+ CX_BACKEND_SOURCE_STEP="source pin resolution"
+ source="$(cx_backend_source_path "$root" "$backend")" || return 1
+ if [ -e "$source" ] || [ -L "$source" ]; then
+ CX_BACKEND_SOURCE_STEP="existing source validation"
+ cx_backend_source_is_valid "$backend" "$source"
+ return
+ fi
+ if [ -n "$seed_root" ]; then
+ CX_BACKEND_SOURCE_STEP="source seed validation"
+ [[ "$seed_root" = /* ]] && [ -d "$seed_root" ] && [ ! -L "$seed_root" ] \
+ || return 1
+ seed_mode="$(stat -c '%a' "$seed_root" 2>/dev/null)" || return 1
+ case "$seed_mode" in 700|[1-7]700) ;; *) return 1 ;; esac
+ seed="$(cx_backend_source_path "$seed_root" "$backend")" || return 1
+ cx_backend_source_is_valid "$backend" "$seed" || return 1
+ CX_BACKEND_SOURCE_STEP="source seed copy"
+ temporary="$(mktemp -d "$root/.${backend}.XXXXXX")" || return 1
+ if ! cp -R -- "$seed/." "$temporary/" \
+ || ! cx_backend_source_is_valid "$backend" "$temporary" \
+ || ! mv -- "$temporary" "$source"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ return
+ fi
+ if [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ]; then
+ CX_BACKEND_SOURCE_STEP="source seed validation"
+ return 1
+ fi
+ CX_BACKEND_SOURCE_STEP="source checkout creation"
+ temporary="$(mktemp -d "$root/.${backend}.XXXXXX")" || return 1
+ CX_BACKEND_SOURCE_STEP="source pin resolution"
+ pin="$(cx_backend_source_pin "$backend")" || {
+ rm -rf -- "$temporary"
+ return 1
+ }
+ IFS='|' read -r revision tree fmt nccl <<< "$pin"
+ CX_BACKEND_SOURCE_STEP="revision fetch"
+ if ! cx_fetch_revision \
+ https://github.com/deepseek-ai/DeepEP "$revision" "$temporary"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ CX_BACKEND_SOURCE_STEP="submodule fetch"
+ if [ -n "$fmt" ] && ! cx_git_in_tree "$temporary" \
+ -c "safe.directory=$temporary/third-party/fmt" \
+ submodule update -q --init --depth 1 third-party/fmt; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ if [ -n "$nccl" ] && ! cx_git_in_tree "$temporary" \
+ -c "safe.directory=$temporary/third-party/nccl" \
+ submodule update -q --init --depth 1 third-party/nccl; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ CX_BACKEND_SOURCE_STEP="source publication validation"
+ if ! cx_backend_source_is_valid "$backend" "$temporary" \
+ || ! mv -- "$temporary" "$source"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+}
+
+cx_prepare_backend_source() {
+ local log backend="$2" CX_BACKEND_SOURCE_STEP="initialization"
+ log="$(cx_private_log_path "backend-source-$backend")" || return 1
+ if _cx_prepare_backend_source "$@" > "$log" 2>&1; then
+ return 0
+ fi
+ printf '%s failed\n' "$CX_BACKEND_SOURCE_STEP" >> "$log"
+ cx_log "ERROR: backend-source-step=${CX_BACKEND_SOURCE_STEP// /-}"
+ cx_fail_stage backend-setup "$log"
+}
+
+cx_materialize_backend_source() {
+ local backend="$1" destination="$2" source parent temporary
+ [ -n "${CX_BACKEND_SOURCE_ROOT:-}" ] || return 1
+ source="$(cx_backend_source_path "$CX_BACKEND_SOURCE_ROOT" "$backend")" || return 1
+ cx_backend_source_is_valid "$backend" "$source" || return 1
+ parent="${destination%/*}"
+ [ "$parent" != "$destination" ] && [ -d "$parent" ] && [ ! -L "$parent" ] \
+ || return 1
+ temporary="$(mktemp -d "$parent/.collectivex-source.XXXXXX")" || return 1
+ if ! cp -R -- "$source/." "$temporary/" \
+ || ! cx_backend_source_is_valid "$backend" "$temporary"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ if ! rm -rf -- "$destination" || ! mv -- "$temporary" "$destination"; then
+ rm -rf -- "$temporary"
+ return 1
+ fi
+ if ! cx_backend_source_is_valid "$backend" "$destination"; then
+ rm -rf -- "$destination"
+ return 1
+ fi
+ return 0
+}
+
+cx_lock_canonical_gha_env() {
+ local runner="$1" expected_nodes expected_gpn expected_world trusted_lock_dir=""
+ local trusted_stage_dir=""
+ local trusted_socket_ifname="" trusted_rdma_devices=""
+ local trusted_ib_gid_index="" trusted_rdma_service_level=""
+ [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] || return 0
+ [ "${GITHUB_ACTIONS:-}" = true ] \
+ || cx_die "canonical CollectiveX execution requires GitHub Actions"
+ [ -n "${CX_SHARD_FILE:-}" ] && [ "${CX_SHARD_SKU:-}" = "$runner" ] \
+ || cx_die "canonical CollectiveX execution requires a matched shard"
+ [[ "${GITHUB_RUN_ID:-}" =~ ^[1-9][0-9]*$ \
+ && "${GITHUB_RUN_ATTEMPT:-}" =~ ^[1-9][0-9]*$ \
+ && "${COLLECTIVEX_SOURCE_SHA:-}" =~ ^[0-9a-f]{40,64}$ ]] \
+ || cx_die "canonical CollectiveX workflow identity is incomplete"
+
+ # cx_load_operator_config clears inherited values before setting this process marker.
+ # Preserve only values parsed from that private strict document.
+ if [ "${COLLECTIVEX_OPERATOR_CONFIG_LOADED:-}" = "$$" ]; then
+ trusted_lock_dir="${CX_LOCK_DIR:-}"
+ trusted_stage_dir="${CX_STAGE_DIR:-}"
+ trusted_socket_ifname="${CX_SOCKET_IFNAME:-}"
+ trusted_rdma_devices="${CX_RDMA_DEVICES:-}"
+ trusted_ib_gid_index="${CX_IB_GID_INDEX:-}"
+ trusted_rdma_service_level="${CX_RDMA_SERVICE_LEVEL:-}"
+ fi
+ unset CX_NCCL_HOME CX_MASTER_PORT CX_MORI_KERNEL_TYPE CX_LOCK_DIR CX_STAGE_DIR
+ unset MASTER_ADDR MASTER_PORT RANK WORLD_SIZE LOCAL_RANK LOCAL_WORLD_SIZE
+ unset CX_SOCKET_IFNAME CX_RDMA_DEVICES CX_IB_GID_INDEX CX_RDMA_SERVICE_LEVEL
+ unset NCCL_NET NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME NCCL_IB_HCA
+ unset NCCL_IB_GID_INDEX NCCL_IB_SL
+ unset NVSHMEM_HCA_LIST NVSHMEM_IB_GID_INDEX NVSHMEM_IB_SL
+ unset NVSHMEM_IB_ENABLE_IBGDA NVSHMEM_IBGDA_NIC_HANDLER
+ unset EP_NIC_NAME EP_OVERRIDE_RDMA_SL
+ unset UCCL_SOCKET_IFNAME UCCL_IB_GID_INDEX UCCL_IB_SL MORI_RDMA_DEVICES
+ unset HYBRID_EP_MULTINODE USE_NIXL RDMA_CORE_HOME DEEPEP_HYBRID_BUILD_MODE
+ unset MORI_COMMIT MORI_DISABLE_AUTO_XGMI MORI_ENABLE_SDMA
+ unset MORI_APP_LOG_LEVEL MORI_SHMEM_LOG_LEVEL MORI_IO_LOG_LEVEL
+ unset NCCL_CUMEM_ENABLE NCCL_MNNVL_ENABLE MC_FORCE_MNNVL
+ unset CX_BACKEND_CACHE_ROOT CX_BACKEND_CACHE_SENTINEL_SHA256
+ unset CX_PREPARED_BACKEND_CACHE CX_BACKEND_SOURCE_ROOT
+
+ [ -n "${CX_SQUASH_DIR:-}" ] \
+ || cx_die "canonical CollectiveX execution requires shared container storage"
+ [ -n "$trusted_stage_dir" ] \
+ || cx_die "canonical CollectiveX execution requires a configured shared stage directory"
+
+ case "$runner" in
+ h100-dgxc|h200-dgxc|b200-dgxc|b300)
+ expected_nodes="${CX_NODES:-}"; expected_gpn=8
+ [ "$expected_nodes" = 1 ] || [ "$expected_nodes" = 2 ] \
+ || cx_die "canonical NVIDIA execution requires one or two nodes"
+ CX_IMAGE="$CX_IMAGE_MULTIARCH"
+ CX_IMAGE_DIGEST="$CX_IMAGE_MULTIARCH_DIGEST"
+ CX_NCCL_HOME=/usr
+ ;;
+ gb200|gb300)
+ expected_nodes="${CX_NODES:-}"; expected_gpn=4
+ [ "$expected_nodes" = 2 ] || [ "$expected_nodes" = 4 ] \
+ || cx_die "canonical GB execution requires two or four trays"
+ CX_IMAGE="$CX_IMAGE_MULTIARCH"
+ CX_IMAGE_DIGEST="$CX_IMAGE_MULTIARCH_DIGEST"
+ CX_NCCL_HOME=/usr
+ CX_MASTER_PORT=29551
+ ;;
+ mi325x)
+ expected_nodes="${CX_NODES:-}"; expected_gpn=8
+ [ "$expected_nodes" = 1 ] || [ "$expected_nodes" = 2 ] \
+ || cx_die "canonical AMD execution requires one or two nodes"
+ CX_IMAGE="$CX_IMAGE_AMD_MORI_MI325"
+ CX_IMAGE_DIGEST="$CX_IMAGE_AMD_MORI_MI325_DIGEST"
+ if [ "$expected_nodes" = 2 ]; then
+ CX_MORI_KERNEL_TYPE=internode-v1
+ else
+ CX_MORI_KERNEL_TYPE=asyncll
+ fi
+ MORI_COMMIT="$CX_MORI_COMMIT_MI325"
+ MORI_DISABLE_AUTO_XGMI=0
+ MORI_ENABLE_SDMA=1
+ MORI_APP_LOG_LEVEL=info
+ MORI_SHMEM_LOG_LEVEL=info
+ MORI_IO_LOG_LEVEL=info
+ ;;
+ mi355x)
+ expected_nodes="${CX_NODES:-}"; expected_gpn=8
+ [ "$expected_nodes" = 1 ] || [ "$expected_nodes" = 2 ] \
+ || cx_die "canonical AMD execution requires one or two nodes"
+ CX_IMAGE="$CX_IMAGE_AMD_MORI"
+ CX_IMAGE_DIGEST="$CX_IMAGE_AMD_MORI_DIGEST"
+ if [ "$expected_nodes" = 2 ]; then
+ CX_MORI_KERNEL_TYPE=internode-v1
+ else
+ CX_MORI_KERNEL_TYPE=intranode
+ fi
+ MORI_COMMIT="$CX_MORI_COMMIT_MI355"
+ ;;
+ *) cx_die "canonical CollectiveX runner is not registered" ;;
+ esac
+ case "$runner:$trusted_lock_dir" in
+ mi325x:?*|mi355x:?*) export CX_LOCK_DIR="$trusted_lock_dir" ;;
+ esac
+ CX_STAGE_DIR="$trusted_stage_dir"
+ [ -z "$trusted_socket_ifname" ] \
+ || export CX_SOCKET_IFNAME="$trusted_socket_ifname"
+ [ -z "$trusted_rdma_devices" ] \
+ || export CX_RDMA_DEVICES="$trusted_rdma_devices"
+ [ -z "$trusted_ib_gid_index" ] \
+ || export CX_IB_GID_INDEX="$trusted_ib_gid_index"
+ [ -z "$trusted_rdma_service_level" ] \
+ || export CX_RDMA_SERVICE_LEVEL="$trusted_rdma_service_level"
+ export CX_STAGE_DIR
+ [ "${CX_NODES:-}" = "$expected_nodes" ] \
+ && [ "${CX_GPUS_PER_NODE:-}" = "$expected_gpn" ] \
+ || cx_die "canonical CollectiveX placement differs from the shard"
+ expected_world=$((expected_nodes * expected_gpn))
+ CX_NGPUS="$expected_world"
+ CX_SEED=67
+ case "$runner" in mi325x|mi355x) CX_RUN_TIMEOUT=1800 ;; *) CX_RUN_TIMEOUT=900 ;; esac
+ unset CX_PUBLIC_RUNNER CX_GB_PRODUCT CX_DRYRUN CX_TIMING CX_ALLOW_MNNVL
+ unset CX_ENROOT_LOCAL_IMPORT COLLECTIVEX_IMAGE COLLECTIVEX_IMAGE_DIGEST
+ unset COLLECTIVEX_IMAGE_DIGEST_VERIFIED COLLECTIVEX_SQUASH_SHA256
+ export CX_IMAGE CX_IMAGE_DIGEST CX_NGPUS CX_SEED CX_RUN_TIMEOUT
+ case "$runner" in
+ h100-dgxc|h200-dgxc|b200-dgxc|b300) export CX_NCCL_HOME ;;
+ gb200|gb300) export CX_NCCL_HOME CX_MASTER_PORT ;;
+ mi325x)
+ export CX_MORI_KERNEL_TYPE MORI_COMMIT MORI_DISABLE_AUTO_XGMI MORI_ENABLE_SDMA
+ export MORI_APP_LOG_LEVEL MORI_SHMEM_LOG_LEVEL MORI_IO_LOG_LEVEL
+ ;;
+ mi355x) export CX_MORI_KERNEL_TYPE MORI_COMMIT ;;
+ esac
+}
+
+cx_reverify_registry_image() {
+ local image="$1" actual
+ [[ "${COLLECTIVEX_IMAGE_DIGEST:-}" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ && [ "${COLLECTIVEX_IMAGE_DIGEST_VERIFIED:-0}" = 1 ] || return 1
+ actual="$(cx_resolve_registry_digest "$image")" || return 1
+ [ "$actual" = "$COLLECTIVEX_IMAGE_DIGEST" ] || {
+ cx_log "ERROR: configured image tag changed during container import"
+ return 1
+ }
+}
+
+cx_export_squash_identity() {
+ local image="$1" digest log
+ log="$(cx_private_log_path container-hash)"
+ digest="$(sha256sum "$image" 2>> "$log" | awk '{print $1}')"
+ [[ "$digest" =~ ^[0-9a-f]{64}$ ]] \
+ || { cx_fail_stage container-hash "$log"; return 1; }
+ export COLLECTIVEX_SQUASH_SHA256="$digest"
+}
+
+cx_squash_path() {
+ local squash_dir="$1" image="$2" key platform
+ [[ "${COLLECTIVEX_IMAGE_DIGEST:-}" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ || return 1
+ case "${CX_IMAGE_PLATFORM:-}" in
+ linux/amd64) platform="" ;;
+ linux/arm64) platform="_linux_arm64" ;;
+ *) return 1 ;;
+ esac
+ key="${CX_SQUASH_FORMAT_VERSION}${platform}_${COLLECTIVEX_IMAGE_DIGEST#sha256:}_$(
+ printf '%s' "$image" | sed 's#[/:@#]#_#g'
+ )"
+ printf '%s' "$squash_dir/${key}.sqsh"
+}
+
+# cx_ensure_squash -> echoes the squash file path.
+# Imports via Enroot only if a valid squash is not already present, under a lock.
+cx_ensure_squash() {
+ local squash_dir="$1" image="$2" key sq locks lock_fd log
+ local enroot_local="" import_rc=0 machine
+ log="$(cx_private_log_path container-import)"
+ machine="$(uname -m)"
+ case "${CX_IMAGE_PLATFORM:-}:$machine" in
+ linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;;
+ *) cx_fail_stage container-import "$log"; return 1 ;;
+ esac
+ mkdir -p "$squash_dir" 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ sq="$(cx_squash_path "$squash_dir" "$image")" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ key="${sq##*/}"
+ key="${key%.sqsh}"
+ locks="$squash_dir/.locks"
+ mkdir -p "$locks" 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ { exec {lock_fd}>"$locks/${key}.lock"; } 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ flock -w 900 "$lock_fd" 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ if unsquashfs -l "$sq" >/dev/null 2>&1; then
+ cx_log "container squash ready"
+ else
+ cx_log "importing configured container image"
+ rm -f "$sq" 2>> "$log" \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ # > "$log" 2>&1 || import_rc=$?
+ rm -rf -- "$enroot_local" >/dev/null 2>&1 || true
+ [ "$import_rc" = 0 ] \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ else
+ SOURCE_DATE_EPOCH="$CX_SQUASH_SOURCE_DATE_EPOCH" \
+ enroot import -o "$sq" "docker://$image" > "$log" 2>&1 \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ fi
+ unsquashfs -l "$sq" >> "$log" 2>&1 \
+ || { cx_fail_stage container-import "$log"; return 1; }
+ fi
+ if ! cx_reverify_registry_image "$image" >> "$log" 2>&1; then
+ flock -u "$lock_fd" >/dev/null 2>&1 || true
+ exec {lock_fd}>&-
+ cx_fail_stage container-import "$log"
+ return 1
+ fi
+ flock -u "$lock_fd"
+ exec {lock_fd}>&-
+ echo "$sq"
+}
+
+# Import on an allocated compute node so multiarch tags resolve for the target
+# architecture. The squash directory must be shared with the submit host.
+cx_ensure_squash_on_job() {
+ local job_id="$1" squash_dir="$2" image="$3" lock_dir="${4:-}" sq key lock log
+ [[ "$job_id" =~ ^[0-9]+$ ]] || return 1
+ sq="$(cx_squash_path "$squash_dir" "$image")" || return 1
+ key="${sq##*/}"
+ key="${key%.sqsh}"
+ [ -n "$lock_dir" ] || lock_dir="$squash_dir/.locks"
+ lock="$lock_dir/${key}.lock"
+ log="$(cx_private_log_path container-import)"
+ if ! srun --jobid="$job_id" --nodes=1 --ntasks=1 --chdir=/tmp \
+ --export="$(cx_host_exports)" \
+ bash -s -- "$sq" "$lock" "$image" "$CX_SQUASH_SOURCE_DATE_EPOCH" \
+ "$CX_IMAGE_PLATFORM" \
+ > "$log" 2>&1 <<'BASH'
+set -euo pipefail
+sq="$1"; lock="$2"; image="$3"; source_date_epoch="$4"; platform="$5"
+machine="$(uname -m)"
+case "$platform:$machine" in
+ linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;;
+ *) exit 13 ;;
+esac
+compute_home="$(mktemp -d /tmp/inferencex-collectivex-home.XXXXXX)"
+trap 'rm -rf -- "$compute_home"' EXIT
+export HOME="$compute_home" XDG_CACHE_HOME="$compute_home/.cache"
+export ENROOT_TEMP_PATH="$compute_home/enroot-tmp"
+export ENROOT_CACHE_PATH="$compute_home/enroot-cache"
+export ENROOT_DATA_PATH="$compute_home/enroot-data"
+export ENROOT_RUNTIME_PATH="$compute_home/enroot-run"
+mkdir -p "$(dirname "$sq")" "$(dirname "$lock")" \
+ "$ENROOT_TEMP_PATH" "$ENROOT_CACHE_PATH" "$ENROOT_DATA_PATH" "$ENROOT_RUNTIME_PATH"
+exec 9>"$lock"
+flock -w 900 9
+if unsquashfs -l "$sq" >/dev/null 2>&1; then
+ echo 'container squash ready'
+else
+ rm -f -- "$sq"
+ SOURCE_DATE_EPOCH="$source_date_epoch" \
+ enroot import -o "$sq" "docker://$image" /dev/null 2>&1
+fi
+BASH
+ then
+ cx_fail_stage container-import "$log"
+ return 1
+ fi
+ if ! cx_reverify_registry_image "$image" >> "$log" 2>&1; then
+ cx_fail_stage container-import "$log"
+ return 1
+ fi
+ printf '%s' "$sq"
+}
+
+cx_preflight_allocation() {
+ local job_id="$1" nodes="$2" mount_src="$3" squash="$4" shard="${5:-}"
+ local log rc=0 runtime shard_path="" probe_root probe_token index
+ runtime="$mount_src/experimental/CollectiveX/runtime/run_in_container.sh"
+ [ -z "$shard" ] || shard_path="$mount_src/experimental/CollectiveX/$shard"
+ log="$(cx_private_log_path allocation-preflight)"
+ probe_root="$mount_src/.collectivex-preflight"
+ probe_token="$probe_root/source"
+ if [ -e "$probe_root" ] || [ -L "$probe_root" ] \
+ || ! mkdir -m 700 "$probe_root"; then
+ cx_fail_stage repository-stage "$log"
+ return 1
+ fi
+ if ! printf '%s\n' "${COLLECTIVEX_EXECUTION_ID:-manual-$$}" > "$probe_token" \
+ || ! chmod 600 "$probe_token"; then
+ chmod 700 "$probe_root" >/dev/null 2>&1 || true
+ rm -rf -- "$probe_root" >/dev/null 2>&1 || true
+ cx_fail_stage repository-stage "$log"
+ return 1
+ fi
+ srun --jobid="$job_id" --nodes="$nodes" --ntasks="$nodes" --ntasks-per-node=1 \
+ --chdir=/tmp --input=all \
+ --export="$(cx_host_exports)" bash -s -- "$runtime" "$shard_path" "$squash" \
+ "$CX_IMAGE_PLATFORM" "$probe_root" \
+ > "$log" 2>&1 <<'BASH' || rc=$?
+set -euo pipefail
+machine="$(uname -m)"
+case "$4:$machine" in
+ linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;;
+ *) exit 13 ;;
+esac
+test -r "$1" || exit 10
+[ -z "$2" ] || test -r "$2" || exit 11
+test -r "$3" || exit 12
+unsquashfs -s "$3" >/dev/null 2>&1 || exit 12
+case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 10 ;; esac
+[ -d "$5" ] && [ ! -L "$5" ] && [ -r "$5/source" ] || exit 10
+(set -C; cat "$5/source" > "$5/node-$SLURM_NODEID") || exit 10
+cmp -s -- "$5/source" "$5/node-$SLURM_NODEID" || exit 10
+BASH
+ if [ "$rc" = 0 ]; then
+ for ((index = 0; index < nodes; index++)); do
+ if ! cmp -s -- "$probe_token" "$probe_root/node-$index"; then
+ rc=10
+ break
+ fi
+ done
+ fi
+ if [ -d "$probe_root" ] && [ ! -L "$probe_root" ]; then
+ chmod 700 "$probe_root" >/dev/null 2>&1 || rc=10
+ fi
+ rm -rf -- "$probe_root" >/dev/null 2>&1 || rc=10
+ [ "$rc" = 0 ] && return 0
+ case "$rc" in
+ 10|11) cx_fail_stage repository-stage "$log" ;;
+ 12) cx_fail_stage container-hash "$log" ;;
+ *) cx_fail_stage container-launch "$log" ;;
+ esac
+ return 1
+}
+
+# Resolve the exact per-execution child before any copy starts, so the parent
+# EXIT trap can remove an interrupted partial stage. The configured base must
+# already exist on compute-visible storage and must not traverse symlinks.
+cx_stage_path() {
+ local repo_root="$1" stage_base="${2:-}" tag safe_tag stage_path
+ tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}"
+ [[ "$tag" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \
+ || cx_die "invalid staging execution identity"
+ safe_tag="$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')"
+ if [ -z "$stage_base" ] || [ "$stage_base" = "$repo_root" ]; then
+ [ "${COLLECTIVEX_CANONICAL_GHA:-0}" != 1 ] \
+ || cx_die "canonical CollectiveX execution requires compute-visible staging"
+ [ -n "${CX_SQUASH_DIR:-}" ] \
+ || cx_die "manual CollectiveX staging requires CX_SQUASH_DIR"
+ stage_base="$CX_SQUASH_DIR"
+ stage_path="${stage_base%/}/.collectivex-stage-$safe_tag"
+ else
+ stage_path="${stage_base%/}/job_$safe_tag"
+ fi
+ python3 - "$repo_root" "$stage_base" "$stage_path" \
+ "${CX_JOB_ROOT:-}" "${GITHUB_WORKSPACE:-}" <<'PY'
+import os
+import stat
+import sys
+
+repo, base, child, job_root, workspace = sys.argv[1:]
+try:
+ if (
+ not os.path.isabs(repo)
+ or os.path.realpath(repo) != repo
+ or not os.path.isabs(base)
+ or os.path.realpath(base) != base
+ or not os.path.isabs(child)
+ or os.path.dirname(child) != base.rstrip("/")
+ or os.path.lexists(child)
+ ):
+ raise OSError
+ metadata = os.stat(base, follow_symlinks=False)
+ excluded = [repo]
+ excluded.extend(path for path in (job_root, workspace) if path)
+ for path in excluded:
+ resolved = os.path.realpath(path)
+ if os.path.commonpath((base, resolved)) == resolved:
+ raise OSError
+ if (
+ not stat.S_ISDIR(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or stat.S_IMODE(metadata.st_mode) & (stat.S_IWGRP | stat.S_IWOTH)
+ or not os.access(base, os.W_OK | os.X_OK)
+ ):
+ raise OSError
+except OSError:
+ raise SystemExit(1)
+print(child, end="")
+PY
+}
+
+# Stage only the public benchmark tree into a pre-resolved, private execution
+# child. A runner-owned marker makes recursive cleanup an explicit capability.
+cx_stage_repo() {
+ local repo_root="$1" stage_dir="$2" expected log tag marker
+ cx_validate_shard_control "$repo_root/experimental/CollectiveX"
+ expected="$(cx_stage_path "$repo_root" "${CX_STAGE_DIR:-}")" \
+ || cx_die "configured stage base is unavailable or unsafe"
+ [ "$stage_dir" = "$expected" ] \
+ || cx_die "execution stage differs from the configured stage base"
+ tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}"
+ if [ -e "$stage_dir" ] || [ -L "$stage_dir" ]; then
+ cx_die "refusing to reuse a pre-existing execution stage"
+ fi
+ mkdir -m 700 "$stage_dir" 2>/dev/null \
+ || cx_die "cannot create the configured stage directory"
+ chmod 700 "$stage_dir" 2>/dev/null \
+ || cx_die "cannot protect the configured stage directory"
+ marker="$stage_dir/.collectivex-stage-v1"
+ umask 077
+ (set -C; printf 'collectivex-stage-v1\n%s\n' "$tag" > "$marker") 2>/dev/null \
+ || cx_die "cannot claim the configured stage directory"
+ chmod 600 "$marker" 2>/dev/null \
+ || cx_die "cannot protect the configured stage directory"
+ mkdir -m 700 "$stage_dir/experimental" 2>/dev/null \
+ || cx_die "cannot create the configured stage directory"
+ cx_log "staging CollectiveX on compute-visible storage"
+ log="$(cx_private_log_path repository-stage)"
+ if ! rsync -a --delete --delete-excluded \
+ --exclude='__pycache__/' --exclude='results/' --exclude='.cx_workloads/' \
+ --exclude='.cx_backend/' --exclude='.cx_sources/' \
+ --exclude='configs/platforms.yaml' --exclude='private-infra.md' \
+ --exclude='goal.md' --exclude='notes.md' \
+ "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" > "$log" 2>&1; then
+ rm -rf -- "$stage_dir" >/dev/null 2>&1 \
+ || cx_log "ERROR: cannot remove the incomplete execution stage"
+ cx_fail_stage repository-stage "$log" || true
+ return 1
+ fi
+}
+
+# cx_collect_results
+# When the run used a staged (compute-visible) mount, copy result JSONs back to
+# the original checkout's results/ so the workflow's upload-artifact (which reads
+# the checkout, not the stage dir) finds them. No-op when no staging was used.
+cx_collect_results() {
+ local mount_src="$1" repo_root="$2" dst log
+ local -a files
+ [ "$mount_src" = "$repo_root" ] && return 0
+ log="$(cx_private_log_path "artifact-collection-$$-${RANDOM}")"
+ dst="$repo_root/experimental/CollectiveX/results"
+ mkdir -p "$dst" 2>> "$log" \
+ || { cx_log "ERROR: cannot create checkout result directory"; return 1; }
+ shopt -s nullglob
+ files=("$mount_src/experimental/CollectiveX/results/"*.json)
+ shopt -u nullglob
+ [ "${#files[@]}" -gt 0 ] || { cx_log "ERROR: staged run produced no result JSON"; return 1; }
+ cp -- "${files[@]}" "$dst/" >> "$log" 2>&1 \
+ || { cx_log "ERROR: staged result collection failed"; return 1; }
+ cx_log "collected staged results for artifact validation"
+}
+
+cx_cleanup_stage() {
+ local mount_src="$1" repo_root="$2" base="${CX_STAGE_DIR:-}" tag safe_tag expected
+ tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}"
+ safe_tag="$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')"
+ [ "$mount_src" != "$repo_root" ] || return 0
+ if [ -n "$base" ] && [ "$base" != "$repo_root" ]; then
+ expected="${base%/}/job_$safe_tag"
+ else
+ [ -n "${CX_SQUASH_DIR:-}" ] \
+ || { cx_log "ERROR: cannot identify the generated stage directory"; return 1; }
+ expected="${CX_SQUASH_DIR%/}/.collectivex-stage-$safe_tag"
+ fi
+ if [ "$mount_src" != "$expected" ] || [ "$mount_src" = / ] \
+ || { [ -n "$base" ] && [ "$mount_src" = "$base" ]; }; then
+ cx_log "ERROR: refusing to remove an unrecognized stage directory"
+ return 1
+ fi
+ if ! python3 - "$mount_src" "$tag" <<'PY'
+import os
+from pathlib import Path
+import stat
+import sys
+
+root = Path(sys.argv[1])
+expected = f"collectivex-stage-v1\n{sys.argv[2]}\n"
+try:
+ metadata = os.stat(root, follow_symlinks=False)
+ marker = root / ".collectivex-stage-v1"
+ if (
+ not stat.S_ISDIR(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or (stat.S_IMODE(metadata.st_mode) & 0o777) != 0o700
+ ):
+ raise OSError
+ entries = list(root.iterdir())
+ if marker.exists():
+ marker_metadata = os.stat(marker, follow_symlinks=False)
+ if (
+ not stat.S_ISREG(marker_metadata.st_mode)
+ or marker_metadata.st_uid != os.getuid()
+ or stat.S_IMODE(marker_metadata.st_mode) != 0o600
+ ):
+ raise OSError
+ marker_content = marker.read_text()
+ if marker_content != expected and entries != [marker]:
+ raise OSError
+ elif entries:
+ raise OSError
+except (OSError, UnicodeError):
+ raise SystemExit(1)
+PY
+ then
+ cx_log "ERROR: refusing to remove an unowned stage directory"
+ return 1
+ fi
+ rm -rf -- "$mount_src" >/dev/null 2>&1 || {
+ cx_log "ERROR: cannot remove generated stage directory"
+ return 1
+ }
+ cx_log "removed generated per-execution stage directory"
+}
+
+# Return success only when a benchmark output is a complete JSON result object.
+# Callers use this before synthesizing a terminal outcome so an emitted invalid result
+# is not shadowed by a second record for the same attempt.
+cx_has_result_doc() {
+ local path="$1"
+ python3 "$_CX_COMMON_ROOT/contracts.py" probe "$path" >/dev/null 2>&1
+}
+
+cx_result_doc_is() {
+ local path="$1" expected="$2"
+ python3 "$_CX_COMMON_ROOT/contracts.py" probe "$path" --status "$expected" \
+ >/dev/null 2>&1
+}
+
+# A rank-zero result can be written before another rank or backend teardown fails. Preserve its
+# measurements, but make the distributed command's nonzero terminal status authoritative.
+cx_demote_result_doc() {
+ local path="$1" rc="$2"
+ python3 "$_CX_COMMON_ROOT/contracts.py" demote "$path" --return-code "$rc"
+}
+
+cx_quarantine_result_doc() {
+ python3 "$_CX_COMMON_ROOT/contracts.py" quarantine-invalid "$1"
+}
+
+# cx_emit_ep_failed_case
+# Preserve failures from rack launchers that invoke run_ep.py directly and therefore cannot use
+# run_in_container.sh's emitter. Case identity is read from the exported CX_* variables.
+cx_emit_ep_failed_case() {
+ local out="$1" backend="$2" phase="$3" rc="$4"
+ local -a args=(emit-terminal --out "$out" --backend "$backend" --phase "$phase"
+ --return-code "$rc")
+ [ -z "${CX_FAILURE_MODE:-}" ] || args+=(--failure-mode "$CX_FAILURE_MODE")
+ if ! python3 "$_CX_COMMON_ROOT/contracts.py" "${args[@]}"
+ then
+ cx_log "ERROR: could not preserve terminal outcome"
+ return 1
+ fi
+}
+
+cx_case_attempt_exists() {
+ local out_dir="$1" case_id="$2"
+ python3 - "$_CX_COMMON_ROOT" "$out_dir" "$case_id" <<'PY'
+import pathlib, sys
+
+sys.path.insert(0, sys.argv[1])
+import contracts
+
+sample_paths = set()
+referenced_samples = set()
+found = False
+
+def quarantine(path, document):
+ sample = document.get("sample_artifact") if isinstance(document, dict) else None
+ if (
+ isinstance(sample, dict)
+ and isinstance(sample.get("path"), str)
+ and pathlib.Path(sample["path"]).name == sample["path"]
+ ):
+ sample_path = path.with_name(sample["path"])
+ if sample_path.is_file():
+ sample_path.replace(sample_path.with_name(sample_path.name + ".quarantine"))
+ if path.is_file():
+ path.replace(path.with_name(path.name + ".quarantine"))
+
+for path in pathlib.Path(sys.argv[2]).glob("*.json"):
+ document = None
+ try:
+ document = contracts.strict_load(path)
+ if not isinstance(document, dict):
+ continue
+ if document.get("format") == contracts.RAW_FORMAT:
+ document = contracts.load_raw_attempt(path)
+ referenced_samples.add(path.with_name(document["sample_artifact"]["path"]))
+ elif document.get("format") == contracts.TERMINAL_FORMAT:
+ document = contracts.validate_terminal_document(document)
+ elif document.get("format") == contracts.SAMPLES_FORMAT:
+ contracts.validate_samples_document(document)
+ sample_paths.add(path)
+ continue
+ else:
+ continue
+ except (contracts.ContractError, OSError, ValueError):
+ quarantine(path, document)
+ continue
+ if document["identity"]["case_id"] == sys.argv[3]:
+ found = True
+for orphan in sample_paths - referenced_samples:
+ quarantine(orphan, {})
+raise SystemExit(0 if found else 1)
+PY
+}
+
+# Emit one setup-failure record per requested case. Rack launchers call this when
+# backend preparation fails before rank processes can start.
+cx_emit_setup_failures() {
+ local root="$1" out_dir="$2" backend="$3" rc="$4" shard="${CX_SHARD_FILE:-}" path
+ local phase case_id suite workload required routing eplb ep hidden topk experts nodes
+ local gpn domain ladder canonical timing mode scope scale_up_transport scale_out_transport
+ local warmup_semantics
+ local transport topology_class
+ local cases_file expected emitted=0 covered=0
+ mkdir -p "$out_dir" || return 1
+ export CX_FAILURE_MODE="${CX_FAILSAFE_MODE:-setup}" CX_ATTEMPT_ID=1
+ if [ -z "$shard" ]; then
+ local phases="${CX_PHASE:-decode}"
+ [ "$phases" = both ] && phases="decode prefill"
+ for phase in $phases; do
+ if [ -n "${CX_CASE_ID:-}" ] && cx_case_attempt_exists "$out_dir" "$CX_CASE_ID"; then
+ continue
+ fi
+ cx_emit_ep_failed_case "$out_dir/failed_${backend}_${phase}_${CX_TS:-setup}-a01.json" \
+ "$backend" "$phase" "$rc" || return 1
+ done
+ unset CX_FAILURE_MODE
+ return 0
+ fi
+ path="$shard"
+ [ -f "$path" ] || path="${root%/}/$shard"
+ [ -f "$path" ] || {
+ unset CX_FAILURE_MODE
+ cx_log "ERROR: cannot emit setup failures without shard control"
+ return 1
+ }
+ export COLLECTIVEX_CONTROL_SHA256
+ COLLECTIVEX_CONTROL_SHA256="$(sha256sum "$path" | awk '{print $1}')"
+ [[ "$COLLECTIVEX_CONTROL_SHA256" =~ ^[0-9a-f]{64}$ ]] || {
+ unset CX_FAILURE_MODE COLLECTIVEX_CONTROL_SHA256
+ cx_log "ERROR: cannot hash shard for setup-failure records"
+ return 1
+ }
+ cases_file="$(mktemp)" || return 1
+ if ! python3 - "$path" > "$cases_file" <<'PY'
+import json, sys
+
+with open(sys.argv[1]) as handle:
+ cases = json.load(handle)["cases"]
+for case in cases:
+ fields = (
+ case["phase"], case["mode"], case["case_id"], case["suite"], case["workload"],
+ case["required_publication"], case["routing"], "1" if case["eplb"] else "",
+ case["ep"], case["hidden"], case["topk"], case["experts"], case["nodes"],
+ case["gpus_per_node"], case["scale_up_domain"], case["scope"],
+ case["scale_up_transport"], case.get("scale_out_transport") or "",
+ case["transport"], case["topology_class"], case["ladder"],
+ case["warmup_semantics"],
+ "1" if case["canonical"] else "", case["timing"],
+ )
+ print("|".join(map(str, fields)))
+PY
+ then
+ rm -f "$cases_file"
+ unset CX_FAILURE_MODE
+ return 1
+ fi
+ expected="$(wc -l < "$cases_file" | tr -d ' ')"
+ [ "$expected" -gt 0 ] || { rm -f "$cases_file"; unset CX_FAILURE_MODE; return 1; }
+ while IFS='|' read -r phase mode case_id suite workload required routing eplb ep hidden topk \
+ experts nodes gpn domain scope scale_up_transport scale_out_transport transport \
+ topology_class ladder warmup_semantics canonical timing; do
+ export CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload"
+ export CX_REQUIRED_PUBLICATION="$required" CX_ROUTING="$routing" CX_EPLB="$eplb"
+ export CX_EP="$ep" CX_NGPUS="$ep" CX_HIDDEN="$hidden" CX_TOPK="$topk" CX_EXPERTS="$experts"
+ export CX_MODE="$mode" CX_NODES="$nodes" CX_GPUS_PER_NODE="$gpn"
+ export CX_SCALE_UP_DOMAIN="$domain" CX_SCOPE="$scope"
+ export CX_SCALE_UP_TRANSPORT="$scale_up_transport"
+ export CX_SCALE_OUT_TRANSPORT="$scale_out_transport"
+ export CX_TRANSPORT="$transport" CX_TOPO="$topology_class"
+ export CX_TOKENS_LADDER="$ladder" CX_CANONICAL="$canonical"
+ export CX_WARMUP_SEMANTICS="$warmup_semantics"
+ IFS=: read -r CX_ITERS CX_TRIALS CX_WARMUP <<< "$timing"
+ export CX_ITERS CX_TRIALS CX_WARMUP CX_SAMPLES_PER_POINT="$((CX_ITERS * CX_TRIALS))"
+ if cx_case_attempt_exists "$out_dir" "$case_id"; then
+ covered=$((covered + 1))
+ continue
+ fi
+ cx_emit_ep_failed_case "$out_dir/failed_${case_id}-a01.json" "$backend" "$phase" "$rc" || return 1
+ emitted=$((emitted + 1))
+ done < "$cases_file"
+ rm -f "$cases_file"
+ unset CX_FAILURE_MODE
+ [ "$((emitted + covered))" -eq "$expected" ] || {
+ cx_log "ERROR: covered $((emitted + covered))/$expected terminal cases"
+ return 1
+ }
+}
+
+# Run one validated shard with one Slurm task per GPU. Launchers provide only
+# allocation/container policy through globals and CX_DISTRIBUTED_CONTAINER_ARGS.
+# shellcheck disable=SC2153
+cx_run_distributed_shard() {
+ local build_log build_rc cases_file expected_cases ci=0 failed_cases=0
+ local ph mode routing eplb hidden topk experts ladder suite workload required_pub
+ local canonical case_id ep timing case_iters case_trials case_warmup case_stem
+ local scope scale_up_transport scale_out_transport transport topology_class nodes gpn domain
+ local workload_dir workload_ladder workload_log stage_rc attempt_tag out failure_out
+ local runtime_log run_rc expected_out case_ok summary_log
+ local -a container_args workload_args ep_args
+ [ "${NODES:-0}" -gt 1 ] && [ "${NGPUS:-0}" = "$((NODES * GPN))" ] \
+ || cx_die "invalid distributed launcher placement"
+ [ -n "${JOB_ID:-}" ] && [ -n "${SQUASH_FILE:-}" ] \
+ && [ -n "${CONTAINER_MOUNTS:-}" ] || cx_die "distributed launcher is incomplete"
+ [ -n "${SOURCE_BACKEND_ENV:-}" ] && [ -n "${BACKEND_PROBE:-}" ] \
+ && [ -n "${WRAP:-}" ] || cx_die "distributed rank wrapper is incomplete"
+
+ cx_resolve_slurm_rendezvous "$JOB_ID"
+ mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results"
+ container_args=(--container-mounts="$CONTAINER_MOUNTS" --no-container-mount-home
+ --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint)
+ if declare -p CX_DISTRIBUTED_CONTAINER_ARGS >/dev/null 2>&1; then
+ container_args+=("${CX_DISTRIBUTED_CONTAINER_ARGS[@]}")
+ fi
+ local container_name="cxep_${JOB_ID}"
+
+ cx_log "distributed backend preparation: bench=$CX_BENCH nodes=$NODES"
+ cx_set_failure_stage backend-setup
+ build_log="$(cx_private_log_path backend-prepare)"
+ set +e
+ srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --chdir=/tmp \
+ --container-name="$container_name" --container-image="$SQUASH_FILE" \
+ "${container_args[@]}" --export="$(cx_container_exports),CX_BUILD_ONLY=1" \
+ bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \
+ "$build_log" 2>&1
+ build_rc=$?
+ if [ "$build_rc" = 0 ]; then
+ srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --chdir=/tmp \
+ --container-name="$container_name" "${container_args[@]}" \
+ --export="$(cx_container_exports)" bash -c "$BACKEND_PROBE" \
+ >"$build_log" 2>&1
+ build_rc=$?
+ fi
+ set -e
+ if [ "$build_rc" != 0 ]; then
+ cx_fail_stage backend-setup "$build_log" || true
+ cx_emit_setup_failures "$CX_DIR" "$MOUNT_SRC/experimental/CollectiveX/results" \
+ "$CX_BENCH" "$build_rc"
+ return "$build_rc"
+ fi
+ cx_set_failure_stage execution
+
+ cases_file="$(mktemp)" || return 1
+ local shard="${CX_SHARD_FILE:-}"
+ [ -z "$shard" ] || [ -f "$shard" ] || shard="$CX_DIR/$shard"
+ if [ -n "$shard" ]; then
+ if [ ! -f "$shard" ] || ! python3 - "$shard" > "$cases_file" <<'PY'
+import json
+import sys
+
+with open(sys.argv[1]) as handle:
+ cases = json.load(handle)["cases"]
+for case in cases:
+ get = lambda key, default="": str(case.get(key) or default)
+ fields = (
+ get("phase", "decode"), get("mode", "normal"), get("routing", "uniform"),
+ "1" if case.get("eplb") else "", get("hidden", "7168"),
+ get("topk", "8"), get("experts", "256"), get("ladder"),
+ get("suite"), get("workload"), get("required_publication"),
+ "1" if case.get("canonical") else "", get("case_id"), get("ep"),
+ get("timing", "8:64:32"), get("nodes"), get("gpus_per_node"),
+ get("scale_up_domain"), get("scope"), get("scale_up_transport"),
+ get("scale_out_transport"), get("transport"), get("topology_class"),
+ )
+ print("|".join(fields))
+PY
+ then
+ rm -f "$cases_file"
+ cx_die "could not enumerate validated shard cases"
+ fi
+ else
+ local phases="${CX_PHASE:-decode}" phase
+ [ "$phases" = both ] && phases="decode prefill"
+ cx_require_record_safe "$phases" "${CX_MODE:-normal}" "${CX_ROUTING:-uniform}" \
+ "${CX_EPLB:-}" "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" \
+ "${CX_TOKENS_LADDER:-}" "${CX_SUITE:-}" "${CX_WORKLOAD_NAME:-}" \
+ "${CX_REQUIRED_PUBLICATION:-}" "${CX_CANONICAL:-}" "${CX_CASE_ID:-}" \
+ "${CX_ITERS:-8}" "${CX_TRIALS:-64}" "${CX_WARMUP:-32}" \
+ "${CX_SCOPE:-scale-up}" \
+ "${CX_SCALE_UP_TRANSPORT:-unknown}" "${CX_SCALE_OUT_TRANSPORT:-}" \
+ "${CX_TRANSPORT:-unknown}" "${CX_TOPO:-manual}"
+ for phase in $phases; do
+ (IFS='|'; printf '%s\n' "$phase|${CX_MODE:-normal}|${CX_ROUTING:-uniform}|${CX_EPLB:-}|${CX_HIDDEN:-7168}|${CX_TOPK:-8}|${CX_EXPERTS:-256}|${CX_TOKENS_LADDER:-}|${CX_SUITE:-}|${CX_WORKLOAD_NAME:-}|${CX_REQUIRED_PUBLICATION:-}|${CX_CANONICAL:-}|${CX_CASE_ID:-}|$NGPUS|${CX_ITERS:-8}:${CX_TRIALS:-64}:${CX_WARMUP:-32}|$NODES|$GPN|$SCALE_UP_DOMAIN|${CX_SCOPE:-scale-up}|${CX_SCALE_UP_TRANSPORT:-unknown}|${CX_SCALE_OUT_TRANSPORT:-}|${CX_TRANSPORT:-unknown}|${CX_TOPO:-manual}")
+ done > "$cases_file"
+ fi
+ expected_cases="$(wc -l < "$cases_file" | tr -d ' ')"
+ [ "$expected_cases" -gt 0 ] \
+ || { rm -f "$cases_file"; cx_die "distributed case list is empty"; }
+
+ while IFS='|' read -r ph mode routing eplb hidden topk experts ladder suite workload \
+ required_pub canonical case_id ep timing nodes gpn domain scope scale_up_transport \
+ scale_out_transport transport topology_class; do
+ [ -n "$ph" ] || continue
+ ci=$((ci + 1))
+ case_stem="${RUNNER}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")"
+ IFS=: read -r case_iters case_trials case_warmup <<< "${timing:-8:64:32}"
+ case_iters="${case_iters:-8}"
+ case_trials="${case_trials:-64}"
+ case_warmup="${case_warmup:-32}"
+ ep="${ep:-$NGPUS}"
+ export CX_MODE="$mode" CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload"
+ export CX_REQUIRED_PUBLICATION="$required_pub" CX_CANONICAL="$canonical" CX_EP="$ep"
+ export CX_ROUTING="$routing" CX_EPLB="$eplb" CX_TOKENS_LADDER="$ladder"
+ export CX_HIDDEN="$hidden" CX_TOPK="$topk" CX_EXPERTS="$experts"
+ export CX_NODES="$nodes" CX_GPUS_PER_NODE="$gpn" CX_SCALE_UP_DOMAIN="$domain"
+ export CX_SCOPE="$scope" CX_SCALE_UP_TRANSPORT="$scale_up_transport"
+ export CX_SCALE_OUT_TRANSPORT="$scale_out_transport"
+ export CX_TRANSPORT="$transport" CX_TOPO="$topology_class"
+ export CX_ITERS="$case_iters" CX_TRIALS="$case_trials" CX_WARMUP="$case_warmup"
+ export CX_SAMPLES_PER_POINT="$((case_iters * case_trials))"
+ export CX_WARMUP_SEMANTICS="full-roundtrip-before-each-component-trial-point-v1"
+ cx_log "EP${NGPUS}[$ci] id=${case_id:-manual} $mode/$ph $CX_BENCH"
+ if [ "$ep" != "$NGPUS" ] || [ "$nodes" != "$NODES" ] || [ "$gpn" != "$GPN" ] \
+ || [ "$domain" != "$SCALE_UP_DOMAIN" ]; then
+ export CX_ATTEMPT_ID=1
+ failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json"
+ cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" 5
+ failed_cases=$((failed_cases + 1))
+ continue
+ fi
+
+ workload_dir=""
+ if cx_bool_enabled "$canonical"; then
+ workload_dir=".cx_workloads/c$(printf '%03d' "$ci")"
+ workload_ladder="$ladder"
+ [ -n "$workload_ladder" ] \
+ || workload_ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096"
+ workload_args=(python3 tests/make_workloads.py --out-dir "$workload_dir"
+ --routing "$routing" --ep "$ep" --hidden "$hidden" --topk "$topk"
+ --experts "$experts" --seed "${CX_SEED:-67}" --tokens-ladder "$workload_ladder")
+ workload_log="$(cx_private_log_path "workload-c$(printf '%03d' "$ci")")"
+ set +e
+ srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --chdir=/tmp \
+ --container-name="$container_name" "${container_args[@]}" \
+ --export="$(cx_container_exports)" "${workload_args[@]}" \
+ "$workload_log" 2>&1
+ stage_rc=$?
+ set -e
+ if [ "$stage_rc" != 0 ]; then
+ export CX_ATTEMPT_ID=1
+ failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json"
+ cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$stage_rc"
+ failed_cases=$((failed_cases + 1))
+ continue
+ fi
+ fi
+
+ ep_args=(--backend "$CX_BENCH" --mode "$mode" --phase "$ph" --routing "$routing"
+ --gpus-per-node "$gpn" --scale-up-domain "$domain" --scope "$scope"
+ --scale-up-transport "$scale_up_transport" --scale-out-transport "$scale_out_transport"
+ --tokens-ladder "$ladder" --hidden "$hidden" --topk "$topk" --experts "$experts"
+ --warmup "$case_warmup" --iters "$case_iters" --trials "$case_trials"
+ --seed "${CX_SEED:-67}" --runner "$RUNNER" --topology-class "$topology_class"
+ --transport "$transport" --case-id "$case_id" --suite "$suite"
+ --workload-name "$workload" --required-publication "$required_pub")
+ cx_bool_enabled "$eplb" && ep_args+=(--eplb)
+ [ -z "$workload_dir" ] || ep_args+=(--workload-dir "$workload_dir")
+ export CX_ATTEMPT_ID=1
+ attempt_tag=a01
+ out="results/${case_stem}_${attempt_tag}.json"
+ failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-${attempt_tag}.json"
+ runtime_log="$(cx_private_log_path "runtime-c$(printf '%03d' "$ci")-$attempt_tag")"
+ set +e
+ timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" \
+ --ntasks="$NGPUS" --ntasks-per-node="$GPN" --chdir=/tmp \
+ --container-name="$container_name" "${container_args[@]}" \
+ --export="$(cx_container_exports)" \
+ bash -c "$WRAP" _ "${ep_args[@]}" --out "$out" \
+ "$runtime_log" 2>&1
+ run_rc=$?
+ set -e
+ expected_out="$MOUNT_SRC/experimental/CollectiveX/$out"
+ case_ok=0
+ if [ "$run_rc" = 0 ] && cx_result_doc_is "$expected_out" success; then
+ case_ok=1
+ elif [ "$run_rc" = 0 ] && cx_result_doc_is "$expected_out" invalid; then
+ cx_log "ERROR: EP${NGPUS}[$ci] completed with invalid semantic evidence"
+ else
+ [ "$run_rc" != 0 ] || run_rc=1
+ if cx_has_result_doc "$expected_out"; then
+ cx_demote_result_doc "$expected_out" "$run_rc" \
+ || { cx_quarantine_result_doc "$expected_out"; cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"; }
+ else
+ cx_quarantine_result_doc "$expected_out"
+ cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"
+ fi
+ fi
+ if [ "$case_ok" = 0 ]; then
+ [ "$run_rc" = 0 ] || cx_fail_stage execution "$runtime_log" || true
+ failed_cases=$((failed_cases + 1))
+ fi
+ done < "$cases_file"
+ rm -f "$cases_file"
+ [ "$ci" -eq "$expected_cases" ] \
+ || cx_die "enumerated $expected_cases cases but executed $ci"
+ if [ "$failed_cases" -ne 0 ]; then
+ summary_log="$(cx_private_log_path shard-summary)"
+ printf 'SHARD done: %s/%s case(s) failed\n' "$failed_cases" "$expected_cases" \
+ > "$summary_log"
+ cx_fail_stage execution "$summary_log" || true
+ return 1
+ fi
+ return 0
+}
+
+cx_launcher_cleanup() {
+ local rc="$1" stage_root="${MOUNT_SRC:-}" source_root out_dir allocation_stopped=1
+ source_root="${stage_root:-${REPO_ROOT:-}}"
+ trap - EXIT
+ if [ -n "${COLLECTIVEX_EPHEMERAL_CONFIG_PATH:-}" ]; then
+ rm -f -- "$COLLECTIVEX_EPHEMERAL_CONFIG_PATH" >/dev/null 2>&1 || true
+ unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH
+ fi
+ if [ -n "${JOB_ID:-}" ]; then
+ if ! cx_cancel_job "$JOB_ID"; then
+ allocation_stopped=0
+ [ "$rc" != 0 ] || rc=1
+ fi
+ elif [ "${CX_ALLOCATION_UNCERTAIN:-0}" = 1 ]; then
+ allocation_stopped=0
+ [ "$rc" != 0 ] || rc=1
+ fi
+ if [ "$allocation_stopped" = 1 ]; then
+ cx_write_cleanup_guard safe || true
+ else
+ cx_write_cleanup_guard unsafe || true
+ fi
+ [ "$allocation_stopped" = 1 ] || source_root="${REPO_ROOT:-$source_root}"
+ if [ "$rc" != 0 ] && [ -n "${REPO_ROOT:-}" ] && [ -n "${CX_BENCH:-}" ]; then
+ cx_log "ERROR: terminal-failure-class=${CX_FAILSAFE_MODE:-setup}"
+ [ -d "$source_root/experimental/CollectiveX" ] || source_root="$REPO_ROOT"
+ out_dir="$source_root/experimental/CollectiveX/results"
+ cx_emit_setup_failures \
+ "$source_root/experimental/CollectiveX" "$out_dir" "$CX_BENCH" "$rc" || true
+ [ "$source_root" = "$REPO_ROOT" ] \
+ || cx_collect_results "$source_root" "$REPO_ROOT" || true
+ fi
+ if [ "$allocation_stopped" = 1 ] && [ -n "${REPO_ROOT:-}" ] \
+ && [ -n "$stage_root" ] && [ "$stage_root" != "$REPO_ROOT" ]; then
+ if ! cx_cleanup_stage "$stage_root" "$REPO_ROOT"; then
+ [ "$rc" != 0 ] || rc=1
+ fi
+ fi
+ [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] || cx_cleanup_private_logs "$rc"
+ exit "$rc"
+}
+
+cx_install_launcher_fail_safe() {
+ CX_ALLOCATION_UNCERTAIN=0
+ trap 'cx_launcher_cleanup "$?"' EXIT
+}
diff --git a/experimental/CollectiveX/runtime/run_in_container.sh b/experimental/CollectiveX/runtime/run_in_container.sh
new file mode 100644
index 0000000000..63396c48a0
--- /dev/null
+++ b/experimental/CollectiveX/runtime/run_in_container.sh
@@ -0,0 +1,1087 @@
+#!/usr/bin/env bash
+# CollectiveX — generic in-container benchmark dispatcher (single-node).
+#
+# Runs INSIDE the container under `srun` for single-node shards. The GB EP8 launcher invokes
+# run_ep.py directly across nodes. The SKU adapter handles allocation/container/transport-env;
+# this script selects one EP backend from CX_BENCH and writes result JSON under results/.
+#
+# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO
+# Selector: CX_BENCH = deepep | deepep-v2 | mori | uccl | nccl-ep | deepep-hybrid
+# EP knobs passed to tests/run_ep.py:
+# CX_PHASE = decode | prefill | both (default decode) <- picks the token sweep
+# CX_TOKENS_LADDER (space/comma sep; blank = phase default)
+# CX_HIDDEN CX_TOPK CX_EXPERTS CX_ROUTING CX_SEED CX_ITERS
+set -euo pipefail
+
+cd /ix/experimental/CollectiveX
+# shellcheck source=../runtime/common.sh
+source runtime/common.sh
+mkdir -p results
+cx_write_runtime_stage backend-setup || cx_die "cannot record runtime stage"
+
+: "${CX_RUNNER:?CX_RUNNER not set}"
+: "${CX_NGPUS:?CX_NGPUS not set}"
+: "${CX_TS:?CX_TS not set}"
+: "${CX_TOPO:?CX_TOPO not set}"
+CX_BENCH="${CX_BENCH:-deepep}"
+CX_TRANSPORT="${CX_TRANSPORT:-}"
+
+cx_apply_timing_profile
+
+cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO"
+
+# Blank ladders use the phase default in tests/run_ep.py.
+cx_ep_ladder() {
+ printf '%s' "${CX_TOKENS_LADDER:-}"
+}
+
+# Canonical workload staging. Every SKU/backend generates identical canonical array bytes and
+# content IDs in-container; the NPZ container bytes themselves are not an identity boundary. When CX_CANONICAL=1
+# (and CX_WORKLOAD_DIR not already provided) we generate routing traces for the run's ladder
+# into a NON-results dir (.cx_workloads/ — so the *.manifest.json never pollute the results glob) and
+# point run_ep at it. Raw attempts remain diagnostic until the publisher validates full coverage.
+cx_stage_canonical() {
+ cx_bool_enabled "${CX_CANONICAL:-0}" || return 0
+ [ -n "${CX_WORKLOAD_DIR:-}" ] && return 0
+ local dir="$PWD/.cx_workloads"
+ local ladder; ladder="$(cx_ep_ladder)"
+ # cover both phase ladders when none is given, so either phase finds its files.
+ [ -z "$ladder" ] && ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096"
+ cx_log "staging canonical workloads (routing=${CX_ROUTING:-uniform} ep=$CX_NGPUS ladder='$ladder')"
+ python3 tests/make_workloads.py --out-dir "$dir" --routing "${CX_ROUTING:-uniform}" \
+ --ep "$CX_NGPUS" --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" \
+ --experts "${CX_EXPERTS:-256}" --seed "${CX_SEED:-67}" --tokens-ladder "$ladder" \
+ || { cx_log "ERROR: canonical workload staging failed"; return 1; }
+ export CX_WORKLOAD_DIR="$dir"
+ cx_log "canonical workloads staged at $dir"
+}
+
+# run_ep_suite
+# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and
+# combine are timed separately inside it. One JSON per (backend, phase).
+# Preserve a failed case with its full scheduled identity instead of letting it vanish.
+emit_failed_case() { # backend phase rc
+ cx_emit_ep_failed_case \
+ "results/failed_${CX_RUNNER}_${1}_${2}_${CX_TS}.json" "$1" "$2" "$3" || true
+}
+
+run_ep_suite() {
+ local backend="$1" phase phases ladder failure_kind rc=0 rc_run
+ ladder="$(cx_ep_ladder)"
+ phases="${CX_PHASE:-decode}"
+ [ "$phases" = "both" ] && phases="decode prefill"
+ if ! cx_stage_canonical; then
+ for phase in $phases; do
+ emit_failed_case "$backend" "$phase" 2
+ done
+ return 1
+ fi
+ for phase in $phases; do
+ cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-}'"
+ local out="results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json"
+ local -a EPARGS=(--backend "$backend" --mode "${CX_MODE:-normal}" --phase "$phase"
+ --tokens-ladder "$ladder"
+ --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}"
+ --routing "${CX_ROUTING:-uniform}" --seed "${CX_SEED:-67}" --iters "${CX_ITERS:-8}"
+ --trials "${CX_TRIALS:-64}" --warmup "${CX_WARMUP:-32}"
+ --gpus-per-node "${CX_GPUS_PER_NODE:-0}" --scale-up-domain "${CX_SCALE_UP_DOMAIN:-0}"
+ --scope "${CX_SCOPE:-scale-up}" --scale-up-transport "${CX_SCALE_UP_TRANSPORT:-unknown}"
+ --scale-out-transport "${CX_SCALE_OUT_TRANSPORT:-}"
+ --case-id "${CX_CASE_ID:-}" --suite "${CX_SUITE:-}" --workload-name "${CX_WORKLOAD_NAME:-}"
+ --required-publication "${CX_REQUIRED_PUBLICATION:-}"
+ --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT"
+ --out "$out")
+ cx_bool_enabled "${CX_EPLB:-0}" && EPARGS+=(--eplb)
+ [ -n "${CX_WORKLOAD_DIR:-}" ] && EPARGS+=(--workload-dir "$CX_WORKLOAD_DIR")
+ cx_write_runtime_stage execution || cx_die "cannot record runtime stage"
+ if timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \
+ torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py "${EPARGS[@]}"; then
+ rc_run=0
+ else
+ rc_run=$?
+ fi
+ if [ "$rc_run" = 0 ] && cx_result_doc_is "$out" invalid; then
+ cx_log "WARN: $backend $phase completed with invalid semantic evidence"
+ rc=1
+ continue
+ fi
+ if [ "$rc_run" = 0 ] && ! cx_result_doc_is "$out" success; then
+ rc_run=1
+ fi
+ if [ "$rc_run" != 0 ]; then
+ failure_kind=failed
+ [ "$rc_run" != 124 ] && [ "$rc_run" != 137 ] || failure_kind="timed out"
+ if [ "$failure_kind" = "timed out" ]; then
+ cx_log "WARN: $backend $phase run timed out rc=$rc_run (limit=${CX_RUN_TIMEOUT:-900}s)"
+ else
+ cx_log "WARN: $backend $phase run failed rc=$rc_run"
+ fi
+ if cx_has_result_doc "$out"; then
+ cx_demote_result_doc "$out" "$rc_run" \
+ || { cx_quarantine_result_doc "$out"; emit_failed_case "$backend" "$phase" "$rc_run"; }
+ cx_log "preserved benchmark output as a failed attempt"
+ else
+ cx_quarantine_result_doc "$out"
+ emit_failed_case "$backend" "$phase" "$rc_run"
+ fi
+ rc=1
+ fi
+ done
+ return "$rc"
+}
+
+# Resolve and verify the actual CUDA target before compiling source kernels.
+cx_cuda_arch() {
+ local expected detected
+ case "$CX_RUNNER" in
+ h100*|h200*) expected="9.0" ;;
+ b200*|gb200*) expected="10.0" ;;
+ b300*|gb300*) expected="10.3" ;;
+ *) cx_log "ERROR: no CUDA target registered for $CX_RUNNER"; return 1 ;;
+ esac
+ detected="$(python3 - <<'PY'
+import torch
+
+major, minor = torch.cuda.get_device_capability()
+print(f"{major}.{minor}")
+PY
+)" || return 1
+ [ "$detected" = "$expected" ] || {
+ cx_log "ERROR: $CX_RUNNER expected CUDA target $expected, detected $detected"
+ return 1
+ }
+ printf '%s' "$detected"
+}
+
+cx_nvidia_package_root() {
+ local package="$1" component="$2"
+ python3 - "$package" "$component" <<'PY'
+from importlib import metadata
+from pathlib import Path, PurePosixPath
+import sys
+
+package, component = sys.argv[1:]
+try:
+ distribution = metadata.distribution(package)
+ prefix = f"nvidia/{component}/"
+ entries = [str(entry).replace("\\", "/") for entry in distribution.files or ()]
+ if not any(entry.startswith(prefix) for entry in entries):
+ raise ValueError
+ root = Path(distribution.locate_file(PurePosixPath("nvidia") / component)).resolve()
+ if not root.is_dir():
+ raise ValueError
+except (metadata.PackageNotFoundError, OSError, TypeError, ValueError):
+ raise SystemExit(1)
+print(root, end="")
+PY
+}
+
+cx_prepare_cuda_cccl() {
+ local cccl="" candidate cuda_home nvcc
+ nvcc="$(command -v nvcc)" \
+ || { cx_log "ERROR: CUDA nvcc is unavailable"; return 1; }
+ nvcc="$(readlink -f -- "$nvcc")" \
+ || { cx_log "ERROR: CUDA nvcc cannot be resolved"; return 1; }
+ case "$nvcc" in
+ */bin/nvcc) cuda_home="${nvcc%/bin/nvcc}" ;;
+ *) cx_log "ERROR: CUDA nvcc has an unexpected path"; return 1 ;;
+ esac
+ [ -x "$cuda_home/bin/nvcc" ] && [ -d "$cuda_home/include" ] \
+ && [ -d "$cuda_home/lib64" ] \
+ || { cx_log "ERROR: CUDA toolkit root is incomplete"; return 1; }
+ for candidate in "$cuda_home"/targets/*/include/cccl; do
+ if [ -d "$candidate" ]; then
+ cccl="$candidate"
+ break
+ fi
+ done
+ [ -n "$cccl" ] || { cx_log "ERROR: CUDA CCCL headers are unavailable"; return 1; }
+ export CUDA_HOME="$cuda_home" CX_CUDA_CCCL="$cccl"
+ export CPATH="$cccl:${CPATH:-}"
+ export NVCC_PREPEND_FLAGS="-I$cccl ${NVCC_PREPEND_FLAGS:-}"
+}
+
+cx_prepare_deepep_toolchain() {
+ local packaged overlay path root temporary
+ packaged="$(cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem)" \
+ || { cx_log "ERROR: nvidia.nvshmem is unavailable"; return 1; }
+ root="$(cx_deepep_v2_root)" || return 1
+ overlay="$root/nvshmem-overlay"
+ if ! (
+ umask 077
+ exec 8>"$root/nvshmem-overlay.lock" || exit 1
+ flock 8 || exit 1
+ if [ ! -d "$overlay" ]; then
+ temporary="$root/.nvshmem-overlay.$$"
+ rm -rf "$temporary" || exit 1
+ mkdir -p "$temporary/lib" || exit 1
+ ln -s "$packaged/include" "$temporary/include" || exit 1
+ for path in "$packaged"/lib/*; do
+ ln -s "$path" "$temporary/lib/${path##*/}" || exit 1
+ done
+ [ ! -e "$packaged/lib/libnvshmem_host.so.3" ] \
+ || ln -sf "$packaged/lib/libnvshmem_host.so.3" \
+ "$temporary/lib/libnvshmem_host.so" || exit 1
+ mv "$temporary" "$overlay" || exit 1
+ fi
+ [ ! -L "$overlay" ] \
+ && [ "$(readlink -f "$overlay/include")" = "$(readlink -f "$packaged/include")" ] \
+ && [ -e "$overlay/lib/libnvshmem_host.so" ] \
+ && [ -e "$overlay/lib/libnvshmem_device.a" ]
+ ); then
+ cx_log "ERROR: DeepEP V2 NVSHMEM overlay is invalid"
+ return 1
+ fi
+ NVSHMEM_DIR="$overlay"
+ export NVSHMEM_DIR
+ cx_prepare_cuda_cccl || return 1
+ export LD_LIBRARY_PATH="$NVSHMEM_DIR/lib:${LD_LIBRARY_PATH:-}"
+}
+
+cx_probe_deepep() {
+ local expected_record_sha256 expected_version expected_wheel_sha256
+ if [ "${COLLECTIVEX_IMAGE:-}" != "$CX_IMAGE_MULTIARCH" ] \
+ || [ "${COLLECTIVEX_IMAGE_DIGEST:-}" != "$CX_IMAGE_MULTIARCH_DIGEST" ] \
+ || [ "${COLLECTIVEX_IMAGE_DIGEST_VERIFIED:-0}" != 1 ]; then
+ cx_log "ERROR: DeepEP V1 requires the exact pinned multi-architecture image"
+ return 1
+ fi
+ cx_cuda_arch >/dev/null || return 1
+ case "$CX_RUNNER" in
+ gb200|gb300)
+ expected_version="1.1.0+814e508"
+ expected_wheel_sha256="784dabec0877b6cf72619b7e93eda7e2f365648487bd37fc3ff6960e53669313"
+ expected_record_sha256="2671cff7baf8c2c214ff4bac721af875d513130670bec57601998bd1aae82882"
+ DEEPEP_COMMIT="814e508537c6ffc775d59f6f1b9ba43f3a65968c"
+ ;;
+ *)
+ expected_version="1.2.1"
+ expected_wheel_sha256="7c02c29306ea0fe2dd474618e72e0f310f260187a9c0700a656d2f6964e8c307"
+ expected_record_sha256="6548e9c504a12b2471af4b7f4d9546321210a57a456b5dc55bd4a8dad0f932ac"
+ DEEPEP_COMMIT="9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee"
+ ;;
+ esac
+ export DEEPEP_COMMIT
+ python3 - "$expected_version" "$expected_wheel_sha256" "$expected_record_sha256" <<'PY' || {
+import base64
+import csv
+import hashlib
+import importlib.metadata as metadata
+import io
+import json
+from pathlib import Path
+import sys
+
+import deep_ep
+from deep_ep import Buffer
+
+distribution = metadata.distribution("deep_ep")
+assert distribution.version == sys.argv[1]
+assert Buffer.__name__ == "Buffer"
+recorded_files = {
+ Path(distribution.locate_file(entry)).resolve() for entry in distribution.files or ()
+}
+buffer_module = sys.modules.get(Buffer.__module__)
+assert Path(deep_ep.__file__).resolve() in recorded_files
+assert buffer_module is not None and Path(buffer_module.__file__).resolve() in recorded_files
+direct_url = json.loads(distribution.read_text("direct_url.json"))
+assert direct_url["archive_info"]["hashes"]["sha256"] == sys.argv[2]
+record_entry = next(
+ entry for entry in distribution.files or ()
+ if str(entry).endswith(".dist-info/RECORD")
+)
+record = distribution.locate_file(record_entry).read_bytes()
+assert hashlib.sha256(record).hexdigest() == sys.argv[3]
+for path, encoded_digest, size in csv.reader(io.StringIO(record.decode())):
+ if not encoded_digest:
+ continue
+ algorithm, expected = encoded_digest.split("=", 1)
+ assert algorithm == "sha256"
+ payload = distribution.locate_file(path).read_bytes()
+ observed = base64.urlsafe_b64encode(hashlib.sha256(payload).digest()).decode().rstrip("=")
+ assert observed == expected
+ assert not size or len(payload) == int(size)
+PY
+ cx_log "ERROR: container DeepEP build does not match its pinned image contract"
+ return 1
+ }
+ cx_log "DeepEP image build ready ($DEEPEP_COMMIT)"
+}
+
+# DeepEP V2 is PR #605's ElasticBuffer implementation with upstream PR #630's pure scale-up
+# initialization fix. Canonical launchers stage the pinned source and mount a private cluster-local
+# build cache at /cx-cache.
+cx_deepep_v2_root() {
+ local arch cpu base identity key image_digest
+ arch="$(cx_cuda_arch)" || return 1
+ cpu="$(uname -m)"
+ [[ "$cpu" =~ ^[A-Za-z0-9._-]+$ ]] || return 1
+ base="${CX_BACKEND_CACHE_ROOT:-}"
+ [[ "$base" = /* ]] || return 1
+ image_digest="${COLLECTIVEX_IMAGE_DIGEST:-manual-unverified}"
+ [[ "$image_digest" = manual-unverified || "$image_digest" =~ ^sha256:[0-9a-f]{64}$ ]] \
+ || return 1
+ # Bump the recipe generation whenever the build procedure changes. Benchmark-only
+ # source revisions must reuse the same immutable environment instead of leaking GBs.
+ identity="deepep-v2-cache-v2|$cpu|sm${arch/./}|image=$image_digest|recipe=aot-persistent-nvshmem-active-cuda-maxjobs16-v2|$CX_DEEPEP_V2_COMMIT|$CX_DEEPEP_V2_TREE|$CX_DEEPEP_V2_FMT_COMMIT|pip=26.1.2|setuptools=82.0.1|wheel=0.47.0|ninja=1.13.0|numpy=2.2.6|torch=2.10.0+cu130|nccl=2.30.4|nvshmem=3.3.9|max-jobs=16"
+ key="$(printf '%s' "$identity" | sha256sum | awk '{print $1}')"
+ [[ "$key" =~ ^[0-9a-f]{64}$ ]] || return 1
+ printf '%s/deepep-v2-%s' "$base" "$key"
+}
+
+cx_activate_deepep_v2() {
+ local root venv stage_root
+ root="$(cx_deepep_v2_root)" || return 1
+ venv="$root/venv"
+ [ -x "$venv/bin/python" ] \
+ || { cx_log "ERROR: DeepEP V2 venv interpreter is unavailable"; return 1; }
+ export VIRTUAL_ENV="$venv"
+ export PATH="$venv/bin:${PATH#"$venv/bin:"}"
+ EP_NCCL_ROOT_DIR="$(cx_nvidia_package_root nvidia-nccl-cu13 nccl)" \
+ || { cx_log "ERROR: DeepEP V2 NCCL package root is unavailable"; return 1; }
+ EP_NVSHMEM_ROOT_DIR="$(cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem)" \
+ || { cx_log "ERROR: DeepEP V2 NVSHMEM package root is unavailable"; return 1; }
+ export EP_NCCL_ROOT_DIR EP_NVSHMEM_ROOT_DIR
+ export LD_LIBRARY_PATH="$EP_NCCL_ROOT_DIR/lib:$EP_NVSHMEM_ROOT_DIR/lib:${LD_LIBRARY_PATH:-}"
+ case "${CX_BACKEND_SOURCE_ROOT:-}" in
+ /*/.cx_sources) stage_root="${CX_BACKEND_SOURCE_ROOT%/.cx_sources}" ;;
+ *) cx_log "ERROR: DeepEP V2 job-local source root is unavailable"; return 1 ;;
+ esac
+ [ -d "$stage_root" ] && [ ! -L "$stage_root" ] \
+ || { cx_log "ERROR: DeepEP V2 job-local stage is invalid"; return 1; }
+ # JIT CUBINs are evidence from this shard, not part of the persistent AOT environment.
+ # Keeping them on the isolated staged tree prevents a prior driver/topology attempt
+ # from seeding a later run; all ranks and cases in this shard still share one cold build.
+ export EP_JIT_CACHE_DIR="$stage_root/.cx_backend/deepep-v2-jit"
+ export EP_REUSE_NCCL_COMM=1
+ export DEEPEP_V2_PR=605 DEEPEP_V2_FIX_PR=630
+ DEEPEP_V2_COMMIT="$CX_DEEPEP_V2_COMMIT"
+ DEEPEP_V2_TREE="$CX_DEEPEP_V2_TREE"
+ DEEPEP_V2_FMT_COMMIT="$CX_DEEPEP_V2_FMT_COMMIT"
+ export DEEPEP_V2_COMMIT DEEPEP_V2_TREE DEEPEP_V2_FMT_COMMIT
+ [ ! -L "$stage_root/.cx_backend" ] && [ ! -L "$EP_JIT_CACHE_DIR" ] \
+ || { cx_log "ERROR: DeepEP V2 JIT cache path is unsafe"; return 1; }
+ if ! mkdir -p "$EP_JIT_CACHE_DIR" \
+ || ! chmod 700 "$stage_root/.cx_backend" "$EP_JIT_CACHE_DIR"; then
+ cx_log "ERROR: DeepEP V2 JIT cache is unavailable"
+ return 1
+ fi
+ unset EP_SUPPRESS_NCCL_CHECK
+}
+
+cx_enable_deepep_v2_jit_reproducibility() {
+ local seed="collectivex-deepep-v2-fa8a9b1" cccl
+ [ -n "${CUDA_HOME:-}" ] \
+ || { cx_log "ERROR: active CUDA toolkit is unavailable"; return 1; }
+ cccl="${CX_CUDA_CCCL:-}"
+ case "$cccl" in
+ "$CUDA_HOME"/targets/*/include/cccl) ;;
+ *) cx_log "ERROR: CUDA CCCL headers differ from the active toolkit"; return 1 ;;
+ esac
+ [ -d "$cccl" ] || { cx_log "ERROR: CUDA CCCL headers are unavailable"; return 1; }
+ CPATH="$cccl"
+ NVCC_PREPEND_FLAGS="--frandom-seed=$seed -I$cccl"
+ DEEPEP_V2_JIT_RANDOM_SEED="$seed"
+ EP_JIT_DUMP_SASS=1
+ unset EP_JIT_DEBUG EP_JIT_DUMP_ASM EP_JIT_DUMP_PTX EP_JIT_WITH_LINEINFO
+ unset EP_JIT_PTXAS_VERBOSE EP_JIT_PRINT_COMPILER_COMMAND EP_JIT_NVCC_COMPILER
+ unset EP_JIT_CPP_STANDARD EP_JIT_PTXAS_CHECK EP_GIN_GDAKI_DEBUG EP_NUM_TOPK_IDX_BITS
+ export CPATH DEEPEP_V2_JIT_RANDOM_SEED EP_JIT_DUMP_SASS NVCC_PREPEND_FLAGS
+}
+
+cx_probe_deepep_v2() {
+ python3 - <<'PY'
+import ctypes
+import importlib.metadata as metadata
+import inspect
+import os
+
+import torch
+
+assert torch.__version__ == "2.10.0+cu130", torch.__version__
+assert metadata.version("nvidia-nccl-cu13") == "2.30.4"
+assert metadata.version("nvidia-nvshmem-cu12") == "3.3.9"
+assert metadata.version("numpy") == "2.2.6"
+
+import deep_ep
+assert deep_ep.__version__ == "2.0.0", deep_ep.__version__
+assert metadata.version("deep_ep") == "2.0.0+fa8a9b1"
+assert inspect.isclass(deep_ep.ElasticBuffer)
+assert deep_ep.ElasticBuffer.__name__ == "ElasticBuffer"
+assert os.environ.get("EP_SUPPRESS_NCCL_CHECK") is None
+with open("/proc/self/maps", encoding="utf-8") as handle:
+ loaded_nccl = {
+ os.path.realpath(line.rstrip().split()[-1])
+ for line in handle
+ if "libnccl.so" in line and os.path.isfile(line.rstrip().split()[-1])
+ }
+assert len(loaded_nccl) == 1
+runtime_version = ctypes.c_int()
+assert ctypes.CDLL(loaded_nccl.pop()).ncclGetVersion(ctypes.byref(runtime_version)) == 0
+assert runtime_version.value == 23004, runtime_version.value
+PY
+}
+
+cx_deepep_v2_content_sha256() {
+ python3 - <<'PY'
+import hashlib
+from importlib import metadata
+import os
+from pathlib import Path, PurePosixPath
+import stat
+
+distribution = metadata.distribution("deep_ep")
+entries = sorted(distribution.files or (), key=lambda entry: entry.as_posix())
+if not entries:
+ raise SystemExit(1)
+venv_path = Path(os.environ["VIRTUAL_ENV"]).absolute()
+if venv_path.is_symlink() or not venv_path.is_dir():
+ raise SystemExit(1)
+venv = venv_path.resolve(strict=True)
+digest = hashlib.sha256()
+extension = False
+for entry in entries:
+ relative = PurePosixPath(entry.as_posix())
+ if (
+ relative.is_absolute()
+ or ".." in relative.parts
+ or not relative.parts
+ or not (
+ relative.parts[0] == "deep_ep"
+ or relative.parts[0].startswith("deep_ep-")
+ and relative.parts[0].endswith(".dist-info")
+ )
+ ):
+ raise SystemExit(1)
+ path = Path(distribution.locate_file(entry)).absolute()
+ resolved = path.resolve(strict=True)
+ try:
+ path.relative_to(venv_path)
+ resolved.relative_to(venv)
+ except ValueError:
+ raise SystemExit(1)
+ parent = path.parent
+ while parent != venv_path:
+ if parent.is_symlink():
+ raise SystemExit(1)
+ parent = parent.parent
+ item = os.lstat(path)
+ if not stat.S_ISREG(item.st_mode):
+ raise SystemExit(1)
+ descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ opened = os.fstat(descriptor)
+ if (opened.st_dev, opened.st_ino) != (item.st_dev, item.st_ino):
+ raise SystemExit(1)
+ file_digest = hashlib.sha256()
+ while chunk := os.read(descriptor, 1024 * 1024):
+ file_digest.update(chunk)
+ finally:
+ os.close(descriptor)
+ name = relative.as_posix()
+ extension |= name.startswith("deep_ep/") and name.endswith(".so")
+ digest.update(name.encode())
+ digest.update(b"\0")
+ digest.update(str(item.st_size).encode())
+ digest.update(b"\0")
+ digest.update(file_digest.digest())
+if not extension:
+ raise SystemExit(1)
+print(digest.hexdigest(), end="")
+PY
+}
+
+cx_deepep_v2_marker_content_sha256() {
+ local root="$1" marker="$2" revision="$3" tree="$4" fmt_revision="$5" cache_key="$6"
+ python3 - "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" <<'PY'
+import os
+import re
+import stat
+import sys
+
+root, marker, revision, tree, fmt_revision, cache_key = sys.argv[1:]
+try:
+ root_item = os.lstat(root)
+ marker_item = os.lstat(marker)
+ children = [os.lstat(os.path.join(root, name)) for name in ("source", "venv")]
+ if (
+ not stat.S_ISDIR(root_item.st_mode)
+ or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700
+ or not stat.S_ISREG(marker_item.st_mode)
+ or marker_item.st_uid != root_item.st_uid
+ or stat.S_IMODE(marker_item.st_mode) & 0o777 != 0o600
+ or marker_item.st_size > 1024
+ or any(
+ not stat.S_ISDIR(child.st_mode)
+ or child.st_uid != root_item.st_uid
+ or stat.S_IMODE(child.st_mode) & 0o022
+ for child in children
+ )
+ ):
+ raise OSError
+ descriptor = os.open(marker, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ opened = os.fstat(descriptor)
+ if (opened.st_dev, opened.st_ino) != (marker_item.st_dev, marker_item.st_ino):
+ raise OSError
+ payload = os.read(descriptor, 1025)
+ finally:
+ os.close(descriptor)
+ lines = payload.decode("ascii").splitlines()
+ if lines[:4] != [revision, tree, fmt_revision, cache_key] or len(lines) != 5:
+ raise ValueError
+ if not re.fullmatch(r"[0-9a-f]{64}", lines[4]):
+ raise ValueError
+except (OSError, UnicodeError, ValueError):
+ raise SystemExit(1)
+print(lines[4], end="")
+PY
+}
+
+cx_deepep_v2_cache_is_valid() {
+ local root="$1" marker="$2" revision="$3" tree="$4" fmt_revision="$5" cache_key="$6"
+ local expected_content actual_content
+ expected_content="$(
+ cx_deepep_v2_marker_content_sha256 \
+ "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key"
+ )" || return 1
+ [ -d "$root/source" ] && [ ! -L "$root/source" ] \
+ && [ "$(cx_git_in_tree "$root/source" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \
+ && [ "$(cx_git_in_tree "$root/source/third-party/fmt" rev-parse HEAD 2>/dev/null)" = "$fmt_revision" ] \
+ || return 1
+ cx_activate_deepep_v2 || return 1
+ actual_content="$(cx_deepep_v2_content_sha256)" || return 1
+ [ "$actual_content" = "$expected_content" ]
+}
+
+cx_build_deepep_v2() {
+ local root venv source marker marker_tmp lock_path arch cache_key cache_ready content_sha256
+ local revision="fa8a9b16898204afd347c663b89e65ef87dc6ce6"
+ local tree="29809e75c5874e6609dac4804e7b651d5226959f"
+ local fmt_revision="a4c7e17133ee9cb6a2f45545f6e974dd3c393efa"
+ cx_verify_backend_cache_mount \
+ || { cx_log "ERROR: DeepEP V2 cache mount identity validation failed"; return 1; }
+ arch="$(cx_cuda_arch)" || return 1
+ root="$(cx_deepep_v2_root)" || return 1
+ cache_key="${root##*/deepep-v2-}"
+ [[ "$cache_key" =~ ^[0-9a-f]{64}$ ]] || return 1
+ venv="$root/venv"; source="$root/source"; marker="$root/.collectivex-complete"
+ lock_path="${root}.lock"
+ command -v flock >/dev/null || { cx_log "ERROR: flock is required for DeepEP V2"; return 1; }
+ mkdir -p "${root%/*}" || return 1
+ cx_log "DeepEP V2: preparing PR #605 implementation with upstream PR #630 fix ($revision)"
+ if ! (
+ [ ! -L "$lock_path" ] \
+ || { cx_log "ERROR: DeepEP V2 cache lock is unsafe"; exit 1; }
+ (umask 077; : >> "$lock_path") && chmod 600 "$lock_path" \
+ || { cx_log "ERROR: DeepEP V2 cache-lock-create failed"; exit 1; }
+ exec 9<>"$lock_path" \
+ || { cx_log "ERROR: DeepEP V2 cache-lock-open failed"; exit 1; }
+ flock 9 \
+ || { cx_log "ERROR: DeepEP V2 cache-lock-acquire failed"; exit 1; }
+ cache_ready=0
+ if [ -e "$marker" ] || [ -L "$marker" ]; then
+ if (
+ cx_deepep_v2_cache_is_valid \
+ "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key"
+ ); then
+ cache_ready=1
+ else
+ cx_log "ERROR: published DeepEP V2 cache failed integrity validation; refusing reset"
+ exit 1
+ fi
+ fi
+ if [ "$cache_ready" != 1 ]; then
+ if [ -e "$root" ] || [ -L "$root" ]; then
+ rm -rf "$root" \
+ || { cx_log "ERROR: incomplete DeepEP V2 cache-reset failed"; exit 1; }
+ fi
+ mkdir -m 700 "$root" \
+ || { cx_log "ERROR: DeepEP V2 cache-create failed"; exit 1; }
+ python3 -m venv "$venv" \
+ || { cx_log "ERROR: DeepEP V2 venv creation failed"; exit 1; }
+ "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \
+ "pip==26.1.2" "setuptools==82.0.1" "wheel==0.47.0" "ninja==1.13.0" \
+ "numpy==2.2.6" "nvidia-nvshmem-cu12==3.3.9" >&2 2>&1 \
+ || { cx_log "ERROR: DeepEP V2 build-tool installation failed"; exit 1; }
+ "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \
+ --index-url https://download.pytorch.org/whl/cu130 \
+ --extra-index-url https://pypi.org/simple "torch==2.10.0" >&2 2>&1 \
+ || { cx_log "ERROR: torch 2.10.0+cu130 installation failed"; exit 1; }
+ # Torch pins NCCL 2.28.9; the PR #605 ElasticBuffer implementation requires 2.30.4.
+ "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \
+ --force-reinstall --no-deps "nvidia-nccl-cu13==2.30.4" >&2 2>&1 \
+ || { cx_log "ERROR: NCCL 2.30.4 installation failed"; exit 1; }
+ cx_activate_deepep_v2 \
+ || { cx_log "ERROR: DeepEP V2 environment activation failed"; exit 1; }
+ cx_prepare_deepep_toolchain \
+ || { cx_log "ERROR: DeepEP V2 toolchain preparation failed"; exit 1; }
+ EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR"
+ export EP_NVSHMEM_ROOT_DIR
+ cx_materialize_backend_source deepep-v2 "$source" \
+ || { cx_log "ERROR: DeepEP V2 staged source is invalid"; exit 1; }
+ (cd "$source" && SOURCE_DATE_EPOCH="$(cx_git_in_tree "$source" show -s --format=%ct HEAD)" \
+ TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \
+ python3 -m pip install -q --no-build-isolation --no-deps --force-reinstall .) >&2 2>&1 \
+ || { cx_log "ERROR: DeepEP V2 build failed"; exit 1; }
+ cx_probe_deepep_v2 \
+ || { cx_log "ERROR: DeepEP V2 ElasticBuffer/runtime probe failed"; exit 1; }
+ content_sha256="$(cx_deepep_v2_content_sha256)" \
+ || { cx_log "ERROR: DeepEP V2 installed-content hashing failed"; exit 1; }
+ marker_tmp="$(mktemp "$root/.collectivex-complete.tmp.XXXXXX")" \
+ || { cx_log "ERROR: DeepEP V2 cache-marker-create failed"; exit 1; }
+ chmod 600 "$marker_tmp" \
+ || { cx_log "ERROR: DeepEP V2 cache-marker-permission failed"; exit 1; }
+ printf '%s\n%s\n%s\n%s\n%s\n' \
+ "$revision" "$tree" "$fmt_revision" "$cache_key" "$content_sha256" > "$marker_tmp" \
+ || { cx_log "ERROR: DeepEP V2 cache-marker-write failed"; exit 1; }
+ mv -f -- "$marker_tmp" "$marker" \
+ || { cx_log "ERROR: DeepEP V2 cache-marker-publish failed"; exit 1; }
+ fi
+ cx_deepep_v2_cache_is_valid \
+ "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" \
+ || { cx_log "ERROR: DeepEP V2 cache validation failed"; exit 1; }
+ ); then
+ cx_log "ERROR: shared DeepEP V2 environment is incomplete"
+ return 1
+ fi
+ cx_activate_deepep_v2 || return 1
+ cx_prepare_deepep_toolchain || return 1
+ cx_enable_deepep_v2_jit_reproducibility || return 1
+ EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR"
+ export EP_NVSHMEM_ROOT_DIR
+ cx_probe_deepep_v2 || { cx_log "ERROR: DeepEP V2 shared runtime probe failed"; return 1; }
+ cx_log "DeepEP V2 ready ($DEEPEP_V2_COMMIT, ElasticBuffer, NCCL Device API; LSA/Gin selected by adapter)"
+}
+
+# Build the pinned DeepEP `hybrid-ep` implementation. MNNVL remains one scale-up
+# domain; true x86 scale-out uses the upstream DOCA/RDMA build explicitly.
+cx_configure_deepep_hybrid_build() {
+ local interface device rdma_name
+ local -a interfaces devices
+ unset HYBRID_EP_MULTINODE USE_NIXL RDMA_CORE_HOME DEEPEP_HYBRID_BUILD_MODE
+ if [ "${CX_NODES:-1}" -le 1 ] || [ "${CX_TRANSPORT:-}" = mnnvl ]; then
+ export DEEPEP_HYBRID_BUILD_MODE=intradomain
+ return 0
+ fi
+ [ "$(uname -m)" = x86_64 ] \
+ || { cx_log "ERROR: hybrid-ep scale-out is registered only on x86_64"; return 1; }
+ [ -n "${GLOO_SOCKET_IFNAME:-}" ] && [ -n "${NCCL_IB_HCA:-}" ] \
+ || { cx_log "ERROR: hybrid-ep scale-out network selectors are unavailable"; return 1; }
+ IFS=, read -r -a interfaces <<< "$GLOO_SOCKET_IFNAME"
+ for interface in "${interfaces[@]}"; do
+ [ -d "/sys/class/net/$interface" ] \
+ || { cx_log "ERROR: configured hybrid-ep socket interface is absent"; return 1; }
+ done
+ IFS=, read -r -a devices <<< "$NCCL_IB_HCA"
+ for device in "${devices[@]}"; do
+ rdma_name="${device%%:*}"
+ [ -d "/sys/class/infiniband/$rdma_name" ] \
+ || { cx_log "ERROR: configured hybrid-ep RDMA device is absent"; return 1; }
+ done
+ command -v make >/dev/null \
+ || { cx_log "ERROR: make is required for hybrid-ep scale-out"; return 1; }
+ [ -r /usr/include/infiniband/verbs.h ] && [ -r /usr/include/infiniband/mlx5dv.h ] \
+ || { cx_log "ERROR: pinned hybrid-ep RDMA headers are unavailable"; return 1; }
+ python3 - <<'PY' >/dev/null 2>&1 || {
+import ctypes.util
+import sys
+sys.exit(0 if all(ctypes.util.find_library(name) for name in ("ibverbs", "mlx5")) else 1)
+PY
+ cx_log "ERROR: pinned hybrid-ep RDMA libraries are unavailable"
+ return 1
+ }
+ export HYBRID_EP_MULTINODE=1 USE_NIXL=0 RDMA_CORE_HOME=/usr
+ export DEEPEP_HYBRID_BUILD_MODE=multinode-doca
+}
+
+cx_deepep_hybrid_marker_content_sha256() {
+ python3 - "$1" "$2" "$3" "$4" "${5:-}" <<'PY'
+import os
+import re
+import stat
+import sys
+
+root, marker, revision, tree, build_mode = sys.argv[1:]
+try:
+ root_item = os.lstat(root)
+ marker_item = os.lstat(marker)
+ if (
+ not stat.S_ISDIR(root_item.st_mode)
+ or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700
+ or not stat.S_ISREG(marker_item.st_mode)
+ or marker_item.st_uid != root_item.st_uid
+ or stat.S_IMODE(marker_item.st_mode) & 0o777 != 0o600
+ or marker_item.st_size > 512
+ ):
+ raise OSError
+ descriptor = os.open(marker, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0))
+ try:
+ opened = os.fstat(descriptor)
+ if (opened.st_dev, opened.st_ino) != (marker_item.st_dev, marker_item.st_ino):
+ raise OSError
+ payload = os.read(descriptor, 513)
+ finally:
+ os.close(descriptor)
+ lines = payload.decode("ascii").splitlines()
+ expected = [revision, tree, build_mode] if build_mode else [revision, tree]
+ if len(lines) != len(expected) + 1 or lines[:-1] != expected:
+ raise ValueError
+ if not re.fullmatch(r"[0-9a-f]{64}", lines[-1]):
+ raise ValueError
+except (OSError, UnicodeError, ValueError):
+ raise SystemExit(1)
+print(lines[-1], end="")
+PY
+}
+
+cx_deepep_hybrid_cache_is_valid() {
+ local root="$1" marker="$2" revision="$3" tree="$4" build_mode="${5:-}"
+ local expected actual status extra
+ expected="$(cx_deepep_hybrid_marker_content_sha256 \
+ "$root" "$marker" "$revision" "$tree" "$build_mode")" || return 1
+ [ "$(cx_git_in_tree "$root" rev-parse HEAD 2>/dev/null)" = "$revision" ] \
+ && [ "$(cx_git_in_tree "$root" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \
+ || return 1
+ status="$(cx_git_in_tree "$root" status --porcelain --untracked-files=no \
+ --ignore-submodules=none 2>/dev/null)" || return 1
+ [ -z "$status" ] || return 1
+ extra="$(cx_git_in_tree "$root" ls-files --others --exclude-standard -- \
+ 'deep_ep/*.py' 'deep_ep/*.so' 2>/dev/null)" || return 1
+ [ -z "$extra" ] || return 1
+ extra="$(cx_git_in_tree "$root" ls-files --others --ignored --exclude-standard -- \
+ 'deep_ep/*.py' 'deep_ep/*.so' 2>/dev/null)" || return 1
+ [ -z "$extra" ] || return 1
+ actual="$(cx_extension_pair_sha256 "$root" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" \
+ || return 1
+ [ "$actual" = "$expected" ]
+}
+
+cx_build_deepep_hybrid() {
+ local arch revision="$CX_DEEPEP_HYBRID_COMMIT" tree="$CX_DEEPEP_HYBRID_TREE"
+ local build_root marker marker_tmp lock_path content_sha256 cache_ready build_mode
+ export DEEPEP_COMMIT="$revision" DEEPEP_TREE="$tree"
+ arch="$(cx_cuda_arch)" || return 1
+ cx_configure_deepep_hybrid_build || return 1
+ build_mode="$DEEPEP_HYBRID_BUILD_MODE"
+ build_root="$PWD/.cx_backend/deepep-hybrid-${arch/./}-${build_mode}"
+ marker="$build_root/.collectivex-complete"
+ lock_path="${build_root}.lock"
+ cx_log "DeepEP hybrid-ep: building $revision for CUDA target $arch"
+ unset NVSHMEM_DIR
+ cx_prepare_cuda_cccl || return 1
+ command -v flock >/dev/null || { cx_log "ERROR: flock is required for hybrid-ep"; return 1; }
+ mkdir -p "$PWD/.cx_backend" || return 1
+ if ! (
+ [ ! -L "$lock_path" ] || exit 1
+ (umask 077; : >> "$lock_path") && chmod 600 "$lock_path" || exit 1
+ exec 9<>"$lock_path" || exit 1
+ flock 9 || exit 1
+ cache_ready=0
+ if [ -e "$marker" ] || [ -L "$marker" ]; then
+ cx_deepep_hybrid_cache_is_valid \
+ "$build_root" "$marker" "$revision" "$tree" "$build_mode" \
+ || exit 1
+ cache_ready=1
+ fi
+ if [ "$cache_ready" != 1 ]; then
+ cx_materialize_backend_source deepep-hybrid "$build_root" \
+ || { cx_log "ERROR: hybrid-ep staged source is invalid"; exit 1; }
+ if [ "$build_mode" = multinode-doca ]; then
+ [ "$(cx_git_in_tree "$build_root/third-party/nccl" rev-parse HEAD 2>/dev/null)" \
+ = "$CX_DEEPEP_HYBRID_NCCL_COMMIT" ] \
+ || { cx_log "ERROR: pinned hybrid-ep NCCL transport source is absent"; exit 1; }
+ fi
+ (cd "$build_root" && \
+ SOURCE_DATE_EPOCH="$(cx_git_in_tree "$build_root" show -s --format=%ct HEAD)" \
+ TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \
+ python3 setup.py build_ext --inplace) >&2 2>&1 \
+ || { cx_log "ERROR: hybrid-ep build failed"; exit 1; }
+ content_sha256="$(cx_extension_pair_sha256 \
+ "$build_root" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" || exit 1
+ marker_tmp="$(mktemp "$build_root/.collectivex-complete.tmp.XXXXXX")" || exit 1
+ chmod 600 "$marker_tmp" || exit 1
+ printf '%s\n%s\n%s\n%s\n' \
+ "$revision" "$tree" "$build_mode" "$content_sha256" > "$marker_tmp" \
+ || exit 1
+ mv -f -- "$marker_tmp" "$marker" || exit 1
+ fi
+ cx_deepep_hybrid_cache_is_valid \
+ "$build_root" "$marker" "$revision" "$tree" "$build_mode"
+ ); then
+ cx_log "ERROR: shared hybrid-ep build is incomplete"
+ return 1
+ fi
+ export PYTHONPATH="$build_root:${PYTHONPATH:-}"
+ python3 -c "import deep_ep; assert hasattr(deep_ep,'HybridEPBuffer'); print('built hybrid-ep deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \
+ || { cx_log "ERROR: hybrid-ep import / HybridEPBuffer missing after build"; return 1; }
+ cx_log "DeepEP hybrid-ep ready ($DEEPEP_COMMIT, mode=$build_mode)"
+}
+
+# UCCL EP (uccl.ep.Buffer is a DeepEP-API clone). The prebuilt wheel is cu12; on a cu13
+# image its kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH (probe-confirmed). PEP-668
+# images need PIP_BREAK_SYSTEM_PACKAGES. Best-effort; failure to import fails loudly.
+cx_build_uccl() {
+ if [ -f /tmp/.cx_built_uccl ]; then
+ cx_log "UCCL EP already prepared this allocation — skip rebuild"
+ python3 -c "import torch; from uccl_deepep import Buffer" 2>/dev/null || return 1
+ return 0
+ fi
+ local version="0.1.1" tag="v0.1.1"
+ local wheel_sha256="390c1320918972206546e44d79b132988f2818ec07e23afcd0595f7183916cec"
+ cx_log "UCCL EP: installing uccl==$version + cu12 runtime shim"
+ export PIP_BREAK_SYSTEM_PACKAGES=1
+ pip install -q --no-deps "sortedcontainers==2.4.0" "intervaltree==3.1.0" >&2 2>&1 \
+ || { cx_log "ERROR: UCCL support dependency installation failed"; return 1; }
+ printf 'uccl==%s --hash=sha256:%s\n' "$version" "$wheel_sha256" \
+ | pip install -q --no-deps --only-binary=:all: --require-hashes -r /dev/stdin >&2 2>&1 \
+ || { cx_log "ERROR: pip install uccl==$version failed"; return 1; }
+ pip install -q --no-deps "nvidia-cuda-runtime-cu12==12.9.79" >&2 2>&1 \
+ || { cx_log "ERROR: CUDA 12 runtime shim install failed"; return 1; }
+ local cu12lib
+ cu12lib="$(python3 -c "import nvidia.cuda_runtime as m, os; print(os.path.join(os.path.dirname(m.__file__),'lib'))" 2>/dev/null)"
+ [ -n "$cu12lib" ] && export LD_LIBRARY_PATH="$cu12lib:${LD_LIBRARY_PATH:-}"
+ local installed
+ installed="$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))')" \
+ || { cx_log "ERROR: cannot read installed UCCL version"; return 1; }
+ [ "$installed" = "$version" ] \
+ || { cx_log "ERROR: expected UCCL $version, installed $installed"; return 1; }
+ UCCL_COMMIT="pkg-$installed"
+ export UCCL_COMMIT
+ # import torch FIRST: uccl.ep's C extension links libc10.so (torch), which is only on the loader
+ # path once torch is imported (rpath). The adapter (ep_uccl.py) imports torch before uccl.ep too.
+ python3 -c "import torch; from uccl.ep import Buffer; print('uccl.ep ready')" >&2 \
+ || { cx_log "ERROR: uccl.ep import failed (cu12 runtime on LD_LIBRARY_PATH?)"; return 1; }
+ # Vendor UCCL's DeepEP-API wrapper (ep/deep_ep_wrapper/deep_ep) under a NON-conflicting name
+ # (uccl_deepep) so it doesn't shadow the container's real deep_ep. Its Buffer(group, num_nvl_bytes,
+ # ...) takes a torch ProcessGroup (matching DeepEP + ep_uccl.py's calls) and runs the full
+ # proxy/IPC-handle/runtime.sync bootstrap that the low-level uccl.ep.Buffer(rank,num_ranks) lacks.
+ rm -rf /tmp/uccl_src /tmp/uccl_deepep_pkg
+ # Pin the wrapper to the SAME tag as the installed wheel (pkg-0.1.1 -> v0.1.1): the wrapper's
+ # dispatch calls into uccl.ep (get_rdma_buffer etc.), so a main-branch wrapper vs a 0.1.1 wheel
+ # mismatches signatures. Match them.
+ if git clone --depth 1 --branch "$tag" https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1 \
+ && [ "$(git -C /tmp/uccl_src rev-parse HEAD)" = "73ee4f12ba71717d6de34ba06806e1baaabe3f42" ] \
+ && [ -d /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep ]; then
+ mkdir -p /tmp/uccl_deepep_pkg/uccl_deepep
+ cp /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep/*.py /tmp/uccl_deepep_pkg/uccl_deepep/ 2>/dev/null
+ export PYTHONPATH="/tmp/uccl_deepep_pkg:${PYTHONPATH:-}"
+ python3 -c "import torch; from uccl_deepep import Buffer; print('uccl_deepep wrapper ready')" >&2 \
+ || { cx_log "ERROR: uccl_deepep wrapper import failed"; return 1; }
+ export CX_UCCL_WRAPPER=1
+ export UCCL_WRAPPER_COMMIT="73ee4f12ba71717d6de34ba06806e1baaabe3f42"
+ else
+ cx_log "ERROR: uccl deep_ep_wrapper not available"
+ return 1
+ fi
+ : > /tmp/.cx_built_uccl
+ cx_log "UCCL EP ready ($UCCL_COMMIT, wrapper=${CX_UCCL_WRAPPER:-0})"
+}
+
+# Rack build and rank steps may enter different container instances. Persist each node's
+# loader/import path and build identity on the shared staged mount, then require it from every rank.
+cx_persist_backend_env() {
+ local root="$PWD/.cx_backend/env" node_id="${SLURM_NODEID:-0}" path temporary name
+ local -a names=(PATH VIRTUAL_ENV LD_LIBRARY_PATH PYTHONPATH CUDA_HOME CPATH NVCC_PREPEND_FLAGS
+ NVSHMEM_DIR DEEPEP_COMMIT DEEPEP_TREE
+ EP_NCCL_ROOT_DIR EP_NVSHMEM_ROOT_DIR EP_JIT_CACHE_DIR EP_REUSE_NCCL_COMM
+ EP_JIT_DUMP_SASS
+ DEEPEP_V2_PR DEEPEP_V2_FIX_PR DEEPEP_V2_COMMIT DEEPEP_V2_TREE DEEPEP_V2_FMT_COMMIT
+ DEEPEP_V2_JIT_RANDOM_SEED
+ HYBRID_EP_MULTINODE USE_NIXL RDMA_CORE_HOME DEEPEP_HYBRID_BUILD_MODE
+ UCCL_COMMIT UCCL_WRAPPER_COMMIT CX_UCCL_WRAPPER)
+ [[ "$node_id" =~ ^[0-9]+$ ]] || return 1
+ mkdir -p "$root" || return 1
+ chmod 700 "$root" || return 1
+ temporary="$(mktemp "$root/.node-${node_id}.XXXXXX")" || return 1
+ chmod 600 "$temporary" || { rm -f "$temporary"; return 1; }
+ for name in "${names[@]}"; do
+ if declare -p "$name" >/dev/null 2>&1; then
+ printf 'export %s=%q\n' "$name" "${!name}" >> "$temporary" \
+ || { rm -f "$temporary"; return 1; }
+ fi
+ done
+ path="$root/node-${node_id}.sh"
+ mv -f -- "$temporary" "$path" || { rm -f "$temporary"; return 1; }
+}
+
+# Validate private scale-out selectors on every allocated compute node before a
+# backend can initialize or build transport code.
+cx_probe_scaleout_network() {
+ local interface device rdma_name
+ local -a interfaces devices
+ if [ "${CX_NODES:-1}" -le 1 ] || [ "${CX_TRANSPORT:-}" = mnnvl ]; then
+ return 0
+ fi
+ [ -n "${GLOO_SOCKET_IFNAME:-}" ] && [ -n "${NCCL_IB_HCA:-}" ] \
+ || { cx_log "ERROR: scale-out network selectors are unavailable"; return 1; }
+ IFS=, read -r -a interfaces <<< "$GLOO_SOCKET_IFNAME"
+ for interface in "${interfaces[@]}"; do
+ [ -d "/sys/class/net/$interface" ] \
+ || { cx_log "ERROR: configured scale-out socket interface is absent"; return 1; }
+ done
+ IFS=, read -r -a devices <<< "$NCCL_IB_HCA"
+ for device in "${devices[@]}"; do
+ rdma_name="${device%%:*}"
+ [ -d "/sys/class/infiniband/$rdma_name" ] \
+ || { cx_log "ERROR: configured scale-out RDMA device is absent"; return 1; }
+ done
+}
+
+# Prepare and probe one backend without running a benchmark. The same hook is used
+# by normal in-container runs and by rack launchers' persistent build-only step.
+cx_prepare_backend() {
+ local backend="${1:-}"
+ case "$backend" in
+ deepep)
+ cx_probe_deepep || return 1
+ ;;
+ deepep-v2)
+ cx_build_deepep_v2 || return 1
+ ;;
+ deepep-hybrid)
+ cx_build_deepep_hybrid || return 1
+ ;;
+ uccl)
+ cx_build_uccl || return 1
+ ;;
+ mori)
+ python3 -c "import mori" 2>/dev/null || return 1
+ ;;
+ nccl-ep)
+ ;;
+ *)
+ cx_log "ERROR: unknown backend preparation request"
+ return 1
+ ;;
+ esac
+}
+
+prepare_backend_or_record() {
+ local backend="$1" phases="${CX_PHASE:-decode}" phase
+ cx_write_runtime_stage backend-setup || return 1
+ if cx_prepare_backend "$backend"; then
+ return 0
+ fi
+ cx_log "WARN: $backend preparation failed"
+ [ "$phases" = "both" ] && phases="decode prefill"
+ for phase in $phases; do
+ CX_FAILURE_MODE=backend-setup emit_failed_case "$backend" "$phase" 6
+ done
+ return 1
+}
+
+# dispatch_bench runs the CURRENT CX_BENCH (+ CX_* config env) once. The sweep workflow runs many
+# of these per allocation (SHARD mode below), reusing this single container + its built backend.
+dispatch_bench() {
+ case "$CX_BENCH" in
+ nccl-ep)
+ run_ep_suite "$CX_BENCH"
+ ;;
+ deepep|deepep-v2|deepep-hybrid|mori|uccl)
+ prepare_backend_or_record "$CX_BENCH" && run_ep_suite "$CX_BENCH"
+ ;;
+ *)
+ cx_die "unknown CX_BENCH=$CX_BENCH (want deepep|deepep-v2|mori|uccl|nccl-ep|deepep-hybrid)"
+ ;;
+ esac
+}
+
+rc=0
+cx_validate_shard_control "$PWD"
+# Build-only mode: rack launchers run the shared backend preparation hook once per
+# node inside a persistent named container, then direct rank processes reuse it.
+if [ -n "${CX_BUILD_ONLY:-}" ]; then
+ if cx_probe_scaleout_network && cx_prepare_backend "${CX_BENCH:-}"; then
+ cx_persist_backend_env || rc=1
+ else
+ rc=1
+ fi
+ cx_log "backend preparation: bench=${CX_BENCH:-unknown} rc=$rc"
+ exit "$rc"
+fi
+if [ -n "${CX_SHARD_FILE:-}" ]; then
+ # SHARD/SWEEP mode (collectivex-sweep.yml): run EVERY case of this shard in THIS one allocation.
+ # All cases share (sku, backend, nodes), so backend preparation is paid once and cached.
+ ncases="$(python3 -c "import json;print(len(json.load(open('$CX_SHARD_FILE'))['cases']))")"
+ cx_log "SHARD mode: $ncases case(s) in one allocation (shard=$CX_SHARD_FILE)"
+ _cx_ts_base="$CX_TS" # per-case CX_TS suffix below keeps each case's result file UNIQUE (else
+ # cases sharing backend+phase overwrite each other at the same timestamp).
+ ci=0
+ failed_cases=0
+ while [ "$ci" -lt "$ncases" ]; do
+ CX_TS="${_cx_ts_base}-c$(printf '%03d' "$ci")"
+ export CX_TS
+ # Map varying case fields plus the frozen v1 defaults into CX_* env.
+ _exports="$(python3 - "$CX_SHARD_FILE" "$ci" <<'PY'
+import json, sys, shlex
+c = json.load(open(sys.argv[1]))["cases"][int(sys.argv[2])]
+def g(k, d=""):
+ v = c.get(k, d); return "" if v is None else str(v)
+env = {
+ "CX_BENCH": g("backend"),
+ "CX_MODE": g("mode", "normal"),
+ "CX_ROUTING": g("routing", "uniform"), "CX_PHASE": g("phase", "decode"),
+ "CX_EP": g("ep", "1"),
+ "CX_EPLB": "1" if c.get("eplb") else "",
+ "CX_CASE_ID": g("case_id"), "CX_SUITE": g("suite"), "CX_WORKLOAD_NAME": g("workload"),
+ "CX_REQUIRED_PUBLICATION": g("required_publication"),
+ "CX_HIDDEN": g("hidden"), "CX_TOPK": g("topk"), "CX_EXPERTS": g("experts"),
+ "CX_TOKENS_LADDER": g("ladder"), "CX_CANONICAL": ("1" if c.get("canonical") else ""),
+ "CX_NODES": g("nodes"), "CX_GPUS_PER_NODE": g("gpus_per_node"),
+ "CX_SCALE_UP_DOMAIN": g("scale_up_domain"), "CX_SCOPE": g("scope"),
+ "CX_SCALE_UP_TRANSPORT": g("scale_up_transport"),
+ "CX_SCALE_OUT_TRANSPORT": g("scale_out_transport"),
+ "CX_TRANSPORT": g("transport"), "CX_TOPO": g("topology_class"),
+ "CX_SAMPLES_PER_POINT": g("samples_per_point"),
+ "CX_WARMUP_SEMANTICS": g("warmup_semantics"),
+}
+lines = [f"export {k}={shlex.quote(v)}" for k, v in env.items()]
+# Per-case timing "iters:trials:warmup" (fixed-512-v1 requires 8:64:32 everywhere);
+# cases without one must fall back to the harness defaults, so UNSET rather than export-empty
+# (an empty CX_ITERS would defeat the 8-iter default and break the run_ep argparse; NOTE no
+# apostrophes in this heredoc — bash command-substitution scanning chokes on unbalanced quotes).
+timing = g("timing")
+if timing:
+ parts = (timing.split(":") + ["", "", ""])[:3]
+ for k, v in zip(("CX_ITERS", "CX_TRIALS", "CX_WARMUP"), parts):
+ if v:
+ lines.append(f"export {k}={shlex.quote(v)}")
+else:
+ lines.append("unset CX_ITERS CX_TRIALS CX_WARMUP 2>/dev/null || true")
+print("\n".join(lines))
+PY
+)"
+ eval "$_exports"
+ # Each case has its OWN routing/dims -> its own canonical workload manifest. cx_stage_canonical
+ # short-circuits when CX_WORKLOAD_DIR is already set, so without this unset the first case's
+ # staged dir is reused for the rest and run_ep.py can't find the later cases' manifests
+ # (FileNotFoundError .cx_workloads/.manifest.json). Unset so every case re-stages its own.
+ unset CX_WORKLOAD_DIR 2>/dev/null || true
+ cx_log " [$((ci+1))/$ncases] $CX_BENCH $CX_MODE/$CX_PHASE routing=$CX_ROUTING eplb=${CX_EPLB:-0}"
+ _cx_case_ts="$CX_TS"
+ CX_TS="${_cx_case_ts}-a01"
+ export CX_ATTEMPT_ID=1 CX_TS
+ dispatch_bench || {
+ failed_cases=$((failed_cases+1))
+ cx_log " [$((ci+1))/$ncases] $CX_BENCH case FAILED; failed-case record preserved"
+ }
+ export CX_TS="$_cx_case_ts"
+ ci=$((ci + 1))
+ done
+ if [ "${failed_cases:-0}" -gt 0 ]; then
+ cx_log "SHARD done: $failed_cases/$ncases case(s) failed"
+ rc=1
+ fi
+ # The base timestamp matches every per-case file, so the final summary covers the whole shard.
+ export CX_TS="$_cx_ts_base"
+else
+ _cx_single_ts="$CX_TS"
+ CX_TS="${_cx_single_ts}-a01"
+ export CX_ATTEMPT_ID=1 CX_TS
+ dispatch_bench || rc=1
+fi
+
+# Summary table for the log; also fails the job if no valid results were produced.
+python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1
+exit "$rc"
diff --git a/experimental/CollectiveX/schemas/channel-v1.schema.json b/experimental/CollectiveX/schemas/channel-v1.schema.json
new file mode 100644
index 0000000000..663e22914b
--- /dev/null
+++ b/experimental/CollectiveX/schemas/channel-v1.schema.json
@@ -0,0 +1,23 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/channel-v1.schema.json",
+ "title": "CollectiveX public channel v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["format","channel","dataset","generated_at"],
+ "properties": {
+ "format": {"const": "collectivex.channel.v1"},
+ "channel": {"enum": ["latest-attempt","dev-latest"]},
+ "dataset": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["path","sha256","bytes"],
+ "properties": {
+ "path": {"type": "string","pattern": "^datasets/[0-9a-f]{64}/dataset\\.json$"},
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "bytes": {"type": "integer","minimum": 1,"maximum": 33554432}
+ }
+ },
+ "generated_at": {"type": "string","format": "date-time"}
+ }
+}
diff --git a/experimental/CollectiveX/schemas/private-bundle-v1.schema.json b/experimental/CollectiveX/schemas/private-bundle-v1.schema.json
new file mode 100644
index 0000000000..166c808930
--- /dev/null
+++ b/experimental/CollectiveX/schemas/private-bundle-v1.schema.json
@@ -0,0 +1,162 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/private-bundle-v1.schema.json",
+ "title": "CollectiveX private attempt bundle v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "format",
+ "schema_version",
+ "created_at",
+ "ingest_id",
+ "run",
+ "matrix",
+ "sources",
+ "attempts",
+ "coverage",
+ "runtime_fingerprints",
+ "checksums",
+ "validation"
+ ],
+ "properties": {
+ "format": {"const": "collectivex.private.bundle.v1"},
+ "schema_version": {"const": 1},
+ "created_at": {"type": "string","format": "date-time"},
+ "ingest_id": {"$ref": "#/$defs/sha256"},
+ "run": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["repository","run_id","run_attempt","source_sha"],
+ "properties": {
+ "repository": {"type": "string","pattern": "^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$"},
+ "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"},
+ "run_attempt": {"type": "integer","minimum": 1},
+ "source_sha": {"type": "string","pattern": "^[0-9a-f]{40}$"}
+ }
+ },
+ "matrix": {"$ref": "#/$defs/file"},
+ "sources": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/source"}},
+ "attempts": {
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "attempt_id",
+ "allocation_id",
+ "case_id",
+ "outcome",
+ "reason",
+ "selected",
+ "document",
+ "samples",
+ "runtime_fingerprint_sha256",
+ "series_ids",
+ "evidence_ids"
+ ],
+ "properties": {
+ "attempt_id": {"$ref": "#/$defs/attemptId"},
+ "allocation_id": {"$ref": "#/$defs/allocationId"},
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "outcome": {"$ref": "#/$defs/outcome"},
+ "reason": {"$ref": "#/$defs/reason"},
+ "selected": {"type": "boolean"},
+ "document": {"$ref": "#/$defs/file"},
+ "samples": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/file"}]},
+ "runtime_fingerprint_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]},
+ "series_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/seriesId"}},
+ "evidence_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/evidenceId"}}
+ }
+ }
+ },
+ "coverage": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["expected_cases","terminal_cases","complete","outcome_counts","selections"],
+ "properties": {
+ "expected_cases": {"type": "integer","minimum": 1},
+ "terminal_cases": {"type": "integer","minimum": 0},
+ "complete": {"type": "boolean"},
+ "outcome_counts": {"$ref": "#/$defs/outcomeCounts"},
+ "selections": {
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["case_id","selected_attempt_id","outcome"],
+ "properties": {
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "selected_attempt_id": {"$ref": "#/$defs/attemptId"},
+ "outcome": {"$ref": "#/$defs/outcome"}
+ }
+ }
+ }
+ }
+ },
+ "runtime_fingerprints": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/sha256"}},
+ "checksums": {"$ref": "#/$defs/file"},
+ "validation": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["policy","passed","checks"],
+ "properties": {
+ "policy": {"const": "collectivex-publisher-v1"},
+ "passed": {"const": true},
+ "checks": {
+ "type": "array",
+ "minItems": 1,
+ "uniqueItems": true,
+ "items": {"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$"}
+ }
+ }
+ }
+ },
+ "$defs": {
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"},
+ "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"},
+ "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"},
+ "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"},
+ "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"},
+ "reason": {"oneOf": [{"type": "null"},{"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}]},
+ "outcome": {"enum": ["success","unsupported","failed","invalid","diagnostic"]},
+ "outcomeCounts": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["success","unsupported","failed","invalid","diagnostic"],
+ "properties": {
+ "success": {"type": "integer","minimum": 0},
+ "unsupported": {"type": "integer","minimum": 0},
+ "failed": {"type": "integer","minimum": 0},
+ "invalid": {"type": "integer","minimum": 0},
+ "diagnostic": {"type": "integer","minimum": 0}
+ }
+ },
+ "file": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["path","sha256","bytes"],
+ "properties": {
+ "path": {"type": "string","pattern": "^[A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*$"},
+ "sha256": {"$ref": "#/$defs/sha256"},
+ "bytes": {"type": "integer","minimum": 1}
+ }
+ },
+ "source": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["path","sha256","bytes","artifact_name"],
+ "properties": {
+ "path": {"type": "string","pattern": "^[A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*$"},
+ "sha256": {"$ref": "#/$defs/sha256"},
+ "bytes": {"type": "integer","minimum": 1},
+ "artifact_name": {
+ "type": "string",
+ "pattern": "^cx(?:unsupported|shard-[a-z0-9][a-z0-9_.-]{0,127})-[1-9][0-9]*-[1-9][0-9]*$"
+ }
+ }
+ }
+ }
+}
diff --git a/experimental/CollectiveX/schemas/public-dataset-v1.schema.json b/experimental/CollectiveX/schemas/public-dataset-v1.schema.json
new file mode 100644
index 0000000000..1b51a8c579
--- /dev/null
+++ b/experimental/CollectiveX/schemas/public-dataset-v1.schema.json
@@ -0,0 +1,606 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/public-dataset-v1.schema.json",
+ "title": "CollectiveX sanitized public dataset v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "format",
+ "schema_version",
+ "generated_at",
+ "source_bundle_ids",
+ "promotion",
+ "coverage",
+ "attempts",
+ "series",
+ "cohorts",
+ "rankings",
+ "recommendations",
+ "sensitivities"
+ ],
+ "properties": {
+ "format": {"const": "collectivex.public.v1"},
+ "schema_version": {"const": 1},
+ "generated_at": {"type": "string","format": "date-time"},
+ "source_bundle_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/sha256"}},
+ "promotion": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "status",
+ "reason",
+ "matrix_id",
+ "allocation_ids",
+ "required_allocations",
+ "requested_cases",
+ "terminal_cases",
+ "policy"
+ ],
+ "properties": {
+ "status": {"enum": ["promoted","diagnostic","quarantined"]},
+ "reason": {"$ref": "#/$defs/reason"},
+ "matrix_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]},
+ "allocation_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}},
+ "required_allocations": {"const": 3},
+ "requested_cases": {"type": "integer","minimum": 0},
+ "terminal_cases": {"type": "integer","minimum": 0},
+ "policy": {"const": "collectivex-decision-grade-v1"}
+ }
+ },
+ "coverage": {"type": "array","items": {"$ref": "#/$defs/coverage"}},
+ "attempts": {"type": "array","items": {"$ref": "#/$defs/attempt"}},
+ "series": {"type": "array","items": {"$ref": "#/$defs/series"}},
+ "cohorts": {"type": "array","items": {"$ref": "#/$defs/cohort"}},
+ "rankings": {"type": "array","items": {"$ref": "#/$defs/ranking"}},
+ "recommendations": {"type": "array","items": {"$ref": "#/$defs/recommendation"}},
+ "sensitivities": {"type": "array","items": {"$ref": "#/$defs/sensitivity"}}
+ },
+ "$defs": {
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"},
+ "workloadId": {"type": "string","pattern": "^cxwork-v1-[0-9a-f]{64}$"},
+ "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"},
+ "pointId": {"type": "string","pattern": "^cxpoint-v1-[0-9a-f]{64}$"},
+ "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"},
+ "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"},
+ "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"},
+ "safeId": {"type": "string","pattern": "^[a-z0-9][a-z0-9_.-]*$","maxLength": 128},
+ "publicationTier": {"enum": ["official","comparable-experimental"]},
+ "label": {"type": "string","minLength": 1,"maxLength": 160},
+ "nullableLabel": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/label"}]},
+ "reason": {"oneOf": [{"type": "null"},{"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}]},
+ "outcome": {"enum": ["success","unsupported","failed","invalid","diagnostic"]},
+ "coverageTopology": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "ep_size",
+ "nodes",
+ "gpus_per_node",
+ "scale_up_domain",
+ "scope",
+ "scale_up_transport",
+ "scale_out_transport",
+ "transport",
+ "topology_class"
+ ],
+ "properties": {
+ "ep_size": {"type": "integer","minimum": 1},
+ "nodes": {"type": "integer","minimum": 1},
+ "gpus_per_node": {"type": "integer","minimum": 1},
+ "scale_up_domain": {"type": "integer","minimum": 1},
+ "scope": {"enum": ["scale-up","scale-out"]},
+ "scale_up_transport": {"$ref": "#/$defs/safeId"},
+ "scale_out_transport": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/safeId"}]},
+ "transport": {"$ref": "#/$defs/safeId"},
+ "topology_class": {"$ref": "#/$defs/safeId"}
+ }
+ },
+ "coverage": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "case_id",
+ "label",
+ "required",
+ "sku",
+ "backend",
+ "mode",
+ "phase",
+ "topology",
+ "disposition",
+ "selected_attempt_id",
+ "outcome",
+ "failure_mode",
+ "reason",
+ "attempt_ids"
+ ],
+ "properties": {
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "label": {"$ref": "#/$defs/label"},
+ "required": {"type": "boolean"},
+ "sku": {"$ref": "#/$defs/safeId"},
+ "backend": {"$ref": "#/$defs/safeId"},
+ "mode": {"enum": ["normal","low-latency"]},
+ "phase": {"enum": ["decode","prefill"]},
+ "topology": {"$ref": "#/$defs/coverageTopology"},
+ "disposition": {"enum": ["runnable","unsupported"]},
+ "selected_attempt_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/attemptId"}]},
+ "outcome": {"$ref": "#/$defs/outcome"},
+ "failure_mode": {"$ref": "#/$defs/reason"},
+ "reason": {"$ref": "#/$defs/reason"},
+ "attempt_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/attemptId"}}
+ }
+ },
+ "attempt": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "attempt_id",
+ "evidence",
+ "case_id",
+ "allocation_id",
+ "run_id",
+ "run_attempt",
+ "attempt_index",
+ "selected",
+ "outcome",
+ "failure_mode",
+ "reason",
+ "series_id",
+ "completed_at"
+ ],
+ "properties": {
+ "attempt_id": {"$ref": "#/$defs/attemptId"},
+ "evidence": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["evidence_id","point_id"],
+ "properties": {"evidence_id": {"$ref": "#/$defs/evidenceId"},"point_id": {"$ref": "#/$defs/pointId"}}
+ }
+ },
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "allocation_id": {"$ref": "#/$defs/allocationId"},
+ "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"},
+ "run_attempt": {"type": "integer","minimum": 1},
+ "attempt_index": {"type": "integer","minimum": 1},
+ "selected": {"type": "boolean"},
+ "outcome": {"$ref": "#/$defs/outcome"},
+ "failure_mode": {"$ref": "#/$defs/reason"},
+ "reason": {"$ref": "#/$defs/reason"},
+ "series_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/seriesId"}]},
+ "completed_at": {"oneOf": [{"type": "null"},{"type": "string","format": "date-time"}]}
+ }
+ },
+ "eligibility": {
+ "type": "object",
+ "additionalProperties": false,
+ "allOf": [{
+ "if": {"properties": {"decision_grade": {"const": true}},"required": ["decision_grade"]},
+ "then": {"properties": {"reasons": {"maxItems": 0}}},
+ "else": {"properties": {"reasons": {"minItems": 1}}}
+ }],
+ "required": [
+ "decision_grade",
+ "allocation_ids",
+ "complete",
+ "correct",
+ "measured_roundtrip_p99",
+ "stable_p50",
+ "stable_p99",
+ "stable_ordering",
+ "p50_max_min_ratio",
+ "p99_max_min_ratio",
+ "reasons"
+ ],
+ "properties": {
+ "decision_grade": {"type": "boolean"},
+ "allocation_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}},
+ "complete": {"type": "boolean"},
+ "correct": {"type": "boolean"},
+ "measured_roundtrip_p99": {"type": "boolean"},
+ "stable_p50": {"type": "boolean"},
+ "stable_p99": {"type": "boolean"},
+ "stable_ordering": {"type": "boolean"},
+ "p50_max_min_ratio": {"oneOf": [{"type": "null"},{"type": "number","minimum": 1}]},
+ "p99_max_min_ratio": {"oneOf": [{"type": "null"},{"type": "number","minimum": 1}]},
+ "reasons": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}
+ }
+ }
+ },
+ "percentiles": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["p50","p90","p95","p99"],
+ "properties": {
+ "p50": {"type": "number","exclusiveMinimum": 0},
+ "p90": {"type": "number","exclusiveMinimum": 0},
+ "p95": {"type": "number","exclusiveMinimum": 0},
+ "p99": {"type": "number","exclusiveMinimum": 0}
+ }
+ },
+ "component": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["origin","latency_us","logical_bytes","logical_payload_rate_gbps_at_latency_percentile","sample_count"],
+ "properties": {
+ "origin": {"enum": ["measured","derived"]},
+ "latency_us": {"$ref": "#/$defs/percentiles"},
+ "logical_bytes": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]},
+ "logical_payload_rate_gbps_at_latency_percentile": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/percentiles"}]},
+ "sample_count": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]}
+ }
+ },
+ "nullableComponent": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/component"}]},
+ "point": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "point_id",
+ "tokens_per_rank",
+ "global_tokens",
+ "correct",
+ "routing",
+ "components",
+ "roundtrip_token_rate_at_latency_percentile",
+ "evidence_ids"
+ ],
+ "properties": {
+ "point_id": {"$ref": "#/$defs/pointId"},
+ "tokens_per_rank": {"type": "integer","minimum": 1},
+ "global_tokens": {"type": "integer","minimum": 1},
+ "correct": {"type": "boolean"},
+ "routing": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "fanout_mean",
+ "recv_tokens_max",
+ "expert_load_cv",
+ "payload_rank_cv",
+ "hotspot_ratio",
+ "empty_expert_count",
+ "empty_rank_count",
+ "routed_copies"
+ ],
+ "properties": {
+ "fanout_mean": {"type": "number","minimum": 0},
+ "recv_tokens_max": {"type": "integer","minimum": 0},
+ "expert_load_cv": {"type": "number","minimum": 0},
+ "payload_rank_cv": {"type": "number","minimum": 0},
+ "hotspot_ratio": {"type": "number","minimum": 0},
+ "empty_expert_count": {"type": "integer","minimum": 0},
+ "empty_rank_count": {"type": "integer","minimum": 0},
+ "routed_copies": {"type": "integer","minimum": 1}
+ }
+ },
+ "components": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["dispatch","combine","roundtrip","isolated_sum"],
+ "properties": {
+ "dispatch": {"$ref": "#/$defs/nullableComponent"},
+ "combine": {"$ref": "#/$defs/nullableComponent"},
+ "roundtrip": {"$ref": "#/$defs/nullableComponent"},
+ "isolated_sum": {"$ref": "#/$defs/nullableComponent"}
+ }
+ },
+ "roundtrip_token_rate_at_latency_percentile": {"$ref": "#/$defs/percentiles"},
+ "evidence_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/evidenceId"}}
+ }
+ },
+ "series": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "series_id",
+ "label",
+ "status",
+ "case_ids",
+ "allocation_ids",
+ "model",
+ "suite",
+ "mode",
+ "phase",
+ "publication_tier",
+ "backend",
+ "build",
+ "system",
+ "workload",
+ "eplb",
+ "resource",
+ "measurement",
+ "points",
+ "eligibility"
+ ],
+ "properties": {
+ "series_id": {"$ref": "#/$defs/seriesId"},
+ "label": {"$ref": "#/$defs/label"},
+ "status": {"enum": ["decision-grade","diagnostic"]},
+ "case_ids": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/caseId"}},
+ "allocation_ids": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}},
+ "model": {"$ref": "#/$defs/safeId"},
+ "suite": {"$ref": "#/$defs/safeId"},
+ "mode": {"enum": ["normal","low-latency"]},
+ "phase": {"enum": ["decode","prefill"]},
+ "publication_tier": {"$ref": "#/$defs/publicationTier"},
+ "backend": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["id","label","role","generation","version"],
+ "properties": {
+ "id": {"$ref": "#/$defs/safeId"},
+ "label": {"$ref": "#/$defs/label"},
+ "role": {"enum": ["library","reference"]},
+ "generation": {"$ref": "#/$defs/nullableLabel"},
+ "version": {"$ref": "#/$defs/nullableLabel"}
+ }
+ },
+ "build": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["implementation_contract_sha256","public_config_sha256","routing_control_sha256","runtime_fingerprint_sha256","image_digest","source_sha","squash_sha256"],
+ "properties": {
+ "implementation_contract_sha256": {"$ref": "#/$defs/sha256"},
+ "public_config_sha256": {"$ref": "#/$defs/sha256"},
+ "routing_control_sha256": {"$ref": "#/$defs/sha256"},
+ "runtime_fingerprint_sha256": {"$ref": "#/$defs/sha256"},
+ "image_digest": {"type": "string","pattern": "^sha256:[0-9a-f]{64}$"},
+ "source_sha": {"type": "string","pattern": "^[0-9a-f]{40,64}$"},
+ "squash_sha256": {"$ref": "#/$defs/sha256"}
+ }
+ },
+ "system": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["sku","label","vendor","topology_class","transport","scale_up_transport","scale_out_transport","scope","nodes","gpus_per_node","scale_up_domain","world_size","ep_size","placement"],
+ "properties": {
+ "sku": {"$ref": "#/$defs/safeId"},
+ "label": {"$ref": "#/$defs/label"},
+ "vendor": {"enum": ["nvidia","amd"]},
+ "topology_class": {"$ref": "#/$defs/safeId"},
+ "transport": {"$ref": "#/$defs/safeId"},
+ "scale_up_transport": {"$ref": "#/$defs/safeId"},
+ "scale_out_transport": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/safeId"}]},
+ "scope": {"enum": ["scale-up","scale-out"]},
+ "nodes": {"type": "integer","minimum": 1},
+ "gpus_per_node": {"type": "integer","minimum": 1},
+ "scale_up_domain": {"type": "integer","minimum": 1},
+ "world_size": {"type": "integer","minimum": 1},
+ "ep_size": {"type": "integer","minimum": 1},
+ "placement": {"enum": ["packed"]}
+ }
+ },
+ "workload": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "workload_id",
+ "hidden",
+ "top_k",
+ "experts",
+ "routing",
+ "eplb",
+ "dispatch_dtype",
+ "combine_dtype",
+ "activation_profile"
+ ],
+ "properties": {
+ "workload_id": {"$ref": "#/$defs/workloadId"},
+ "hidden": {"type": "integer","minimum": 1},
+ "top_k": {"type": "integer","minimum": 1},
+ "experts": {"type": "integer","minimum": 1},
+ "routing": {"enum": ["uniform","zipf"]},
+ "eplb": {"type": "boolean"},
+ "dispatch_dtype": {"const": "bf16"},
+ "combine_dtype": {"const": "bf16"},
+ "activation_profile": {"const": "canonical-counter-source-v3"}
+ }
+ },
+ "eplb": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "enabled",
+ "planner",
+ "mapping_sha256",
+ "logical_experts",
+ "physical_experts",
+ "redundant_experts",
+ "reference_tokens_per_rank",
+ "replicated_experts",
+ "max_replicas",
+ "imbalance_before",
+ "imbalance_after"
+ ],
+ "properties": {
+ "enabled": {"type": "boolean"},
+ "planner": {"$ref": "#/$defs/nullableLabel"},
+ "mapping_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]},
+ "logical_experts": {"type": "integer","minimum": 1},
+ "physical_experts": {"type": "integer","minimum": 1},
+ "redundant_experts": {"type": "integer","minimum": 0},
+ "reference_tokens_per_rank": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]},
+ "replicated_experts": {"type": "integer","minimum": 0},
+ "max_replicas": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 0}]},
+ "imbalance_before": {"oneOf": [{"type": "null"},{"type": "number","minimum": 0}]},
+ "imbalance_after": {"oneOf": [{"type": "null"},{"type": "number","minimum": 0}]}
+ }
+ },
+ "resource": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["mode","profile","comm_units_kind","configured_units"],
+ "properties": {
+ "mode": {"const": "tuned"},
+ "profile": {"$ref": "#/$defs/safeId"},
+ "comm_units_kind": {"$ref": "#/$defs/nullableLabel"},
+ "configured_units": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]}
+ }
+ },
+ "measurement": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "contract",
+ "component_order_contract",
+ "combine_semantics",
+ "payload_unit",
+ "sampling_contract",
+ "iters",
+ "trials",
+ "warmups",
+ "samples_per_component",
+ "headline_component",
+ "headline_percentile"
+ ],
+ "properties": {
+ "contract": {"enum": ["layout-and-dispatch-v1","expert-packed-weighted-combine-v1"]},
+ "component_order_contract": {"enum": ["roundtrip-dispatch-activation-only-combine-v2","roundtrip-dispatch-gate-weighted-combine-v1"]},
+ "combine_semantics": {"enum": ["activation-only","gate-weighted"]},
+ "payload_unit": {"enum": ["token-rank","token-expert"]},
+ "sampling_contract": {"const": "fixed-512-v1"},
+ "iters": {"const": 8},
+ "trials": {"const": 64},
+ "warmups": {"const": 32},
+ "samples_per_component": {"const": 512},
+ "headline_component": {"const": "roundtrip"},
+ "headline_percentile": {"const": "p99"}
+ }
+ },
+ "points": {"type": "array","minItems": 1,"items": {"$ref": "#/$defs/point"}},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ },
+ "cohort": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "cohort_id",
+ "kind",
+ "label",
+ "description",
+ "series_ids",
+ "controlled_factors",
+ "varying_factors",
+ "publication_tier",
+ "eligibility"
+ ],
+ "properties": {
+ "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"},
+ "kind": {"enum": ["library","chip","system","routing"]},
+ "label": {"$ref": "#/$defs/label"},
+ "description": {"$ref": "#/$defs/label"},
+ "series_ids": {"type": "array","minItems": 2,"uniqueItems": true,"items": {"$ref": "#/$defs/seriesId"}},
+ "controlled_factors": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/safeId"}},
+ "varying_factors": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/safeId"}},
+ "publication_tier": {"$ref": "#/$defs/publicationTier"},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ },
+ "metric": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["operation","statistic","measure","objective","tokens_per_rank","phase"],
+ "properties": {
+ "operation": {"const": "roundtrip"},
+ "statistic": {"enum": ["p50","p99"]},
+ "measure": {"enum": ["latency_us","logical_payload_rate_gbps_at_latency_percentile"]},
+ "objective": {"enum": ["min","max"]},
+ "tokens_per_rank": {"type": "integer","minimum": 1},
+ "phase": {"enum": ["decode","prefill"]}
+ }
+ },
+ "ranking": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["ranking_id","cohort_id","label","metric","entries","publication_tier","eligibility"],
+ "properties": {
+ "ranking_id": {"type": "string","pattern": "^cxranking-v1-[0-9a-f]{64}$"},
+ "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"},
+ "label": {"$ref": "#/$defs/label"},
+ "metric": {"$ref": "#/$defs/metric"},
+ "entries": {
+ "type": "array",
+ "minItems": 2,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["rank","series_id","point_id","value","unit"],
+ "properties": {
+ "rank": {"type": "integer","minimum": 1},
+ "series_id": {"$ref": "#/$defs/seriesId"},
+ "point_id": {"$ref": "#/$defs/pointId"},
+ "value": {"type": "number","exclusiveMinimum": 0},
+ "unit": {"enum": ["us","GB/s"]}
+ }
+ }
+ },
+ "publication_tier": {"$ref": "#/$defs/publicationTier"},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ },
+ "recommendation": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "recommendation_id",
+ "cohort_id",
+ "label",
+ "objective",
+ "series_id",
+ "point_id",
+ "value",
+ "unit",
+ "rationale",
+ "publication_tier",
+ "eligibility"
+ ],
+ "properties": {
+ "recommendation_id": {"type": "string","pattern": "^cxrecommendation-v1-[0-9a-f]{64}$"},
+ "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"},
+ "label": {"$ref": "#/$defs/label"},
+ "objective": {"enum": ["min-p50-latency","min-p99-latency","max-payload-rate-at-p50-latency","max-payload-rate-at-p99-latency"]},
+ "series_id": {"$ref": "#/$defs/seriesId"},
+ "point_id": {"$ref": "#/$defs/pointId"},
+ "value": {"type": "number","exclusiveMinimum": 0},
+ "unit": {"enum": ["us","GB/s"]},
+ "rationale": {"$ref": "#/$defs/label"},
+ "publication_tier": {"const": "official"},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ },
+ "sensitivity": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "sensitivity_id",
+ "cohort_id",
+ "label",
+ "baseline_series_id",
+ "candidate_series_id",
+ "metric",
+ "signed_change_ratio",
+ "publication_tier",
+ "eligibility"
+ ],
+ "properties": {
+ "sensitivity_id": {"type": "string","pattern": "^cxsensitivity-v1-[0-9a-f]{64}$"},
+ "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"},
+ "label": {"$ref": "#/$defs/label"},
+ "baseline_series_id": {"$ref": "#/$defs/seriesId"},
+ "candidate_series_id": {"$ref": "#/$defs/seriesId"},
+ "metric": {"$ref": "#/$defs/metric"},
+ "signed_change_ratio": {"type": "number"},
+ "publication_tier": {"$ref": "#/$defs/publicationTier"},
+ "eligibility": {"$ref": "#/$defs/eligibility"}
+ }
+ }
+ }
+}
diff --git a/experimental/CollectiveX/schemas/raw-case-v1.schema.json b/experimental/CollectiveX/schemas/raw-case-v1.schema.json
new file mode 100644
index 0000000000..be0d98d76c
--- /dev/null
+++ b/experimental/CollectiveX/schemas/raw-case-v1.schema.json
@@ -0,0 +1,1199 @@
+{
+ "$id": "https://inferencex.com/schemas/collectivex/raw-case-v1.schema.json",
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$defs": {
+ "deepep_v2_jit_cubin": {
+ "additionalProperties": false,
+ "properties": {
+ "cache_key": {
+ "pattern":"^kernel\\.[A-Za-z0-9_+-]+\\.[0-9a-f]{32}$",
+ "type":"string"
+ },
+ "cubin_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "sass_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "source_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}
+ },
+ "required": ["cache_key","cubin_sha256","sass_sha256","source_sha256"],
+ "type": "object"
+ },
+ "hybrid_jit_rank_artifact": {
+ "additionalProperties": false,
+ "properties": {
+ "bytes": {"minimum":1,"type":"integer"},
+ "rank": {"minimum":0,"type":"integer"},
+ "sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}
+ },
+ "required": ["bytes","rank","sha256"],
+ "type": "object"
+ },
+ "hybrid_realized_config": {
+ "additionalProperties": false,
+ "properties": {
+ "backward_combine_api": {"type":"boolean"},
+ "device_side_sync_combine_api": {"type":"boolean"},
+ "device_side_sync_dispatch_api": {"type":"boolean"},
+ "forward_dispatch_api": {"type":"boolean"},
+ "hidden_dim": {"minimum":1,"type":"integer"},
+ "max_num_of_tokens_per_rank": {"minimum":1,"type":"integer"},
+ "num_of_additional_in_flight_s2g_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_additional_in_flight_s2g_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_blocks_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_blocks_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_blocks_permute": {"minimum":0,"type":"integer"},
+ "num_of_blocks_preprocessing_api": {"minimum":0,"type":"integer"},
+ "num_of_blocks_unpermute": {"minimum":0,"type":"integer"},
+ "num_of_experts_per_rank": {"minimum":1,"type":"integer"},
+ "num_of_in_flight_s2g_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_in_flight_s2g_permute_block_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_nodes": {"minimum":1,"type":"integer"},
+ "num_of_ranks_per_node": {"minimum":1,"type":"integer"},
+ "num_of_stages_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_stages_g2s_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_stages_permute_block_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_stages_s2g_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_threads_per_block_preprocessing_api": {"minimum":0,"type":"integer"},
+ "num_of_tokens_per_chunk_combine_api": {"minimum":0,"type":"integer"},
+ "num_of_tokens_per_chunk_dispatch_api": {"minimum":0,"type":"integer"},
+ "num_of_tokens_per_chunk_preprocessing_api": {"minimum":0,"type":"integer"},
+ "num_of_tokens_per_group_combine_api": {"minimum":0,"type":"integer"},
+ "pad_multiple": {"minimum":0,"type":"integer"},
+ "token_data_type": {"enum":["UINT8","UINT16"]}
+ },
+ "required": [
+ "backward_combine_api","device_side_sync_combine_api","device_side_sync_dispatch_api",
+ "forward_dispatch_api","hidden_dim","max_num_of_tokens_per_rank",
+ "num_of_additional_in_flight_s2g_combine_api",
+ "num_of_additional_in_flight_s2g_dispatch_api","num_of_blocks_combine_api",
+ "num_of_blocks_dispatch_api","num_of_blocks_permute","num_of_blocks_preprocessing_api",
+ "num_of_blocks_unpermute","num_of_experts_per_rank",
+ "num_of_in_flight_s2g_dispatch_api","num_of_in_flight_s2g_permute_block_dispatch_api",
+ "num_of_nodes","num_of_ranks_per_node","num_of_stages_dispatch_api",
+ "num_of_stages_g2s_combine_api","num_of_stages_permute_block_dispatch_api",
+ "num_of_stages_s2g_combine_api","num_of_threads_per_block_preprocessing_api",
+ "num_of_tokens_per_chunk_combine_api","num_of_tokens_per_chunk_dispatch_api",
+ "num_of_tokens_per_chunk_preprocessing_api","num_of_tokens_per_group_combine_api",
+ "pad_multiple","token_data_type"
+ ],
+ "type": "object"
+ },
+ "nullable_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]},
+ "oracle": {
+ "additionalProperties": false,
+ "properties": {
+ "checks": {
+ "additionalProperties": false,
+ "properties": {
+ "combine_values": {"type":"boolean"},
+ "counts": {"type":"boolean"},
+ "metadata": {"type":"boolean"},
+ "multiplicity": {"type":"boolean"},
+ "payload": {"type":"boolean"},
+ "source_set": {"type":"boolean"},
+ "weights": {"type":"boolean"}
+ },
+ "required": ["combine_values","counts","metadata","multiplicity","payload","source_set","weights"],
+ "type": "object"
+ },
+ "atol": {"const":0.02},
+ "combine_weight_semantics": {"enum":["unweighted-rank-sum","gate-weighted-sum"]},
+ "contract": {"enum":["expert-specific-transform-v1","expert-assignment-transform-v1"]},
+ "dispatch_sha256": {"$ref":"#/$defs/nullable_sha256"},
+ "max_absolute_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "max_elementwise_relative_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "max_relative_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "max_weight_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "order_sha256": {"$ref":"#/$defs/nullable_sha256"},
+ "ordering_contract": {"minLength":1,"type":"string"},
+ "passed": {"type":"boolean"},
+ "receive_count": {"minimum":0,"type":"integer"},
+ "rtol": {"const":0.05}
+ },
+ "required": [
+ "atol",
+ "checks",
+ "combine_weight_semantics",
+ "contract",
+ "dispatch_sha256",
+ "max_absolute_error",
+ "max_elementwise_relative_error",
+ "max_relative_error",
+ "max_weight_error",
+ "order_sha256",
+ "ordering_contract",
+ "passed",
+ "receive_count",
+ "rtol"
+ ],
+ "type": "object"
+ },
+ "percentiles": {
+ "additionalProperties": false,
+ "properties": {
+ "p50": {"minimum":0,"type":"number"},
+ "p90": {"minimum":0,"type":"number"},
+ "p95": {"minimum":0,"type":"number"},
+ "p99": {"minimum":0,"type":"number"}
+ },
+ "required": ["p50","p90","p95","p99"],
+ "type": "object"
+ },
+ "component": {
+ "additionalProperties": false,
+ "properties": {
+ "availability": {"enum":["measured","derived","unavailable"]},
+ "origin": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "percentiles_us": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/percentiles"}]},
+ "sample_count": {"minimum":0,"type":"integer"}
+ },
+ "required": ["availability","origin","percentiles_us","sample_count"],
+ "type": "object"
+ },
+ "histogram": {
+ "additionalProperties": false,
+ "properties": {
+ "bins": {"minimum":1,"type":"integer"},
+ "counts": {"items":{"minimum":0,"type":"integer"},"minItems":1,"type":"array"},
+ "max": {"minimum":0,"type":"number"},
+ "min": {"minimum":0,"type":"number"},
+ "n": {"minimum":1,"type":"integer"}
+ },
+ "required": ["n","min","max","bins","counts"],
+ "type": "object"
+ },
+ "scheduled_case": {
+ "additionalProperties": false,
+ "properties": {
+ "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "canonical": {"const":true},
+ "ep": {"minimum":1,"type":"integer"},
+ "eplb": {"type":"boolean"},
+ "experts": {"minimum":1,"type":"integer"},
+ "gpus_per_node": {"minimum":1,"type":"integer"},
+ "hidden": {"minimum":1,"type":"integer"},
+ "ladder": {"pattern":"^[1-9][0-9]*( [1-9][0-9]*)*$","type":"string"},
+ "mode": {"enum":["normal","low-latency"]},
+ "nodes": {"minimum":1,"type":"integer"},
+ "phase": {"enum":["decode","prefill"]},
+ "required_publication": {"enum":["official","comparable-experimental"]},
+ "routing": {"enum":["uniform","zipf"]},
+ "samples_per_point": {"const":512},
+ "scale_out_transport": {"oneOf":[{"type":"null"},{"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}]},
+ "scale_up_domain": {"minimum":1,"type":"integer"},
+ "scale_up_transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "scope": {"enum":["scale-up","scale-out"]},
+ "suite": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "timing": {"const":"8:64:32"},
+ "topk": {"minimum":1,"type":"integer"},
+ "topology_class": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "warmup_semantics": {"const":"full-roundtrip-before-each-component-trial-point-v1"},
+ "workload": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}
+ },
+ "required": [
+ "backend",
+ "canonical",
+ "eplb",
+ "ep",
+ "experts",
+ "gpus_per_node",
+ "hidden",
+ "ladder",
+ "mode",
+ "nodes",
+ "phase",
+ "required_publication",
+ "routing",
+ "samples_per_point",
+ "scale_out_transport",
+ "scale_up_domain",
+ "scale_up_transport",
+ "scope",
+ "suite",
+ "timing",
+ "topk",
+ "topology_class",
+ "transport",
+ "warmup_semantics",
+ "workload"
+ ],
+ "type": "object"
+ },
+ "git_run": {
+ "additionalProperties": false,
+ "properties": {
+ "artifact": {"minLength":1,"type":"string"},
+ "job": {"minLength":1,"type":"string"},
+ "ref": {"minLength":1,"type":"string"},
+ "repo": {"pattern":"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$","type":"string"},
+ "run_attempt": {"pattern":"^[1-9][0-9]*$","type":"string"},
+ "run_id": {"pattern":"^[1-9][0-9]*$","type":"string"},
+ "source_sha": {"pattern":"^[0-9a-f]{40}$","type":"string"}
+ },
+ "required": ["artifact","job","ref","repo","run_attempt","run_id","source_sha"],
+ "type": "object"
+ }
+ },
+ "additionalProperties": false,
+ "properties": {
+ "case": {
+ "additionalProperties": false,
+ "properties": {
+ "attempt_ordinal": {"minimum":1,"type":"integer"},
+ "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "ep_size": {"minimum":1,"type":"integer"},
+ "eplb": {
+ "additionalProperties": false,
+ "properties": {
+ "enabled": {"type":"boolean"},
+ "imbalance_after": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "imbalance_before": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]},
+ "mapping_hash": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]},
+ "max_replicas": {"oneOf":[{"type":"null"},{"minimum":0,"type":"integer"}]},
+ "num_logical_experts": {"minimum":1,"type":"integer"},
+ "num_physical_experts": {"minimum":1,"type":"integer"},
+ "num_redundant": {"minimum":0,"type":"integer"},
+ "planner": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "reference_tokens_per_rank": {"oneOf":[{"type":"null"},{"minimum":1,"type":"integer"}]},
+ "replicated_experts": {"minimum":0,"type":"integer"}
+ },
+ "required": [
+ "enabled",
+ "imbalance_after",
+ "imbalance_before",
+ "mapping_hash",
+ "max_replicas",
+ "num_logical_experts",
+ "num_physical_experts",
+ "num_redundant",
+ "planner",
+ "reference_tokens_per_rank",
+ "replicated_experts"
+ ],
+ "type": "object"
+ },
+ "mode": {"enum":["normal","low-latency"]},
+ "phase": {"enum":["decode","prefill"]},
+ "required_publication": {"enum":["official","comparable-experimental"]},
+ "resource_mode": {"const":"tuned"},
+ "runner": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "shape": {
+ "additionalProperties": false,
+ "properties": {
+ "activation_profile": {"const":"canonical-counter-source-v3"},
+ "dispatch_dtype": {"const":"bf16"},
+ "eplb": {"type":"boolean"},
+ "experts": {"minimum":1,"type":"integer"},
+ "experts_per_rank": {"minimum":1,"type":"integer"},
+ "hidden": {"minimum":1,"type":"integer"},
+ "kernel_gen": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "num_logical_experts": {"minimum":1,"type":"integer"},
+ "quant": {
+ "additionalProperties": false,
+ "properties": {
+ "combine_accum_dtype": {"minLength":1,"type":"string"},
+ "combine_input_dtype": {"const":"bf16"},
+ "combine_output_dtype": {"const":"bf16"},
+ "combine_quant_mode": {"const":"none"},
+ "scale_layout": {"type":"null"}
+ },
+ "required": [
+ "combine_accum_dtype",
+ "combine_input_dtype",
+ "combine_output_dtype",
+ "combine_quant_mode",
+ "scale_layout"
+ ],
+ "type": "object"
+ },
+ "routing": {"enum":["uniform","zipf"]},
+ "topk": {"minimum":1,"type":"integer"}
+ },
+ "required": [
+ "activation_profile",
+ "dispatch_dtype",
+ "eplb",
+ "experts",
+ "experts_per_rank",
+ "hidden",
+ "kernel_gen",
+ "num_logical_experts",
+ "quant",
+ "routing",
+ "topk"
+ ],
+ "type": "object"
+ },
+ "suite": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "workload_name": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}
+ },
+ "required": [
+ "attempt_ordinal",
+ "backend",
+ "eplb",
+ "ep_size",
+ "mode",
+ "phase",
+ "required_publication",
+ "resource_mode",
+ "runner",
+ "shape",
+ "suite",
+ "workload_name"
+ ],
+ "type": "object"
+ },
+ "format": {"const":"collectivex.ep.v1"},
+ "generated_at": {"format":"date-time","type":"string"},
+ "identity": {
+ "additionalProperties": false,
+ "properties": {
+ "allocation_factors": {
+ "additionalProperties": false,
+ "properties": {
+ "artifact": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "execution_id": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "job": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "repo": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "run_attempt": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "run_id": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "runner": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "source_sha": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}
+ },
+ "required": ["artifact","execution_id","job","repo","run_attempt","run_id","runner","source_sha"],
+ "type": "object"
+ },
+ "allocation_id": {"pattern":"^cxallocation-v1-[0-9a-f]{64}$","type":"string"},
+ "attempt_id": {"pattern":"^cxattempt-v1-[0-9a-f]{64}$","type":"string"},
+ "attempt_ordinal": {"minimum":1,"type":"integer"},
+ "case_factors": {
+ "additionalProperties": false,
+ "properties": {
+ "case": {"$ref":"#/$defs/scheduled_case"},
+ "profile": {
+ "oneOf": [
+ {"const": {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_profile": "canonical-counter-source-v3",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "combine_semantics": "activation-only",
+ "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2",
+ "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "contract": "layout-and-dispatch-v1",
+ "correctness_scope": "dispatch-metadata-and-transformed-combine",
+ "dtype": "bf16",
+ "eplb_planner": "greedy-rank-major-v1",
+ "eplb_redundant_experts": 32,
+ "eplb_reference_tokens_per_rank": 2048,
+ "mode": "normal",
+ "oracle_contract": "expert-specific-transform-v1",
+ "oracle_tolerances": "rtol=0.05,atol=0.02",
+ "payload_unit": "token-rank",
+ "placement": "packed",
+ "percentile_method": "nearest-rank",
+ "rank_reduction": "cross-rank-max-per-iteration",
+ "resource_mode": "tuned",
+ "routing_generator": "collectivex-routing-counter-v3",
+ "sampling_contract": "fixed-512-v1",
+ "seed": 67
+ }},
+ {"const": {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_profile": "canonical-counter-source-v3",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "combine_semantics": "gate-weighted",
+ "component_order_contract": "roundtrip-dispatch-gate-weighted-combine-v1",
+ "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "contract": "expert-packed-weighted-combine-v1",
+ "correctness_scope": "expert-assignment-and-weighted-combine",
+ "dtype": "bf16",
+ "eplb_planner": "greedy-rank-major-v1",
+ "eplb_redundant_experts": 32,
+ "eplb_reference_tokens_per_rank": 2048,
+ "mode": "low-latency",
+ "oracle_contract": "expert-assignment-transform-v1",
+ "oracle_tolerances": "rtol=0.05,atol=0.02",
+ "payload_unit": "token-expert",
+ "placement": "packed",
+ "percentile_method": "nearest-rank",
+ "rank_reduction": "cross-rank-max-per-iteration",
+ "resource_mode": "tuned",
+ "routing_generator": "collectivex-routing-counter-v3",
+ "sampling_contract": "fixed-512-v1",
+ "seed": 67
+ }}]
+ },
+ "sku": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}
+ },
+ "required": ["case","profile","sku"],
+ "type": "object"
+ },
+ "case_id": {"pattern":"^cxcase-v1-[0-9a-f]{64}$","type":"string"},
+ "series_factors": {
+ "additionalProperties": false,
+ "properties": {
+ "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "implementation_contract_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "public_config_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "routing_control_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "case_id": {"pattern":"^cxcase-v1-[0-9a-f]{64}$","type":"string"},
+ "image_digest": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "runtime_fingerprint_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "source_sha": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{40}$","type":"string"}]},
+ "squash_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]},
+ "workload_id": {"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"}
+ },
+ "required": [
+ "backend",
+ "implementation_contract_sha256",
+ "public_config_sha256",
+ "routing_control_sha256",
+ "case_id",
+ "image_digest",
+ "runtime_fingerprint_sha256",
+ "source_sha",
+ "squash_sha256",
+ "workload_id"
+ ],
+ "type": "object"
+ },
+ "series_id": {"pattern":"^cxseries-v1-[0-9a-f]{64}$","type":"string"}
+ },
+ "required": [
+ "allocation_factors",
+ "allocation_id",
+ "attempt_id",
+ "attempt_ordinal",
+ "case_factors",
+ "case_id",
+ "series_factors",
+ "series_id"
+ ],
+ "type": "object"
+ },
+ "implementation": {
+ "additionalProperties": false,
+ "properties": {
+ "kernel_generation": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "name": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "provenance": {
+ "properties": {
+ "allow_hybrid_mode": {"type":"boolean"},
+ "communication_backend": {"enum":["nccl-device-lsa","nccl-gin"]},
+ "deepep_fix_pr": {"const":630},
+ "deepep_pr": {"const":605},
+ "deterministic": {"type": "boolean"},
+ "gin_enabled": {"type":"boolean"},
+ "jit_cubins": {
+ "items": {"$ref":"#/$defs/deepep_v2_jit_cubin"},
+ "maxItems": 5,
+ "minItems": 5,
+ "type": "array",
+ "uniqueItems": true
+ },
+ "jit_kernel_keys": {
+ "items": {"maxLength":512,"pattern":"^[A-Za-z0-9][A-Za-z0-9_.+-]*$","type":"string"},
+ "maxItems": 3,
+ "minItems": 3,
+ "type": "array",
+ "uniqueItems": true
+ },
+ "jit_random_seed": {"const":"collectivex-deepep-v2-fa8a9b1"},
+ "jit_shared_objects": {
+ "items": {
+ "additionalProperties": false,
+ "properties": {
+ "kernel_key": {"maxLength":512,"pattern":"^[A-Za-z0-9][A-Za-z0-9_.+-]*$","type":"string"},
+ "rank_artifacts": {
+ "items": {"$ref":"#/$defs/hybrid_jit_rank_artifact"},
+ "minItems": 1,
+ "type": "array"
+ }
+ },
+ "required": ["kernel_key","rank_artifacts"],
+ "type": "object"
+ },
+ "maxItems": 3,
+ "minItems": 3,
+ "type": "array"
+ },
+ "num_experts": {"minimum": 1, "type": "integer"},
+ "num_nvl_bytes": {"minimum": 0, "type": "integer"},
+ "num_qps_per_rank": {"minimum": 1, "type": "integer"},
+ "num_rdma_bytes": {"minimum": 0, "type": "integer"},
+ "rdma_block_num": {"minimum": 0, "type": "integer"},
+ "realized_config": {"$ref":"#/$defs/hybrid_realized_config"},
+ "tuning_num_experts": {"minimum": 1, "type": "integer"},
+ "uccl_dependency_versions": {
+ "additionalProperties": false,
+ "properties": {
+ "intervaltree": {"const":"3.1.0"},
+ "nvidia-cuda-runtime-cu12": {"const":"12.9.79"},
+ "sortedcontainers": {"const":"2.4.0"}
+ },
+ "required": ["intervaltree","nvidia-cuda-runtime-cu12","sortedcontainers"],
+ "type": "object"
+ },
+ "use_external_inp_buf": {"type": "boolean"}
+ },
+ "type": "object",
+ "propertyNames": {
+ "enum": [
+ "allocated_qps",
+ "allow_hybrid_mode",
+ "allow_mnnvl",
+ "allow_multiple_reduction",
+ "api",
+ "api_signature_sha256",
+ "backend",
+ "backend_lineage",
+ "block_num",
+ "block_num_floored",
+ "block_num_target",
+ "branch",
+ "collective_library",
+ "combine_dtype",
+ "combine_warps",
+ "communication_backend",
+ "cuda_version",
+ "deepep_commit",
+ "deepep_distribution_version",
+ "deepep_fix_pr",
+ "deepep_pr",
+ "deepep_tree",
+ "deepep_version",
+ "deterministic",
+ "device_cus",
+ "device_sms",
+ "dispatch_dtype",
+ "dispatch_warps",
+ "enable_sdma",
+ "fmt_commit",
+ "gpus_per_node",
+ "gin_enabled",
+ "heap_size",
+ "impl",
+ "jit_cache_key",
+ "jit_cubins",
+ "jit_kernel_keys",
+ "jit_random_seed",
+ "jit_shared_objects",
+ "kernel_type",
+ "loaded_libraries",
+ "local_experts",
+ "logical_scaleout_ranks",
+ "logical_scaleup_ranks",
+ "mapping_variant",
+ "max_num_inp_token_per_rank",
+ "max_num_tokens",
+ "max_total_recv_tokens",
+ "mnnvl_comm",
+ "mode",
+ "mori_commit",
+ "nccl_communicator",
+ "nccl_package_version",
+ "nccl_version",
+ "num_experts",
+ "num_max_tokens_per_rank",
+ "num_nvl_bytes",
+ "num_qps",
+ "num_qps_per_rank",
+ "num_rdma_bytes",
+ "num_sms",
+ "nvshmem_package_version",
+ "path",
+ "physical_nvlink_ranks",
+ "physical_rdma_ranks",
+ "prefer_overlap_with_compute",
+ "rdma_block_num",
+ "reference_semantics",
+ "realized_config",
+ "requested_num_sms",
+ "resource_mode",
+ "routing_factor",
+ "routing_metadata",
+ "sm_fraction",
+ "top_k",
+ "torch_git_version",
+ "torch_version",
+ "transport",
+ "trtllm",
+ "tuned_source",
+ "tuning_num_experts",
+ "uccl_commit",
+ "uccl_dependency_versions",
+ "uccl_version",
+ "uccl_wrapper_commit",
+ "use_external_inp_buf",
+ "workspace"
+ ]
+ }
+ },
+ "resource_profile": {
+ "additionalProperties": false,
+ "properties": {
+ "achieved_fraction": {},
+ "comm_units_kind": {},
+ "configured_units": {},
+ "conformance_class": {},
+ "device_units": {},
+ "fixed_kernel": {},
+ "nonconforming": {},
+ "pareto_eligible": {},
+ "persistent_bytes": {},
+ "qps_per_rank": {},
+ "requested_fraction": {},
+ "tuned_source": {},
+ "target_achieved_within_tol": {},
+ "tolerance": {},
+ "resource_class": {},
+ "warps_combine": {},
+ "warps_dispatch": {}
+ },
+ "required": [
+ "comm_units_kind",
+ "requested_fraction",
+ "configured_units",
+ "device_units",
+ "achieved_fraction",
+ "warps_dispatch",
+ "warps_combine",
+ "qps_per_rank",
+ "persistent_bytes",
+ "tuned_source",
+ "resource_class",
+ "conformance_class",
+ "tolerance",
+ "target_achieved_within_tol",
+ "nonconforming",
+ "fixed_kernel",
+ "pareto_eligible"
+ ],
+ "type": "object"
+ }
+ },
+ "required": ["kernel_generation","name","provenance","resource_profile"],
+ "type": "object"
+ },
+ "measurement": {
+ "additionalProperties": false,
+ "properties": {
+ "component_order_contract": {"enum":["roundtrip-dispatch-activation-only-combine-v2","roundtrip-dispatch-gate-weighted-combine-v1"]},
+ "conditioning": {
+ "additionalProperties": false,
+ "properties": {
+ "contract": {"const":"fixed-phase-ramp-8-roundtrips-v1"},
+ "ladder": {"items":{"minimum":1,"type":"integer"},"minItems":1,"type":"array"},
+ "roundtrips_per_shape": {"const":8}
+ },
+ "required": ["contract","ladder","roundtrips_per_shape"],
+ "type": "object"
+ },
+ "contract": {"enum":["layout-and-dispatch-v1","expert-packed-weighted-combine-v1"]},
+ "rows": {
+ "items": {
+ "additionalProperties": false,
+ "properties": {
+ "anomalies": {
+ "items": {
+ "additionalProperties": false,
+ "properties": {
+ "T": {"minimum":1,"type":"integer"},
+ "component_floor_p50": {"minimum":0,"type":"number"},
+ "isolated_sum_p99": {"minimum":0,"type":"number"},
+ "ratio": {"minimum":0,"type":"number"},
+ "roundtrip_p50": {"minimum":0,"type":"number"},
+ "roundtrip_p99": {"minimum":0,"type":"number"},
+ "threshold": {"minimum":0,"type":"number"},
+ "type": {"enum":["roundtrip_gt_isolated_sum","roundtrip_lt_component_floor"]}
+ },
+ "required": ["type","T"],
+ "type": "object"
+ },
+ "type": "array"
+ },
+ "components": {
+ "additionalProperties": false,
+ "properties": {
+ "combine": {"$ref":"#/$defs/component"},
+ "dispatch": {"$ref":"#/$defs/component"},
+ "isolated_sum": {"$ref":"#/$defs/component"},
+ "roundtrip": {"$ref":"#/$defs/component"}
+ },
+ "required": ["combine","dispatch","isolated_sum","roundtrip"],
+ "type": "object"
+ },
+ "correctness": {
+ "additionalProperties": false,
+ "properties": {
+ "contract": {"enum":["expert-specific-transform-v1","expert-assignment-transform-v1"]},
+ "max_relative_error": {"minimum":0,"type":"number"},
+ "passed": {"type":"boolean"},
+ "rank_evidence": {
+ "items": {
+ "additionalProperties": false,
+ "properties": {
+ "input_unchanged": {"type":"boolean"},
+ "order_stable": {"type":"boolean"},
+ "post_timing": {"$ref":"#/$defs/oracle"},
+ "pre_timing": {"$ref":"#/$defs/oracle"},
+ "rank": {"minimum":0,"type":"integer"}
+ },
+ "required": ["input_unchanged","order_stable","post_timing","pre_timing","rank"],
+ "type": "object"
+ },
+ "minItems": 1,
+ "type": "array"
+ },
+ "scope": {"enum":["dispatch-metadata-and-transformed-combine","expert-assignment-and-weighted-combine"]}
+ },
+ "required": ["contract","max_relative_error","passed","rank_evidence","scope"],
+ "type": "object"
+ },
+ "evidence_id": {"pattern":"^cxevidence-v1-[0-9a-f]{64}$","type":"string"},
+ "global_tokens": {"minimum":1,"type":"integer"},
+ "logical_bytes": {
+ "additionalProperties": false,
+ "properties": {
+ "combine": {"minimum":1,"type":"integer"},
+ "dispatch": {"minimum":1,"type":"integer"},
+ "roundtrip": {"minimum":1,"type":"integer"}
+ },
+ "required": ["combine","dispatch","roundtrip"],
+ "type": "object"
+ },
+ "point_id": {"pattern":"^cxpoint-v1-[0-9a-f]{64}$","type":"string"},
+ "receive": {
+ "additionalProperties": false,
+ "properties": {
+ "max": {"minimum":0,"type":"integer"},
+ "mean": {"minimum":0,"type":"number"},
+ "min": {"minimum":0,"type":"integer"},
+ "total": {"minimum":0,"type":"integer"}
+ },
+ "required": ["max","mean","min","total"],
+ "type": "object"
+ },
+ "routing": {
+ "additionalProperties": false,
+ "properties": {
+ "empty_expert_count": {"minimum":0,"type":"integer"},
+ "empty_rank_count": {"minimum":0,"type":"integer"},
+ "expert_assignment_rank_cv": {"minimum":0,"type":"number"},
+ "expert_assignments_per_rank": {"items":{"minimum":0,"type":"integer"},"type":"array"},
+ "expert_load_cv": {"minimum":0,"type":"number"},
+ "expert_load_max": {"minimum":0,"type":"integer"},
+ "expert_load_mean": {"minimum":0,"type":"number"},
+ "expert_load_min": {"minimum":0,"type":"integer"},
+ "fanout_histogram": {"items":{"minimum":0,"type":"integer"},"type":"array"},
+ "fanout_max": {"minimum":1,"type":"integer"},
+ "fanout_mean": {"minimum":0,"type":"number"},
+ "fanout_min": {"minimum":1,"type":"integer"},
+ "hash": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "hotspot_ratio": {"minimum":0,"type":"number"},
+ "locality": {
+ "oneOf": [
+ {"type":"null"},
+ {
+ "additionalProperties": false,
+ "properties": {
+ "copies": {"minimum":0,"type":"integer"},
+ "cross_domain_fraction": {"minimum":0,"type":"number"},
+ "cross_node_fraction": {"minimum":0,"type":"number"},
+ "gpus_per_node": {"minimum":1,"type":"integer"},
+ "local_rank_fraction": {"minimum":0,"type":"number"},
+ "placement": {"const":"packed"},
+ "same_node_fraction": {"minimum":0,"type":"number"},
+ "same_scaleup_domain_fraction": {"minimum":0,"type":"number"},
+ "scale_up_domain": {"minimum":1,"type":"integer"}
+ },
+ "required": [
+ "placement",
+ "local_rank_fraction",
+ "same_node_fraction",
+ "same_scaleup_domain_fraction",
+ "cross_node_fraction",
+ "cross_domain_fraction",
+ "gpus_per_node",
+ "scale_up_domain",
+ "copies"
+ ],
+ "type": "object"
+ }
+ ]
+ },
+ "payload_copies_per_rank": {"items":{"minimum":0,"type":"integer"},"type":"array"},
+ "payload_rank_cv": {"minimum":0,"type":"number"},
+ "routed_copies": {"minimum":1,"type":"integer"},
+ "source_token_stats": {
+ "oneOf": [
+ {"type":"null"},
+ {
+ "additionalProperties": false,
+ "properties": {
+ "cv": {"minimum":0,"type":"number"},
+ "empty_ranks": {"minimum":0,"type":"integer"},
+ "max": {"minimum":0,"type":"integer"},
+ "mean": {"minimum":0,"type":"number"},
+ "min": {"minimum":0,"type":"integer"},
+ "ranks": {"minimum":1,"type":"integer"},
+ "total": {"minimum":0,"type":"integer"}
+ },
+ "required": ["min","mean","max","cv","empty_ranks","total","ranks"],
+ "type": "object"
+ }
+ ]
+ }
+ },
+ "required": [
+ "empty_expert_count",
+ "empty_rank_count",
+ "expert_assignment_rank_cv",
+ "expert_assignments_per_rank",
+ "expert_load_cv",
+ "expert_load_max",
+ "expert_load_mean",
+ "expert_load_min",
+ "fanout_histogram",
+ "fanout_max",
+ "fanout_mean",
+ "fanout_min",
+ "hash",
+ "hotspot_ratio",
+ "locality",
+ "payload_copies_per_rank",
+ "payload_rank_cv",
+ "routed_copies",
+ "source_token_stats"
+ ],
+ "type": "object"
+ },
+ "sample_histograms": {
+ "additionalProperties": false,
+ "properties": {
+ "combine": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/histogram"}]},
+ "dispatch": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/histogram"}]},
+ "roundtrip": {"$ref":"#/$defs/histogram"}
+ },
+ "required": ["dispatch","combine","roundtrip"],
+ "type": "object"
+ },
+ "sample_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "token_rate_at_latency_percentile": {"$ref":"#/$defs/percentiles"},
+ "tokens_per_rank": {"minimum":1,"type":"integer"}
+ },
+ "required": [
+ "anomalies",
+ "components",
+ "correctness",
+ "evidence_id",
+ "global_tokens",
+ "logical_bytes",
+ "point_id",
+ "receive",
+ "routing",
+ "sample_histograms",
+ "sample_sha256",
+ "token_rate_at_latency_percentile",
+ "tokens_per_rank"
+ ],
+ "type": "object"
+ },
+ "minItems": 1,
+ "type": "array"
+ },
+ "sampling": {
+ "additionalProperties": false,
+ "properties": {
+ "contract": {"const":"fixed-512-v1"},
+ "iterations_per_trial": {"const":8},
+ "percentile_method": {"const":"nearest-rank"},
+ "reduction": {"const":"cross-rank-max-per-iteration"},
+ "samples_per_component": {"const":512},
+ "trials": {"const":64},
+ "warmup_iterations": {"const":32},
+ "warmup_semantics": {"const":"full-roundtrip-before-each-component-trial-point-v1"}
+ },
+ "required": [
+ "contract",
+ "iterations_per_trial",
+ "percentile_method",
+ "reduction",
+ "samples_per_component",
+ "trials",
+ "warmup_iterations",
+ "warmup_semantics"
+ ],
+ "type": "object"
+ },
+ "source_allocation": {"const":"even"}
+ },
+ "required": [
+ "component_order_contract",
+ "conditioning",
+ "contract",
+ "rows",
+ "sampling",
+ "source_allocation"
+ ],
+ "type": "object"
+ },
+ "outcome": {
+ "additionalProperties": false,
+ "properties": {
+ "publication_status": {"enum":["diagnostic","invalid"]},
+ "reasons": {"items":{"type":"string"},"type":"array"},
+ "status": {"enum":["success","invalid"]},
+ "validity": {
+ "additionalProperties": false,
+ "properties": {
+ "anomaly_free": {"type":"boolean"},
+ "execution_status": {"enum":["complete","failed"]},
+ "measurement_conformance": {"enum":["conformant","nonconformant"]},
+ "provenance_complete": {"type":"boolean"},
+ "resource_conformance": {"minLength":1,"type":"string"},
+ "sampling_conformance": {"enum":["conformant","nonconformant"]},
+ "semantic_correctness": {"enum":["pass","fail"]},
+ "workload_identity": {"enum":["consistent-across-ranks","inconsistent"]},
+ "workload_source": {"enum":["canonical-serialized","seeded-runtime"]}
+ },
+ "required": [
+ "execution_status",
+ "semantic_correctness",
+ "workload_identity",
+ "workload_source",
+ "measurement_conformance",
+ "sampling_conformance",
+ "resource_conformance",
+ "provenance_complete",
+ "anomaly_free"
+ ],
+ "type": "object"
+ }
+ },
+ "required": ["publication_status","reasons","status","validity"],
+ "type": "object"
+ },
+ "provenance": {
+ "additionalProperties": false,
+ "properties": {
+ "command": {"minLength":1,"type":"string"},
+ "distributed_launcher": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "git_run": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/git_run"}]},
+ "image": {
+ "additionalProperties": false,
+ "properties": {
+ "arch": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "digest": {
+ "oneOf": [{"type":"null"},{"pattern":"^sha256:[0-9a-f]{64}$","type":"string"}]
+ },
+ "digest_verified": {"type":"boolean"},
+ "reference": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "squash_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]}
+ },
+ "required": ["arch","digest","digest_verified","reference","squash_sha256"],
+ "type": "object"
+ },
+ "redaction": {"const":"sanitized-v1"}
+ },
+ "required": ["command","distributed_launcher","git_run","image","redaction"],
+ "type": "object"
+ },
+ "record_type": {"const":"case-attempt"},
+ "runtime_fingerprint": {
+ "additionalProperties": false,
+ "properties": {
+ "accelerator_runtime": {
+ "additionalProperties": false,
+ "properties": {
+ "kind": {"enum":["cuda","hip"]},
+ "version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}
+ },
+ "required": ["kind","version"],
+ "type": "object"
+ },
+ "collective_library": {
+ "additionalProperties": false,
+ "properties": {
+ "kind": {"enum":["nccl","rccl"]},
+ "version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}
+ },
+ "required": ["kind","version"],
+ "type": "object"
+ },
+ "device": {
+ "additionalProperties": false,
+ "properties": {
+ "arch": {"minLength":1,"type":"string"},
+ "compute_units": {"minimum":1,"type":"integer"},
+ "memory_bytes": {"minimum":1,"type":"integer"},
+ "product": {"minLength":1,"type":"string"},
+ "warp_size": {"minimum":1,"type":"integer"}
+ },
+ "required": ["arch","compute_units","memory_bytes","product","warp_size"],
+ "type": "object"
+ },
+ "driver_version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]},
+ "framework": {
+ "additionalProperties": false,
+ "properties": {"kind":{"const":"torch"},"version":{"minLength":1,"type":"string"}},
+ "required": ["kind","version"],
+ "type": "object"
+ },
+ "machine": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "python_version": {"minLength":1,"type":"string"},
+ "vendor": {"enum":["nvidia","amd"]}
+ },
+ "required": [
+ "accelerator_runtime",
+ "collective_library",
+ "device",
+ "driver_version",
+ "framework",
+ "machine",
+ "python_version",
+ "vendor"
+ ],
+ "type": "object"
+ },
+ "sample_artifact": {
+ "additionalProperties": false,
+ "properties": {
+ "bytes": {"minimum":1,"type":"integer"},
+ "format": {"const":"collectivex.samples.v1"},
+ "path": {"pattern":"^[A-Za-z0-9_.-]+$","type":"string"},
+ "sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}
+ },
+ "required": ["bytes","format","path","sha256"],
+ "type": "object"
+ },
+ "schema_version": {"const":1},
+ "topology": {
+ "additionalProperties": false,
+ "properties": {
+ "device_count": {"minimum":1,"type":"integer"},
+ "device_product": {"minLength":1,"type":"string"},
+ "gpus_per_node": {"minimum":1,"type":"integer"},
+ "nodes": {"minimum":1,"type":"integer"},
+ "placement": {"const":"packed"},
+ "realized_placement": {
+ "additionalProperties": false,
+ "properties": {
+ "gpus_per_node": {"minimum":1,"type":"integer"},
+ "nodes": {"minimum":1,"type":"integer"},
+ "ranks_per_node": {"minimum":1,"type":"integer"},
+ "unique_local_ranks": {"const":true},
+ "valid": {"const":true}
+ },
+ "required": ["gpus_per_node","nodes","ranks_per_node","unique_local_ranks","valid"],
+ "type": "object"
+ },
+ "scale_out_transport": {"oneOf":[{"type":"null"},{"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}]},
+ "scale_up_domain": {"minimum":1,"type":"integer"},
+ "scale_up_transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "scope": {"enum":["scale-up","scale-out"]},
+ "topology_class": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"},
+ "world_size": {"minimum":1,"type":"integer"}
+ },
+ "required": [
+ "device_count",
+ "device_product",
+ "gpus_per_node",
+ "nodes",
+ "placement",
+ "realized_placement",
+ "scale_out_transport",
+ "scale_up_domain",
+ "scale_up_transport",
+ "scope",
+ "topology_class",
+ "transport",
+ "world_size"
+ ],
+ "type": "object"
+ },
+ "workload": {
+ "additionalProperties": false,
+ "properties": {
+ "activation_generator": {"const":"collectivex-activation-counter-v3"},
+ "activation_identity": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "activation_profile": {"const":"canonical-counter-source-v3"},
+ "cross_rank_consistent": {"const":true},
+ "manifest_checksums": {
+ "oneOf": [
+ {"type":"null"},
+ {
+ "additionalProperties": {
+ "additionalProperties": false,
+ "properties": {
+ "topk_idx": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "topk_weights": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "trace": {"pattern":"^[0-9a-f]{64}$","type":"string"}
+ },
+ "required": ["topk_idx", "topk_weights", "trace"],
+ "type": "object"
+ },
+ "type": "object"
+ }
+ ]
+ },
+ "members": {
+ "oneOf": [
+ {"type":"null"},
+ {
+ "items": {"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"},
+ "minItems": 1,
+ "uniqueItems": true,
+ "type": "array"
+ }
+ ]
+ },
+ "routing_generator": {"const":"collectivex-routing-counter-v3"},
+ "source": {"enum":["canonical-serialized","seeded-runtime"]},
+ "trace_hashes": {
+ "items": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "minItems": 1,
+ "type": "array"
+ },
+ "trace_signature": {"pattern":"^[0-9a-f]{64}$","type":"string"},
+ "workload_id": {
+ "oneOf": [{"type":"null"},{"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"}]
+ }
+ },
+ "required": [
+ "activation_generator",
+ "activation_identity",
+ "activation_profile",
+ "cross_rank_consistent",
+ "manifest_checksums",
+ "members",
+ "routing_generator",
+ "source",
+ "trace_hashes",
+ "trace_signature",
+ "workload_id"
+ ],
+ "type": "object"
+ }
+ },
+ "required": [
+ "case",
+ "format",
+ "generated_at",
+ "identity",
+ "implementation",
+ "measurement",
+ "outcome",
+ "provenance",
+ "record_type",
+ "runtime_fingerprint",
+ "sample_artifact",
+ "schema_version",
+ "topology",
+ "workload"
+ ],
+ "title": "CollectiveX raw case attempt v1",
+ "type": "object"
+}
diff --git a/experimental/CollectiveX/schemas/samples-v1.schema.json b/experimental/CollectiveX/schemas/samples-v1.schema.json
new file mode 100644
index 0000000000..b9a1df0541
--- /dev/null
+++ b/experimental/CollectiveX/schemas/samples-v1.schema.json
@@ -0,0 +1,80 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/samples-v1.schema.json",
+ "title": "CollectiveX exact private samples v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["allocation_id","attempt_id","case_id","format","points","sampling","schema_version","series_id"],
+ "properties": {
+ "allocation_id": {"$ref": "#/$defs/allocationId"},
+ "attempt_id": {"$ref": "#/$defs/attemptId"},
+ "case_id": {"$ref": "#/$defs/caseId"},
+ "format": {"const": "collectivex.samples.v1"},
+ "points": {
+ "type": "array",
+ "minItems": 1,
+ "items": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["components","evidence_id","point_id","sample_sha256","tokens_per_rank"],
+ "properties": {
+ "components": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["combine","dispatch","roundtrip"],
+ "properties": {
+ "combine": {"$ref": "#/$defs/component"},
+ "dispatch": {"$ref": "#/$defs/component"},
+ "roundtrip": {"$ref": "#/$defs/component"}
+ }
+ },
+ "evidence_id": {"$ref": "#/$defs/evidenceId"},
+ "point_id": {"$ref": "#/$defs/pointId"},
+ "sample_sha256": {"$ref": "#/$defs/sha256"},
+ "tokens_per_rank": {"type": "integer","minimum": 1}
+ }
+ }
+ },
+ "sampling": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["iterations_per_trial","reduction","trials"],
+ "properties": {
+ "iterations_per_trial": {"const": 8},
+ "reduction": {"const": "cross-rank-max-per-iteration"},
+ "trials": {"const": 64}
+ }
+ },
+ "schema_version": {"const": 1},
+ "series_id": {"$ref": "#/$defs/seriesId"}
+ },
+ "$defs": {
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"},
+ "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"},
+ "pointId": {"type": "string","pattern": "^cxpoint-v1-[0-9a-f]{64}$"},
+ "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"},
+ "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"},
+ "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"},
+ "component": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["availability","sample_count","trials"],
+ "properties": {
+ "availability": {"enum": ["measured","unavailable"]},
+ "sample_count": {"type": "integer","minimum": 0,"maximum": 512},
+ "trials": {
+ "oneOf": [
+ {"type": "null"},
+ {
+ "type": "array",
+ "minItems": 64,
+ "maxItems": 64,
+ "items": {"type": "array","minItems": 8,"maxItems": 8,"items": {"type": "number","minimum": 0}}
+ }
+ ]
+ }
+ }
+ }
+ }
+}
diff --git a/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json b/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json
new file mode 100644
index 0000000000..dede78a43c
--- /dev/null
+++ b/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json
@@ -0,0 +1,289 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "https://inferencex.com/schemas/collectivex/terminal-outcome-v1.schema.json",
+ "title": "CollectiveX terminal outcome v1",
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["case","format","generated_at","identity","outcome","provenance","record_type","schema_version"],
+ "properties": {
+ "case": {"$ref": "#/$defs/case"},
+ "format": {"const": "collectivex.terminal.v1"},
+ "generated_at": {"type": "string","format": "date-time"},
+ "identity": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["allocation_factors","allocation_id","attempt_id","attempt_ordinal","case_factors","case_id"],
+ "properties": {
+ "allocation_factors": {"$ref": "#/$defs/allocationFactors"},
+ "allocation_id": {"$ref": "#/$defs/allocationId"},
+ "attempt_id": {"$ref": "#/$defs/attemptId"},
+ "attempt_ordinal": {"type": "integer","minimum": 1},
+ "case_factors": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["case","profile","sku"],
+ "properties": {
+ "case": {"$ref": "#/$defs/case"},
+ "profile": {
+ "oneOf": [
+ {"const": {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_profile": "canonical-counter-source-v3",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "combine_semantics": "activation-only",
+ "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2",
+ "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "contract": "layout-and-dispatch-v1",
+ "correctness_scope": "dispatch-metadata-and-transformed-combine",
+ "dtype": "bf16",
+ "eplb_planner": "greedy-rank-major-v1",
+ "eplb_redundant_experts": 32,
+ "eplb_reference_tokens_per_rank": 2048,
+ "mode": "normal",
+ "oracle_contract": "expert-specific-transform-v1",
+ "oracle_tolerances": "rtol=0.05,atol=0.02",
+ "payload_unit": "token-rank",
+ "placement": "packed",
+ "percentile_method": "nearest-rank",
+ "rank_reduction": "cross-rank-max-per-iteration",
+ "resource_mode": "tuned",
+ "routing_generator": "collectivex-routing-counter-v3",
+ "sampling_contract": "fixed-512-v1",
+ "seed": 67
+ }},
+ {"const": {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_profile": "canonical-counter-source-v3",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "combine_semantics": "gate-weighted",
+ "component_order_contract": "roundtrip-dispatch-gate-weighted-combine-v1",
+ "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "contract": "expert-packed-weighted-combine-v1",
+ "correctness_scope": "expert-assignment-and-weighted-combine",
+ "dtype": "bf16",
+ "eplb_planner": "greedy-rank-major-v1",
+ "eplb_redundant_experts": 32,
+ "eplb_reference_tokens_per_rank": 2048,
+ "mode": "low-latency",
+ "oracle_contract": "expert-assignment-transform-v1",
+ "oracle_tolerances": "rtol=0.05,atol=0.02",
+ "payload_unit": "token-expert",
+ "placement": "packed",
+ "percentile_method": "nearest-rank",
+ "rank_reduction": "cross-rank-max-per-iteration",
+ "resource_mode": "tuned",
+ "routing_generator": "collectivex-routing-counter-v3",
+ "sampling_contract": "fixed-512-v1",
+ "seed": 67
+ }}
+ ]
+ },
+ "sku": {"$ref": "#/$defs/safeId"}
+ }
+ },
+ "case_id": {"$ref": "#/$defs/caseId"}
+ }
+ },
+ "outcome": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["failure_mode","reason","return_code","status"],
+ "properties": {
+ "failure_mode": {"$ref": "#/$defs/safeId"},
+ "reason": {"type": "string","minLength": 1,"maxLength": 240},
+ "return_code": {"type": "integer","minimum": 0},
+ "status": {"enum": ["failed","invalid","unsupported"]}
+ }
+ },
+ "provenance": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["git_run","control_sha256","redaction","source"],
+ "properties": {
+ "git_run": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/gitRun"}]},
+ "control_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]},
+ "redaction": {"const": "sanitized-v1"},
+ "source": {
+ "enum": [
+ "runtime-emitter",
+ "post-emit-command",
+ "matrix-capability-resolver"
+ ]
+ }
+ }
+ },
+ "record_type": {"const": "terminal-outcome"},
+ "schema_version": {"const": 1}
+ },
+ "allOf": [
+ {
+ "oneOf": [
+ {
+ "properties": {
+ "provenance": {
+ "properties": {"source": {"const": "runtime-emitter"}}
+ },
+ "outcome": {"$ref": "#/$defs/runtimeOutcome"}
+ }
+ },
+ {
+ "properties": {
+ "provenance": {
+ "properties": {"source": {"const": "post-emit-command"}}
+ },
+ "outcome": {"$ref": "#/$defs/postEmitOutcome"}
+ }
+ },
+ {
+ "properties": {
+ "provenance": {
+ "properties": {"source": {"const": "matrix-capability-resolver"}}
+ },
+ "outcome": {"$ref": "#/$defs/capabilityOutcome"}
+ }
+ }
+ ]
+ }
+ ],
+ "$defs": {
+ "runtimeOutcome": {
+ "type": "object",
+ "properties": {"status": {"const": "failed"}},
+ "allOf": [
+ {
+ "oneOf": [
+ {"properties": {"failure_mode": {"const": "setup"}, "reason": {"const": "launcher-setup-failed"}}},
+ {"properties": {"failure_mode": {"const": "repository-stage"}, "reason": {"const": "repository-staging-failed"}}},
+ {"properties": {"failure_mode": {"const": "registry-verification"}, "reason": {"const": "container-registry-verification-failed"}}},
+ {"properties": {"failure_mode": {"const": "scheduler-allocation"}, "reason": {"const": "scheduler-allocation-failed"}}},
+ {"properties": {"failure_mode": {"const": "container-import"}, "reason": {"const": "container-image-preparation-failed"}}},
+ {"properties": {"failure_mode": {"const": "container-hash"}, "reason": {"const": "container-image-identity-failed"}}},
+ {"properties": {"failure_mode": {"const": "container-launch"}, "reason": {"const": "container-runtime-launch-failed"}}},
+ {"properties": {"failure_mode": {"const": "backend-setup"}, "reason": {"const": "backend-setup-failed"}}},
+ {"properties": {"failure_mode": {"const": "artifact-collection"}, "reason": {"const": "artifact-collection-failed"}}},
+ {"properties": {"failure_mode": {"const": "runtime-identity"}, "reason": {"const": "runtime-identity-mismatch"}}},
+ {"properties": {"failure_mode": {"const": "timeout"}, "reason": {"const": "execution-timeout"}}},
+ {"properties": {"failure_mode": {"const": "deadlock"}, "reason": {"const": "execution-deadlock"}}},
+ {"properties": {"failure_mode": {"const": "execution"}, "reason": {"const": "distributed-command-failed"}}}
+ ]
+ }
+ ]
+ },
+ "postEmitOutcome": {
+ "type": "object",
+ "properties": {
+ "status": {"const": "failed"},
+ "failure_mode": {"enum": ["runtime-identity", "timeout", "deadlock", "execution"]},
+ "reason": {"const": "post-emit-distributed-command-failed"}
+ }
+ },
+ "capabilityOutcome": {
+ "type": "object",
+ "properties": {
+ "status": {"const": "unsupported"},
+ "failure_mode": {"const": "capability"},
+ "reason": {
+ "enum": [
+ "backend-platform-unsupported",
+ "backend-token-capacity"
+ ]
+ }
+ }
+ },
+ "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"},
+ "safeId": {"type": "string","pattern": "^[a-z0-9][a-z0-9_.-]*$","maxLength": 128},
+ "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"},
+ "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"},
+ "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"},
+ "nullableText": {"oneOf": [{"type": "null"},{"type": "string","minLength": 1}]},
+ "allocationFactors": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["artifact","execution_id","job","repo","run_attempt","run_id","runner","source_sha"],
+ "properties": {
+ "artifact": {"$ref": "#/$defs/nullableText"},
+ "execution_id": {"$ref": "#/$defs/nullableText"},
+ "job": {"$ref": "#/$defs/nullableText"},
+ "repo": {"$ref": "#/$defs/nullableText"},
+ "run_attempt": {"$ref": "#/$defs/nullableText"},
+ "run_id": {"$ref": "#/$defs/nullableText"},
+ "runner": {"$ref": "#/$defs/nullableText"},
+ "source_sha": {"$ref": "#/$defs/nullableText"}
+ }
+ },
+ "gitRun": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": ["artifact","job","ref","repo","run_attempt","run_id","source_sha"],
+ "properties": {
+ "artifact": {"type": "string","minLength": 1},
+ "job": {"type": "string","minLength": 1},
+ "ref": {"type": "string","minLength": 1},
+ "repo": {"type": "string","pattern": "^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$"},
+ "run_attempt": {"type": "string","pattern": "^[1-9][0-9]*$"},
+ "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"},
+ "source_sha": {"type": "string","pattern": "^[0-9a-f]{40}$"}
+ }
+ },
+ "case": {
+ "type": "object",
+ "additionalProperties": false,
+ "required": [
+ "backend",
+ "canonical",
+ "eplb",
+ "ep",
+ "experts",
+ "gpus_per_node",
+ "hidden",
+ "ladder",
+ "mode",
+ "nodes",
+ "phase",
+ "required_publication",
+ "routing",
+ "samples_per_point",
+ "scale_out_transport",
+ "scale_up_domain",
+ "scale_up_transport",
+ "scope",
+ "suite",
+ "timing",
+ "topk",
+ "topology_class",
+ "transport",
+ "warmup_semantics",
+ "workload"
+ ],
+ "properties": {
+ "backend": {"$ref": "#/$defs/safeId"},
+ "canonical": {"type": "boolean"},
+ "eplb": {"type": "boolean"},
+ "ep": {"type": "integer","minimum": 1},
+ "experts": {"type": "integer","minimum": 1},
+ "gpus_per_node": {"type": "integer","minimum": 1},
+ "hidden": {"type": "integer","minimum": 1},
+ "ladder": {"type": "string","pattern": "^[1-9][0-9]*( [1-9][0-9]*)*$"},
+ "mode": {"enum": ["normal","low-latency"]},
+ "nodes": {"type": "integer","minimum": 1},
+ "phase": {"enum": ["decode","prefill"]},
+ "required_publication": {"enum": ["official","comparable-experimental","diagnostic"]},
+ "routing": {"enum": ["uniform","zipf"]},
+ "samples_per_point": {"const": 512},
+ "scale_out_transport": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/safeId"}]},
+ "scale_up_domain": {"type": "integer","minimum": 1},
+ "scale_up_transport": {"$ref": "#/$defs/safeId"},
+ "scope": {"enum": ["scale-up","scale-out"]},
+ "suite": {"$ref": "#/$defs/safeId"},
+ "timing": {"const": "8:64:32"},
+ "topk": {"type": "integer","minimum": 1},
+ "topology_class": {"$ref": "#/$defs/safeId"},
+ "transport": {"$ref": "#/$defs/safeId"},
+ "warmup_semantics": {"const": "full-roundtrip-before-each-component-trial-point-v1"},
+ "workload": {"$ref": "#/$defs/safeId"}
+ }
+ }
+ }
+}
diff --git a/experimental/CollectiveX/source_archive.py b/experimental/CollectiveX/source_archive.py
new file mode 100644
index 0000000000..c027490a65
--- /dev/null
+++ b/experimental/CollectiveX/source_archive.py
@@ -0,0 +1,349 @@
+#!/usr/bin/env python3
+"""Validate and extract one pinned backend from a shared source tar."""
+from __future__ import annotations
+
+import argparse
+import os
+from pathlib import Path, PurePosixPath
+import stat
+import tarfile
+from typing import Optional, Sequence
+
+
+PathParts = tuple[str, ...]
+_DIRECTORY_FLAGS = os.O_RDONLY | os.O_DIRECTORY | os.O_NOFOLLOW | os.O_CLOEXEC
+_FILE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_EXCL | os.O_NOFOLLOW | os.O_CLOEXEC
+MAX_ARCHIVE_MEMBERS = 20_000
+MAX_MEMBER_BYTES = 512 * 1024 * 1024
+MAX_EXPANDED_BYTES = 2 * 1024 * 1024 * 1024
+MAX_ARCHIVE_BYTES = 4 * 1024 * 1024 * 1024
+MAX_ARCHIVE_HEADERS = 40_000
+MAX_EXTENSION_BYTES = 64 * 1024 * 1024
+MAX_EXTENSION_MEMBER_BYTES = 1024 * 1024
+MAX_EXTENSION_CHAIN = 8
+_TAR_BLOCK = 512
+_EXTENSION_TYPES = {b"L", b"K", b"x", b"g", b"X"}
+
+
+class SourceArchiveError(ValueError):
+ """The backend source archive cannot be extracted safely."""
+
+
+def _tar_size(field: bytes) -> int:
+ if field[0] in (0o200, 0o377):
+ value = int.from_bytes(field[1:], "big")
+ if field[0] == 0o377:
+ value -= 256 ** (len(field) - 1)
+ return value
+ try:
+ text = field.split(b"\0", 1)[0].decode("ascii").strip()
+ return int(text or "0", 8)
+ except (UnicodeDecodeError, ValueError) as exc:
+ raise SourceArchiveError("archive contains an invalid size field") from exc
+
+
+def _preflight_archive(descriptor: int, archive_size: int) -> None:
+ if archive_size <= 0 or archive_size > MAX_ARCHIVE_BYTES:
+ raise SourceArchiveError("backend source archive exceeds the raw size limit")
+ offset = headers = extension_bytes = extension_chain = 0
+ while offset < archive_size:
+ header = os.pread(descriptor, _TAR_BLOCK, offset)
+ if len(header) != _TAR_BLOCK:
+ raise SourceArchiveError("archive header is truncated")
+ if not any(header):
+ return
+ headers += 1
+ if headers > MAX_ARCHIVE_HEADERS:
+ raise SourceArchiveError("archive has too many physical headers")
+ size = _tar_size(header[124:136])
+ if size < 0:
+ raise SourceArchiveError("archive contains a negative payload size")
+ type_flag = header[156:157]
+ if type_flag in _EXTENSION_TYPES:
+ extension_chain += 1
+ extension_bytes += size
+ if (
+ extension_chain > MAX_EXTENSION_CHAIN
+ or size > MAX_EXTENSION_MEMBER_BYTES
+ or extension_bytes > MAX_EXTENSION_BYTES
+ ):
+ raise SourceArchiveError("archive extension metadata exceeds its limit")
+ if type_flag in {b"x", b"g", b"X"}:
+ payload = os.pread(descriptor, size, offset + _TAR_BLOCK)
+ if len(payload) != size:
+ raise SourceArchiveError("archive extension metadata is truncated")
+ if b"GNU.sparse." in payload:
+ raise SourceArchiveError("archive contains sparse extension metadata")
+ else:
+ extension_chain = 0
+ if type_flag == b"S":
+ raise SourceArchiveError("archive contains a sparse member")
+ blocks = (size + _TAR_BLOCK - 1) // _TAR_BLOCK
+ offset += _TAR_BLOCK + blocks * _TAR_BLOCK
+ if offset > archive_size:
+ raise SourceArchiveError("archive payload is truncated")
+
+
+def _member_parts(name: str) -> PathParts:
+ if not name or "\\" in name or "\0" in name:
+ raise SourceArchiveError("archive contains a noncanonical member path")
+ path = PurePosixPath(name)
+ if (
+ path.is_absolute()
+ or path.as_posix() != name
+ or not path.parts
+ or path.parts[0] != ".cx_sources"
+ or any(part in {"", ".", ".."} for part in path.parts)
+ ):
+ raise SourceArchiveError("archive contains a noncanonical member path")
+ return path.parts
+
+
+def _root_parts(root_basename: str) -> PathParts:
+ path = PurePosixPath(root_basename)
+ if (
+ not root_basename
+ or "\\" in root_basename
+ or "\0" in root_basename
+ or path.is_absolute()
+ or path.as_posix() != root_basename
+ or len(path.parts) != 1
+ or path.parts[0] in {"", ".", ".."}
+ ):
+ raise SourceArchiveError("invalid backend source root")
+ return (".cx_sources", root_basename)
+
+
+def _read_members(archive: tarfile.TarFile) -> list[tarfile.TarInfo]:
+ members: list[tarfile.TarInfo] = []
+ for member in archive:
+ if len(members) >= MAX_ARCHIVE_MEMBERS:
+ raise SourceArchiveError("archive has an invalid member count")
+ members.append(member)
+ return members
+
+
+def _validate_members(
+ members: list[tarfile.TarInfo], selected_root: PathParts
+) -> dict[PathParts, tarfile.TarInfo]:
+ if not members or len(members) > MAX_ARCHIVE_MEMBERS:
+ raise SourceArchiveError("archive has an invalid member count")
+ entries: dict[PathParts, tarfile.TarInfo] = {}
+ expanded_bytes = 0
+ for member in members:
+ parts = _member_parts(member.name)
+ if parts in entries:
+ raise SourceArchiveError("archive contains duplicate member paths")
+ if member.sparse is not None:
+ raise SourceArchiveError("archive contains a sparse member")
+ if member.isdir():
+ if member.size != 0:
+ raise SourceArchiveError("archive contains an invalid directory")
+ elif member.isfile():
+ if member.size < 0 or member.size > MAX_MEMBER_BYTES:
+ raise SourceArchiveError("archive member exceeds the size limit")
+ expanded_bytes += member.size
+ if expanded_bytes > MAX_EXPANDED_BYTES:
+ raise SourceArchiveError("archive exceeds the expanded size limit")
+ elif member.issym():
+ if member.size != 0:
+ raise SourceArchiveError("archive contains an invalid symbolic link")
+ else:
+ raise SourceArchiveError("archive contains a non-file member")
+ entries[parts] = member
+
+ source_parent = entries.get((".cx_sources",))
+ selected = entries.get(selected_root)
+ if source_parent is None or not source_parent.isdir():
+ raise SourceArchiveError("archive is missing its source directory")
+ if selected is None or not selected.isdir():
+ raise SourceArchiveError("archive is missing the selected backend source")
+
+ for parts in entries:
+ for depth in range(1, len(parts)):
+ parent = entries.get(parts[:depth])
+ if parent is None or not parent.isdir():
+ raise SourceArchiveError("archive member has an unsafe parent")
+
+ for parts, member in entries.items():
+ if not member.issym():
+ continue
+ target_name = member.linkname
+ target_path = PurePosixPath(target_name)
+ if (
+ not target_name
+ or "\\" in target_name
+ or "\0" in target_name
+ or target_path.is_absolute()
+ or target_path.as_posix() != target_name
+ ):
+ raise SourceArchiveError("archive contains an unsafe symbolic link")
+ target = list(parts[:-1])
+ for component in target_path.parts:
+ if component == "..":
+ if len(target) <= 2:
+ raise SourceArchiveError("symbolic link escapes its backend source")
+ target.pop()
+ else:
+ target.append(component)
+ resolved = tuple(target)
+ if resolved[:2] != parts[:2]:
+ raise SourceArchiveError("symbolic link crosses backend sources")
+ target_member = entries.get(resolved)
+ if target_member is None or not target_member.isfile():
+ raise SourceArchiveError("symbolic link target is not a regular archive file")
+ return entries
+
+
+def _open_directory(root_fd: int, parts: PathParts) -> int:
+ descriptor = os.dup(root_fd)
+ try:
+ for part in parts:
+ child = os.open(part, _DIRECTORY_FLAGS, dir_fd=descriptor)
+ os.close(descriptor)
+ descriptor = child
+ return descriptor
+ except BaseException:
+ os.close(descriptor)
+ raise
+
+
+def _create_directory(root_fd: int, parts: PathParts) -> None:
+ parent_fd = _open_directory(root_fd, parts[:-1])
+ try:
+ os.mkdir(parts[-1], mode=0o700, dir_fd=parent_fd)
+ finally:
+ os.close(parent_fd)
+
+
+def _extract_file(
+ archive: tarfile.TarFile, root_fd: int, parts: PathParts, member: tarfile.TarInfo
+) -> None:
+ parent_fd = _open_directory(root_fd, parts[:-1])
+ descriptor = -1
+ source = None
+ try:
+ mode = 0o700 if member.mode & 0o111 else 0o600
+ descriptor = os.open(parts[-1], _FILE_FLAGS, mode, dir_fd=parent_fd)
+ source = archive.extractfile(member)
+ if source is None:
+ raise SourceArchiveError("archive file has no readable payload")
+ remaining = member.size
+ while remaining:
+ chunk = source.read(min(1024 * 1024, remaining))
+ if not chunk:
+ raise SourceArchiveError("archive file payload is truncated")
+ view = memoryview(chunk)
+ while view:
+ written = os.write(descriptor, view)
+ view = view[written:]
+ remaining -= len(chunk)
+ os.fchmod(descriptor, mode)
+ finally:
+ if source is not None:
+ source.close()
+ if descriptor >= 0:
+ os.close(descriptor)
+ os.close(parent_fd)
+
+
+def _extract_symlink(root_fd: int, parts: PathParts, member: tarfile.TarInfo) -> None:
+ parent_fd = _open_directory(root_fd, parts[:-1])
+ try:
+ os.symlink(member.linkname, parts[-1], dir_fd=parent_fd)
+ finally:
+ os.close(parent_fd)
+
+
+def _extract_selected(
+ archive: tarfile.TarFile,
+ destination_fd: int,
+ entries: dict[PathParts, tarfile.TarInfo],
+ selected_root: PathParts,
+) -> None:
+ try:
+ os.stat(".cx_sources", dir_fd=destination_fd, follow_symlinks=False)
+ except FileNotFoundError:
+ pass
+ else:
+ raise SourceArchiveError("backend source output already exists")
+
+ selected = {
+ parts: member
+ for parts, member in entries.items()
+ if parts[: len(selected_root)] == selected_root
+ }
+ _create_directory(destination_fd, (".cx_sources",))
+ directories = sorted(
+ (parts for parts, member in selected.items() if member.isdir()),
+ key=lambda parts: (len(parts), parts),
+ )
+ for parts in directories:
+ _create_directory(destination_fd, parts)
+ for parts, member in sorted(selected.items()):
+ if member.isfile():
+ _extract_file(archive, destination_fd, parts, member)
+ for parts, member in sorted(selected.items()):
+ if member.issym():
+ _extract_symlink(destination_fd, parts, member)
+
+
+def extract_source_archive(
+ archive_path: Path, destination: Path, root_basename: str
+) -> None:
+ """Validate the complete tar, then safely extract one backend source root."""
+ selected_root = _root_parts(root_basename)
+ archive_fd = os.open(archive_path, os.O_RDONLY | os.O_NOFOLLOW | os.O_CLOEXEC)
+ try:
+ metadata = os.fstat(archive_fd)
+ if (
+ not stat.S_ISREG(metadata.st_mode)
+ or metadata.st_uid != os.getuid()
+ or stat.S_IMODE(metadata.st_mode) & 0o022
+ ):
+ raise SourceArchiveError("backend source archive has unsafe metadata")
+ _preflight_archive(archive_fd, metadata.st_size)
+ with os.fdopen(os.dup(archive_fd), "rb") as stream:
+ try:
+ with tarfile.open(fileobj=stream, mode="r:") as archive:
+ entries = _validate_members(_read_members(archive), selected_root)
+ destination_fd = os.open(destination, _DIRECTORY_FLAGS)
+ try:
+ destination_metadata = os.fstat(destination_fd)
+ if (
+ destination_metadata.st_uid != os.getuid()
+ or stat.S_IMODE(destination_metadata.st_mode) != 0o700
+ ):
+ raise SourceArchiveError("backend source destination is unsafe")
+ previous_umask = os.umask(0o077)
+ try:
+ _extract_selected(
+ archive, destination_fd, entries, selected_root
+ )
+ finally:
+ os.umask(previous_umask)
+ finally:
+ os.close(destination_fd)
+ except RecursionError as exc:
+ raise SourceArchiveError("archive extension metadata is recursive") from exc
+ finally:
+ os.close(archive_fd)
+
+
+def main(argv: Optional[Sequence[str]] = None) -> int:
+ parser = argparse.ArgumentParser(
+ description="Safely install one pinned backend source archive"
+ )
+ parser.add_argument("archive", type=Path)
+ parser.add_argument("destination", type=Path)
+ parser.add_argument("root_basename")
+ args = parser.parse_args(argv)
+ try:
+ extract_source_archive(args.archive, args.destination, args.root_basename)
+ except (OSError, SourceArchiveError, tarfile.TarError) as exc:
+ parser.error(f"backend source archive rejected: {exc}")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py
new file mode 100644
index 0000000000..3752db6b9d
--- /dev/null
+++ b/experimental/CollectiveX/summarize.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""Render a small native-v1 shard summary and gate on a successful case."""
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import contracts
+
+
+def load_results(directory: str, runner: str | None, timestamp: str | None) -> list[dict]:
+ documents: list[dict] = []
+ for path in sorted(Path(directory).glob("*.json")):
+ if runner and not path.name.startswith(f"{runner}_"):
+ continue
+ if timestamp and timestamp not in path.name:
+ continue
+ try:
+ document = contracts.strict_load(path)
+ if document.get("format") == contracts.RAW_FORMAT:
+ documents.append(contracts.load_raw_attempt(path))
+ elif document.get("format") == contracts.TERMINAL_FORMAT:
+ documents.append(contracts.validate_terminal_document(document))
+ except (contracts.ContractError, OSError):
+ continue
+ return documents
+
+
+def _identity(document: dict) -> tuple[str, str, str, str, bool, str, int]:
+ case = document["case"]
+ if document["format"] == contracts.RAW_FORMAT:
+ routing = case["shape"]["routing"]
+ eplb = case["eplb"]["enabled"]
+ else:
+ routing = case["routing"]
+ eplb = case["eplb"]
+ sku = document["identity"]["case_factors"]["sku"]
+ return (
+ sku, case["suite"], routing, case["phase"], eplb,
+ case["required_publication"], case.get("ep_size", case.get("ep", 0)),
+ )
+
+
+def _headline(document: dict) -> tuple[int | str, float | str, float | str]:
+ if document["format"] != contracts.RAW_FORMAT:
+ return "-", "-", "-"
+ rows = document["measurement"]["rows"]
+ row = next((item for item in rows if item["tokens_per_rank"] == 64), rows[len(rows) // 2])
+ latency = row["components"]["roundtrip"]["percentiles_us"]
+ return row["tokens_per_rank"], latency["p50"], latency["p99"]
+
+
+def render(documents: list[dict], markdown: bool) -> str:
+ documents = sorted(documents, key=_identity)
+ if markdown:
+ lines = [
+ "## CollectiveX EP results", "",
+ "| sku | backend | suite | phase | routing | tier | ep | outcome | T* | p50 us | p99 us |",
+ "|---|---|---|---|---|---|--:|---|--:|--:|--:|",
+ ]
+ for document in documents:
+ sku, suite, routing, phase, eplb, tier, ep = _identity(document)
+ backend = document["case"]["backend"]
+ token, p50, p99 = _headline(document)
+ lines.append(
+ f"| {sku} | `{backend}` | {suite} | {phase} | "
+ f"{routing}{'+eplb' if eplb else ''} | {tier} | {ep} | "
+ f"{document['outcome']['status']} | {token} | {p50} | {p99} |"
+ )
+ if not documents:
+ lines.append("\n> No valid native v1 outcome documents found.")
+ return "\n".join(lines)
+ lines = ["CollectiveX EP results", "======================"]
+ for document in documents:
+ sku, suite, routing, phase, eplb, tier, ep = _identity(document)
+ backend = document["case"]["backend"]
+ token, _, p99 = _headline(document)
+ lines.append(
+ f" {sku:<10} {backend:<16} {suite:<13} {phase:<7} "
+ f"{routing}{'+eplb' if eplb else ''} {tier} ep{ep} "
+ f"{document['outcome']['status']} T={token} roundtrip_p99_us={p99}"
+ )
+ return "\n".join(lines)
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="Summarize CollectiveX native v1 outcomes")
+ parser.add_argument("--results-dir", default="results")
+ parser.add_argument("--runner")
+ parser.add_argument("--ts")
+ parser.add_argument("--markdown", action="store_true")
+ args = parser.parse_args()
+ documents = load_results(args.results_dir, args.runner, args.ts)
+ print(render(documents, args.markdown))
+ if args.markdown:
+ return 0
+ return 0 if any(
+ document["format"] == contracts.RAW_FORMAT
+ and document["outcome"]["status"] == "success"
+ for document in documents
+ ) else 1
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/sweep_matrix.py b/experimental/CollectiveX/sweep_matrix.py
new file mode 100644
index 0000000000..fcf1ffc233
--- /dev/null
+++ b/experimental/CollectiveX/sweep_matrix.py
@@ -0,0 +1,1031 @@
+#!/usr/bin/env python3
+"""Resolve CollectiveX v1 suites and extract validated execution shards.
+
+The promoted v1 profile contains normal and explicit low-latency BF16 contracts.
+Mode changes measurement semantics and therefore participates in case identity;
+resource mode and quantization remain fixed rather than becoming matrix axes.
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as dt
+import hashlib
+import itertools
+import json
+import os
+from pathlib import Path
+import re
+import sys
+from typing import Any
+
+HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(HERE))
+sys.path.insert(0, str(HERE / "tests"))
+
+try: # Shard extraction on GPU runners is intentionally stdlib-only.
+ import yaml # type: ignore
+except ModuleNotFoundError: # pragma: no cover - exercised by the workflow environment
+ yaml = None
+
+import capability as cap # noqa: E402
+import contracts # noqa: E402
+import ep_harness # noqa: E402
+import identity # noqa: E402
+
+
+EP_TIMING_PROFILE = (
+ f"{ep_harness.TIMED_ITERS_PER_TRIAL}:{ep_harness.TRIALS_PER_POINT}:"
+ f"{ep_harness.WARMUP_ITERS_PER_TRIAL}"
+)
+V1_WORKLOAD = ("deepseek-v3-v1", 7168, 8, 256)
+V1_SUITE_CONTRACTS = {
+ "ep-core-v1": {
+ "mode": "normal",
+ "publication": "official",
+ "coordinates": {
+ ("normal", "decode", "uniform", False),
+ ("normal", "prefill", "uniform", False),
+ },
+ "ladders": {
+ "decode": tuple(ep_harness.DECODE_LADDER),
+ "prefill": (256, 512),
+ },
+ },
+ "ep-routing-v1": {
+ "mode": "normal",
+ "publication": "comparable-experimental",
+ "coordinates": {
+ ("normal", "decode", "zipf", False),
+ ("normal", "decode", "zipf", True),
+ ("normal", "prefill", "zipf", False),
+ ("normal", "prefill", "zipf", True),
+ },
+ "ladders": {"decode": (128,), "prefill": (512,)},
+ },
+ "ep-low-latency-v1": {
+ "mode": "low-latency",
+ "publication": "official",
+ "backends": {"deepep", "uccl"},
+ "coordinates": {("low-latency", "decode", "uniform", False)},
+ "ladders": {"decode": tuple(ep_harness.DECODE_LADDER)},
+ },
+}
+IDENTIFIER = re.compile(r"[a-z0-9][a-z0-9.-]*")
+SUITE_FIELDS = {
+ "backends", "ep_degrees", "eplb", "mode", "phases", "platforms",
+ "required_publication", "routings", "token_points", "token_points_decode",
+ "token_points_prefill", "workloads",
+}
+SUITE_REQUIRED = {
+ "ep_degrees", "mode", "phases", "platforms", "required_publication", "routings",
+ "workloads",
+}
+TOPOLOGY_FIELDS = (
+ "nodes", "gpus_per_node", "scale_up_domain", "scope", "scale_up_transport",
+ "scale_out_transport", "transport", "topology_class",
+)
+
+
+class MatrixError(ValueError):
+ """A matrix or shard-control document violates the execution contract."""
+
+
+if yaml is not None:
+ class _UniqueKeyLoader(yaml.SafeLoader):
+ pass
+
+ def _unique_mapping(loader: Any, node: Any, deep: bool = False) -> dict[Any, Any]:
+ result: dict[Any, Any] = {}
+ for key_node, value_node in node.value:
+ key = loader.construct_object(key_node, deep=deep)
+ if key in result:
+ raise SystemExit(f"duplicate YAML key {key!r} at line {key_node.start_mark.line + 1}")
+ result[key] = loader.construct_object(value_node, deep=deep)
+ return result
+
+ _UniqueKeyLoader.add_constructor(
+ yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, _unique_mapping
+ )
+
+
+def _load(name: str) -> dict[str, Any]:
+ if yaml is None:
+ raise SystemExit("matrix generation requires PyYAML; shard extraction does not")
+ try:
+ with (HERE / "configs" / name).open() as fh:
+ document = yaml.load(fh, Loader=_UniqueKeyLoader)
+ except yaml.YAMLError as exc:
+ raise SystemExit(f"configs/{name} is not valid YAML: {exc}") from exc
+ if not isinstance(document, dict):
+ raise SystemExit(f"configs/{name} must contain a YAML object")
+ return document
+
+
+def _workload_registry(workloads: dict[str, Any]) -> dict[str, dict[str, Any]]:
+ return {
+ name: cfg
+ for section in ("synthetic", "model_derived")
+ for name, cfg in (workloads.get(section) or {}).items()
+ }
+
+
+def _fields(value: Any, path: str, allowed: set[str], required: set[str]) -> dict[str, Any]:
+ if not isinstance(value, dict):
+ raise SystemExit(f"{path} must be an object")
+ if any(not isinstance(key, str) for key in value):
+ raise SystemExit(f"{path} field names must be strings")
+ unknown, missing = set(value) - allowed, required - set(value)
+ if unknown or missing:
+ raise SystemExit(f"{path} fields: unknown={sorted(unknown)}, missing={sorted(missing)}")
+ return value
+
+
+def _list(value: Any, path: str, item_type: type, allowed: set[Any] | None = None) -> list[Any]:
+ if (not isinstance(value, list) or not value
+ or any(type(item) is not item_type for item in value)
+ or len(value) != len(set(value))
+ or (allowed is not None and any(item not in allowed for item in value))):
+ raise SystemExit(f"{path} must be a non-empty unique list of valid {item_type.__name__}s")
+ return value
+
+
+def validate_config_documents(
+ suites_document: dict[str, Any], workloads: dict[str, Any]
+) -> None:
+ """Reject configuration that is ambiguous, unused, or outside the v1 grid."""
+ _fields(
+ suites_document, "configs/suites.yaml",
+ {"schema_version", "suites"}, {"schema_version", "suites"},
+ )
+ _fields(
+ workloads, "configs/workloads.yaml",
+ {"schema_version", "synthetic", "model_derived"}, {"schema_version"},
+ )
+ if type(suites_document["schema_version"]) is not int or suites_document["schema_version"] != 1:
+ raise SystemExit("configs/suites.yaml schema_version must be integer 1")
+ if type(workloads["schema_version"]) is not int or workloads["schema_version"] != 1:
+ raise SystemExit("configs/workloads.yaml schema_version must be integer 1")
+ registry: dict[str, dict[str, Any]] = {}
+ for section, expert_field in (
+ ("synthetic", "experts"),
+ ("model_derived", "routed_experts"),
+ ):
+ entries = workloads.get(section, {})
+ if not isinstance(entries, dict):
+ raise SystemExit(f"workloads.{section} must be an object")
+ for name, value in entries.items():
+ if not isinstance(name, str) or not IDENTIFIER.fullmatch(name) or name in registry:
+ raise SystemExit(f"workloads.{section} has invalid or duplicate name {name!r}")
+ fields = {"hidden", "topk", expert_field, "verified_against"}
+ config = _fields(value, f"workload {name}", fields, fields - {"verified_against"})
+ dimensions = [config[key] for key in ("hidden", "topk", expert_field)]
+ if any(type(item) is not int or item <= 0 for item in dimensions):
+ raise SystemExit(f"workload {name} dimensions must be positive integers")
+ if dimensions[1] > dimensions[2]:
+ raise SystemExit(f"workload {name}.topk exceeds its expert count")
+ source = config.get("verified_against")
+ if source is not None and (not isinstance(source, str) or not source.strip()):
+ raise SystemExit(f"workload {name}.verified_against must be a non-empty string")
+ registry[name] = config
+ if not registry:
+ raise SystemExit("configs/workloads.yaml must define at least one workload")
+
+ suites = suites_document["suites"]
+ if not isinstance(suites, dict) or not suites:
+ raise SystemExit("configs/suites.yaml suites must be a non-empty object")
+ referenced: set[str] = set()
+ for name, value in suites.items():
+ if not isinstance(name, str) or not IDENTIFIER.fullmatch(name):
+ raise SystemExit(f"invalid suite name {name!r}")
+ suite = _fields(value, f"suite {name}", SUITE_FIELDS, SUITE_REQUIRED)
+ contract = V1_SUITE_CONTRACTS.get(name)
+ if contract is None:
+ raise SystemExit(f"suite {name} is outside the frozen v1 catalog")
+ mode = suite["mode"]
+ if mode not in identity.V1_CASE_PROFILES or mode != contract["mode"]:
+ raise SystemExit(f"suite {name}.mode differs from the frozen v1 catalog")
+ suite_backends = _list(
+ suite.get("backends", list(cap.SWEEP_BACKENDS)),
+ f"suite {name}.backends",
+ str,
+ set(cap.SWEEP_BACKENDS),
+ )
+ expected_backends = contract.get("backends")
+ if expected_backends is not None and set(suite_backends) != expected_backends:
+ raise SystemExit(f"suite {name}.backends differs from the frozen v1 catalog")
+ if expected_backends is None and "backends" in suite:
+ raise SystemExit(f"suite {name}.backends must be omitted")
+ suite_workloads = _list(suite["workloads"], f"suite {name}.workloads", str)
+ unknown = sorted(set(suite_workloads) - set(registry))
+ if unknown:
+ raise SystemExit(f"suite {name}: unknown workloads {unknown}")
+ referenced.update(suite_workloads)
+ platforms = _list(
+ suite["platforms"], f"suite {name}.platforms", str, set(cap.PLATFORMS)
+ )
+ phases = _list(suite["phases"], f"suite {name}.phases", str, {"decode", "prefill"})
+ routings = _list(suite["routings"], f"suite {name}.routings", str, {"uniform", "zipf"})
+ eplb = _list(suite.get("eplb", [False]), f"suite {name}.eplb", bool)
+ if True in eplb and routings != ["zipf"]:
+ raise SystemExit(f"suite {name}: EPLB is only valid for Zipf routing")
+ if suite["required_publication"] not in {"official", "comparable-experimental"}:
+ raise SystemExit(f"suite {name}.required_publication is invalid")
+ if suite["required_publication"] == "official":
+ unverified = [item for item in suite_workloads if not registry[item].get("verified_against")]
+ if unverified:
+ raise SystemExit(f"suite {name}: official workloads need verified_against: {unverified}")
+ degrees = _list(suite["ep_degrees"], f"suite {name}.ep_degrees", int)
+ if degrees != [8, 16]:
+ raise SystemExit(f"suite {name}.ep_degrees must be exactly [8, 16]")
+ for platform in platforms:
+ if not set(degrees).issubset(cap.PLATFORMS[platform]["ep_degrees"]):
+ raise SystemExit(f"suite {name}: invalid EP degree for {platform}")
+ for phase in {"decode", "prefill"} - set(phases):
+ if f"token_points_{phase}" in suite:
+ raise SystemExit(f"suite {name}.token_points_{phase} is unreachable")
+ if "token_points" in suite and all(
+ f"token_points_{phase}" in suite for phase in phases
+ ):
+ raise SystemExit(f"suite {name}.token_points is unreachable")
+ for phase in phases:
+ _ladder(suite, phase)
+ coordinates = {
+ (mode, phase, routing, enabled)
+ for phase, routing, enabled in itertools.product(phases, routings, eplb)
+ }
+ if coordinates != contract["coordinates"] or any(
+ tuple(map(int, _ladder(suite, phase).split())) != contract["ladders"][phase]
+ for phase in phases
+ ):
+ raise SystemExit(f"suite {name} coordinates differ from the frozen v1 catalog")
+ unused = sorted(set(registry) - referenced)
+ if unused:
+ raise SystemExit(f"unreferenced workloads: {unused}")
+
+
+def _dims(workloads: dict[str, Any], name: str) -> tuple[int, int, int]:
+ config = _workload_registry(workloads)[name]
+ values = (
+ config.get("hidden"),
+ config.get("topk"),
+ config.get("experts", config.get("routed_experts")),
+ )
+ return values # type: ignore[return-value]
+
+
+def _ladder(suite: dict[str, Any], phase: str) -> str:
+ points = suite.get(f"token_points_{phase}", suite.get("token_points"))
+ if points is None:
+ points = ep_harness.DECODE_LADDER if phase == "decode" else ep_harness.PREFILL_LADDER
+ if (not isinstance(points, list) or not points
+ or any(isinstance(point, bool) or not isinstance(point, int) or point <= 0
+ for point in points)
+ or points != sorted(set(points))):
+ raise SystemExit(f"invalid {phase} token ladder: {points!r}")
+ return " ".join(map(str, points))
+
+
+def _v1_requested_ladder(case: dict[str, Any]) -> str:
+ """Bind extracted controls to the frozen v1 suite and workload catalog."""
+ suite = V1_SUITE_CONTRACTS.get(case.get("suite"))
+ coordinate = (
+ case.get("mode"), case.get("phase"), case.get("routing"), case.get("eplb")
+ )
+ if (
+ suite is None
+ or coordinate not in suite["coordinates"]
+ or case.get("required_publication") != suite["publication"]
+ or (
+ case.get("workload"), case.get("hidden"), case.get("topk"), case.get("experts")
+ ) != V1_WORKLOAD
+ ):
+ raise MatrixError("case differs from the frozen v1 suite/workload catalog")
+ return " ".join(map(str, suite["ladders"][case["phase"]]))
+
+
+def _expected_disposition(
+ sku: str, case: dict[str, Any]
+) -> tuple[str, str | None, str | None]:
+ requested_ladder = _v1_requested_ladder(case)
+ ok, detail = cap.resolve(
+ sku, case["backend"], ep=case["ep"], nodes=case["nodes"],
+ routing=case["routing"], eplb=case["eplb"], mode=case["mode"],
+ )
+ if ok:
+ if case["ladder"] != requested_ladder:
+ raise MatrixError("case ladder differs from the frozen v1 suite catalog")
+ return "runnable", None, None
+ if case["ladder"] != requested_ladder:
+ raise MatrixError("unsupported case ladder differs from the frozen v1 suite catalog")
+ return "unsupported", "backend-platform-unsupported", detail
+
+
+def _case_id(sku: str, case: dict[str, Any]) -> str:
+ return identity.case_id(
+ sku=sku, profile=identity.profile_for_case(case), case=case
+ )
+
+
+def _semantic_points(sku: str, case: dict[str, Any]) -> list[str]:
+ execution = {
+ key: value for key, value in case.items()
+ if key not in {"canonical", "case_id", "ladder", "required_publication", "suite", "workload"}
+ }
+ return [
+ json.dumps(
+ {"sku": sku, "tokens_per_rank": int(point), **execution},
+ sort_keys=True,
+ separators=(",", ":"),
+ )
+ for point in case["ladder"].split()
+ ]
+
+
+def _select_backends(backend: str, backends: str) -> list[str]:
+ available = list(cap.SWEEP_BACKENDS)
+ if backend and backends:
+ raise SystemExit("--backend and --backends are mutually exclusive")
+ if backends:
+ names = available if backends == "all" else [
+ value.strip() for value in backends.split(",") if value.strip()
+ ]
+ else:
+ names = [backend or "deepep"]
+ unknown = sorted(set(names) - set(available))
+ if unknown:
+ raise SystemExit(f"unknown backend values {unknown}; have {available}")
+ if len(names) != len(set(names)):
+ raise SystemExit("backend selection contains duplicates")
+ return names
+
+
+def resolve_matrix(
+ suites: str = "all",
+ backend: str = "",
+ backends: str = "",
+ only_sku: str = "",
+ min_nodes: int = 0,
+ max_nodes: int = 0,
+ max_cases: int = 128,
+) -> dict[str, Any]:
+ """Resolve suite configuration into allocation-sized workflow shards."""
+ if max_cases <= 0:
+ raise SystemExit("--max-cases must be positive")
+ if min_nodes < 0 or max_nodes < 0 or (min_nodes and max_nodes and min_nodes > max_nodes):
+ raise SystemExit("invalid node bounds")
+ if only_sku and only_sku not in cap.PLATFORMS:
+ raise SystemExit(f"unknown --only-sku {only_sku!r}; have {sorted(cap.PLATFORMS)}")
+
+ workloads = _load("workloads.yaml")
+ suites_document = _load("suites.yaml")
+ validate_config_documents(suites_document, workloads)
+ registry = suites_document["suites"]
+ names = list(registry) if suites == "all" else [
+ value.strip() for value in suites.split(",") if value.strip()
+ ]
+ if not names or len(names) != len(set(names)):
+ raise SystemExit("suite selection must be non-empty and unique")
+ unknown = sorted(set(names) - set(registry))
+ if unknown:
+ raise SystemExit(f"unknown suites {unknown}; have {sorted(registry)}")
+ targets = _select_backends(backend, backends)
+
+ shards: dict[tuple[str, str, int], list[dict[str, Any]]] = {}
+ requested_cases: list[dict[str, Any]] = []
+ scheduled: set[str] = set()
+ for suite_name in names:
+ suite = registry[suite_name]
+ mode = suite["mode"]
+ phases = suite["phases"]
+ routings = suite["routings"]
+ eplb_values = suite.get("eplb", [False])
+ suite_backends = set(suite.get("backends", cap.SWEEP_BACKENDS))
+ suite_targets = [target for target in targets if target in suite_backends]
+ if not suite_targets:
+ continue
+ for platform_name in suite["platforms"]:
+ if only_sku and platform_name != only_sku:
+ continue
+ ep_degrees = suite["ep_degrees"]
+ for workload, ep, phase, routing, eplb, target in itertools.product(
+ suite["workloads"], ep_degrees, phases, routings, eplb_values, suite_targets
+ ):
+ topology = cap.topology_for(platform_name, ep)
+ if topology is None:
+ raise SystemExit(
+ f"suite {suite_name}: {platform_name} EP{ep} is not registered"
+ )
+ nodes = int(topology["nodes"])
+ if min_nodes and nodes < min_nodes:
+ continue
+ if max_nodes and nodes > max_nodes:
+ continue
+ ok, capability_detail = cap.resolve(
+ platform_name,
+ target,
+ ep=ep,
+ nodes=nodes,
+ routing=routing,
+ eplb=bool(eplb),
+ mode=mode,
+ )
+ hidden, topk, experts = _dims(workloads, workload)
+
+ def add_case(
+ case_ladder: str,
+ disposition: str,
+ reason: str | None,
+ detail: str | None,
+ ) -> None:
+ case: dict[str, Any] = {
+ "suite": suite_name,
+ "workload": workload,
+ "required_publication": suite["required_publication"],
+ "backend": target,
+ "routing": routing,
+ "phase": phase,
+ "ep": ep,
+ "eplb": eplb,
+ "hidden": hidden,
+ "topk": topk,
+ "experts": experts,
+ "samples_per_point": ep_harness.TIMED_SAMPLES_PER_POINT,
+ "warmup_semantics": ep_harness.WARMUP_SEMANTICS,
+ "ladder": case_ladder,
+ "mode": mode,
+ "timing": EP_TIMING_PROFILE,
+ "canonical": True,
+ **{field: topology[field] for field in TOPOLOGY_FIELDS},
+ }
+ for signature in _semantic_points(platform_name, case):
+ if signature in scheduled:
+ raise SystemExit(
+ f"suite {suite_name}: duplicate semantic point for {platform_name}"
+ )
+ scheduled.add(signature)
+ case["case_id"] = _case_id(platform_name, case)
+ requested_cases.append(
+ {
+ "sku": platform_name,
+ "case": case,
+ "disposition": disposition,
+ "reason": reason,
+ "detail": detail,
+ }
+ )
+ if disposition == "runnable":
+ shards.setdefault((platform_name, target, nodes), []).append(case)
+
+ requested_ladder = _ladder(suite, phase)
+ if not ok:
+ add_case(
+ requested_ladder,
+ "unsupported",
+ "backend-platform-unsupported",
+ capability_detail,
+ )
+ continue
+ add_case(requested_ladder, "runnable", None, None)
+
+ shards_by_sku: dict[str, list[dict[str, Any]]] = {}
+ for (sku, target, nodes), cases in sorted(shards.items()):
+ chunk_size = max_cases
+ for offset in range(0, len(cases), chunk_size):
+ chunk = cases[offset:offset + chunk_size]
+ part = offset // chunk_size
+ shard_id = f"{sku}-{target}-n{nodes}"
+ if len(cases) > chunk_size:
+ shard_id += f"-p{part}"
+ shards_by_sku.setdefault(sku, []).append({
+ "id": shard_id,
+ "sku": sku,
+ "backend": target,
+ "launcher": cap.PLATFORMS[sku]["launcher"],
+ **{field: chunk[0][field] for field in TOPOLOGY_FIELDS},
+ "n": len(chunk),
+ "case_ids": [case["case_id"] for case in chunk],
+ })
+ include = [
+ shards_by_sku[sku][round_index]
+ for round_index in range(max(map(len, shards_by_sku.values()), default=0))
+ for sku in sorted(shards_by_sku)
+ if round_index < len(shards_by_sku[sku])
+ ]
+ return {
+ "format": "collectivex.matrix.v1",
+ "schema_version": 1,
+ "requested_cases": requested_cases,
+ "include": include,
+ }
+
+
+def _strict_json_load(path: Path) -> Any:
+ def reject_constant(value: str) -> None:
+ raise MatrixError(f"non-finite JSON number {value}")
+
+ def reject_duplicates(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
+ result: dict[str, Any] = {}
+ for key, value in pairs:
+ if key in result:
+ raise MatrixError(f"duplicate JSON key {key!r}")
+ result[key] = value
+ return result
+
+ if not path.is_file():
+ raise MatrixError(f"matrix does not exist: {path}")
+ if path.stat().st_size == 0:
+ raise MatrixError(f"matrix is empty: {path}")
+ try:
+ with path.open() as fh:
+ return json.load(
+ fh, parse_constant=reject_constant, object_pairs_hook=reject_duplicates
+ )
+ except (OSError, json.JSONDecodeError) as exc:
+ raise MatrixError(f"matrix is not valid JSON: {exc}") from exc
+
+
+def _positive_int(value: Any, field: str) -> int:
+ if type(value) is not int:
+ raise MatrixError(f"{field} must be a positive integer")
+ if value <= 0:
+ raise MatrixError(f"{field} must be a positive integer")
+ return value
+
+
+def validate_shard_control(
+ shard: dict[str, Any],
+ *,
+ sku: str,
+ backend: str,
+ nodes: int,
+ require_runnable: bool = True,
+) -> None:
+ """Validate one shard against the workflow cell that requested it."""
+ if not isinstance(shard, dict):
+ raise MatrixError("shard must be a JSON object")
+ if sku not in cap.PLATFORMS or backend not in cap.SWEEP_BACKENDS:
+ raise MatrixError("shard platform/backend is not registered")
+ top_fields = {"schema_version", "id", "sku", "backend", "nodes", "n", "cases"}
+ if (
+ set(shard) != top_fields
+ or type(shard.get("schema_version")) is not int
+ or shard["schema_version"] != 1
+ ):
+ raise MatrixError("shard fields or schema version differ from v1 contract")
+ if not isinstance(shard.get("id"), str) or not IDENTIFIER.fullmatch(shard["id"]):
+ raise MatrixError("shard has invalid id")
+ for field, expected in (("sku", sku), ("backend", backend)):
+ if shard.get(field) != expected:
+ raise MatrixError(
+ f"shard {field} mismatch: expected {expected!r}, got {shard.get(field)!r}"
+ )
+ if _positive_int(shard.get("nodes"), "shard.nodes") != nodes:
+ raise MatrixError(
+ f"shard nodes mismatch: expected {nodes}, got {shard.get('nodes')!r}"
+ )
+ cases = shard.get("cases")
+ if not isinstance(cases, list) or not cases:
+ raise MatrixError("shard must contain at least one case")
+ if _positive_int(shard.get("n"), "shard.n") != len(cases):
+ raise MatrixError("shard.n does not match the number of cases")
+ seen: set[str] = set()
+ required = {
+ "case_id", "suite", "workload", "required_publication", "backend", "routing",
+ "mode", "phase", "ep", "eplb", "hidden", "topk", "experts",
+ "samples_per_point",
+ "warmup_semantics", "ladder", "timing", "canonical",
+ } | set(TOPOLOGY_FIELDS)
+ for index, case in enumerate(cases):
+ if not isinstance(case, dict):
+ raise MatrixError(f"case {index} must be a JSON object")
+ fields = set(case)
+ if fields != required:
+ raise MatrixError(
+ f"case {index} fields differ from v1 contract: "
+ f"missing={sorted(required - fields)}, extra={sorted(fields - required)}"
+ )
+ case_id = case["case_id"]
+ if not identity.is_typed_id(case_id, "case"):
+ raise MatrixError(f"case {index} has invalid case_id")
+ if case_id in seen:
+ raise MatrixError(f"duplicate case_id {case_id}")
+ seen.add(case_id)
+ for field in (
+ "suite", "workload", "required_publication", "backend", "mode", "routing",
+ "phase", "warmup_semantics", "ladder", "timing",
+ ):
+ if not isinstance(case[field], str) or not case[field]:
+ raise MatrixError(f"case {index}.{field} must be a non-empty string")
+ for field in ("suite", "workload", "required_publication", "backend", "routing", "phase"):
+ if not IDENTIFIER.fullmatch(case[field]):
+ raise MatrixError(f"case {index}.{field} is not a safe identifier")
+ if case["required_publication"] not in {"official", "comparable-experimental"}:
+ raise MatrixError(f"case {index} has invalid publication requirement")
+ case_identity = {key: value for key, value in case.items() if key != "case_id"}
+ if case_id != _case_id(sku, case_identity):
+ raise MatrixError(f"case {index} case_id does not match its contents")
+ if case["backend"] != backend:
+ raise MatrixError(f"case {index} backend does not match shard")
+ if case["mode"] not in identity.V1_CASE_PROFILES:
+ raise MatrixError(f"case {index} mode is invalid")
+ if _positive_int(case["nodes"], f"case {index}.nodes") != nodes:
+ raise MatrixError(f"case {index} nodes does not match shard")
+ ep = _positive_int(case["ep"], f"case {index}.ep")
+ gpus_per_node = _positive_int(
+ case["gpus_per_node"], f"case {index}.gpus_per_node"
+ )
+ topology = cap.topology_for(sku, ep)
+ if topology is None or any(case[field] != topology[field] for field in TOPOLOGY_FIELDS):
+ raise MatrixError(f"case {index} differs from the platform registry")
+ if ep != nodes * gpus_per_node:
+ raise MatrixError(f"case {index} ep does not equal nodes * gpus_per_node")
+ if case["samples_per_point"] != ep_harness.TIMED_SAMPLES_PER_POINT:
+ raise MatrixError(f"case {index} violates fixed-512-v1")
+ if case["timing"] != EP_TIMING_PROFILE:
+ raise MatrixError(f"case {index} has invalid timing profile")
+ if case["warmup_semantics"] != ep_harness.WARMUP_SEMANTICS:
+ raise MatrixError(f"case {index} has invalid warmup semantics")
+ if case["phase"] not in {"decode", "prefill"}:
+ raise MatrixError(f"case {index} has invalid phase")
+ if case["routing"] not in {"uniform", "zipf"}:
+ raise MatrixError(f"case {index} has invalid routing")
+ if not isinstance(case["eplb"], bool) or (case["eplb"] and case["routing"] != "zipf"):
+ raise MatrixError(f"case {index} has invalid EPLB setting")
+ if not isinstance(case["canonical"], bool) or not case["canonical"]:
+ raise MatrixError(f"case {index} must use a canonical workload")
+ for field in ("ep", "nodes", "gpus_per_node", "hidden", "topk", "experts",
+ "samples_per_point", "scale_up_domain"):
+ if isinstance(case[field], bool) or not isinstance(case[field], int):
+ raise MatrixError(f"case {index}.{field} must be an integer")
+ _positive_int(case[field], f"case {index}.{field}")
+ scale_up_domain = _positive_int(
+ case["scale_up_domain"], f"case {index}.scale_up_domain"
+ )
+ expected_scope = "scale-up" if ep <= scale_up_domain else "scale-out"
+ if case["scope"] != expected_scope or (
+ expected_scope == "scale-out" and ep % scale_up_domain
+ ):
+ raise MatrixError(f"case {index} has invalid scale-up/scale-out geometry")
+ try:
+ ladder = [int(value) for value in case["ladder"].split()]
+ except (AttributeError, ValueError) as exc:
+ raise MatrixError(f"case {index} has invalid token ladder") from exc
+ if (not ladder or any(value <= 0 for value in ladder)
+ or ladder != sorted(set(ladder))
+ or case["ladder"] != " ".join(map(str, ladder))):
+ raise MatrixError(f"case {index} has invalid token ladder")
+ if require_runnable:
+ disposition, reason, _ = _expected_disposition(sku, case)
+ if disposition != "runnable":
+ raise MatrixError(f"case {index} violates capability registry: {reason}")
+ else:
+ _v1_requested_ladder(case)
+
+
+def validate_matrix_document(document: Any) -> dict[str, Any]:
+ """Validate the complete requested grid and its runnable shard partition."""
+ if not isinstance(document, dict) or set(document) != {
+ "format", "schema_version", "requested_cases", "include"
+ }:
+ raise MatrixError("matrix fields differ from the v1 contract")
+ if (
+ document["format"] != "collectivex.matrix.v1"
+ or type(document["schema_version"]) is not int
+ or document["schema_version"] != 1
+ ):
+ raise MatrixError("matrix format/schema differs from v1")
+ requested = document["requested_cases"]
+ include = document["include"]
+ if not isinstance(requested, list) or not requested:
+ raise MatrixError("matrix.requested_cases must be non-empty")
+ if not isinstance(include, list):
+ raise MatrixError("matrix.include must be an array")
+
+ cases_by_id: dict[str, dict[str, Any]] = {}
+ runnable_ids: set[str] = set()
+ semantic_points: set[str] = set()
+ for index, value in enumerate(requested):
+ path = f"matrix.requested_cases[{index}]"
+ if not isinstance(value, dict) or set(value) != {
+ "sku", "case", "disposition", "reason", "detail"
+ }:
+ raise MatrixError(f"{path} fields differ from the v1 contract")
+ sku = value["sku"]
+ case = value["case"]
+ disposition = value["disposition"]
+ if sku not in cap.PLATFORMS:
+ raise MatrixError(f"{path}.sku is unknown")
+ if disposition not in {"runnable", "unsupported"}:
+ raise MatrixError(f"{path}.disposition is invalid")
+ if disposition == "runnable":
+ if value["reason"] is not None or value["detail"] is not None:
+ raise MatrixError(f"{path} runnable cases cannot have a reason")
+ else:
+ if (
+ not isinstance(value["reason"], str)
+ or not IDENTIFIER.fullmatch(value["reason"])
+ or not isinstance(value["detail"], str)
+ or not value["detail"]
+ ):
+ raise MatrixError(f"{path} unsupported cases need a public reason and detail")
+ if not isinstance(case, dict):
+ raise MatrixError(f"{path}.case must be an object")
+ backend = case.get("backend")
+ nodes = case.get("nodes")
+ if not isinstance(backend, str) or type(nodes) is not int:
+ raise MatrixError(f"{path}.case backend/nodes are invalid")
+ validate_shard_control(
+ {
+ "schema_version": 1,
+ "id": "requested-case",
+ "sku": sku,
+ "backend": backend,
+ "nodes": nodes,
+ "n": 1,
+ "cases": [case],
+ },
+ sku=sku,
+ backend=backend,
+ nodes=nodes,
+ require_runnable=disposition == "runnable",
+ )
+ case_id = case["case_id"]
+ if case_id in cases_by_id:
+ raise MatrixError(f"duplicate requested case_id {case_id}")
+ for signature in _semantic_points(sku, case):
+ if signature in semantic_points:
+ raise MatrixError(f"{path} duplicates a semantic token point")
+ semantic_points.add(signature)
+ cases_by_id[case_id] = value
+ expected = _expected_disposition(sku, case)
+ if (disposition, value["reason"], value["detail"]) != expected:
+ raise MatrixError(f"{path} disposition differs from the frozen v1 catalog")
+ if disposition == "runnable":
+ runnable_ids.add(case_id)
+
+ shard_ids: set[str] = set()
+ assigned: list[str] = []
+ for index, shard in enumerate(include):
+ path = f"matrix.include[{index}]"
+ expected = {
+ "id", "sku", "backend", "launcher", "n", "case_ids",
+ } | set(TOPOLOGY_FIELDS)
+ if not isinstance(shard, dict) or set(shard) != expected:
+ raise MatrixError(f"{path} fields differ from the v1 contract")
+ shard_id = shard["id"]
+ if not isinstance(shard_id, str) or not IDENTIFIER.fullmatch(shard_id):
+ raise MatrixError(f"{path}.id is invalid")
+ if shard_id in shard_ids:
+ raise MatrixError(f"duplicate shard id {shard_id}")
+ shard_ids.add(shard_id)
+ sku = shard["sku"]
+ if sku not in cap.PLATFORMS:
+ raise MatrixError(f"{path}.sku is unknown")
+ platform = cap.PLATFORMS[sku]
+ if shard["launcher"] != platform["launcher"]:
+ raise MatrixError(f"{path}.launcher differs from the platform registry")
+ case_ids = shard["case_ids"]
+ if not isinstance(case_ids, list) or not case_ids or len(case_ids) != len(set(case_ids)):
+ raise MatrixError(f"{path}.case_ids must be a non-empty unique array")
+ if _positive_int(shard["n"], f"{path}.n") != len(case_ids):
+ raise MatrixError(f"{path}.n differs from case_ids")
+ nodes = _positive_int(shard["nodes"], f"{path}.nodes")
+ for case_id in case_ids:
+ wrapper = cases_by_id.get(case_id)
+ if wrapper is None or wrapper["disposition"] != "runnable":
+ raise MatrixError(f"{path} references a missing or unsupported case")
+ case = wrapper["case"]
+ if (
+ wrapper["sku"] != sku
+ or case["backend"] != shard["backend"]
+ or case["nodes"] != nodes
+ or any(shard[field] != case[field] for field in TOPOLOGY_FIELDS)
+ ):
+ raise MatrixError(f"{path} case does not match shard coordinates")
+ assigned.append(case_id)
+ if len(assigned) != len(set(assigned)):
+ raise MatrixError("a runnable case is assigned to more than one shard")
+ if set(assigned) != runnable_ids:
+ raise MatrixError("runnable requested cases and shard assignments differ")
+ return document
+
+
+def extract_shard(
+ matrix_path: str | os.PathLike[str],
+ shard_id: str,
+ output_path: str | os.PathLike[str],
+ *,
+ sku: str,
+ backend: str,
+ nodes: int,
+) -> dict[str, Any]:
+ """Extract one strictly matched shard control file, writing it atomically."""
+ document = validate_matrix_document(_strict_json_load(Path(matrix_path)))
+ include = document["include"]
+ matches = [item for item in include if isinstance(item, dict) and item.get("id") == shard_id]
+ if len(matches) != 1:
+ raise MatrixError(f"expected exactly one shard {shard_id!r}, found {len(matches)}")
+ source = matches[0]
+ requested = {
+ item["case"]["case_id"]: item
+ for item in document["requested_cases"]
+ }
+ cases = [requested[case_id]["case"] for case_id in source["case_ids"]]
+ control = {
+ "schema_version": 1,
+ "id": source.get("id"),
+ "sku": source.get("sku"),
+ "backend": source.get("backend"),
+ "nodes": source.get("nodes"),
+ "n": source.get("n"),
+ "cases": cases,
+ }
+ validate_shard_control(control, sku=sku, backend=backend, nodes=nodes)
+ output = Path(output_path)
+ output.parent.mkdir(parents=True, exist_ok=True)
+ temporary = output.with_name(f".{output.name}.tmp-{os.getpid()}")
+ try:
+ with temporary.open("w") as fh:
+ json.dump(control, fh, sort_keys=True, separators=(",", ":"))
+ fh.write("\n")
+ os.replace(temporary, output)
+ finally:
+ temporary.unlink(missing_ok=True)
+ return control
+
+
+def emit_unsupported(
+ matrix_path: str | os.PathLike[str], output_dir: str | os.PathLike[str]
+) -> list[Path]:
+ """Materialize one strict terminal outcome for each unsupported requested case."""
+ source = Path(matrix_path)
+ document = validate_matrix_document(_strict_json_load(source))
+ control_sha256 = hashlib.sha256(source.read_bytes()).hexdigest()
+ generated_at = dt.datetime.now(dt.timezone.utc).isoformat()
+ git_run = {
+ "run_id": os.environ.get("GITHUB_RUN_ID"),
+ "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+ "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"),
+ "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"),
+ "repo": os.environ.get("GITHUB_REPOSITORY"),
+ "job": os.environ.get("GITHUB_JOB"),
+ "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"),
+ }
+ allocation_factors = {
+ "artifact": git_run["artifact"],
+ "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID"),
+ "job": git_run["job"],
+ "repo": git_run["repo"],
+ "run_attempt": git_run["run_attempt"],
+ "run_id": git_run["run_id"],
+ "runner": "capability-resolver",
+ "source_sha": git_run["source_sha"],
+ }
+ destination = Path(output_dir)
+ destination.mkdir(parents=True, exist_ok=True)
+ written: list[Path] = []
+ for wrapper in document["requested_cases"]:
+ if wrapper["disposition"] != "unsupported":
+ continue
+ scheduled = wrapper["case"]
+ case = {key: value for key, value in scheduled.items() if key != "case_id"}
+ case_factors = {
+ "case": case,
+ "profile": identity.profile_for_case(case),
+ "sku": wrapper["sku"],
+ }
+ case_id = identity.digest("case", case_factors)
+ if case_id != scheduled["case_id"]:
+ raise MatrixError(f"unsupported case identity differs for {scheduled['case_id']}")
+ attempt_ordinal = 1
+ record = contracts.make_terminal_document(
+ allocation_factors=allocation_factors,
+ attempt_ordinal=attempt_ordinal,
+ case=case,
+ case_factors=case_factors,
+ control_sha256=control_sha256,
+ failure_mode="capability",
+ generated_at=generated_at,
+ git_run=git_run,
+ reason=wrapper["reason"],
+ return_code=5,
+ source="matrix-capability-resolver",
+ status="unsupported",
+ expected_case_id=case_id,
+ )
+ path = destination / f"unsupported_{case_id}.json"
+ temporary = path.with_name(f".{path.name}.tmp-{os.getpid()}")
+ try:
+ with temporary.open("x") as handle:
+ json.dump(record, handle, allow_nan=False, sort_keys=True, separators=(",", ":"))
+ handle.write("\n")
+ handle.flush()
+ os.fsync(handle.fileno())
+ os.replace(temporary, path)
+ finally:
+ temporary.unlink(missing_ok=True)
+ written.append(path)
+ return written
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser(description="CollectiveX v1 matrix resolver")
+ parser.add_argument("--suites", default="all", help="'all' or comma-list of suites")
+ parser.add_argument("--backend", default="", help="select one EP backend")
+ parser.add_argument("--backends", default="", help="'all' or comma-list of EP backends")
+ parser.add_argument("--only-sku", default="")
+ parser.add_argument("--min-nodes", type=int, default=0)
+ parser.add_argument("--max-nodes", type=int, default=0)
+ parser.add_argument("--max-cases", type=int, default=128)
+ parser.add_argument("--extract-from", default="", metavar="MATRIX")
+ parser.add_argument("--validate-control", default="", metavar="SHARD")
+ parser.add_argument("--emit-unsupported-from", default="", metavar="MATRIX")
+ parser.add_argument("--out-dir", default="")
+ parser.add_argument("--shard-id", default="")
+ parser.add_argument("--expect-sku", default="")
+ parser.add_argument("--expect-backend", default="")
+ parser.add_argument("--expect-nodes", type=int, default=0)
+ parser.add_argument("--out", default="")
+ args = parser.parse_args()
+
+ if args.emit_unsupported_from:
+ if not args.out_dir:
+ parser.error("unsupported outcome emission requires --out-dir")
+ try:
+ written = emit_unsupported(args.emit_unsupported_from, args.out_dir)
+ except MatrixError as exc:
+ parser.error(str(exc))
+ print(f"emitted {len(written)} unsupported terminal outcomes", file=sys.stderr)
+ return 0
+
+ if args.validate_control:
+ if not all((args.expect_sku, args.expect_backend, args.expect_nodes)):
+ parser.error(
+ "control validation requires --expect-sku, --expect-backend, and --expect-nodes"
+ )
+ try:
+ control = _strict_json_load(Path(args.validate_control))
+ validate_shard_control(
+ control,
+ sku=args.expect_sku,
+ backend=args.expect_backend,
+ nodes=args.expect_nodes,
+ )
+ except MatrixError as exc:
+ parser.error(str(exc))
+ print(f"validated {control.get('id')}: {control['n']} cases", file=sys.stderr)
+ return 0
+
+ if args.extract_from:
+ if not all((args.shard_id, args.expect_sku, args.expect_backend, args.expect_nodes, args.out)):
+ parser.error(
+ "shard extraction requires --shard-id, --expect-sku, --expect-backend, "
+ "--expect-nodes, and --out"
+ )
+ try:
+ control = extract_shard(
+ args.extract_from,
+ args.shard_id,
+ args.out,
+ sku=args.expect_sku,
+ backend=args.expect_backend,
+ nodes=args.expect_nodes,
+ )
+ except MatrixError as exc:
+ parser.error(str(exc))
+ print(f"extracted {control['id']}: {control['n']} cases", file=sys.stderr)
+ print(json.dumps(control, separators=(",", ":")))
+ return 0
+
+ matrix = resolve_matrix(
+ suites=args.suites,
+ backend=args.backend,
+ backends=args.backends,
+ only_sku=args.only_sku,
+ min_nodes=args.min_nodes,
+ max_nodes=args.max_nodes,
+ max_cases=args.max_cases,
+ )
+ try:
+ validate_matrix_document(matrix)
+ except MatrixError as exc:
+ parser.error(str(exc))
+ if args.out:
+ with open(args.out, "w") as fh:
+ json.dump(matrix, fh, sort_keys=True, separators=(",", ":"))
+ fh.write("\n")
+ runnable = sum(
+ item["disposition"] == "runnable" for item in matrix["requested_cases"]
+ )
+ unsupported = len(matrix["requested_cases"]) - runnable
+ print(
+ f"resolved {len(matrix['include'])} shard-cells, "
+ f"{runnable} runnable and {unsupported} unsupported cases",
+ file=sys.stderr,
+ )
+ print(json.dumps(matrix))
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py
new file mode 100644
index 0000000000..26cc889b5d
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+"""CollectiveX DeepEP adapter for the v1 BF16 normal-mode workload."""
+from __future__ import annotations
+
+import inspect
+import os
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+import contracts
+
+try:
+ import deep_ep
+ from deep_ep import Buffer # type: ignore
+except Exception as exc: # pragma: no cover - requires the benchmark image
+ print(f"ERROR: deep_ep import failed: {exc!r}", file=sys.stderr)
+ raise
+
+
+def _deepep_version() -> str:
+ try:
+ import importlib.metadata as metadata
+
+ return metadata.version("deep_ep")
+ except Exception:
+ return getattr(deep_ep, "__version__", "unknown")
+
+
+def _mnnvl_buffer_configuration() -> tuple[dict[str, bool], str]:
+ """Resolve the explicit DeepEP MNNVL API contract."""
+ requested_value = os.environ.get("CX_ALLOW_MNNVL")
+ if requested_value not in {None, "", "0", "1"}:
+ raise RuntimeError("CX_ALLOW_MNNVL must be unset, 0, or 1")
+ requested = requested_value == "1"
+ if not requested:
+ return contracts.resolve_deepep_mnnvl(
+ requested=False, signature_parameters=(),
+ deepep_commit=os.environ.get("DEEPEP_COMMIT"),
+ )
+ try:
+ parameters = inspect.signature(Buffer.__init__).parameters
+ except (TypeError, ValueError) as exc:
+ raise RuntimeError("cannot inspect DeepEP Buffer MNNVL API") from exc
+ try:
+ return contracts.resolve_deepep_mnnvl(
+ requested=True, signature_parameters=parameters,
+ deepep_commit=os.environ.get("DEEPEP_COMMIT"),
+ )
+ except contracts.ContractError as exc:
+ raise RuntimeError(str(exc)) from exc
+
+
+def _normal_buffer_sizes(hidden: int, world_size: int) -> tuple[int, int]:
+ """Apply DeepEP's dispatch/combine buffer sizing contract for this EP world."""
+ hidden_bytes = hidden * torch.tensor([], dtype=torch.bfloat16).element_size()
+ configs = (Buffer.get_dispatch_config(world_size), Buffer.get_combine_config(world_size))
+ num_nvl_bytes = max(
+ int(config.get_nvl_buffer_size_hint(hidden_bytes, world_size)) for config in configs
+ )
+ num_rdma_bytes = max(
+ int(config.get_rdma_buffer_size_hint(hidden_bytes, world_size)) for config in configs
+ )
+ if num_nvl_bytes <= 0 or num_rdma_bytes < 0:
+ raise RuntimeError("DeepEP returned invalid normal-mode buffer size hints")
+ return num_nvl_bytes, num_rdma_bytes
+
+
+class DeepEPBackend:
+ name = "deepep"
+ combine_needs_redispatch = False
+ # DeepEP reduces activations and top-k weights independently. The activation
+ # tensor must therefore carry the complete local weighted expert sum.
+ combine_weight_semantics = "unweighted-rank-sum"
+ oracle_layout = "token-rank"
+ payload_unit = "token-rank"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = getattr(args, "mode", "normal")
+ if self.mode not in {"normal", "low-latency"}:
+ raise ValueError(f"unsupported DeepEP mode {self.mode!r}")
+
+ self.group = dist.group.WORLD
+ device_sms = torch.cuda.get_device_properties(device).multi_processor_count
+ mnnvl_kwargs, mnnvl_comm = _mnnvl_buffer_configuration()
+ if self.mode == "low-latency":
+ if args.phase != "decode":
+ raise ValueError("DeepEP low-latency mode only supports the decode ladder")
+ if args.experts % world_size:
+ raise ValueError("DeepEP low-latency experts must divide the EP group")
+ self.combine_needs_redispatch = True
+ self.combine_weight_semantics = "gate-weighted-sum"
+ self.oracle_layout = "expert-packed"
+ self.payload_unit = "token-expert"
+ self.max_tokens_per_rank = 128
+ num_qps_per_rank = args.experts // world_size
+ num_rdma_bytes = Buffer.get_low_latency_rdma_size_hint(
+ self.max_tokens_per_rank, args.hidden, world_size, args.experts
+ )
+ self.buffer = Buffer(
+ self.group,
+ num_nvl_bytes=0,
+ num_rdma_bytes=num_rdma_bytes,
+ low_latency_mode=True,
+ num_qps_per_rank=num_qps_per_rank,
+ allow_nvlink_for_low_latency_mode=True,
+ explicitly_destroy=True,
+ **mnnvl_kwargs,
+ )
+ self.buffer.clean_low_latency_buffer(
+ self.max_tokens_per_rank, args.hidden, args.experts
+ )
+ resource_provenance = {
+ "requested_num_sms": None,
+ "num_sms": None,
+ "sm_fraction": None,
+ "tuned_source": "deepep-low-latency-fixed-kernel",
+ "num_max_tokens_per_rank": self.max_tokens_per_rank,
+ "num_nvl_bytes": 0,
+ "num_rdma_bytes": num_rdma_bytes,
+ "num_qps_per_rank": num_qps_per_rank,
+ }
+ else:
+ num_nvl_bytes, num_rdma_bytes = _normal_buffer_sizes(args.hidden, world_size)
+ if world_size > args.scale_up_domain and num_rdma_bytes == 0:
+ raise RuntimeError("DeepEP scale-out configuration returned no RDMA buffer")
+ self.buffer = Buffer(
+ self.group, num_nvl_bytes, num_rdma_bytes, **mnnvl_kwargs
+ )
+ num_sms = int(getattr(Buffer, "num_sms", args.num_sms))
+ try:
+ Buffer.set_num_sms(num_sms)
+ except Exception as exc: # pragma: no cover - version dependent
+ raise RuntimeError(
+ f"DeepEP did not apply requested num_sms={num_sms}: {exc!r}"
+ ) from exc
+ applied_num_sms = int(getattr(Buffer, "num_sms", num_sms))
+ if applied_num_sms != num_sms:
+ raise RuntimeError(
+ f"DeepEP num_sms mismatch: requested={num_sms} applied={applied_num_sms}"
+ )
+ resource_provenance = {
+ "requested_num_sms": num_sms,
+ "num_sms": applied_num_sms,
+ "sm_fraction": applied_num_sms / device_sms,
+ "tuned_source": "deepep-default-num_sms",
+ "num_nvl_bytes": num_nvl_bytes,
+ "num_rdma_bytes": num_rdma_bytes,
+ }
+ version = _deepep_version()
+ self.backend_provenance = {
+ "deepep_version": version,
+ "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{version}",
+ "backend_lineage": "deepep-v1",
+ "mode": self.mode,
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "resource_mode": "tuned",
+ "device_sms": device_sms,
+ "allow_mnnvl": bool(mnnvl_kwargs),
+ "mnnvl_comm": mnnvl_comm,
+ **resource_provenance,
+ }
+
+ def buffer_cap(self, args):
+ return self.max_tokens_per_rank if self.mode == "low-latency" else None
+
+ def make_problem(self, T, idx, weights, x):
+ return types.SimpleNamespace(
+ T=T,
+ x=x,
+ topk_idx=idx.to(torch.int64),
+ topk_weights=weights.to(torch.float32),
+ )
+
+ def dispatch(self, p):
+ if self.mode == "low-latency":
+ recv_x, recv_counts, handle, _, _ = self.buffer.low_latency_dispatch(
+ p.x,
+ p.topk_idx,
+ self.max_tokens_per_rank,
+ self.args.experts,
+ use_fp8=False,
+ async_finish=False,
+ return_recv_hook=False,
+ )
+ return types.SimpleNamespace(
+ recv_x=recv_x,
+ recv_counts=recv_counts,
+ handle=handle,
+ )
+ (
+ num_tokens_per_rank,
+ num_tokens_per_rdma_rank,
+ num_tokens_per_expert,
+ is_token_in_rank,
+ _,
+ ) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+ recv_x, recv_topk_idx, recv_topk_weights, recv_counts, handle, _ = self.buffer.dispatch(
+ p.x,
+ topk_idx=p.topk_idx,
+ topk_weights=p.topk_weights,
+ num_tokens_per_rank=num_tokens_per_rank,
+ num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+ is_token_in_rank=is_token_in_rank,
+ num_tokens_per_expert=num_tokens_per_expert,
+ )
+ return types.SimpleNamespace(
+ recv_x=recv_x,
+ recv_topk_idx=recv_topk_idx,
+ recv_topk_weights=recv_topk_weights,
+ recv_counts=recv_counts,
+ handle=handle,
+ )
+
+ def stage(self, p, h):
+ h.combine_input = h.recv_x
+
+ def combine(self, p, h):
+ if self.mode == "low-latency":
+ combined_x, _, _ = self.buffer.low_latency_combine(
+ h.combine_input,
+ p.topk_idx,
+ p.topk_weights,
+ h.handle,
+ async_finish=False,
+ return_recv_hook=False,
+ )
+ return combined_x
+ combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle)
+ return combined_x
+
+ def inspect_dispatch(self, p, h):
+ valid = h.recv_topk_idx >= 0
+ expert_ids = torch.where(
+ valid,
+ h.recv_topk_idx + self.rank * (self.args.experts // self.world_size),
+ h.recv_topk_idx,
+ )
+ return types.SimpleNamespace(
+ payload=h.recv_x,
+ expert_ids=expert_ids,
+ weights=h.recv_topk_weights.masked_fill(~valid, 0),
+ local_expert_counts=torch.tensor(h.recv_counts, device=self.device, dtype=torch.int64),
+ ordering_contract="source-rank-major-stable-v1",
+ )
+
+ def inspect_expert_dispatch(self, p, h):
+ if self.mode != "low-latency":
+ raise RuntimeError("expert-packed inspection requires low-latency mode")
+ return types.SimpleNamespace(
+ payload=h.recv_x,
+ local_expert_counts=h.recv_counts,
+ source_info=h.handle[0],
+ layout_range=h.handle[1],
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ if self.mode == "low-latency":
+ packed = torch.zeros_like(h.recv_x)
+ packed[h.oracle_local_expert_slots, h.oracle_packed_positions] = transformed.to(
+ h.recv_x.dtype
+ )
+ combined, _, _ = self.buffer.low_latency_combine(
+ packed,
+ p.topk_idx,
+ p.topk_weights,
+ h.handle,
+ async_finish=False,
+ return_recv_hook=False,
+ )
+ return combined
+ combined, _, _ = self.buffer.combine(transformed.to(h.recv_x.dtype), h.handle)
+ return combined
+
+ def recv_tokens(self, h):
+ if self.mode == "low-latency":
+ return int(h.recv_counts.to(torch.int64).sum().item())
+ return int(h.recv_x.shape[0])
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ if self.mode == "low-latency":
+ self.buffer.destroy()
+ dist.destroy_process_group()
+ except Exception:
+ pass
+ return rc
diff --git a/experimental/CollectiveX/tests/ep_deepep_hybrid.py b/experimental/CollectiveX/tests/ep_deepep_hybrid.py
new file mode 100644
index 0000000000..70cd17b005
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep_hybrid.py
@@ -0,0 +1,457 @@
+#!/usr/bin/env python3
+"""CollectiveX EP backend adapter — DeepEP `hybrid-ep` branch (NVIDIA TMA-based HybridEPBuffer).
+
+The hybrid-ep branch (https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is NVIDIA's TMA +
+warp-pipeline implementation of expert-parallel all-to-all, exposing `deep_ep.HybridEPBuffer`
+(distinct from the mainline `deep_ep.Buffer`). HybridEP is NVIDIA's MoE backend built for NVL72
+rack-scale (Megatron `moe_flex_dispatcher_backend="hybridep"`). This adapter binds the API's
+"ranks per node" field to active ranks per NVLink/MNNVL communication domain, not physical host
+GPUs: x86 EP16 is two 8-rank domains, while GB EP8/EP16 is one 8/16-rank MNNVL domain across hosts.
+The container build is done by runtime/run_in_container.sh `cx_build_deepep_hybrid` (CUDA-13 CCCL
+include path, without the V2 NVSHMEM overlay).
+
+API (pinned on B300, branch e0a5b1d):
+ HybridEPBuffer(group, hidden_dim, max_num_of_tokens_per_rank, num_local_experts, use_fp8=False, ...)
+ .dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) -> (recv_hidden, recv_x2, None, handle)
+ .combine(hidden, handle=) -> [T, hidden]
+
+CORRECTNESS: identity expert (no expert compute), combine WITHOUT probs -> each source token is
+reconstructed as x * (distinct ranks among its top_k experts) — verified: an 8-rank uniform top_k=8
+round trip gives relerr(combined, x) = 4.28, matching E[distinct ranks] ~ 5.26 exactly. So this uses
+the same per-rank-sum combine contract (no gate re-weight). BF16 tolerance is 5e-2.
+
+STATUS: bf16 / normal / layout-and-dispatch-v1. The v1 scope covers one MNNVL domain or x86
+scale-out between two eight-GPU NVLink domains; fp8 and cross-NVL72 scale-out remain out of scope.
+"""
+from __future__ import annotations
+
+import hashlib
+import importlib
+import json
+import os
+from pathlib import Path
+import re
+import shutil
+import sys
+import tempfile
+import types
+
+import torch
+import torch.distributed as dist
+import contracts
+
+try:
+ import deep_ep
+ HybridEPBuffer = deep_ep.HybridEPBuffer
+except Exception as exc: # pragma: no cover - needs the hybrid-ep build
+ print("ERROR: deep_ep.HybridEPBuffer import failed — the hybrid-ep branch must be built at job "
+ "setup (cx_build_deepep_hybrid). "
+ f"{exc!r}", file=sys.stderr)
+ raise
+
+
+def _deepep_hybrid_version() -> str:
+ return os.environ.get("DEEPEP_COMMIT", getattr(deep_ep, "__version__", "hybrid-ep"))
+
+
+def _hybrid_build_evidence() -> list[dict[str, str]]:
+ records = []
+ for module_name, role in (
+ ("deep_ep_cpp", "deepep-extension"),
+ ("hybrid_ep_cpp", "deepep-hybrid-extension"),
+ ):
+ module = importlib.import_module(module_name)
+ path = getattr(module, "__file__", None)
+ if not path:
+ raise RuntimeError(f"{module_name} has no loaded extension path")
+ records.append(contracts.content_manifest_evidence(
+ role=role,
+ name=module_name,
+ files=[(os.path.basename(path), path)],
+ ))
+ return sorted(records, key=lambda item: (item["role"], item["name"]))
+
+
+HYBRID_CONFIG_FIELDS = (
+ "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank",
+ "num_of_ranks_per_node", "num_of_nodes", "pad_multiple",
+ "num_of_tokens_per_chunk_preprocessing_api",
+ "num_of_threads_per_block_preprocessing_api", "num_of_blocks_preprocessing_api",
+ "num_of_blocks_permute", "num_of_blocks_unpermute", "token_data_type",
+ "num_of_stages_dispatch_api", "num_of_stages_permute_block_dispatch_api",
+ "num_of_in_flight_s2g_dispatch_api",
+ "num_of_in_flight_s2g_permute_block_dispatch_api",
+ "num_of_additional_in_flight_s2g_dispatch_api",
+ "num_of_tokens_per_chunk_dispatch_api", "num_of_blocks_dispatch_api",
+ "forward_dispatch_api", "device_side_sync_dispatch_api",
+ "num_of_stages_g2s_combine_api", "num_of_stages_s2g_combine_api",
+ "num_of_tokens_per_chunk_combine_api", "num_of_tokens_per_group_combine_api",
+ "num_of_blocks_combine_api", "num_of_additional_in_flight_s2g_combine_api",
+ "backward_combine_api", "device_side_sync_combine_api",
+)
+
+
+def _hybrid_realized_config(config) -> dict[str, str | int | bool]:
+ """Project the Python-visible, post-autotune HybridEP config to JSON scalars."""
+ realized = {}
+ for field in HYBRID_CONFIG_FIELDS:
+ try:
+ value = getattr(config, field)
+ except AttributeError as exc:
+ raise RuntimeError(f"HybridEP realized config omits {field}") from exc
+ if field == "token_data_type":
+ token_type = getattr(value, "name", None)
+ if token_type not in {"UINT8", "UINT16"}:
+ token_type = {"uint8_t": "UINT8", "uint16_t": "UINT16"}.get(str(value))
+ if token_type is None:
+ raise RuntimeError("HybridEP realized token_data_type is invalid")
+ realized[field] = token_type
+ continue
+ if type(value) is bool:
+ realized[field] = value
+ continue
+ try:
+ realized[field] = int(value)
+ except (TypeError, ValueError) as exc:
+ raise RuntimeError(f"HybridEP realized config {field} is not integral") from exc
+ return realized
+
+
+def _sha256_with_size(path: Path) -> tuple[str, int]:
+ digest = hashlib.sha256()
+ size = 0
+ with path.open("rb") as handle:
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ digest.update(chunk)
+ size += len(chunk)
+ return digest.hexdigest(), size
+
+
+def _hybrid_jit_evidence(root: Path) -> list[dict[str, str | int]]:
+ """Hash final JIT libraries without exposing rank-specific cache paths."""
+ if not root.is_dir():
+ raise RuntimeError("DeepEP Hybrid produced no JIT cache directory")
+ artifacts = []
+ for path in sorted(root.iterdir(), key=lambda item: item.name):
+ if path.suffix != ".so":
+ continue
+ if path.is_symlink() or not path.is_file():
+ raise RuntimeError("DeepEP Hybrid JIT artifact is not a regular file")
+ kernel_key = path.stem
+ if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,511}", kernel_key):
+ raise RuntimeError("DeepEP Hybrid JIT kernel key is invalid")
+ digest, size = _sha256_with_size(path)
+ if size <= 0:
+ raise RuntimeError("DeepEP Hybrid JIT artifact is empty")
+ artifacts.append({
+ "bytes": size,
+ "kernel_key": kernel_key,
+ "sha256": digest,
+ })
+ if len(artifacts) != 3:
+ raise RuntimeError(
+ f"DeepEP Hybrid expected 3 final JIT libraries, found {len(artifacts)}"
+ )
+ return artifacts
+
+
+def _require_cross_rank_equal(value, label: str) -> None:
+ gathered = [None] * dist.get_world_size()
+ dist.all_gather_object(gathered, value)
+ canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered}
+ if len(canonical) != 1:
+ raise RuntimeError(f"DeepEP Hybrid {label} differs across ranks")
+
+
+def _hybrid_topology(args, world_size: int) -> dict[str, int | str]:
+ """Translate physical placement into HybridEP communication-domain geometry."""
+ gpus_per_node = int(args.gpus_per_node or world_size)
+ scale_up_domain = int(args.scale_up_domain or gpus_per_node)
+ key = (
+ world_size, gpus_per_node, scale_up_domain, args.scope,
+ args.scale_up_transport, args.scale_out_transport or None, args.transport,
+ )
+ fixed = {
+ (8, 8, 8, "scale-up", "nvlink", None, "nvlink"): (8, 1),
+ (16, 8, 8, "scale-out", "nvlink", "rdma", "nvlink-rdma"): (8, 2),
+ (8, 4, 72, "scale-up", "mnnvl", None, "mnnvl"): (8, 1),
+ (16, 4, 72, "scale-up", "mnnvl", None, "mnnvl"): (16, 1),
+ }
+ if key not in fixed:
+ raise RuntimeError("DeepEP Hybrid topology is outside the fixed v1 matrix")
+ domain_ranks, communication_domains = fixed[key]
+
+ return {
+ "communication_domains": communication_domains,
+ "domain_ranks": domain_ranks,
+ "physical_nodes": world_size // gpus_per_node,
+ "transport": str(args.transport),
+ }
+
+
+class DeepEPHybridBackend:
+ name = "deepep-hybrid"
+ # HybridEPBuffer.combine consumes the recv payload + the dispatch handle (no re-dispatch needed
+ # before a timed combine); the harness times dispatch and combine separately (like ep_deepep).
+ combine_needs_redispatch = False
+ combine_weight_semantics = "unweighted-rank-sum"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = "normal"
+ self.group = dist.group.WORLD
+ self.tolerance = 5e-2
+ self.top_k = int(args.topk)
+ self.num_experts = int(args.experts)
+ self.hidden = int(args.hidden)
+ self.local_experts = max(1, self.num_experts // world_size)
+ topology = _hybrid_topology(args, world_size)
+ self.domain_ranks = int(topology["domain_ranks"])
+ self.communication_domains = int(topology["communication_domains"])
+ build_mode = os.environ.get("DEEPEP_HYBRID_BUILD_MODE", "")
+ if self.communication_domains > 1:
+ if (
+ os.environ.get("HYBRID_EP_MULTINODE") != "1"
+ or build_mode != "multinode-doca"
+ or os.environ.get("USE_NIXL", "0") != "0"
+ ):
+ raise RuntimeError("DeepEP Hybrid scale-out build mode is not realized")
+ elif build_mode != "intradomain":
+ raise RuntimeError("DeepEP Hybrid scale-up requires the intradomain build")
+ if args.scale_up_transport == "mnnvl" and any(
+ os.environ.get(name) != "1"
+ for name in ("NCCL_CUMEM_ENABLE", "NCCL_MNNVL_ENABLE", "MC_FORCE_MNNVL")
+ ):
+ raise RuntimeError("DeepEP Hybrid MNNVL runtime enablement is incomplete")
+ # Token cap (per rank) for the symmetric buffer; the sweep is capped here (buffer_cap).
+ self.max_tokens = 4096
+ dev_sms = torch.cuda.get_device_properties(device).multi_processor_count
+ ver = _deepep_hybrid_version()
+ loaded_libraries = _hybrid_build_evidence()
+ _require_cross_rank_equal(loaded_libraries, "loaded extension identities")
+
+ # HybridEP's compiler uses a process-specific child of HYBRID_EP_CACHE_DIR. Give every
+ # rank a fresh private base so stale kernels cannot enter this attempt's evidence.
+ self._previous_jit_cache_dir = os.environ.get("HYBRID_EP_CACHE_DIR")
+ self._previous_domain_ranks = os.environ.get(
+ "NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"
+ )
+ self._jit_cache_dir = tempfile.mkdtemp(prefix=f"collectivex-hybrid-r{rank}-")
+ os.environ["HYBRID_EP_CACHE_DIR"] = self._jit_cache_dir
+ os.environ["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = str(self.domain_ranks)
+ self._jit_root = (
+ Path(self._jit_cache_dir) / ".deepep" / "hybrid_ep" / "jit"
+ / f"proc-{os.getpid()}"
+ )
+ self._realized_config = None
+ self._deferred_semantic_snapshot = None
+ self._deferred_jit_diagnostics = None
+
+ try:
+ self.buffer = HybridEPBuffer(
+ self.group, hidden_dim=self.hidden,
+ max_num_of_tokens_per_rank=self.max_tokens,
+ num_local_experts=self.local_experts, use_fp8=False)
+ realized_geometry = (
+ int(self.buffer.num_of_hybrid_ep_ranks_per_nvlink_domain),
+ int(self.buffer.num_of_nodes),
+ int(self.buffer.local_rank),
+ int(self.buffer.node_rank),
+ )
+ expected_geometry = (
+ self.domain_ranks,
+ self.communication_domains,
+ rank % self.domain_ranks,
+ rank // self.domain_ranks,
+ )
+ buffer_config = self.buffer.configurer.buffer_config
+ if realized_geometry != expected_geometry or (
+ int(buffer_config.num_of_ranks_per_node) != self.domain_ranks
+ or int(buffer_config.num_of_nodes) != self.communication_domains
+ ):
+ raise RuntimeError(
+ "HybridEPBuffer communication-domain geometry differs from the case"
+ )
+ except Exception as exc:
+ shutil.rmtree(self._jit_cache_dir, ignore_errors=True)
+ if self._previous_jit_cache_dir is None:
+ os.environ.pop("HYBRID_EP_CACHE_DIR", None)
+ else:
+ os.environ["HYBRID_EP_CACHE_DIR"] = self._previous_jit_cache_dir
+ if self._previous_domain_ranks is None:
+ os.environ.pop("NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN", None)
+ else:
+ os.environ[
+ "NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"
+ ] = self._previous_domain_ranks
+ raise RuntimeError(
+ f"HybridEPBuffer construction failed (hidden={self.hidden} max_tokens={self.max_tokens} "
+ f"local_experts={self.local_experts} world={world_size}): {exc!r}") from exc
+ update_template_config = self.buffer.update_template_config
+
+ def tracked_update_template_config(*call_args, **call_kwargs):
+ config = update_template_config(*call_args, **call_kwargs)
+ realized = _hybrid_realized_config(config)
+ if (
+ realized["num_of_ranks_per_node"] != self.domain_ranks
+ or realized["num_of_nodes"] != self.communication_domains
+ ):
+ raise RuntimeError("DeepEP Hybrid realized topology changed within one case")
+ if self._realized_config is not None and realized != self._realized_config:
+ raise RuntimeError("DeepEP Hybrid realized autotune config changed within one case")
+ self._realized_config = realized
+ return config
+
+ self.buffer.update_template_config = tracked_update_template_config
+ self.domain_rank = int(self.buffer.local_rank)
+ if rank == 0:
+ print(
+ "[deepep-hybrid] HybridEPBuffer constructed "
+ f"(domains={self.communication_domains} ranks_per_domain={self.domain_ranks} "
+ f"world={world_size} local_experts={self.local_experts} hidden={self.hidden})",
+ file=sys.stderr,
+ )
+
+ self.backend_provenance = {
+ "deepep_commit": ver, "branch": "hybrid-ep",
+ "deepep_tree": os.environ.get("DEEPEP_TREE"),
+ "backend_lineage": "deepep-hybrid",
+ "loaded_libraries": loaded_libraries,
+ "impl": "deep_ep.HybridEPBuffer (NVIDIA TMA + warp-pipeline)",
+ "mode": "normal", "transport": topology["transport"],
+ "resource_mode": "tuned",
+ "num_sms": None, "device_sms": dev_sms,
+ "tuned_source": "deepep-hybrid-configurer-autotune-v1",
+ "realized_config": None, "jit_kernel_keys": [], "jit_shared_objects": [],
+ "max_num_tokens": self.max_tokens, "top_k": self.top_k,
+ "num_experts": self.num_experts, "local_experts": self.local_experts,
+ "routing_factor": "ranks",
+ }
+
+ def buffer_cap(self, args):
+ return self.max_tokens
+
+ def make_problem(self, T, idx, weights, x):
+ return types.SimpleNamespace(
+ T=int(T), x=x,
+ topk_idx=idx.to(torch.int64),
+ topk_weights=weights.to(torch.float32),
+ )
+
+ def dispatch(self, p):
+ recv, recv_probs, _scales, handle = self.buffer.dispatch(
+ p.x,
+ topk_idx=p.topk_idx,
+ topk_weights=p.topk_weights,
+ num_of_experts=self.num_experts,
+ )
+ return types.SimpleNamespace(
+ recv=recv,
+ recv_payload=recv,
+ recv_probs=recv_probs,
+ handle=handle,
+ combine_input=None,
+ )
+
+ def stage(self, p, h):
+ # Identity expert: the recv hidden IS the "expert output". combine reduces it per source token.
+ h.combine_input = h.recv_payload
+ return None
+
+ def combine(self, p, h):
+ # combine(hidden, handle=) -> [T, H] per-source-token reduction (no gate re-weight: "ranks").
+ comb = self.buffer.combine(h.combine_input, handle=h.handle)
+ return comb[0] if isinstance(comb, (tuple, list)) else comb
+
+ def capture_deferred_provenance(self):
+ torch.cuda.synchronize()
+ dist.barrier()
+ if self._realized_config is None:
+ raise RuntimeError("DeepEP Hybrid autotune config was not materialized")
+ local_artifacts = _hybrid_jit_evidence(self._jit_root)
+ semantic = {
+ "jit_kernel_keys": [item["kernel_key"] for item in local_artifacts],
+ "realized_config": dict(self._realized_config),
+ }
+ # NVCC may embed each rank's timestamped source basename in its ELF, so raw .so hashes are
+ # diagnostics rather than a cross-rank identity. Stable kernel keys encode every codegen
+ # input, including HybridEpConfigInstance fields that the Python binding does not expose.
+ _require_cross_rank_equal(semantic, "realized config/JIT kernel keys")
+ gathered_artifacts = [None] * dist.get_world_size()
+ dist.all_gather_object(gathered_artifacts, local_artifacts)
+ diagnostics = []
+ for artifact_index, kernel_key in enumerate(semantic["jit_kernel_keys"]):
+ diagnostics.append({
+ "kernel_key": kernel_key,
+ "rank_artifacts": [
+ {
+ "bytes": rank_artifacts[artifact_index]["bytes"],
+ "rank": artifact_rank,
+ "sha256": rank_artifacts[artifact_index]["sha256"],
+ }
+ for artifact_rank, rank_artifacts in enumerate(gathered_artifacts)
+ ],
+ })
+ if self._deferred_semantic_snapshot is not None and semantic != self._deferred_semantic_snapshot:
+ raise RuntimeError("DeepEP Hybrid config/JIT kernel set changed after measurement")
+ if self._deferred_jit_diagnostics is not None and diagnostics != self._deferred_jit_diagnostics:
+ raise RuntimeError("DeepEP Hybrid rank-local JIT artifacts changed after measurement")
+ self._deferred_semantic_snapshot = semantic
+ self._deferred_jit_diagnostics = diagnostics
+ self.backend_provenance.update(semantic)
+ self.backend_provenance["jit_shared_objects"] = diagnostics
+
+ def inspect_dispatch(self, p, h):
+ count = self.recv_tokens(h)
+ routing_map = h.handle[4][:count]
+ rows, local_expert_ids = routing_map.nonzero(as_tuple=True)
+ positions = routing_map.to(torch.int64).cumsum(dim=1)[rows, local_expert_ids] - 1
+ probability_columns = self.domain_rank * self.local_experts + local_expert_ids
+ if h.recv_probs.shape[1] < (self.domain_rank + 1) * self.local_experts:
+ raise RuntimeError("HybridEPBuffer probability tensor omits this NVLink-domain rank")
+ expert_ids = torch.full(
+ (count, self.top_k), -1, dtype=torch.int64, device=self.device
+ )
+ weights = torch.zeros(
+ (count, self.top_k), dtype=torch.float32, device=self.device
+ )
+ expert_ids[rows, positions] = local_expert_ids + self.rank * self.local_experts
+ weights[rows, positions] = h.recv_probs[:count][rows, probability_columns]
+ return types.SimpleNamespace(
+ payload=h.recv_payload[:count],
+ expert_ids=expert_ids,
+ weights=weights,
+ local_expert_counts=routing_map.sum(dim=0, dtype=torch.int64),
+ ordering_contract="global-source-filter-stable-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ combined = self.buffer.combine(
+ transformed.to(h.recv_payload.dtype), handle=h.handle
+ )
+ return combined[0] if isinstance(combined, (tuple, list)) else combined
+
+ def recv_tokens(self, h):
+ return int(h.handle[3].item())
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ dist.destroy_process_group()
+ except Exception:
+ pass
+ shutil.rmtree(self._jit_cache_dir, ignore_errors=True)
+ if self._previous_jit_cache_dir is None:
+ os.environ.pop("HYBRID_EP_CACHE_DIR", None)
+ else:
+ os.environ["HYBRID_EP_CACHE_DIR"] = self._previous_jit_cache_dir
+ if self._previous_domain_ranks is None:
+ os.environ.pop("NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN", None)
+ else:
+ os.environ[
+ "NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"
+ ] = self._previous_domain_ranks
+ return rc
diff --git a/experimental/CollectiveX/tests/ep_deepep_v2.py b/experimental/CollectiveX/tests/ep_deepep_v2.py
new file mode 100644
index 0000000000..c629b0294e
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_deepep_v2.py
@@ -0,0 +1,528 @@
+#!/usr/bin/env python3
+"""DeepEP PR #605 adapter with PR #630's pure scale-up initialization fix."""
+
+from __future__ import annotations
+
+import ctypes
+import hashlib
+import importlib.metadata
+import inspect
+import json
+import os
+import re
+import sys
+import types
+from pathlib import Path
+
+import torch
+import torch.distributed as dist
+import contracts
+import ep_harness
+
+try:
+ import deep_ep
+ from deep_ep import ElasticBuffer # type: ignore
+except Exception as exc: # pragma: no cover - requires the benchmark image
+ print(f"ERROR: DeepEP V2 import failed: {exc!r}", file=sys.stderr)
+ raise
+
+
+DEEPEP_V2_PR = 605
+DEEPEP_V2_FIX_PR = 630
+DEEPEP_V2_COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6"
+DEEPEP_V2_TREE = "29809e75c5874e6609dac4804e7b651d5226959f"
+DEEPEP_V2_FMT_COMMIT = "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa"
+DEEPEP_V2_VERSION = "2.0.0"
+DEEPEP_V2_DISTRIBUTION = "2.0.0+fa8a9b1"
+DEEPEP_V2_JIT_RANDOM_SEED = "collectivex-deepep-v2-fa8a9b1"
+TORCH_VERSION = "2.10.0+cu130"
+NCCL_VERSION = "2.30.4"
+NVSHMEM_VERSION = "3.3.9"
+DEEPEP_V2_JIT_KERNELS = contracts.DEEPEP_V2_JIT_KERNELS
+
+
+def _sha256(path: str) -> str:
+ digest = hashlib.sha256()
+ with open(path, "rb") as handle:
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+ digest.update(chunk)
+ return digest.hexdigest()
+
+
+def _api_sha256() -> str:
+ signatures = {
+ "ElasticBuffer.__init__": str(inspect.signature(ElasticBuffer.__init__)),
+ "ElasticBuffer.dispatch": str(inspect.signature(ElasticBuffer.dispatch)),
+ "ElasticBuffer.combine": str(inspect.signature(ElasticBuffer.combine)),
+ }
+ return hashlib.sha256(
+ json.dumps(signatures, sort_keys=True, separators=(",", ":")).encode()
+ ).hexdigest()
+
+
+def _loaded_library_paths() -> set[str]:
+ extension = getattr(getattr(deep_ep, "_C", None), "__file__", None)
+ if not extension or not os.path.isfile(extension):
+ raise RuntimeError("DeepEP V2 extension library is not loaded")
+ paths = {os.path.realpath(extension)}
+ try:
+ with open("/proc/self/maps", encoding="utf-8") as handle:
+ for line in handle:
+ path = line.rstrip().split()[-1]
+ name = os.path.basename(path)
+ if ("libnccl.so" in name or "libnvshmem_host.so" in name) and os.path.isfile(path):
+ paths.add(os.path.realpath(path))
+ except OSError as exc: # pragma: no cover - benchmark runtime is Linux
+ raise RuntimeError("cannot inspect loaded communication libraries") from exc
+ return paths
+
+
+def _loaded_nccl_version() -> str:
+ matches = [
+ path for path in _loaded_library_paths()
+ if "libnccl.so" in os.path.basename(path)
+ ]
+ if len(matches) != 1:
+ raise RuntimeError("expected exactly one loaded NCCL library")
+ version = ctypes.c_int()
+ if ctypes.CDLL(matches[0]).ncclGetVersion(ctypes.byref(version)) != 0:
+ raise RuntimeError("loaded NCCL version query failed")
+ return ep_harness.format_collective_version(version.value)
+
+
+def _loaded_library_evidence() -> list[dict[str, str]]:
+ """Return content identities, never private library paths."""
+ paths = _loaded_library_paths()
+ required = {
+ "nccl": [path for path in paths if "libnccl.so" in os.path.basename(path)],
+ "nvshmem": [path for path in paths if "libnvshmem_host.so" in os.path.basename(path)],
+ }
+ mismatches = [f"{name}={len(matches)}" for name, matches in required.items() if len(matches) != 1]
+ if mismatches:
+ raise RuntimeError("expected one loaded library for each dependency: " + ", ".join(mismatches))
+
+ def role(path: str) -> str:
+ name = os.path.basename(path)
+ if "libnccl.so" in name:
+ return "nccl"
+ if "libnvshmem_host.so" in name:
+ return "nvshmem"
+ return "deepep-extension"
+
+ def label(path: str) -> str:
+ return "deep_ep._C" if role(path) == "deepep-extension" else os.path.basename(path)
+
+ return sorted(
+ ({"role": role(path), "name": label(path), "sha256": _sha256(path)} for path in paths),
+ key=lambda item: (item["role"], item["name"], item["sha256"]),
+ )
+
+
+def _jit_artifact_evidence() -> list[dict[str, str]]:
+ root = Path(os.environ["EP_JIT_CACHE_DIR"]) / "cache"
+ if root.is_symlink() or not root.is_dir():
+ raise RuntimeError("DeepEP V2 produced no JIT cache evidence")
+ artifacts = []
+ kernel_names = set()
+ for directory in sorted(root.iterdir(), key=lambda item: item.name):
+ match = re.fullmatch(r"kernel\.([A-Za-z0-9_+-]+)\.([0-9a-f]{32})", directory.name)
+ if directory.is_symlink() or not directory.is_dir() or match is None:
+ raise RuntimeError("DeepEP V2 JIT cache contains an invalid entry")
+ if {path.name for path in directory.iterdir()} != {
+ "kernel.cu", "kernel.cubin", "kernel.sass",
+ }:
+ raise RuntimeError("DeepEP V2 JIT kernel evidence is incomplete")
+ source = directory / "kernel.cu"
+ cubin = directory / "kernel.cubin"
+ sass = directory / "kernel.sass"
+ if any(path.is_symlink() or not path.is_file() for path in (source, cubin, sass)):
+ raise RuntimeError("DeepEP V2 JIT evidence is not a regular file")
+ if any(path.stat().st_size <= 0 for path in (source, cubin, sass)):
+ raise RuntimeError("DeepEP V2 JIT evidence is empty")
+ kernel_names.add(match.group(1))
+ artifacts.append({
+ "cache_key": directory.name,
+ "source_sha256": _sha256(str(source)),
+ "sass_sha256": _sha256(str(sass)),
+ "cubin_sha256": _sha256(str(cubin)),
+ })
+ if (
+ len(artifacts) != len(DEEPEP_V2_JIT_KERNELS)
+ or kernel_names != DEEPEP_V2_JIT_KERNELS
+ ):
+ raise RuntimeError("DeepEP V2 JIT kernel set differs from the v1 contract")
+ return sorted(artifacts, key=lambda item: item["cache_key"])
+
+
+def _jit_cache_key(
+ args,
+ world_size: int,
+ max_tokens: int,
+ allow_hybrid_mode: bool,
+ realized: dict[str, int | bool],
+) -> str:
+ """Key generated kernels by codegen inputs, not routing data or case identity."""
+ payload = {
+ "contract": "deepep-v2-jit-config-v3",
+ "runner": args.runner,
+ "world_size": world_size,
+ "hidden": args.hidden,
+ "topk": args.topk,
+ "physical_experts": args.experts,
+ "tuning_experts": getattr(args, "num_logical_experts", args.experts),
+ "max_tokens": max_tokens,
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "input_layout": "bf16-no-sf",
+ "expert_alignment": 1,
+ "do_cpu_sync": True,
+ "cached_mode": False,
+ "do_expand": False,
+ "use_expanded_layout": False,
+ "allow_hybrid_mode": allow_hybrid_mode,
+ "allow_multiple_reduction": True,
+ "prefer_overlap_with_compute": True,
+ "deterministic": False,
+ **realized,
+ }
+ return "jitcfg-v3-" + hashlib.sha256(
+ json.dumps(payload, sort_keys=True, separators=(",", ":")).encode()
+ ).hexdigest()
+
+
+def _require_cross_rank_equal(value, label: str) -> None:
+ gathered = [None] * dist.get_world_size()
+ dist.all_gather_object(gathered, value)
+ canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered}
+ if len(canonical) != 1:
+ raise RuntimeError(f"DeepEP V2 {label} differs across ranks")
+
+
+def _configure_gin_mode(args, world_size: int) -> bool:
+ scale_up_domain = int(
+ getattr(args, "scale_up_domain", None)
+ or getattr(args, "gpus_per_node", None)
+ or world_size
+ )
+ allow_hybrid_mode = world_size > scale_up_domain
+ if allow_hybrid_mode:
+ os.environ.pop("EP_DISABLE_GIN", None)
+ else:
+ os.environ["EP_DISABLE_GIN"] = "1"
+ return allow_hybrid_mode
+
+
+def _lsa_topology_is_valid(
+ gin_enabled: bool,
+ world_size: int,
+ scale_up_domain: int,
+ config: dict[str, int | bool],
+) -> bool:
+ if gin_enabled:
+ domains = world_size // scale_up_domain
+ return (
+ world_size % scale_up_domain == 0
+ and domains > 1
+ and config["physical_rdma_ranks"] == domains
+ and config["physical_nvlink_ranks"] == scale_up_domain
+ and config["logical_scaleout_ranks"] == domains
+ and config["logical_scaleup_ranks"] == scale_up_domain
+ and config["is_scaleup_nvlink"] is True
+ )
+ return (
+ config["physical_rdma_ranks"] == 1
+ and config["physical_nvlink_ranks"] == world_size
+ and config["logical_scaleout_ranks"] == 1
+ and config["logical_scaleup_ranks"] == world_size
+ and config["is_scaleup_nvlink"] is True
+ )
+
+
+def _require_runtime() -> tuple[str, str]:
+ expected = {
+ "DEEPEP_V2_PR": str(DEEPEP_V2_PR),
+ "DEEPEP_V2_FIX_PR": str(DEEPEP_V2_FIX_PR),
+ "DEEPEP_V2_COMMIT": DEEPEP_V2_COMMIT,
+ "DEEPEP_V2_TREE": DEEPEP_V2_TREE,
+ "DEEPEP_V2_FMT_COMMIT": DEEPEP_V2_FMT_COMMIT,
+ "DEEPEP_V2_JIT_RANDOM_SEED": DEEPEP_V2_JIT_RANDOM_SEED,
+ "EP_JIT_DUMP_SASS": "1",
+ }
+ mismatches = [
+ f"{name}={os.environ.get(name)!r}, expected {value!r}"
+ for name, value in expected.items()
+ if os.environ.get(name) != value
+ ]
+ torch_version = str(torch.__version__)
+ nccl_package_version = importlib.metadata.version("nvidia-nccl-cu13")
+ nvshmem_package_version = importlib.metadata.version("nvidia-nvshmem-cu12")
+ actual = {
+ "deep_ep": str(getattr(deep_ep, "__version__", "")),
+ "deep_ep distribution": importlib.metadata.version("deep_ep"),
+ "torch": torch_version,
+ "nvidia-nccl-cu13": nccl_package_version,
+ "nvidia-nvshmem-cu12": nvshmem_package_version,
+ }
+ required = {
+ "deep_ep": DEEPEP_V2_VERSION,
+ "deep_ep distribution": DEEPEP_V2_DISTRIBUTION,
+ "torch": TORCH_VERSION,
+ "nvidia-nccl-cu13": NCCL_VERSION,
+ "nvidia-nvshmem-cu12": NVSHMEM_VERSION,
+ }
+ mismatches.extend(
+ f"{name}={actual[name]!r}, expected {value!r}"
+ for name, value in required.items()
+ if actual[name] != value
+ )
+ if not inspect.isclass(ElasticBuffer) or ElasticBuffer.__name__ != "ElasticBuffer":
+ mismatches.append("deep_ep.ElasticBuffer is absent")
+ if os.environ.get("EP_SUPPRESS_NCCL_CHECK"):
+ mismatches.append("EP_SUPPRESS_NCCL_CHECK must be unset")
+ nccl_runtime_version = _loaded_nccl_version()
+ if nccl_runtime_version != NCCL_VERSION:
+ mismatches.append(
+ f"loaded NCCL={nccl_runtime_version!r}, expected {NCCL_VERSION!r}"
+ )
+ if mismatches:
+ raise RuntimeError("invalid DeepEP V2 runtime: " + "; ".join(mismatches))
+ return torch_version, nccl_runtime_version
+
+
+class DeepEPV2Backend:
+ name = "deepep-v2"
+ combine_needs_redispatch = False
+ combine_weight_semantics = "unweighted-rank-sum"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = "normal"
+ self.group = dist.group.WORLD
+ torch_version, nccl_runtime_version = _require_runtime()
+ ladder, _ = ep_harness.token_ladder(args.tokens_ladder, args.phase, None)
+ conditioning = ep_harness.CONDITIONING_LADDERS[args.phase]
+ self.max_tokens = max([*ladder, *conditioning])
+ jit_root = Path(os.environ["EP_JIT_CACHE_DIR"])
+ scale_up_domain = int(
+ getattr(args, "scale_up_domain", None)
+ or getattr(args, "gpus_per_node", None)
+ or world_size
+ )
+ allow_hybrid_mode = _configure_gin_mode(args, world_size)
+ gin_enabled = allow_hybrid_mode
+ communication_backend = "nccl-gin" if gin_enabled else "nccl-device-lsa"
+ self._deferred_jit_snapshot = None
+ self.buffer = ElasticBuffer(
+ self.group,
+ num_max_tokens_per_rank=self.max_tokens,
+ hidden=args.hidden,
+ num_topk=args.topk,
+ use_fp8_dispatch=False,
+ deterministic=False,
+ allow_hybrid_mode=allow_hybrid_mode,
+ allow_multiple_reduction=True,
+ prefer_overlap_with_compute=True,
+ num_gpu_timeout_secs=100,
+ explicitly_destroy=True,
+ )
+ tuning_num_experts = int(getattr(args, "num_logical_experts", args.experts))
+ self.num_sms = int(
+ self.buffer.get_theoretical_num_sms(tuning_num_experts, args.topk)
+ )
+ self.num_qps = int(self.buffer.get_theoretical_num_qps(self.num_sms))
+ properties = torch.cuda.get_device_properties(device)
+ device_sms = int(properties.multi_processor_count)
+ jit_config = {
+ "num_sms": self.num_sms,
+ "num_qps": self.num_qps,
+ "allocated_qps": int(self.buffer.num_allocated_qps),
+ "logical_scaleout_ranks": int(self.buffer.num_scaleout_ranks),
+ "logical_scaleup_ranks": int(self.buffer.num_scaleup_ranks),
+ "physical_rdma_ranks": int(self.buffer.num_rdma_ranks),
+ "physical_nvlink_ranks": int(self.buffer.num_nvlink_ranks),
+ "is_scaleup_nvlink": self.buffer.num_scaleup_ranks == self.buffer.num_nvlink_ranks,
+ "device_arch_major": int(properties.major),
+ "device_arch_minor": int(properties.minor),
+ "device_sms": device_sms,
+ "device_smem_bytes": int(properties.shared_memory_per_block_optin),
+ "gpu_timeout_cycles": 100 * int(properties.clock_rate) * 1000,
+ }
+ _require_cross_rank_equal(jit_config, "JIT configuration")
+ if not _lsa_topology_is_valid(
+ gin_enabled, world_size, scale_up_domain, jit_config
+ ):
+ raise RuntimeError("DeepEP V2 realized communication domains differ from topology")
+ self.jit_cache_key = _jit_cache_key(
+ args, world_size, self.max_tokens, allow_hybrid_mode, jit_config
+ )
+ os.environ["EP_JIT_CACHE_DIR"] = str(jit_root / self.jit_cache_key)
+ realized_config = {
+ "jit_cache_key": self.jit_cache_key,
+ "num_max_tokens_per_rank": self.max_tokens,
+ **jit_config,
+ }
+ _require_cross_rank_equal(realized_config, "realized tuning/topology")
+ comm = getattr(self.buffer, "nccl_comm_handle", None)
+ communicator = (
+ "deepep-managed" if getattr(comm, "managed", True) else "pytorch-reused"
+ )
+
+ loaded_libraries = _loaded_library_evidence()
+ _require_cross_rank_equal(loaded_libraries, "loaded libraries")
+ self.backend_provenance = {
+ "deepep_version": DEEPEP_V2_VERSION,
+ "deepep_distribution_version": importlib.metadata.version("deep_ep"),
+ "deepep_commit": DEEPEP_V2_COMMIT,
+ "deepep_tree": DEEPEP_V2_TREE,
+ "deepep_pr": DEEPEP_V2_PR,
+ "deepep_fix_pr": DEEPEP_V2_FIX_PR,
+ "fmt_commit": DEEPEP_V2_FMT_COMMIT,
+ "api": "deep_ep.ElasticBuffer",
+ "api_signature_sha256": _api_sha256(),
+ "communication_backend": communication_backend,
+ "gin_enabled": gin_enabled,
+ "nccl_communicator": communicator,
+ "torch_version": torch_version,
+ "torch_git_version": str(torch.version.git_version),
+ "cuda_version": str(torch.version.cuda),
+ "nccl_package_version": importlib.metadata.version("nvidia-nccl-cu13"),
+ "nccl_version": nccl_runtime_version,
+ "nvshmem_package_version": importlib.metadata.version("nvidia-nvshmem-cu12"),
+ "loaded_libraries": loaded_libraries,
+ "jit_cache_key": self.jit_cache_key,
+ "jit_cubins": [],
+ "jit_random_seed": DEEPEP_V2_JIT_RANDOM_SEED,
+ "num_experts": int(args.experts),
+ "mode": "normal",
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "deterministic": False,
+ "resource_mode": "tuned",
+ "requested_num_sms": self.num_sms,
+ "tuning_num_experts": tuning_num_experts,
+ "num_sms": self.num_sms,
+ "num_qps": self.num_qps,
+ "allocated_qps": int(self.buffer.num_allocated_qps),
+ "device_sms": device_sms,
+ "sm_fraction": self.num_sms / device_sms,
+ "tuned_source": "deepep-v2-analytical-sm-qp-logical-experts-v1",
+ "num_max_tokens_per_rank": self.max_tokens,
+ "allow_hybrid_mode": bool(self.buffer.allow_hybrid_mode),
+ "allow_multiple_reduction": bool(self.buffer.allow_multiple_reduction),
+ "prefer_overlap_with_compute": bool(
+ self.buffer.prefer_overlap_with_compute
+ ),
+ "logical_scaleout_ranks": int(self.buffer.num_scaleout_ranks),
+ "logical_scaleup_ranks": int(self.buffer.num_scaleup_ranks),
+ "physical_rdma_ranks": int(self.buffer.num_rdma_ranks),
+ "physical_nvlink_ranks": int(self.buffer.num_nvlink_ranks),
+ }
+
+ def buffer_cap(self, args):
+ return self.max_tokens
+
+ def make_problem(self, T, idx, weights, x):
+ return types.SimpleNamespace(
+ T=T,
+ x=x,
+ topk_idx=idx.to(deep_ep.topk_idx_t),
+ topk_weights=weights.to(torch.float32),
+ )
+
+ def dispatch(self, p):
+ recv_x, recv_topk_idx, recv_topk_weights, handle, _ = self.buffer.dispatch(
+ p.x,
+ topk_idx=p.topk_idx,
+ topk_weights=p.topk_weights,
+ num_experts=self.args.experts,
+ num_max_tokens_per_rank=self.max_tokens,
+ expert_alignment=1,
+ num_sms=self.num_sms,
+ num_qps=self.num_qps,
+ async_with_compute_stream=False,
+ do_handle_copy=True,
+ do_cpu_sync=True,
+ do_expand=False,
+ )
+ return types.SimpleNamespace(
+ recv_x=recv_x,
+ recv_topk_idx=recv_topk_idx,
+ recv_topk_weights=recv_topk_weights,
+ handle=handle,
+ )
+
+ def stage(self, p, h):
+ h.combine_input = h.recv_x
+
+ def combine(self, p, h):
+ combined_x, _, _ = self.buffer.combine(
+ h.combine_input,
+ handle=h.handle,
+ num_sms=self.num_sms,
+ num_qps=self.num_qps,
+ async_with_compute_stream=False,
+ )
+ return combined_x
+
+ def capture_deferred_provenance(self):
+ # destroy() uses this same barrier. Materialize its JIT kernel before hashing the
+ # implementation so the first and later routing cases see identical evidence.
+ self.buffer.barrier(use_comm_stream=True, with_cpu_sync=True)
+ torch.cuda.synchronize()
+ jit_cubins = _jit_artifact_evidence()
+ _require_cross_rank_equal(jit_cubins, "JIT CUBINs")
+ if (
+ self._deferred_jit_snapshot is not None
+ and jit_cubins != self._deferred_jit_snapshot
+ ):
+ raise RuntimeError("DeepEP V2 JIT CUBIN set changed after measurement")
+ self._deferred_jit_snapshot = jit_cubins
+ self.backend_provenance["jit_cubins"] = jit_cubins
+
+ def inspect_dispatch(self, p, h):
+ count = self.recv_tokens(h)
+ local_idx = h.recv_topk_idx[:count]
+ valid = local_idx >= 0
+ expert_ids = torch.where(
+ valid,
+ local_idx + self.rank * (self.args.experts // self.world_size),
+ local_idx,
+ )
+ local = local_idx[valid].to(torch.int64)
+ return types.SimpleNamespace(
+ payload=h.recv_x[:count],
+ expert_ids=expert_ids,
+ weights=h.recv_topk_weights[:count].masked_fill(~valid, 0),
+ local_expert_counts=torch.bincount(
+ local, minlength=self.args.experts // self.world_size
+ ),
+ ordering_contract="elastic-source-metadata-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ combine_input = torch.zeros_like(h.recv_x)
+ combine_input[: transformed.shape[0]].copy_(transformed.to(combine_input.dtype))
+ combined, _, _ = self.buffer.combine(
+ combine_input,
+ handle=h.handle,
+ num_sms=self.num_sms,
+ num_qps=self.num_qps,
+ async_with_compute_stream=False,
+ )
+ return combined
+
+ def recv_tokens(self, h):
+ return int(h.handle.psum_num_recv_tokens_per_scaleup_rank[-1].item())
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ self.buffer.destroy()
+ dist.barrier()
+ dist.destroy_process_group()
+ except Exception:
+ return 1
+ return rc
diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py
new file mode 100644
index 0000000000..c9a022a484
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_harness.py
@@ -0,0 +1,1780 @@
+#!/usr/bin/env python3
+"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness.
+
+Backend-agnostic core. The per-backend adapters (`ep_deepep.py`, `ep_mori.py`)
+implement a small duck-typed protocol; this module owns the source-tokens-per-rank
+sweep, the timing, the correctness gate, and the provenance-tagged JSON doc.
+
+Fair-comparison contract (see docs/methodology.md):
+ * **Deterministic shared routing trace** (`routing.py`): the per-token expert IDs +
+ gate weights are generated once from a fixed seed over the *global* batch and are
+ identical on every SKU; each rank materializes its slice. So every platform runs
+ the *same* problem (no per-rank/per-platform RNG in the adapters).
+ * **Explicit measurement contract**: layout-and-dispatch-v1 includes routing-layout
+ generation in dispatch timing. Combine excludes staging.
+ Isolated sum is derived independently at each percentile and is not a measured chained op.
+ * **Correct collective percentile**: each iteration's latency is reduced MAX across
+ ranks first (a collective finishes with its slowest rank), THEN percentiled —
+ `median_i(max_r)`, not `max_r(median_i)`.
+ * **One line = one fixed config**; only T varies. Both `tokens_per_rank` and
+ `global_tokens = T * ep_size` are recorded as explicit chart coordinates.
+
+stdlib-only at module top (torch is passed in by the entrypoint; `routing` is imported
+lazily inside run_sweep) so this file `py_compile`s without torch.
+
+Backend protocol:
+ name, mode, combine_needs_redispatch, backend_provenance(dict)
+ buffer_cap(args) -> int|None
+ make_problem(T, idx, weights, x) -> problem # materialize this rank's trace slice
+ dispatch(problem) -> handle # pure dispatch comm (timed)
+ stage(problem, handle) # untimed expert-output placement
+ combine(problem, handle) -> tensor # pure combine comm (timed)
+ inspect_dispatch(problem, handle) -> view # normalized payload/expert/weight metadata
+ combine_transformed(problem, handle, tensor) -> tensor
+ recv_tokens(handle) -> int # realized tokens received this rank
+ finalize(rc) -> int|NoReturn
+"""
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import hashlib
+import json
+import math
+import os
+import types
+
+import contracts
+import identity
+import workload as workload_contract
+
+# Raw v1 result emitted by one benchmark case. Publication uses a separate contract.
+SCHEMA_VERSION = 1
+
+# Every comparison-grade EP point uses the same literal timing profile on every SKU/backend.
+# Eight timed iterations keep each MoRI burst well below its sustained-iteration wedge, 64 trials
+# provide 512 observations per operation, and 32 warmups meet Blackwell's measured clock-ramp floor.
+SAMPLING_CONTRACT = identity.V1_CASE_PROFILE["sampling_contract"]
+TIMED_SAMPLES_PER_POINT = 512
+TIMED_ITERS_PER_TRIAL = 8
+TRIALS_PER_POINT = 64
+WARMUP_ITERS_PER_TRIAL = 32
+WARMUP_SEMANTICS = "full-roundtrip-before-each-component-trial-point-v1"
+ROUTING_SEED = 67
+ROUTING_GENERATOR = workload_contract.GENERATOR_VERSION
+ACTIVATION_PROFILE = "canonical-counter-source-v3"
+ACTIVATION_GENERATOR = workload_contract.ACTIVATION_GENERATOR
+PLACEMENT = "packed"
+COMPONENT_ORDER_CONTRACT = "roundtrip-dispatch-activation-only-combine-v2"
+LOW_LATENCY_MODE = "low-latency"
+LOW_LATENCY_MAX_TOKENS_PER_RANK = 128
+LOW_LATENCY_MEASUREMENT_CONTRACT = "expert-packed-weighted-combine-v1"
+LOW_LATENCY_COMPONENT_ORDER_CONTRACT = "roundtrip-dispatch-gate-weighted-combine-v1"
+LOW_LATENCY_ORACLE_CONTRACT = "expert-assignment-transform-v1"
+LOW_LATENCY_CORRECTNESS_SCOPE = "expert-assignment-and-weighted-combine"
+
+# Phase-default sweeps — token-size regimes, NOT distinct kernels (both run normal
+# mode; "decode"/"prefill" name the small/large-token regime). Powers of two for a
+# clean log x-axis; clamped to the backend buffer ceiling (MoRI's registerable heap).
+DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128]
+PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096]
+CONDITIONING_LADDERS = {
+ phase: list(ladder) for phase, ladder in contracts.V1_CONDITIONING_LADDERS.items()
+}
+CONDITIONING_ROUNDS_PER_SHAPE = contracts.V1_CONDITIONING_ROUNDS_PER_SHAPE
+CONDITIONING_CONTRACT = identity.V1_CASE_PROFILE["conditioning_contract"]
+ORACLE_CONTRACT = identity.V1_CASE_PROFILE["oracle_contract"]
+ORACLE_RTOL = 5e-2
+ORACLE_ATOL = 2e-2
+
+BF16_BYTES = 2
+EPLB_REDUNDANT_EXPERTS = 32
+EPLB_REFERENCE_TOKENS_PER_RANK = 2048
+EPLB_PLANNER = "greedy-rank-major-v1"
+V1_PROFILE = {
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "combine_quant_mode": "none",
+ "mode": "normal",
+ "measurement_contract": "layout-and-dispatch-v1",
+ "resource_mode": "tuned",
+ "placement": PLACEMENT,
+ "activation_profile": ACTIVATION_PROFILE,
+ "activation_generator": ACTIVATION_GENERATOR,
+ "routing_generator": ROUTING_GENERATOR,
+ "component_order_contract": COMPONENT_ORDER_CONTRACT,
+ "conditioning_contract": CONDITIONING_CONTRACT,
+ "eplb_reference_tokens_per_rank": EPLB_REFERENCE_TOKENS_PER_RANK,
+ "eplb_redundant_experts": EPLB_REDUNDANT_EXPERTS,
+ "eplb_planner": EPLB_PLANNER,
+ # DeepEP/UCCL use this only as the fallback when their tuned default is not exported.
+ "num_sms": 24,
+}
+
+def format_collective_version(raw) -> str:
+ """Normalize PyTorch's tuple or packed NCCL/RCCL version representation."""
+ if isinstance(raw, int):
+ if raw < 10_000:
+ return f"{raw // 1000}.{raw // 100 % 10}.{raw % 100}"
+ return f"{raw // 10_000}.{raw // 100 % 100}.{raw % 100}"
+ if isinstance(raw, (tuple, list)):
+ return ".".join(map(str, raw))
+ return str(raw) if raw not in (None, "") else "unknown"
+
+
+def add_common_args(ap: argparse.ArgumentParser) -> None:
+ """Add the varying v1 inputs; fixed profile values are not CLI axes."""
+ ap.set_defaults(**V1_PROFILE)
+ ap.add_argument("--mode", default="normal", choices=["normal", LOW_LATENCY_MODE])
+ ap.add_argument("--phase", default="decode", choices=["decode", "prefill"],
+ help="token-size regime: decode (small T) / prefill (large T) — picks the default ladder")
+ ap.add_argument("--tokens-ladder", default="",
+ help="space/comma-separated source-tokens-per-rank sweep; blank = phase default")
+ ap.add_argument("--hidden", type=int, default=7168)
+ ap.add_argument("--topk", type=int, default=8)
+ ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across EP degrees)")
+ ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"])
+ # EPLB (Expert-Parallel Load Balancer): replicate hot experts onto redundant physical
+ # slots + balanced-place so per-rank load equalizes. A pure routing-trace transform
+ # (tests/eplb.py); experts becomes num_logical+redundant. The remedy for `zipf` skew.
+ ap.add_argument("--eplb", action="store_true",
+ help="apply EPLB expert replication/placement to the routing trace")
+ # Canonical workloads consume pre-generated trace bytes instead of the
+ # seeded runtime generator, so a result is provably the SAME workload as another machine's
+ # (checksum match). Points at a dir of .npz/.manifest.json (make_workloads.py).
+ ap.add_argument("--workload-dir", default="",
+ help="dir of canonical workload traces; empty = seeded runtime generation (dev)")
+ ap.add_argument("--case-id", default="")
+ ap.add_argument("--suite", default="")
+ ap.add_argument("--workload-name", default="")
+ ap.add_argument("--required-publication", default="")
+ ap.add_argument("--seed", type=int, default=ROUTING_SEED)
+ # 32: B300/Blackwell needs ~30 untimed iters to reach steady-state GPU clocks +
+ # establish NVLink/NVSHMEM connections — at warmup=8 its dispatch read ~1787us
+ # (cold), at warmup>=30 it settles to ~85us (faster than H100, reproducible within
+ # ~2.5%). H100/MI355X reach steady state much sooner; the extra iters are harmless.
+ ap.add_argument("--warmup", type=int, default=WARMUP_ITERS_PER_TRIAL,
+ help=f"untimed full roundtrips before each trial/point; fixed by "
+ f"{SAMPLING_CONTRACT} to {WARMUP_ITERS_PER_TRIAL}")
+ ap.add_argument("--iters", type=int, default=TIMED_ITERS_PER_TRIAL,
+ help=f"timed iterations per trial; fixed by {SAMPLING_CONTRACT} to "
+ f"{TIMED_ITERS_PER_TRIAL}")
+ ap.add_argument("--trials", type=int, default=TRIALS_PER_POINT,
+ help=f"timed trials; fixed by {SAMPLING_CONTRACT} to {TRIALS_PER_POINT}")
+ # provenance / output
+ ap.add_argument("--runner", required=True)
+ ap.add_argument("--topology-class", required=True)
+ ap.add_argument("--transport", default="")
+ ap.add_argument("--scope", required=True, choices=["scale-up", "scale-out"])
+ ap.add_argument("--scale-up-transport", required=True)
+ ap.add_argument("--scale-out-transport", default="")
+ # gpus-per-node=0 means one node containing the whole EP group.
+ ap.add_argument("--gpus-per-node", type=int, default=0)
+ ap.add_argument("--scale-up-domain", type=int, default=0, help="0 = gpus_per_node*ep (one domain)")
+ ap.add_argument("--timestamp")
+ ap.add_argument("--out", required=True)
+
+
+def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]:
+ """Return (ladder, dropped): explicit spec else the phase default; positive ints;
+ clamped to `cap` with dropped points reported (never silently truncated)."""
+ if spec and spec.strip():
+ want = [int(t) for t in spec.replace(",", " ").split() if t]
+ else:
+ want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER
+ want = sorted({t for t in want if t > 0})
+ if cap is not None:
+ return [t for t in want if t <= cap], [t for t in want if t > cap]
+ return want, []
+
+
+def sampling_contract_error(iters: int, trials: int, warmup: int) -> str | None:
+ """Return a user-facing error unless the exact cross-SKU timing profile is used."""
+ expected = (TIMED_ITERS_PER_TRIAL, TRIALS_PER_POINT, WARMUP_ITERS_PER_TRIAL)
+ observed = (iters, trials, warmup)
+ if observed != expected:
+ return (f"{SAMPLING_CONTRACT} requires exactly iters:trials:warmup="
+ f"{expected[0]}:{expected[1]}:{expected[2]} on every SKU/backend; got "
+ f"{observed[0]}:{observed[1]}:{observed[2]} "
+ f"({iters * trials if iters > 0 and trials > 0 else 'invalid'} timed samples)")
+ return None
+
+
+def _stats_vec(xs: list[int]) -> dict:
+ """min/mean/max/CV (+ empty count) of a per-rank count vector — self-describing source-token
+ or load summary without dumping the full vector."""
+ n = len(xs) or 1
+ mean = sum(xs) / n
+ var = sum((x - mean) ** 2 for x in xs) / n
+ cv = (var ** 0.5 / mean) if mean > 0 else 0.0
+ return {"min": min(xs) if xs else 0, "mean": round(mean, 3),
+ "max": max(xs) if xs else 0, "cv": round(cv, 4),
+ "empty_ranks": sum(1 for x in xs if x == 0), "total": sum(xs), "ranks": n}
+
+
+def percentile(xs: list[float], q: float) -> float:
+ if not xs:
+ return float("nan")
+ s = sorted(xs)
+ i = max(0, min(len(s) - 1, math.ceil(q / 100.0 * len(s)) - 1))
+ return s[i]
+
+
+def _sha256_json(value) -> str:
+ payload = json.dumps(
+ value, allow_nan=False, ensure_ascii=False, sort_keys=True, separators=(",", ":")
+ ).encode()
+ return hashlib.sha256(payload).hexdigest()
+
+
+def _series_provenance(provenance: dict) -> dict:
+ """Retain stable semantic build identity while keeping raw binaries diagnostic."""
+ return contracts.series_provenance(provenance)
+
+
+def _write_bytes_atomic(path: str, payload: bytes) -> tuple[str, int]:
+ os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
+ temporary = f"{path}.tmp-{os.getpid()}"
+ try:
+ with open(temporary, "wb") as handle:
+ handle.write(payload)
+ handle.flush()
+ os.fsync(handle.fileno())
+ os.replace(temporary, path)
+ finally:
+ try:
+ os.unlink(temporary)
+ except FileNotFoundError:
+ pass
+ return hashlib.sha256(payload).hexdigest(), len(payload)
+
+
+def _write_json_atomic(path: str, value) -> tuple[str, int]:
+ payload = (
+ json.dumps(value, allow_nan=False, ensure_ascii=False, indent=2) + "\n"
+ ).encode()
+ return _write_bytes_atomic(path, payload)
+
+
+def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]:
+ """Per-iteration CUDA-event latencies (µs) for THIS rank.
+
+ Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration (sync
+ before the start event so its GPU work can't bleed in), then times `fn(pre_result)`
+ — how combine is isolated when it consumes the dispatch state and needs a fresh
+ untimed dispatch+stage before every sample. Returns the raw per-iteration series;
+ the caller reduces across ranks per iteration before percentiling.
+ """
+ def sample():
+ arg = pre() if pre is not None else None
+ if pre is not None:
+ torch.cuda.synchronize()
+ s = torch.cuda.Event(enable_timing=True)
+ e = torch.cuda.Event(enable_timing=True)
+ s.record()
+ fn(arg) if pre is not None else fn()
+ e.record()
+ torch.cuda.synchronize()
+ return s.elapsed_time(e) * 1000.0 # ms -> us
+
+ for _ in range(max(0, warmup)):
+ if pre is not None:
+ a = pre()
+ torch.cuda.synchronize()
+ fn(a)
+ else:
+ fn()
+ # sync EACH warmup iteration, not just once after the loop: the measured-roundtrip fn
+ # interleaves dispatch+combine on a backend's persistent comm buffer, so back-to-back
+ # un-synced warmup iterations let iter N+1's dispatch race iter N's combine (CUDA abort
+ # on a rank -> NCCL-watchdog SIGABRT). Cheap (warmup is small); timed samples already sync.
+ torch.cuda.synchronize()
+ return [sample() for _ in range(iters)]
+
+
+def kernel_generation(backend) -> str:
+ """Return the adapter's explicit kernel family when one exists."""
+ declared = getattr(backend, "kernel_generation", None)
+ if declared:
+ return declared
+ return {
+ "deepep": "v1",
+ "deepep-v2": "v2-elastic-buffer",
+ "deepep-hybrid": "hybrid",
+ }.get(backend.name, "n-a")
+
+
+def _reduce_vec(torch, dist, device, vals, op):
+ t = torch.tensor(vals, device=device, dtype=torch.float64)
+ dist.all_reduce(t, op=op)
+ return [float(x) for x in t.tolist()]
+
+
+def _reduce_int(torch, dist, device, v: int, op) -> int:
+ t = torch.tensor([int(v)], device=device, dtype=torch.int64)
+ dist.all_reduce(t, op=op)
+ return int(t.item())
+
+
+def _same_hash_across_ranks(torch, dist, device, digest: str) -> bool:
+ parts = [int(digest[offset:offset + 8], 16) for offset in range(0, 64, 8)]
+ low = torch.tensor(parts, device=device, dtype=torch.int64)
+ high = low.clone()
+ dist.all_reduce(low, op=dist.ReduceOp.MIN)
+ dist.all_reduce(high, op=dist.ReduceOp.MAX)
+ return bool(torch.equal(low, high))
+
+
+def _tensor_sha256(*tensors) -> str:
+ digest = hashlib.sha256()
+ for tensor in tensors:
+ digest.update(tensor.detach().contiguous().cpu().numpy().tobytes())
+ return digest.hexdigest()
+
+
+def _normalized_expert_metadata(torch, expert_ids, weights):
+ """Sort each row by global expert ID while keeping -1 sentinels last."""
+ valid = expert_ids >= 0
+ keys = torch.where(valid, expert_ids.to(torch.int64), torch.full_like(expert_ids, 1 << 30))
+ order = torch.argsort(keys, dim=1, stable=True)
+ sorted_ids = torch.gather(expert_ids.to(torch.int64), 1, order)
+ sorted_weights = torch.gather(weights.to(torch.float32), 1, order)
+ sorted_valid = sorted_ids >= 0
+ return (
+ torch.where(sorted_valid, sorted_ids, torch.full_like(sorted_ids, -1)),
+ sorted_weights.masked_fill(~sorted_valid, 0),
+ )
+
+
+def expert_packed_slot_map(
+ counts,
+ src_info,
+ layout_range,
+ *,
+ tokens_per_rank: int,
+ experts_per_rank: int,
+ world_size: int,
+) -> list[tuple[int, int, int]]:
+ """Decode and validate DeepEP's expert-packed receive metadata.
+
+ ``src_info`` stores a source-local token index. The source rank is carried by
+ the corresponding packed ``layout_range`` interval, so neither field is
+ independently sufficient to identify a source token.
+ """
+ if tokens_per_rank <= 0 or experts_per_rank <= 0 or world_size <= 0:
+ raise ValueError("expert-packed dimensions must be positive")
+ if len(counts) != experts_per_rank:
+ raise ValueError("expert-packed count shape differs from local experts")
+ if len(src_info) != experts_per_rank or len(layout_range) != experts_per_rank:
+ raise ValueError("expert-packed metadata shape differs from local experts")
+
+ mask = (1 << 32) - 1
+ slots: list[tuple[int, int, int]] = []
+ pairs: set[tuple[int, int]] = set()
+ for local_expert in range(experts_per_rank):
+ count = counts[local_expert]
+ if type(count) is not int or count < 0:
+ raise ValueError("expert-packed receive count is invalid")
+ if len(layout_range[local_expert]) != world_size:
+ raise ValueError("expert-packed layout rank dimension is invalid")
+ if len(src_info[local_expert]) < count:
+ raise ValueError("expert-packed source metadata is truncated")
+
+ covered = [False] * count
+ for source_rank, encoded in enumerate(layout_range[local_expert]):
+ if type(encoded) is not int or encoded < 0:
+ raise ValueError("expert-packed layout range is invalid")
+ begin, span = encoded >> 32, encoded & mask
+ if begin > count or begin + span > count:
+ raise ValueError("expert-packed layout range exceeds valid slots")
+ for packed_position in range(begin, begin + span):
+ if covered[packed_position]:
+ raise ValueError("expert-packed layout ranges overlap")
+ covered[packed_position] = True
+ local_source = src_info[local_expert][packed_position]
+ if (
+ type(local_source) is not int
+ or local_source < 0
+ or local_source >= tokens_per_rank
+ ):
+ raise ValueError("expert-packed source token index is invalid")
+ source_id = source_rank * tokens_per_rank + local_source
+ pair = (source_id, local_expert)
+ if pair in pairs:
+ raise ValueError("expert-packed source/expert assignment is duplicated")
+ pairs.add(pair)
+ slots.append((local_expert, packed_position, source_id))
+ if not all(covered):
+ raise ValueError("expert-packed layout ranges omit valid receive slots")
+ return slots
+
+
+def expert_packed_dispatch_view(
+ torch,
+ packed_payload,
+ packed_counts,
+ packed_src_info,
+ packed_layout_range,
+ *,
+ rank: int,
+ tokens_per_rank: int,
+ experts_per_rank: int,
+ world_size: int,
+):
+ """Return the valid expert-packed rows with exact global source identities."""
+ if packed_payload.ndim != 3:
+ raise ValueError("expert-packed payload must have shape [experts, slots, hidden]")
+ if packed_payload.shape[0] != experts_per_rank:
+ raise ValueError("expert-packed payload expert dimension is invalid")
+ if tuple(packed_counts.shape) != (experts_per_rank,):
+ raise ValueError("expert-packed count tensor shape is invalid")
+ if tuple(packed_src_info.shape[:1]) != (experts_per_rank,):
+ raise ValueError("expert-packed source tensor shape is invalid")
+ if tuple(packed_layout_range.shape) != (experts_per_rank, world_size):
+ raise ValueError("expert-packed layout tensor shape is invalid")
+ if packed_src_info.ndim != 2 or packed_src_info.shape[1] < packed_payload.shape[1]:
+ raise ValueError("expert-packed source tensor capacity is invalid")
+
+ counts = [int(value) for value in packed_counts.detach().cpu().tolist()]
+ if any(count > packed_payload.shape[1] for count in counts):
+ raise ValueError("expert-packed receive count exceeds payload capacity")
+ slots = expert_packed_slot_map(
+ counts,
+ packed_src_info.detach().cpu().tolist(),
+ packed_layout_range.detach().cpu().tolist(),
+ tokens_per_rank=tokens_per_rank,
+ experts_per_rank=experts_per_rank,
+ world_size=world_size,
+ )
+ device = packed_payload.device
+ local_expert_slots = torch.tensor(
+ [slot[0] for slot in slots], device=device, dtype=torch.int64
+ )
+ packed_positions = torch.tensor(
+ [slot[1] for slot in slots], device=device, dtype=torch.int64
+ )
+ source_ids = torch.tensor(
+ [slot[2] for slot in slots], device=device, dtype=torch.int64
+ )
+ expert_ids = local_expert_slots + rank * experts_per_rank
+ payload = packed_payload[local_expert_slots, packed_positions]
+ return types.SimpleNamespace(
+ payload=payload,
+ source_ids=source_ids,
+ expert_ids=expert_ids,
+ local_expert_counts=packed_counts.to(torch.int64),
+ local_expert_slots=local_expert_slots,
+ packed_positions=packed_positions,
+ ordering_contract="expert-major/layout-addressed-packed-slot-v1",
+ )
+
+
+def _expert_transform(torch, payload, expert_ids, weights, combine_weight_semantics):
+ """Build one local expert aggregate for the v1 unweighted combine contract."""
+ if combine_weight_semantics != "unweighted-rank-sum":
+ raise ValueError("v1 requires unweighted rank-sum combine")
+ valid = expert_ids >= 0
+ expert = expert_ids.clamp(min=0).to(torch.int64)
+ gate = weights.to(torch.float32).masked_fill(~valid, 0)
+ scale = ((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32
+ offset_a = (((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64
+ offset_b = (((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128
+ scale_sum = (gate * scale).sum(dim=1, keepdim=True)
+ offset_a_sum = (gate * offset_a).sum(dim=1, keepdim=True)
+ offset_b_sum = (gate * offset_b).sum(dim=1, keepdim=True)
+ columns = torch.arange(payload.shape[1], device=payload.device, dtype=torch.int64)
+ pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8
+ transformed = (
+ payload.float() * scale_sum + offset_a_sum + offset_b_sum * pattern.unsqueeze(0)
+ )
+ return transformed.to(payload.dtype)
+
+
+def _expert_transform_expanded(torch, payload, expert_ids):
+ """Apply the oracle transform to one row per token/expert assignment."""
+ expert = expert_ids.to(torch.int64)
+ scale = (((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32).unsqueeze(1)
+ offset_a = ((((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64).unsqueeze(1)
+ offset_b = ((((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128).unsqueeze(1)
+ columns = torch.arange(payload.shape[1], device=payload.device, dtype=torch.int64)
+ pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8
+ transformed = payload.float() * scale + offset_a + offset_b * pattern.unsqueeze(0)
+ return transformed.to(payload.dtype)
+
+
+def _expected_transformed_combine(torch, problem):
+ """Independently derive sum_i gate_i * expert_i(x) for each source token."""
+ expected = torch.zeros_like(problem.x, dtype=torch.float32)
+ expert_ids = problem.topk_idx.to(torch.int64)
+ weights = problem.topk_weights.to(torch.float32)
+ columns = torch.arange(problem.x.shape[1], device=problem.x.device, dtype=torch.int64)
+ pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8
+ for slot in range(expert_ids.shape[1]):
+ expert = expert_ids[:, slot]
+ gate = weights[:, slot].unsqueeze(1)
+ scale = (((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32).unsqueeze(1)
+ offset_a = ((((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64).unsqueeze(1)
+ offset_b = ((((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128).unsqueeze(1)
+ expert_output = problem.x.float() * scale + offset_a + offset_b * pattern.unsqueeze(0)
+ expected.add_(gate * expert_output)
+ return expected
+
+
+def _run_expert_packed_oracle(
+ torch,
+ routing,
+ backend,
+ problem,
+ global_idx,
+ global_weights,
+ rank: int,
+ experts_per_rank: int,
+ seed: int,
+):
+ """Verify an expert-packed dispatch and native gate-weighted combine."""
+ contract = LOW_LATENCY_ORACLE_CONTRACT
+ handle = backend.dispatch(problem)
+ torch.cuda.synchronize()
+ try:
+ packed = backend.inspect_expert_dispatch(problem, handle)
+ view = expert_packed_dispatch_view(
+ torch,
+ packed.payload,
+ packed.local_expert_counts,
+ packed.source_info,
+ packed.layout_range,
+ rank=rank,
+ tokens_per_rank=problem.T,
+ experts_per_rank=experts_per_rank,
+ world_size=backend.world_size,
+ )
+ decoded_source_ids = routing.decode_source_ids(view.payload, seed)
+ except Exception as inspection_error:
+ try:
+ problem.recv_tokens = backend.recv_tokens(handle)
+ backend.stage(problem, handle)
+ backend.combine(problem, handle)
+ torch.cuda.synchronize()
+ except Exception as cleanup_error:
+ raise inspection_error from cleanup_error
+ return {
+ "contract": contract,
+ "passed": False,
+ "ordering_contract": "adapter-inspection-failed",
+ "order_sha256": None,
+ "dispatch_sha256": None,
+ "combine_weight_semantics": getattr(
+ backend, "combine_weight_semantics", "undeclared"
+ ),
+ "receive_count": 0,
+ "atol": ORACLE_ATOL,
+ "max_absolute_error": None,
+ "max_elementwise_relative_error": None,
+ "max_relative_error": None,
+ "max_weight_error": None,
+ "rtol": ORACLE_RTOL,
+ "checks": {
+ "combine_values": False,
+ "counts": False,
+ "metadata": False,
+ "multiplicity": False,
+ "payload": False,
+ "source_set": False,
+ "weights": False,
+ },
+ }
+
+ device = problem.x.device
+ world_size = backend.world_size
+ total_experts = experts_per_rank * world_size
+ global_idx_device = global_idx.to(device=device, dtype=torch.int64)
+ global_weights_device = global_weights.to(device=device, dtype=torch.float32)
+ source_grid = torch.arange(
+ global_idx_device.shape[0], device=device, dtype=torch.int64
+ ).unsqueeze(1).expand_as(global_idx_device)
+ local_mask = (global_idx_device // experts_per_rank) == rank
+ expected_sources = source_grid[local_mask]
+ expected_experts = global_idx_device[local_mask]
+ expected_pair_weights = global_weights_device[local_mask]
+
+ receive_count = int(view.payload.shape[0])
+ shape_ok = (
+ view.payload.ndim == 2
+ and view.source_ids.shape == (receive_count,)
+ and view.expert_ids.shape == (receive_count,)
+ and view.local_expert_counts.shape == (experts_per_rank,)
+ )
+ source_range = bool(
+ receive_count == 0
+ or (
+ (view.source_ids >= 0)
+ & (view.source_ids < global_idx_device.shape[0])
+ ).all().item()
+ )
+ expected_payload = (
+ routing.activations_for_source_ids(
+ view.source_ids, problem.x.shape[1], seed, problem.x.dtype
+ )
+ if source_range
+ else torch.empty_like(view.payload)
+ )
+ payload_ok = bool(
+ source_range
+ and torch.equal(decoded_source_ids.to(torch.int64), view.source_ids)
+ and torch.equal(view.payload, expected_payload)
+ )
+
+ actual_keys = view.source_ids * total_experts + view.expert_ids
+ expected_keys = expected_sources * total_experts + expected_experts
+ actual_order = torch.argsort(actual_keys, stable=True)
+ expected_order = torch.argsort(expected_keys, stable=True)
+ canonical_sources = view.source_ids.index_select(0, actual_order)
+ canonical_experts = view.expert_ids.index_select(0, actual_order)
+ canonical_expected_weights = expected_pair_weights.index_select(0, expected_order)
+ expected_local_idx = global_idx_device[
+ rank * problem.T:(rank + 1) * problem.T
+ ]
+ metadata_ok = bool(
+ shape_ok
+ and torch.equal(problem.topk_idx.to(torch.int64), expected_local_idx)
+ and torch.equal(
+ actual_keys.index_select(0, actual_order),
+ expected_keys.index_select(0, expected_order),
+ )
+ )
+ expected_counts = torch.bincount(
+ expected_experts - rank * experts_per_rank, minlength=experts_per_rank
+ )
+ counts_ok = torch.equal(
+ view.local_expert_counts.to(torch.int64), expected_counts.to(torch.int64)
+ )
+ actual_multiplicity = torch.bincount(
+ view.source_ids, minlength=global_idx_device.shape[0]
+ )
+ expected_multiplicity = torch.bincount(
+ expected_sources, minlength=global_idx_device.shape[0]
+ )
+ multiplicity_ok = torch.equal(actual_multiplicity, expected_multiplicity)
+ source_set_ok = torch.equal(
+ torch.sort(torch.unique(view.source_ids)).values,
+ torch.sort(torch.unique(expected_sources)).values,
+ )
+
+ expected_local_weights = global_weights_device[
+ rank * problem.T:(rank + 1) * problem.T
+ ]
+ if problem.topk_weights.shape == expected_local_weights.shape:
+ max_weight_error = (
+ float((problem.topk_weights.float() - expected_local_weights).abs().max().item())
+ if expected_local_weights.numel()
+ else 0.0
+ )
+ else:
+ max_weight_error = None
+ weights_ok = max_weight_error == 0.0
+ ordering_contract = f"canonical-source-expert-v1/{view.ordering_contract}"
+ order_sha256 = _tensor_sha256(canonical_sources, canonical_experts)
+ dispatch_sha256 = _tensor_sha256(
+ canonical_sources, canonical_experts, canonical_expected_weights
+ )
+
+ handle.oracle_local_expert_slots = view.local_expert_slots
+ handle.oracle_packed_positions = view.packed_positions
+ problem.recv_tokens = receive_count
+ transformed = _expert_transform_expanded(torch, view.payload, view.expert_ids)
+ combined = backend.combine_transformed(problem, handle, transformed)
+ torch.cuda.synchronize()
+ expected_combined = _expected_transformed_combine(torch, problem)
+ if combined.shape == expected_combined.shape and combined.numel():
+ absolute_error = (combined.float() - expected_combined).abs()
+ max_absolute_error = float(absolute_error.max().item())
+ max_relative_error = max_absolute_error / (
+ float(expected_combined.abs().max().item()) + 1e-6
+ )
+ max_elementwise_relative_error = float(
+ (absolute_error / expected_combined.abs().clamp_min(ORACLE_ATOL)).max().item()
+ )
+ combine_values_ok = bool(torch.allclose(
+ combined.float(), expected_combined, rtol=ORACLE_RTOL, atol=ORACLE_ATOL
+ ))
+ elif combined.shape == expected_combined.shape:
+ max_absolute_error = 0.0
+ max_elementwise_relative_error = 0.0
+ max_relative_error = 0.0
+ combine_values_ok = True
+ else:
+ max_absolute_error = None
+ max_elementwise_relative_error = None
+ max_relative_error = None
+ combine_values_ok = False
+ tolerance = float(getattr(backend, "tolerance", ORACLE_RTOL))
+ checks = {
+ "combine_values": combine_values_ok,
+ "counts": counts_ok,
+ "metadata": metadata_ok,
+ "multiplicity": multiplicity_ok,
+ "payload": payload_ok,
+ "source_set": source_set_ok,
+ "weights": weights_ok,
+ }
+ return {
+ "contract": contract,
+ "passed": bool(
+ all(checks.values())
+ and ordering_contract
+ and max_relative_error is not None
+ and max_relative_error < tolerance
+ ),
+ "atol": ORACLE_ATOL,
+ "combine_weight_semantics": backend.combine_weight_semantics,
+ "ordering_contract": ordering_contract,
+ "order_sha256": order_sha256,
+ "dispatch_sha256": dispatch_sha256,
+ "receive_count": receive_count,
+ "max_absolute_error": max_absolute_error,
+ "max_elementwise_relative_error": max_elementwise_relative_error,
+ "max_relative_error": max_relative_error,
+ "max_weight_error": max_weight_error,
+ "rtol": ORACLE_RTOL,
+ "checks": checks,
+ }
+
+
+def _run_expert_oracle(
+ torch,
+ routing,
+ backend,
+ problem,
+ global_idx,
+ global_weights,
+ rank: int,
+ experts_per_rank: int,
+ seed: int,
+):
+ """Verify one real dispatch/transform/combine without entering a timed region."""
+ if getattr(backend, "oracle_layout", "token-rank") == "expert-packed":
+ return _run_expert_packed_oracle(
+ torch,
+ routing,
+ backend,
+ problem,
+ global_idx,
+ global_weights,
+ rank,
+ experts_per_rank,
+ seed,
+ )
+ handle = backend.dispatch(problem)
+ torch.cuda.synchronize()
+ try:
+ view = backend.inspect_dispatch(problem, handle)
+ source_ids = routing.decode_source_ids(view.payload, seed)
+ except Exception as inspection_error:
+ try:
+ problem.recv_tokens = backend.recv_tokens(handle)
+ backend.stage(problem, handle)
+ backend.combine(problem, handle)
+ torch.cuda.synchronize()
+ except Exception as cleanup_error:
+ raise inspection_error from cleanup_error
+ return {
+ "contract": ORACLE_CONTRACT,
+ "passed": False,
+ "ordering_contract": "adapter-inspection-failed",
+ "order_sha256": None,
+ "dispatch_sha256": None,
+ "combine_weight_semantics": getattr(
+ backend, "combine_weight_semantics", "undeclared"
+ ),
+ "receive_count": 0,
+ "atol": ORACLE_ATOL,
+ "max_absolute_error": None,
+ "max_elementwise_relative_error": None,
+ "max_relative_error": None,
+ "max_weight_error": None,
+ "rtol": ORACLE_RTOL,
+ "checks": {
+ "combine_values": False,
+ "counts": False,
+ "metadata": False,
+ "multiplicity": False,
+ "payload": False,
+ "source_set": False,
+ "weights": False,
+ },
+ }
+
+ receive_count = int(view.payload.shape[0])
+ shape_ok = (
+ view.payload.ndim == 2
+ and view.expert_ids.shape == (receive_count, problem.topk_idx.shape[1])
+ and view.weights.shape == view.expert_ids.shape
+ )
+ source_range = bool(
+ receive_count == 0
+ or ((source_ids >= 0) & (source_ids < global_idx.shape[0])).all().item()
+ )
+ if source_range:
+ expected_idx = global_idx.to(problem.x.device).index_select(0, source_ids)
+ expected_weights = global_weights.to(problem.x.device).index_select(0, source_ids)
+ local = (expected_idx // experts_per_rank) == rank
+ expected_ids = torch.where(local, expected_idx, torch.full_like(expected_idx, -1))
+ expected_weights = expected_weights.masked_fill(~local, 0)
+ expected_payload = routing.activations_for_source_ids(
+ source_ids, problem.x.shape[1], seed, problem.x.dtype
+ )
+ else:
+ expected_ids = torch.full_like(view.expert_ids, -1)
+ expected_weights = torch.zeros_like(view.weights)
+ expected_payload = torch.empty_like(view.payload)
+ actual_ids, actual_weights = _normalized_expert_metadata(
+ torch, view.expert_ids, view.weights
+ )
+ expected_ids, expected_weights = _normalized_expert_metadata(
+ torch, expected_ids, expected_weights
+ )
+ expected_sources = (
+ ((global_idx // experts_per_rank) == rank).any(dim=1).nonzero(as_tuple=True)[0]
+ ).to(problem.x.device)
+ source_set_ok = (
+ source_range
+ and source_ids.numel() == torch.unique(source_ids).numel()
+ and torch.equal(torch.sort(source_ids).values, expected_sources)
+ )
+ payload_ok = source_range and torch.equal(view.payload, expected_payload)
+ metadata_ok = shape_ok and torch.equal(actual_ids, expected_ids)
+ max_weight_error = (
+ float((actual_weights - expected_weights).abs().max().item())
+ if actual_weights.numel()
+ else 0.0
+ )
+ weights_ok = max_weight_error == 0.0
+ valid_expected = expected_ids >= 0
+ expected_local = expected_ids[valid_expected] - rank * experts_per_rank
+ expected_counts = torch.bincount(expected_local, minlength=experts_per_rank)
+ counts_ok = torch.equal(
+ view.local_expert_counts.to(torch.int64), expected_counts.to(torch.int64)
+ )
+ multiplicity_ok = torch.equal(
+ (actual_ids >= 0).sum(dim=1), (expected_ids >= 0).sum(dim=1)
+ )
+ # Receive-slot assignment may use atomics and is not a semantic EP guarantee. Compare
+ # pre/post dispatch evidence in canonical source-token order without changing the native path.
+ canonical_order = torch.argsort(source_ids.to(torch.int64), stable=True)
+ canonical_sources = source_ids.to(torch.int64).index_select(0, canonical_order)
+ canonical_ids = actual_ids.to(torch.int64).index_select(0, canonical_order)
+ canonical_weights = actual_weights.index_select(0, canonical_order)
+ ordering_contract = f"canonical-source-id-v1/{view.ordering_contract}"
+ order_sha256 = _tensor_sha256(canonical_sources)
+ dispatch_sha256 = _tensor_sha256(
+ canonical_sources, canonical_ids, canonical_weights
+ )
+
+ problem.recv_tokens = receive_count
+ combine_weight_semantics = backend.combine_weight_semantics
+ transformed = _expert_transform(
+ torch, view.payload, actual_ids, actual_weights, combine_weight_semantics
+ )
+ combined = backend.combine_transformed(problem, handle, transformed)
+ torch.cuda.synchronize()
+ expected_combined = _expected_transformed_combine(torch, problem)
+ if combined.shape == expected_combined.shape and combined.numel():
+ absolute_error = (combined.float() - expected_combined).abs()
+ max_absolute_error = float(absolute_error.max().item())
+ max_relative_error = max_absolute_error / (
+ float(expected_combined.abs().max().item()) + 1e-6
+ )
+ max_elementwise_relative_error = float(
+ (absolute_error / expected_combined.abs().clamp_min(ORACLE_ATOL)).max().item()
+ )
+ combine_values_ok = bool(torch.allclose(
+ combined.float(), expected_combined, rtol=ORACLE_RTOL, atol=ORACLE_ATOL
+ ))
+ elif combined.shape == expected_combined.shape:
+ max_absolute_error = 0.0
+ max_elementwise_relative_error = 0.0
+ max_relative_error = 0.0
+ combine_values_ok = True
+ else:
+ max_absolute_error = None
+ max_elementwise_relative_error = None
+ max_relative_error = None
+ combine_values_ok = False
+ tolerance = float(getattr(backend, "tolerance", 5e-2))
+ checks = {
+ "combine_values": combine_values_ok,
+ "counts": counts_ok,
+ "metadata": metadata_ok,
+ "multiplicity": multiplicity_ok,
+ "payload": payload_ok,
+ "source_set": source_set_ok,
+ "weights": weights_ok,
+ }
+ return {
+ "contract": ORACLE_CONTRACT,
+ "passed": bool(
+ all(checks.values())
+ and ordering_contract
+ and max_relative_error is not None
+ and max_relative_error < tolerance
+ ),
+ "atol": ORACLE_ATOL,
+ "combine_weight_semantics": combine_weight_semantics,
+ "ordering_contract": ordering_contract,
+ "order_sha256": order_sha256,
+ "dispatch_sha256": dispatch_sha256,
+ "receive_count": receive_count,
+ "max_absolute_error": max_absolute_error,
+ "max_elementwise_relative_error": max_elementwise_relative_error,
+ "max_relative_error": max_relative_error,
+ "max_weight_error": max_weight_error,
+ "rtol": ORACLE_RTOL,
+ "checks": checks,
+ }
+
+
+def _histogram(xs: list[float], nbins: int = 40) -> dict:
+ """Compact equal-width summary of the exact private cross-rank-max samples."""
+ if not xs:
+ return {"n": 0}
+ lo, hi = min(xs), max(xs)
+ if hi <= lo:
+ return {"n": len(xs), "min": lo, "max": hi, "bins": nbins, "counts": [len(xs)]}
+ counts = [0] * nbins
+ span = hi - lo
+ for x in xs:
+ b = min(nbins - 1, int((x - lo) / span * nbins))
+ counts[b] += 1
+ return {"n": len(xs), "min": round(lo, 3), "max": round(hi, 3), "bins": nbins, "counts": counts}
+
+
+def _derive_publication_status(v: dict) -> str:
+ """Classify raw attempts; only the isolated coverage publisher may promote evidence."""
+ if v["execution_status"] != "complete":
+ return "failed"
+ if v["semantic_correctness"] != "pass" or v["measurement_conformance"] != "conformant" \
+ or v["workload_identity"] == "inconsistent":
+ return "invalid"
+ # Per-case producers cannot prove exact matrix coverage, repeat stability, or controlled
+ # cohorts. Keep even sound attempts diagnostic until the isolated publisher validates them.
+ return "diagnostic"
+
+
+def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int:
+ """Drive the source-tokens-per-rank sweep for one fully-specified line."""
+ mode = getattr(args, "mode", "normal")
+ try:
+ case_profile = identity.case_profile(mode)
+ except identity.IdentityError as exc:
+ if rank == 0:
+ print(f"ERROR: {exc}")
+ return 2
+ sampling_error = sampling_contract_error(args.iters, args.trials, args.warmup)
+ if sampling_error:
+ if rank == 0:
+ print(f"ERROR: {sampling_error}")
+ return 2
+ import routing # torch-based; imported lazily so the module byte-compiles without torch
+ import eplb # stdlib planner + torch remap (the EPLB transform)
+
+ ep_size = world_size
+ # EPLB (if on): run_ep.py already bumped args.experts to the PHYSICAL count and stashed the
+ # logical count, so experts_per_rank below is physical. The trace is built over LOGICAL
+ # experts then remapped to physical (build_trace), so the whole sweep runs over the
+ # balanced physical placement with no adapter change.
+ eplb_on = getattr(args, "eplb", False)
+ num_logical = getattr(args, "num_logical_experts", args.experts)
+ if args.experts % ep_size != 0:
+ if rank == 0:
+ print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})")
+ return 2
+ experts_per_rank = args.experts // ep_size
+ if getattr(backend, "mode", None) != mode:
+ if rank == 0:
+ print(f"ERROR: backend mode {getattr(backend, 'mode', None)!r} != {mode!r}")
+ return 2
+ expected_weight_semantics = (
+ "gate-weighted-sum"
+ if case_profile["combine_semantics"] == "gate-weighted"
+ else "unweighted-rank-sum"
+ )
+ if getattr(backend, "combine_weight_semantics", None) != expected_weight_semantics:
+ if rank == 0:
+ print(
+ f"ERROR: {mode} requires combine semantics {expected_weight_semantics}"
+ )
+ return 2
+ if mode == LOW_LATENCY_MODE and (
+ args.phase != "decode"
+ or getattr(backend, "oracle_layout", None) != "expert-packed"
+ or getattr(backend, "payload_unit", None) != "token-expert"
+ ):
+ if rank == 0:
+ print("ERROR: low-latency requires decode expert-packed token-expert execution")
+ return 2
+
+ cap = backend.buffer_cap(args)
+ conditioning_ladder = CONDITIONING_LADDERS[args.phase]
+ if cap is not None and cap < conditioning_ladder[-1]:
+ if rank == 0:
+ print(f"ERROR: {backend.name} buffer cap {cap} cannot run the v1 conditioning ladder")
+ return 2
+ ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap)
+ if rank == 0 and dropped:
+ print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} "
+ f"(hidden={args.hidden}); not silently truncated.")
+ if not ladder:
+ if rank == 0:
+ print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})")
+ return 2
+ MAX, MIN, SUM = dist.ReduceOp.MAX, dist.ReduceOp.MIN, dist.ReduceOp.SUM
+
+ # EPLB plan (once): estimate logical load from the global logical trace at the largest
+ # ladder T (most samples), then replicate+place. Held fixed across all T (as real EPLB
+ # plans from an observed load estimate). build_trace builds the LOGICAL trace and remaps
+ # to physical when the plan is present; otherwise it's the identity (logical == physical).
+ eplb_plan = None
+ if eplb_on:
+ ref_idx, _ = routing.build_global_routing(
+ EPLB_REFERENCE_TOKENS_PER_RANK * ep_size,
+ num_logical,
+ args.topk,
+ args.routing,
+ args.seed,
+ )
+ load = torch.bincount(ref_idx.reshape(-1), minlength=num_logical).float().tolist()
+ eplb_plan = eplb.build_plan(load, args.experts, ep_size)
+ if rank == 0:
+ print(f"NOTE: EPLB {num_logical}->{args.experts} experts ({ep_size}x{experts_per_rank}); "
+ f"per-rank load imbalance {eplb_plan['imbalance_before']:.2f}x -> "
+ f"{eplb_plan['imbalance_after']:.2f}x; {eplb_plan['replicated_experts']} experts "
+ f"replicated (hottest {eplb_plan['max_replicas']}x)")
+
+ canonical = bool(getattr(args, "workload_dir", ""))
+ loaded_workload_ids, loaded_checksums = [], {}
+ if canonical:
+ import workload as _wl
+
+ def build_trace(gt):
+ # canonical: load pre-serialized trace bytes (verified by checksum) so this run is
+ # provably the SAME workload as any other consuming the same files. else: seeded gen.
+ if canonical:
+ wid = _wl.compute_workload_id(
+ args.routing, args.hidden, args.topk, num_logical, ep_size, gt, args.seed
+ )
+ idx_np, w_np, man = _wl.load_workload(os.path.join(args.workload_dir, f"{wid}.npz"), verify=True)
+ idx_l = torch.from_numpy(idx_np).to(torch.int64)
+ w = torch.from_numpy(w_np).to(torch.float32)
+ if wid not in loaded_workload_ids:
+ loaded_workload_ids.append(wid)
+ loaded_checksums[wid] = man.get("checksums")
+ else:
+ idx_l, w = routing.build_global_routing(
+ gt, num_logical, args.topk, args.routing, args.seed
+ )
+ return (eplb.remap_idx(idx_l, eplb_plan) if eplb_plan is not None else idx_l), w
+
+ # Fabric/clock warm-up BEFORE any timed point (review: H200 had an anomalous cold
+ # first point and a 40% decode-vs-prefill mismatch at the shared T=128). Gradually
+ # ramp through the small ladder shapes untimed — warms clocks/fabric for everyone
+ # and is also cold-jump-safe for MoRI.
+ def warm_roundtrips(problem, count):
+ for _ in range(count):
+ handle = backend.dispatch(problem)
+ if not hasattr(problem, "recv_tokens"):
+ # Dynamic receive cardinality is stable for this fixed routing trace. Cache it
+ # during untimed conditioning so adapters never read a device scalar in timing.
+ problem.recv_tokens = backend.recv_tokens(handle)
+ backend.stage(problem, handle)
+ backend.combine(problem, handle)
+ torch.cuda.synchronize()
+
+ for wt in conditioning_ladder:
+ # Warm-only shapes need not have canonical manifests: they are never measured or emitted.
+ wi, ww = routing.build_global_routing(
+ wt * ep_size, num_logical, args.topk, args.routing, args.seed,
+ )
+ if eplb_plan is not None:
+ wi = eplb.remap_idx(wi, eplb_plan)
+ wsi, wsw = routing.rank_slice(wi, ww, rank, wt)
+ wx = routing.rank_activations(wt, args.hidden, args.seed, rank, device, torch.bfloat16)
+ wp = backend.make_problem(wt, wsi.to(device), wsw.to(device), wx)
+ warm_roundtrips(wp, CONDITIONING_ROUNDS_PER_SHAPE)
+ torch.cuda.synchronize()
+ dist.barrier()
+ # Setup may materialize deferred provenance such as DeepEP V2 JIT CUBINs.
+ # Resolve it after conditioning but before correctness or timed measurements.
+ capture_deferred_provenance = getattr(backend, "capture_deferred_provenance", None)
+ if capture_deferred_provenance is not None:
+ capture_deferred_provenance()
+ provenance_issues = contracts.backend_provenance_issues(
+ backend.name, backend.backend_provenance
+ )
+ if provenance_issues:
+ if rank == 0:
+ print(
+ f"ERROR: unpinned provenance {provenance_issues} "
+ f"in {backend.backend_provenance}"
+ )
+ return 4
+ elem_dispatch = BF16_BYTES
+
+ # ---- Pass 1: build each deterministic problem and run the expert oracle. ----
+ problems, gate, gts, global_traces, input_snapshots = {}, {}, {}, {}, {}
+ routing_hashes = set()
+ for T in ladder:
+ counts = [T] * ep_size
+ gt = T * ep_size
+ gts[T] = gt
+ idx_g, w_g = build_trace(gt)
+ rstats = routing.routing_stats(idx_g, args.experts, experts_per_rank, weights=w_g)
+ gpn = args.gpus_per_node or ep_size
+ rstats["locality"] = routing.routing_locality(idx_g, experts_per_rank, ep_size, max(1, T),
+ gpn, args.scale_up_domain or None)
+ rstats["source_token_stats"] = _stats_vec(counts)
+ routing_hashes.add(rstats["routing_hash"])
+ my_off, my_cnt = rank * T, T
+ idx_s = idx_g[my_off:my_off + my_cnt].contiguous()
+ w_s = w_g[my_off:my_off + my_cnt].contiguous()
+ x = routing.rank_activations(my_cnt, args.hidden, args.seed, rank, device, torch.bfloat16)
+ problem = backend.make_problem(my_cnt, idx_s.to(device), w_s.to(device), x)
+ input_snapshots[T] = (
+ problem.x.clone(), problem.topk_idx.clone(), problem.topk_weights.clone()
+ )
+ oracle = _run_expert_oracle(
+ torch, routing, backend, problem, idx_g, w_g, rank, experts_per_rank,
+ args.seed,
+ )
+ before_x, before_idx, before_weights = input_snapshots[T]
+ pre_input_unchanged = (
+ torch.equal(problem.x, before_x)
+ and torch.equal(problem.topk_idx, before_idx)
+ and torch.equal(problem.topk_weights, before_weights)
+ )
+ problems[T] = problem
+ global_traces[T] = (idx_g, w_g)
+ gate[T] = {
+ "rstats": rstats,
+ "recv_local": oracle["receive_count"],
+ "max_rel": oracle["max_relative_error"] or 0.0,
+ "local_ok": int(oracle["passed"]),
+ "oracle_pre": oracle,
+ "pre_input_unchanged": pre_input_unchanged,
+ }
+
+ # ---- Pass 2: every backend uses the same ascending point order and conditioning ramp.
+ # Per-iteration cross-rank MAX samples are pooled across trials. ----
+ disp_pool = {T: [] for T in ladder} # pooled per-iteration cross-rank MAX (dispatch)
+ comb_pool = {T: [] for T in ladder} # ... combine
+ rt_pool = {T: [] for T in ladder} # independently measured round trip
+ disp_trials = {T: [] for T in ladder}
+ comb_trials = {T: [] for T in ladder}
+ rt_trials = {T: [] for T in ladder}
+ order = list(ladder)
+ for _trial in range(args.trials):
+ for T in order:
+ problem = problems[T]
+ # Stateful paired APIs may expose only a measured round trip.
+ # Do not synthesize component latency from that measurement.
+ roundtrip_only = getattr(backend, "roundtrip_only", False)
+
+ def rt_once(p=problem):
+ hh = backend.dispatch(p)
+ backend.stage(p, hh)
+ return backend.combine(p, hh)
+
+ # Every available component starts after the same synchronized full-roundtrip warmup.
+ # Roundtrip is first on every backend because it is the comparison headline.
+ warm_roundtrips(problem, args.warmup)
+ rt_iters = time_us(torch, lambda p=problem: rt_once(p), 0, args.iters)
+ if roundtrip_only:
+ disp_iters = comb_iters = []
+ else:
+ warm_roundtrips(problem, args.warmup)
+ disp_iters = time_us(torch, lambda p=problem: backend.dispatch(p),
+ 0, args.iters)
+
+ def prep(p=problem):
+ hh = backend.dispatch(p)
+ backend.stage(p, hh)
+ return hh
+ warm_roundtrips(problem, args.warmup)
+ if backend.combine_needs_redispatch:
+ comb_iters = time_us(torch, lambda hh, p=problem: backend.combine(p, hh),
+ 0, args.iters, pre=prep)
+ else:
+ hh = prep()
+ torch.cuda.synchronize()
+ comb_iters = time_us(torch, lambda p=problem, hx=hh: backend.combine(p, hx),
+ 0, args.iters)
+ # per-iteration cross-rank MAX (the distributed-op latency per iter), pooled.
+ if disp_iters:
+ reduced_dispatch = _reduce_vec(torch, dist, device, disp_iters, MAX)
+ reduced_combine = _reduce_vec(torch, dist, device, comb_iters, MAX)
+ disp_trials[T].append(reduced_dispatch)
+ comb_trials[T].append(reduced_combine)
+ disp_pool[T] += reduced_dispatch
+ comb_pool[T] += reduced_combine
+ reduced_roundtrip = _reduce_vec(torch, dist, device, rt_iters, MAX)
+ rt_trials[T].append(reduced_roundtrip)
+ rt_pool[T] += reduced_roundtrip
+
+ # ---- Pass 3: prove timed inputs were immutable and repeat the full oracle. ----
+ for T in ladder:
+ problem = problems[T]
+ before_x, before_idx, before_weights = input_snapshots[T]
+ input_unchanged = gate[T]["pre_input_unchanged"] and (
+ torch.equal(problem.x, before_x)
+ and torch.equal(problem.topk_idx, before_idx)
+ and torch.equal(problem.topk_weights, before_weights)
+ )
+ idx_g, w_g = global_traces[T]
+ post = _run_expert_oracle(
+ torch, routing, backend, problem, idx_g, w_g, rank, experts_per_rank,
+ args.seed,
+ )
+ pre = gate[T]["oracle_pre"]
+ order_stable = (
+ pre["ordering_contract"] == post["ordering_contract"]
+ and pre["order_sha256"] == post["order_sha256"]
+ and pre["dispatch_sha256"] == post["dispatch_sha256"]
+ )
+ gate[T].update({
+ "input_unchanged": input_unchanged,
+ "local_ok": int(pre["passed"] and post["passed"] and input_unchanged and order_stable),
+ "max_rel": max(pre["max_relative_error"] or 0.0, post["max_relative_error"] or 0.0),
+ "oracle_post": post,
+ "order_stable": order_stable,
+ })
+
+ # ---- Pass 4: percentiles (p50/p90/p95/p99, nearest-rank) from pooled samples + bytes + row ----
+ def pcts(xs):
+ return ({"p50": percentile(xs, 50), "p90": percentile(xs, 90),
+ "p95": percentile(xs, 95), "p99": percentile(xs, 99)} if xs else None)
+
+ def component(percentiles, count, *, derived=False):
+ if percentiles is None:
+ return {"availability": "unavailable", "origin": None,
+ "percentiles_us": None, "sample_count": 0}
+ return {
+ "availability": "derived" if derived else "measured",
+ "origin": "derived-percentile-sum" if derived else "measured",
+ "percentiles_us": percentiles,
+ "sample_count": 0 if derived else count,
+ }
+ rows = []
+ all_anomalies = []
+ thr_rt = 3.0
+ for T in ladder:
+ gt = gts[T]
+ g = gate[T]
+ rstats = g["rstats"]
+ d, c, rt = disp_pool[T], comb_pool[T], rt_pool[T]
+ dp, cp, rtp = pcts(d), pcts(c), pcts(rt)
+ # isolated_sum = SUM of the isolated dispatch+combine percentiles. NOT a measured op
+ # (can't reveal shared sync / launch amortization / overlap) — do NOT use for throughput
+ # or SLO capacity. The MEASURED round trip (rtp) is the real chained latency.
+ isum = {key: dp[key] + cp[key] for key in dp} if dp and cp else None
+ recv_total = _reduce_int(torch, dist, device, g["recv_local"], SUM)
+ recv_max = _reduce_int(torch, dist, device, g["recv_local"], MAX)
+ recv_min = _reduce_int(torch, dist, device, g["recv_local"], MIN)
+ global_ok = _reduce_int(torch, dist, device, g["local_ok"], MIN)
+ max_rel = _reduce_vec(torch, dist, device, [g["max_rel"]], MAX)[0]
+ point_ok = bool(global_ok) and recv_total > 0
+ rank_evidence = [None] * world_size
+ dist.all_gather_object(
+ rank_evidence,
+ {
+ "input_unchanged": g["input_unchanged"],
+ "order_stable": g["order_stable"],
+ "post_timing": g["oracle_post"],
+ "pre_timing": g["oracle_pre"],
+ "rank": rank,
+ },
+ )
+ # Canonical LOGICAL payload byte contracts (from the routing trace, NOT backend recv
+ # tensors): token-rank = one copy per unique (token,dest-rank); token-expert = one copy
+ # per routed (token,expert). routed_copies = token-rank copies; gt*topk = token-expert.
+ token_rank_copies = rstats["routed_copies"]
+ logical_copies = (
+ sum(rstats["expert_assignments_per_rank"])
+ if case_profile["payload_unit"] == "token-expert"
+ else token_rank_copies
+ )
+ H = args.hidden
+ throughput = {
+ percentile_name: gt / (latency_us * 1e-6)
+ for percentile_name, latency_us in rtp.items()
+ }
+ disp_bytes_l = logical_copies * H * elem_dispatch
+ comb_bytes_l = logical_copies * H * BF16_BYTES
+ # Contract-level anomalies are attached to the row and rolled into validity.
+ # roundtrip_gt_isolated_sum: measured RT p99 >> Σ(isolated dispatch+combine) p99.
+ # roundtrip_lt_component_floor: measured RT p50 < max(dispatch,combine) p50 — a chained
+ # op can't finish faster than its slowest required component (sync semantics violated).
+ row_anoms = []
+ if isum and isum["p99"] > 0 and rtp["p99"] > thr_rt * isum["p99"]:
+ row_anoms.append({"type": "roundtrip_gt_isolated_sum", "T": T,
+ "roundtrip_p99": round(rtp["p99"], 2), "isolated_sum_p99": round(isum["p99"], 2),
+ "ratio": round(rtp["p99"] / isum["p99"], 2), "threshold": thr_rt})
+ floor = max(dp["p50"], cp["p50"]) if dp and cp else None
+ if floor and rtp["p50"] > 0 and rtp["p50"] < 0.95 * floor:
+ row_anoms.append({"type": "roundtrip_lt_component_floor", "T": T,
+ "roundtrip_p50": round(rtp["p50"], 2), "component_floor_p50": round(floor, 2)})
+ all_anomalies.extend(row_anoms)
+ rows.append({
+ "anomalies": row_anoms,
+ "components": {
+ "combine": component(cp, len(c)),
+ "dispatch": component(dp, len(d)),
+ "isolated_sum": component(isum, 0, derived=True),
+ "roundtrip": component(rtp, len(rt)),
+ },
+ "correctness": {
+ "contract": case_profile["oracle_contract"],
+ "max_relative_error": max_rel,
+ "passed": point_ok,
+ "rank_evidence": rank_evidence,
+ "scope": case_profile["correctness_scope"],
+ },
+ "global_tokens": gt,
+ "logical_bytes": {
+ "combine": comb_bytes_l,
+ "dispatch": disp_bytes_l,
+ "roundtrip": disp_bytes_l + comb_bytes_l,
+ },
+ "receive": {
+ "max": recv_max,
+ "mean": recv_total / world_size,
+ "min": recv_min,
+ "total": recv_total,
+ },
+ "routing": {
+ "empty_expert_count": rstats["empty_expert_count"],
+ "empty_rank_count": rstats["empty_rank_count"],
+ "expert_assignment_rank_cv": rstats["expert_assignment_rank_cv"],
+ "expert_assignments_per_rank": rstats["expert_assignments_per_rank"],
+ "expert_load_cv": rstats["expert_load_cv"],
+ "expert_load_max": rstats["expert_load_max"],
+ "expert_load_mean": rstats["expert_load_mean"],
+ "expert_load_min": rstats["expert_load_min"],
+ "fanout_histogram": rstats["fanout_hist"],
+ "fanout_max": rstats["fanout_max"],
+ "fanout_mean": rstats["fanout_mean"],
+ "fanout_min": rstats["fanout_min"],
+ "hash": rstats["routing_hash"],
+ "hotspot_ratio": rstats["hotspot_ratio"],
+ "locality": rstats.get("locality"),
+ "payload_copies_per_rank": rstats["payload_copies_per_rank"],
+ "payload_rank_cv": rstats["payload_rank_cv"],
+ "routed_copies": rstats["routed_copies"],
+ "source_token_stats": rstats.get("source_token_stats"),
+ },
+ "sample_histograms": {
+ "dispatch": _histogram(d) if d else None,
+ "combine": _histogram(c) if c else None,
+ "roundtrip": _histogram(rt),
+ },
+ "token_rate_at_latency_percentile": throughput,
+ "tokens_per_rank": T,
+ })
+ if rank == 0:
+ component_log = (f"disp p50/p99={dp['p50']:7.1f}/{dp['p99']:7.1f} "
+ f"comb {cp['p50']:6.1f}/{cp['p99']:6.1f} " if dp and cp
+ else "components=unavailable ")
+ print(f" T={T:<5} {component_log}"
+ f"RT p50/p99={rtp['p50']:7.1f}/{rtp['p99']:7.1f}us n={len(rt)} fanout={rstats['fanout_mean']:.2f} "
+ f"recv[min/mean/max]={recv_min}/{recv_total // world_size}/{recv_max} "
+ f"correct={point_ok}")
+
+ # Cross-rank workload-identity proof: every rank must have built the SAME global routing
+ # (one hash per T here); confirm all ranks agree by hashing the per-T hash set and
+ # MIN/MAX-reducing it — a mismatch means NVIDIA and AMD did NOT run identical routing.
+ trace_sig = hashlib.sha256("|".join(sorted(routing_hashes)).encode()).hexdigest()
+ routing_consistent = _same_hash_across_ranks(torch, dist, device, trace_sig)
+
+ # Capture again after correctness and timing so no lazily generated kernel can escape
+ # the implementation identity recorded in the artifact.
+ if capture_deferred_provenance is not None:
+ capture_deferred_provenance()
+
+ if rank != 0:
+ return 0
+
+ # status=valid requires correctness AND a proven-identical routing trace across ranks.
+ all_ok = bool(rows) and all(r["correctness"]["passed"] for r in rows) and routing_consistent
+
+ # Adapters never self-label official; status is derived from these gates.
+ prov = backend.backend_provenance
+ provenance_complete = contracts.provenance_complete(
+ prov,
+ backend.name,
+ getattr(args, "git_run", None),
+ image_digest=getattr(args, "image_digest", None),
+ image_verified=getattr(args, "image_digest_verified", False),
+ squash_sha256=getattr(args, "squash_sha256", None),
+ )
+ resource_profile = contracts.project_resource_profile(prov)
+ resource_conformance = resource_profile["conformance_class"]
+ # record the canonical workload identity consumed (one trace per T -> set of ids/checksums).
+ if canonical and loaded_workload_ids:
+ args.workload_id = identity.workload_id(
+ {
+ "members": [
+ {"checksums": loaded_checksums[member], "workload_id": member}
+ for member in sorted(loaded_workload_ids)
+ ]
+ }
+ )
+ args.workload_members = sorted(loaded_workload_ids)
+ args.workload_checksums = loaded_checksums
+ canonical_workload = bool(getattr(args, "workload_id", None))
+ activation_identity = workload_contract.compute_activation_identity(args.seed, args.hidden)
+ # EPLB identity covers replica placement, not only counts.
+ eplb_mapping_hash = None
+ if eplb_plan is not None:
+ eplb_mapping_hash = eplb.mapping_hash(eplb_plan)
+ anomaly_free = len(all_anomalies) == 0
+ validity = {
+ "execution_status": "complete" if rows else "failed",
+ "semantic_correctness": (
+ "pass" if rows and all(r["correctness"]["passed"] for r in rows) else "fail"
+ ),
+ "workload_identity": "consistent-across-ranks" if routing_consistent else "inconsistent",
+ "workload_source": "canonical-serialized" if canonical_workload else "seeded-runtime",
+ "measurement_conformance": "conformant", # run_ep gate rejects nonconformant pre-run
+ "sampling_conformance": "conformant", # fixed-512-v1 gate rejects any other profile
+ "resource_conformance": resource_conformance,
+ "provenance_complete": provenance_complete,
+ # anomaly-free unless a contract-level timing anomaly fired (then diagnostic, see above).
+ "anomaly_free": anomaly_free,
+ }
+ publication_status = _derive_publication_status(validity)
+
+ shape = { # FIXED line identity (no T, no per-backend resource knobs)
+ "hidden": args.hidden, "topk": args.topk, "experts": args.experts,
+ "experts_per_rank": experts_per_rank, "dispatch_dtype": "bf16",
+ "routing": args.routing, "eplb": bool(eplb_plan), "num_logical_experts": num_logical,
+ # V2 is reserved for the PR #605 ElasticBuffer adapter; package versions never imply it.
+ "kernel_gen": kernel_generation(backend),
+ "activation_profile": ACTIVATION_PROFILE,
+ "quant": {
+ "combine_input_dtype": "bf16",
+ "combine_accum_dtype": getattr(backend, "combine_accum_dtype", "fp32"),
+ "combine_output_dtype": "bf16", "combine_quant_mode": "none",
+ "scale_layout": None,
+ },
+ }
+ generated_at = args.timestamp or _dt.datetime.now().astimezone().isoformat()
+ realized_placement = getattr(args, "realized_placement", None)
+ nodes = (
+ realized_placement["nodes"]
+ if realized_placement is not None
+ else int(os.environ.get("SLURM_NNODES", "1"))
+ )
+ case_factors = {
+ "case": {
+ "backend": backend.name,
+ "canonical": canonical,
+ "eplb": bool(eplb_plan),
+ "ep": ep_size,
+ "experts": num_logical,
+ "gpus_per_node": args.gpus_per_node or ep_size,
+ "hidden": args.hidden,
+ "ladder": " ".join(map(str, ladder)),
+ "mode": mode,
+ "nodes": nodes,
+ "phase": args.phase,
+ "required_publication": args.required_publication or "diagnostic",
+ "routing": args.routing,
+ "samples_per_point": TIMED_SAMPLES_PER_POINT,
+ "scale_up_domain": args.scale_up_domain or (args.gpus_per_node or ep_size),
+ "scale_up_transport": args.scale_up_transport,
+ "scale_out_transport": args.scale_out_transport or None,
+ "scope": args.scope,
+ "suite": args.suite or "manual",
+ "timing": f"{args.iters}:{args.trials}:{args.warmup}",
+ "topk": args.topk,
+ "topology_class": args.topology_class,
+ "transport": args.transport,
+ "warmup_semantics": WARMUP_SEMANTICS,
+ "workload": args.workload_name or "manual",
+ },
+ "profile": case_profile,
+ "sku": args.runner,
+ }
+ computed_case_id = identity.digest("case", case_factors)
+ if args.case_id and args.case_id != computed_case_id:
+ raise ValueError(
+ f"scheduled case ID does not match realized factors: {args.case_id} != {computed_case_id}"
+ )
+ case_identifier = args.case_id or computed_case_id
+ git_run = getattr(args, "git_run", None) or {}
+ allocation_factors = {
+ "artifact": git_run.get("artifact"),
+ "execution_id": getattr(args, "allocation_execution_id", None),
+ "job": git_run.get("job"),
+ "repo": git_run.get("repo"),
+ "run_attempt": git_run.get("run_attempt"),
+ "run_id": git_run.get("run_id"),
+ "runner": args.runner,
+ "source_sha": git_run.get("source_sha"),
+ }
+ allocation_identifier = identity.allocation_id(allocation_factors)
+ try:
+ attempt_ordinal = int(os.environ.get("CX_ATTEMPT_ID", "1"))
+ except ValueError:
+ attempt_ordinal = 0
+ if attempt_ordinal <= 0:
+ raise ValueError("CX_ATTEMPT_ID must be a positive integer")
+ attempt_identifier = identity.attempt_id(
+ allocation=allocation_identifier, case=case_identifier, ordinal=attempt_ordinal
+ )
+ runtime_fingerprint = getattr(args, "runtime_fingerprint", None) or {}
+ implementation_contract = {
+ "kernel_generation": kernel_generation(backend),
+ "name": backend.name,
+ "provenance": _series_provenance(backend.backend_provenance),
+ "resource_profile": resource_profile,
+ }
+ public_config = contracts.public_series_config(
+ kernel_generation=implementation_contract["kernel_generation"],
+ provenance=backend.backend_provenance,
+ resource_profile=resource_profile,
+ resource_mode=args.resource_mode,
+ device_product=getattr(args, "runtime_device_product", None),
+ )
+ series_factors = {
+ "backend": backend.name,
+ "implementation_contract_sha256": _sha256_json(implementation_contract),
+ "public_config_sha256": contracts.public_series_config_sha256(public_config),
+ "routing_control_sha256": contracts.routing_implementation_control_sha256(
+ implementation_contract
+ ),
+ "case_id": case_identifier,
+ "image_digest": getattr(args, "image_digest", None),
+ "runtime_fingerprint_sha256": _sha256_json(runtime_fingerprint),
+ "source_sha": git_run.get("source_sha"),
+ "squash_sha256": getattr(args, "squash_sha256", None),
+ "workload_id": getattr(args, "workload_id", None) or trace_sig,
+ }
+ series_identifier = identity.series_id(series_factors)
+
+ sample_points = []
+ for row in rows:
+ token_count = row["tokens_per_rank"]
+
+ def sampled_component(trials):
+ return {
+ "availability": "measured" if trials else "unavailable",
+ "sample_count": sum(len(trial) for trial in trials),
+ "trials": trials if trials else None,
+ }
+
+ sample_point = {
+ "components": {
+ "combine": sampled_component(comb_trials[token_count]),
+ "dispatch": sampled_component(disp_trials[token_count]),
+ "roundtrip": sampled_component(rt_trials[token_count]),
+ },
+ "tokens_per_rank": token_count,
+ }
+ sample_sha256 = _sha256_json(sample_point)
+ point_identifier = identity.point_id(
+ series=series_identifier, tokens_per_rank=token_count
+ )
+ evidence_identifier = identity.evidence_id(
+ point=point_identifier,
+ allocation=allocation_identifier,
+ attempt=attempt_identifier,
+ sample_sha256=sample_sha256,
+ )
+ sample_point.update(
+ {
+ "evidence_id": evidence_identifier,
+ "point_id": point_identifier,
+ "sample_sha256": sample_sha256,
+ }
+ )
+ sample_points.append(sample_point)
+ row.update({
+ "evidence_id": evidence_identifier,
+ "point_id": point_identifier,
+ "sample_sha256": sample_sha256,
+ })
+
+ samples_path = args.out[:-5] + ".samples.json" if args.out.endswith(".json") else args.out + ".samples.json"
+ samples_document = {
+ "allocation_id": allocation_identifier,
+ "attempt_id": attempt_identifier,
+ "case_id": case_identifier,
+ "format": "collectivex.samples.v1",
+ "points": sample_points,
+ "sampling": {
+ "iterations_per_trial": args.iters,
+ "reduction": case_profile["rank_reduction"],
+ "trials": args.trials,
+ },
+ "schema_version": 1,
+ "series_id": series_identifier,
+ }
+ samples_payload = contracts.canonical_json_bytes(samples_document)
+ samples_sha256 = hashlib.sha256(samples_payload).hexdigest()
+ samples_bytes = len(samples_payload)
+ sample_artifact = {
+ "bytes": samples_bytes,
+ "format": "collectivex.samples.v1",
+ "path": os.path.basename(samples_path),
+ "sha256": samples_sha256,
+ }
+ headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2])
+ eplb_record = (
+ {
+ "enabled": True,
+ "imbalance_after": eplb_plan["imbalance_after"],
+ "imbalance_before": eplb_plan["imbalance_before"],
+ "mapping_hash": eplb_mapping_hash,
+ "max_replicas": eplb_plan["max_replicas"],
+ "num_logical_experts": num_logical,
+ "num_physical_experts": args.experts,
+ "num_redundant": args.experts - num_logical,
+ "planner": EPLB_PLANNER,
+ "reference_tokens_per_rank": EPLB_REFERENCE_TOKENS_PER_RANK,
+ "replicated_experts": eplb_plan["replicated_experts"],
+ }
+ if eplb_plan
+ else {
+ "enabled": False,
+ "imbalance_after": None,
+ "imbalance_before": None,
+ "mapping_hash": None,
+ "max_replicas": None,
+ "num_logical_experts": num_logical,
+ "num_physical_experts": args.experts,
+ "num_redundant": 0,
+ "planner": None,
+ "reference_tokens_per_rank": None,
+ "replicated_experts": 0,
+ }
+ )
+ doc = {
+ "format": "collectivex.ep.v1",
+ "schema_version": SCHEMA_VERSION,
+ "record_type": "case-attempt",
+ "generated_at": generated_at,
+ "identity": {
+ "allocation_factors": allocation_factors,
+ "allocation_id": allocation_identifier,
+ "attempt_id": attempt_identifier,
+ "attempt_ordinal": attempt_ordinal,
+ "case_factors": case_factors,
+ "case_id": case_identifier,
+ "series_factors": series_factors,
+ "series_id": series_identifier,
+ },
+ "case": {
+ "attempt_ordinal": attempt_ordinal,
+ "backend": backend.name,
+ "eplb": eplb_record,
+ "ep_size": ep_size,
+ "mode": mode,
+ "phase": args.phase,
+ "required_publication": args.required_publication or "diagnostic",
+ "resource_mode": "tuned",
+ "runner": args.runner,
+ "shape": shape,
+ "suite": args.suite or "manual",
+ "workload_name": args.workload_name or "manual",
+ },
+ "workload": {
+ "activation_generator": ACTIVATION_GENERATOR,
+ "activation_identity": activation_identity,
+ "activation_profile": ACTIVATION_PROFILE,
+ "cross_rank_consistent": routing_consistent,
+ "manifest_checksums": getattr(args, "workload_checksums", None),
+ "members": getattr(args, "workload_members", None),
+ "routing_generator": ROUTING_GENERATOR,
+ "source": validity["workload_source"],
+ "trace_hashes": sorted(routing_hashes),
+ "trace_signature": trace_sig,
+ "workload_id": getattr(args, "workload_id", None),
+ },
+ "measurement": {
+ "component_order_contract": case_profile["component_order_contract"],
+ "conditioning": {
+ "contract": CONDITIONING_CONTRACT,
+ "ladder": conditioning_ladder,
+ "roundtrips_per_shape": CONDITIONING_ROUNDS_PER_SHAPE,
+ },
+ "contract": case_profile["contract"],
+ "rows": rows,
+ "sampling": {
+ "contract": SAMPLING_CONTRACT,
+ "iterations_per_trial": args.iters,
+ "percentile_method": case_profile["percentile_method"],
+ "reduction": case_profile["rank_reduction"],
+ "samples_per_component": TIMED_SAMPLES_PER_POINT,
+ "trials": args.trials,
+ "warmup_iterations": args.warmup,
+ "warmup_semantics": WARMUP_SEMANTICS,
+ },
+ "source_allocation": "even",
+ },
+ "implementation": {
+ "kernel_generation": kernel_generation(backend),
+ "name": backend.name,
+ "provenance": backend.backend_provenance,
+ "resource_profile": resource_profile,
+ },
+ "topology": {
+ "device_count": getattr(args, "runtime_device_count", None),
+ "device_product": getattr(args, "runtime_device_product", None),
+ "gpus_per_node": args.gpus_per_node or ep_size,
+ "nodes": nodes,
+ "placement": "packed",
+ "realized_placement": realized_placement,
+ "scale_up_domain": args.scale_up_domain or (args.gpus_per_node or ep_size),
+ "scale_up_transport": args.scale_up_transport,
+ "scale_out_transport": args.scale_out_transport or None,
+ "scope": args.scope,
+ "topology_class": args.topology_class,
+ "transport": args.transport,
+ "world_size": world_size,
+ },
+ "runtime_fingerprint": runtime_fingerprint,
+ "provenance": {
+ "command": getattr(args, "reproduction_command", ""),
+ "distributed_launcher": getattr(args, "distributed_launcher", None),
+ "git_run": getattr(args, "git_run", None),
+ "image": {
+ "arch": getattr(args, "image_arch", None),
+ "digest": getattr(args, "image_digest", "") or None,
+ "digest_verified": getattr(args, "image_digest_verified", False),
+ "reference": getattr(args, "image", "") or None,
+ "squash_sha256": getattr(args, "squash_sha256", None),
+ },
+ "redaction": "sanitized-v1",
+ },
+ "sample_artifact": sample_artifact,
+ "outcome": {
+ "publication_status": publication_status,
+ "reasons": [] if all_ok else ["semantic correctness or routing identity failed"],
+ "status": "success" if all_ok else "invalid",
+ "validity": validity,
+ },
+ }
+ contracts.validate_raw_document(doc, samples_document)
+ _write_bytes_atomic(samples_path, samples_payload)
+ _write_json_atomic(args.out, doc)
+ dispatch_percentiles = headline["components"]["dispatch"]["percentiles_us"]
+ dispatch_p99 = dispatch_percentiles["p99"] if dispatch_percentiles else None
+ component_summary = (f"disp_p99={dispatch_p99:.1f}us "
+ if dispatch_p99 is not None
+ else "components=unavailable ")
+ print(f"{backend.name} ep-dispatch-combine [{args.phase}/{mode}/{case_profile['contract']}]: "
+ f"status={doc['outcome']['status']} {len(rows)} pts, routing_consistent={routing_consistent}, "
+ f"headline T={headline['tokens_per_rank']} {component_summary}"
+ f"-> {args.out}")
+ # A complete invalid document is still a successfully captured terminal outcome. Launchers
+ # inspect its status to fail the case without conflating it with an execution failure.
+ return 0
diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py
new file mode 100644
index 0000000000..936425c5af
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_mori.py
@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+"""CollectiveX MoRI adapter for the v1 BF16 normal-mode workload."""
+from __future__ import annotations
+
+import os
+from pathlib import Path
+import re
+import sys
+import types
+
+# MoRI registers the whole symmetric heap at import time. The pinned upstream
+# inter-node benchmark uses 6 GiB for its InterNodeV1 staging and signal buffers.
+os.environ["MORI_SHMEM_HEAP_SIZE"] = "6G"
+
+import torch
+import torch.distributed as dist
+
+try:
+ import mori # type: ignore
+except Exception as exc: # pragma: no cover - requires the benchmark image
+ print(f"ERROR: mori import failed: {exc!r}", file=sys.stderr)
+ raise
+
+
+def _project_local_metadata(torch_module, raw_expert_ids, raw_weights, rank, experts_per_rank):
+ local_start = rank * experts_per_rank
+ local = (raw_expert_ids >= local_start) & (
+ raw_expert_ids < local_start + experts_per_rank
+ )
+ expert_ids = torch_module.where(
+ local, raw_expert_ids, torch_module.full_like(raw_expert_ids, -1)
+ )
+ weights = torch_module.where(local, raw_weights, torch_module.zeros_like(raw_weights))
+ return expert_ids, weights, raw_expert_ids[local] - local_start
+
+
+def _mori_source_commit() -> str:
+ module_path = Path(mori.__file__).resolve()
+ for root in module_path.parents:
+ head = root / ".git" / "HEAD"
+ if not head.is_symlink() and head.is_file() and head.stat().st_size <= 128:
+ value = head.read_text(encoding="ascii").strip()
+ if re.fullmatch(r"[0-9a-f]{40}", value):
+ return value
+ raise RuntimeError("MoRI image source is not pinned to a detached commit")
+ raise RuntimeError("MoRI image source revision is unavailable")
+
+
+class MoRIBackend:
+ name = "mori"
+ combine_needs_redispatch = True
+ combine_weight_semantics = "unweighted-rank-sum"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = "normal"
+
+ self.ep_size = world_size
+ self.experts_per_rank = args.experts // self.ep_size
+ device_cus = torch.cuda.get_device_properties(device).multi_processor_count
+ gpus_per_node = int(args.gpus_per_node or world_size)
+ scale_up_domain = int(args.scale_up_domain or gpus_per_node)
+ scale_out = world_size > scale_up_domain
+ if (
+ gpus_per_node <= 0
+ or scale_up_domain <= 0
+ or world_size % gpus_per_node
+ or world_size % scale_up_domain
+ ):
+ raise RuntimeError("MoRI placement is not divisible into complete domains")
+ if scale_out != (args.scope == "scale-out"):
+ raise RuntimeError("MoRI requested scope differs from the EP topology")
+ if not scale_out and (
+ world_size != 8
+ or gpus_per_node != 8
+ or scale_up_domain != 8
+ or args.scale_up_transport != "xgmi"
+ or args.scale_out_transport
+ or args.transport != "xgmi"
+ ):
+ raise RuntimeError("MoRI scale-up is pinned to EP8 over one XGMI domain")
+ if scale_out and (
+ world_size != 16
+ or gpus_per_node != 8
+ or scale_up_domain != 8
+ or args.scale_up_transport != "xgmi"
+ or args.scale_out_transport != "rdma"
+ or args.transport != "xgmi-rdma"
+ ):
+ raise RuntimeError(
+ "MoRI InterNodeV1 is pinned to EP16 over two 8-GPU XGMI/RDMA nodes"
+ )
+ self.block_num = self._block_target = 80
+ self.rdma_block_num = 0
+ self.num_qps = 1
+ self._block_floored = False
+ self._tuned_source = "default-80"
+ self.dispatch_warps = 16
+ self.combine_warps = 8
+
+ # MI355X uses the direct intranode kernel. MI325X uses MoRI's split
+ # AsyncLL send/receive kernel as its normal-mode XGMI transport.
+ kernel_request = os.environ.get("CX_MORI_KERNEL_TYPE", "intranode").strip().lower()
+ self._kernel_type = None
+ self._kernel_type_label = "IntraNode"
+ self._async_ll = False
+ self._inter_node = False
+ if kernel_request in ("asyncll", "async_ll", "async-ll"):
+ if scale_out:
+ raise RuntimeError("MoRI EP16 must use InterNodeV1, not AsyncLL")
+ kernel_enum = getattr(mori.ops, "EpDispatchCombineKernelType", None)
+ if kernel_enum is None or not hasattr(kernel_enum, "AsyncLL"):
+ raise RuntimeError(
+ "CX_MORI_KERNEL_TYPE=asyncll requires "
+ "EpDispatchCombineKernelType.AsyncLL"
+ )
+ self._kernel_type = kernel_enum.AsyncLL
+ self._kernel_type_label = "AsyncLL"
+ self._async_ll = True
+ self.block_num = self._block_target = 64
+ self.dispatch_warps = self.combine_warps = 8
+ self._tuned_source = "upstream-asyncll-64x8-external-input"
+ elif kernel_request in ("internode-v1", "internode_v1", "internodev1"):
+ if not scale_out:
+ raise RuntimeError("MoRI InterNodeV1 is valid only for scale-out EP16")
+ kernel_enum = getattr(mori.ops, "EpDispatchCombineKernelType", None)
+ if kernel_enum is None or not hasattr(kernel_enum, "InterNodeV1"):
+ raise RuntimeError(
+ "CX_MORI_KERNEL_TYPE=internode-v1 requires "
+ "EpDispatchCombineKernelType.InterNodeV1"
+ )
+ self._kernel_type = kernel_enum.InterNodeV1
+ self._kernel_type_label = "InterNodeV1"
+ self._inter_node = True
+ self.block_num = self._block_target = 96
+ self.rdma_block_num = 64
+ self.dispatch_warps = self.combine_warps = 8
+ self._tuned_source = "upstream-internode-v1-96-64x8-qps1"
+ elif kernel_request not in ("intranode", "intra_node", "intra-node", ""):
+ raise RuntimeError(
+ f"unknown CX_MORI_KERNEL_TYPE={kernel_request!r} "
+ "(expected intranode|asyncll|internode-v1)"
+ )
+ elif scale_out:
+ raise RuntimeError("MoRI scale-out EP16 requires CX_MORI_KERNEL_TYPE=internode-v1")
+ self.kernel_generation = (
+ "inter-node-v1" if self._inter_node
+ else "async-ll" if self._async_ll
+ else "intranode"
+ )
+ self._external_input = self._async_ll or self._inter_node
+
+ world_group = torch.distributed.group.WORLD
+ torch._C._distributed_c10d._register_process_group("default", world_group)
+ mori.shmem.shmem_torch_process_group_init("default")
+ realized_qps = int(mori.shmem.shmem_num_qp_per_pe())
+ if realized_qps < self.num_qps:
+ raise RuntimeError(
+ f"MoRI realized {realized_qps} QPs per PE; {self.num_qps} required"
+ )
+
+ self._cap = self.buffer_cap(args)
+ config_kwargs = {
+ "data_type": torch.bfloat16,
+ "rank": rank,
+ "world_size": world_size,
+ "hidden_dim": args.hidden,
+ "scale_dim": 0,
+ "scale_type_size": 1,
+ "max_token_type_size": (
+ torch.tensor([], dtype=torch.bfloat16).element_size()
+ if self._inter_node
+ else torch.tensor([], dtype=torch.float32).element_size()
+ ),
+ "max_num_inp_token_per_rank": max(512, self._cap),
+ "num_experts_per_rank": self.experts_per_rank,
+ "num_experts_per_token": args.topk,
+ "use_external_inp_buf": self._external_input,
+ "quant_type": "none",
+ }
+ if self._kernel_type is not None:
+ config_kwargs["kernel_type"] = self._kernel_type
+ if self._async_ll:
+ config_kwargs["max_total_recv_tokens"] = 0
+ if self._async_ll or self._inter_node:
+ config_kwargs["block_num"] = self.block_num
+ config_kwargs["warp_num_per_block"] = self.dispatch_warps
+ if self._inter_node:
+ config_kwargs.update({
+ "gpu_per_node": gpus_per_node,
+ "rdma_block_num": self.rdma_block_num,
+ "num_qp_per_pe": self.num_qps,
+ })
+ self.config = mori.ops.EpDispatchCombineConfig(**config_kwargs)
+ expected_config = {"use_external_inp_buf": self._external_input}
+ if self._async_ll or self._inter_node:
+ expected_config.update({
+ "block_num": self.block_num,
+ "warp_num_per_block": self.dispatch_warps,
+ })
+ if self._inter_node:
+ expected_config.update({
+ "gpu_per_node": 8,
+ "rdma_block_num": 64,
+ "num_qp_per_pe": 1,
+ })
+ if any(getattr(self.config, key, None) != value for key, value in expected_config.items()):
+ raise RuntimeError("MoRI requested launch/topology configuration was not realized")
+ # The newer pinned MoRI revision can otherwise replace explicit values
+ # with token-dependent tuning rules from the image.
+ os.environ["MORI_EP_LAUNCH_CONFIG_MODE"] = "MANUAL"
+ self.op = mori.ops.EpDispatchCombineOp(self.config)
+ if getattr(self.op, "launch_config_mode", None) != "MANUAL":
+ raise RuntimeError("MoRI explicit launch configuration was not applied")
+
+ expected_mori_commit = os.environ.get("MORI_COMMIT")
+ mori_commit = _mori_source_commit()
+ if expected_mori_commit and mori_commit != expected_mori_commit:
+ raise RuntimeError("MoRI image source revision differs from canonical provenance")
+ self.backend_provenance = {
+ "mori_commit": mori_commit,
+ "api": (
+ "mori.ops.EpDispatchCombineOp/external-input"
+ if self._external_input
+ else "mori.ops.EpDispatchCombineOp/registered-input"
+ ),
+ "mode": "normal",
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "kernel_type": self._kernel_type_label,
+ "enable_sdma": os.environ.get("MORI_ENABLE_SDMA"),
+ "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"),
+ "max_num_inp_token_per_rank": max(512, self._cap),
+ "max_total_recv_tokens": config_kwargs.get("max_total_recv_tokens"),
+ "gpus_per_node": gpus_per_node,
+ "rdma_block_num": self.rdma_block_num,
+ "use_external_inp_buf": self._external_input,
+ "num_qps": self.num_qps,
+ "resource_mode": "tuned",
+ "block_num": self.block_num,
+ "block_num_target": self._block_target,
+ "block_num_floored": self._block_floored,
+ "dispatch_warps": self.dispatch_warps,
+ "combine_warps": self.combine_warps,
+ "device_cus": device_cus,
+ "sm_fraction": None if self._async_ll else self.block_num / device_cus,
+ "tuned_source": self._tuned_source,
+ }
+
+ def buffer_cap(self, args):
+ return 512
+
+ def make_problem(self, T, idx, weights, x):
+ indices = idx.to(torch.int32)
+ gate_weights = weights.to(torch.float32)
+ return types.SimpleNamespace(
+ T=T,
+ x=x,
+ topk_idx=indices,
+ topk_weights=gate_weights,
+ indices=indices,
+ weights=gate_weights,
+ scales=torch.empty((T, 0), dtype=torch.uint8, device=self.device),
+ )
+
+ def dispatch(self, p):
+ dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num = (
+ self.op.dispatch(
+ p.x,
+ p.weights,
+ p.scales,
+ p.indices,
+ block_num=self.block_num,
+ rdma_block_num=self.rdma_block_num,
+ warp_per_block=self.dispatch_warps,
+ )
+ )
+ if self._async_ll:
+ self.op.dispatch_recv(warp_per_block=self.dispatch_warps)
+ return types.SimpleNamespace(
+ dispatch_output=dispatch_output,
+ dispatch_weights=dispatch_weights,
+ dispatch_indices=dispatch_indices,
+ recv_num=recv_num[0],
+ combine_input=dispatch_output.to(torch.bfloat16),
+ )
+
+ def stage(self, p, h):
+ rows = getattr(p, "recv_tokens", None)
+ if not isinstance(rows, int) or rows < 0 or rows > h.combine_input.size(0):
+ raise RuntimeError("MoRI receive count was not validated before staging")
+ if self._external_input:
+ return None
+ buffer = self.op.get_registered_combine_input_buffer(
+ torch.bfloat16, hidden_dim=h.combine_input.size(1)
+ )
+ buffer[:rows, :].copy_(h.combine_input[:rows, :])
+ h.combine_input = buffer
+
+ def combine(self, p, h):
+ combine_indices = p.indices if self._async_ll else h.dispatch_indices
+ combined, _weights = self.op.combine(
+ h.combine_input,
+ None,
+ combine_indices,
+ block_num=self.block_num,
+ rdma_block_num=self.rdma_block_num,
+ warp_per_block=self.combine_warps,
+ )
+ if self._async_ll:
+ self.op.combine_recv(warp_per_block=self.combine_warps)
+ return combined[:p.T]
+
+ def inspect_dispatch(self, p, h):
+ count = self.recv_tokens(h)
+ if h.dispatch_weights is None:
+ raise RuntimeError("MoRI dispatch did not expose gate weights")
+ if count < 0 or any(
+ tensor.ndim == 0 or count > tensor.size(0)
+ for tensor in (h.dispatch_output, h.dispatch_indices, h.dispatch_weights)
+ ):
+ raise RuntimeError("MoRI receive count exceeds dispatch metadata")
+ raw_expert_ids = h.dispatch_indices[:count].to(torch.int64)
+ expert_ids, weights, local_expert_ids = _project_local_metadata(
+ torch,
+ raw_expert_ids,
+ h.dispatch_weights[:count].to(torch.float32),
+ self.rank,
+ self.experts_per_rank,
+ )
+ return types.SimpleNamespace(
+ payload=h.dispatch_output[:count],
+ expert_ids=expert_ids,
+ weights=weights,
+ local_expert_counts=torch.bincount(
+ local_expert_ids, minlength=self.experts_per_rank
+ ),
+ ordering_contract="mori-global-topk-masked-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ h.combine_input = transformed.to(torch.bfloat16)
+ self.stage(p, h)
+ return self.combine(p, h)
+
+ def recv_tokens(self, h):
+ return int(h.recv_num.item())
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ except Exception:
+ pass
+ sys.stdout.flush()
+ sys.stderr.flush()
+ os._exit(rc if 0 <= rc <= 255 else 1)
diff --git a/experimental/CollectiveX/tests/ep_nccl.py b/experimental/CollectiveX/tests/ep_nccl.py
new file mode 100644
index 0000000000..cab9cb8238
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_nccl.py
@@ -0,0 +1,186 @@
+"""CollectiveX NCCL all-to-all expert-parallel reference backend.
+
+The canonical "token-shuffle" EP built on torch.distributed's NCCL ``all_to_all_single``. Like the
+DeepEP-family APIs, dispatch sends one hidden-state copy to each distinct destination rank, even when
+multiple selected experts live on that rank. Combine reverses the shuffle and sums those rank copies.
+
+Why this exists alongside DeepEP/UCCL/MoRI: it is the portable collective reference baseline for the
+same rank-deduplicated payload and routing metadata. It keeps the library comparison anchored to the
+platform collective stack without claiming the custom fused kernels use the same transport algorithm.
+
+Scope: BF16, normal mode, layout-and-dispatch-v1. The timed dispatch includes layout, count exchange,
+payload, rank-masked expert indices, gate weights, and source-token metadata; combine returns only
+the activation payload. RCCL exposes the same API. The v1 AMD matrix uses this backend at EP8 and EP16.
+"""
+
+import os
+import re
+import types
+
+import torch
+import torch.distributed as dist
+import contracts
+
+
+def _runtime_collective(args, torch_module) -> tuple[str, str]:
+ expected = "rccl" if torch_module.version.hip else "nccl"
+ fingerprint = getattr(args, "runtime_fingerprint", None)
+ collective = fingerprint.get("collective_library") if isinstance(fingerprint, dict) else None
+ if (
+ not isinstance(collective, dict)
+ or collective.get("kind") != expected
+ or not isinstance(collective.get("version"), str)
+ or not re.fullmatch(r"[0-9]+\.[0-9]+\.[0-9]+", collective["version"])
+ ):
+ raise RuntimeError("loaded collective runtime identity is unavailable")
+ return expected, collective["version"]
+
+
+class NCCLBackend:
+ name = "nccl-ep"
+ combine_needs_redispatch = False # dispatch saves the permutation + splits
+ combine_weight_semantics = "unweighted-rank-sum"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.experts = args.experts
+ if args.experts % world_size:
+ raise ValueError(f"experts({args.experts}) must divide world_size({world_size})")
+ self.experts_per_rank = args.experts // world_size
+ self.tolerance = 5e-2 # bf16 round-trip
+ _library, _version = _runtime_collective(args, torch)
+ if args.scale_out_transport:
+ hcas = os.environ.get("NCCL_IB_HCA", "")
+ if os.environ.get("NCCL_NET") != "IB" or not re.fullmatch(
+ r"=[A-Za-z][A-Za-z0-9_.-]{0,31}(?::[1-9][0-9]*)?"
+ r"(?:,[A-Za-z][A-Za-z0-9_.-]{0,31}(?::[1-9][0-9]*)?)*",
+ hcas,
+ ):
+ raise RuntimeError("scale-out collective transport is not pinned to RDMA")
+ self.kernel_generation = contracts.collective_kernel_generation(_library)
+ self.backend_provenance = {
+ "backend": f"{_library}-all2all",
+ "backend_lineage": _library,
+ "collective_library": _library,
+ "nccl_version": _version,
+ "transport": f"{_library}-all_to_all_single",
+ "resource_mode": "tuned",
+ "num_sms": None,
+ "device_sms": torch.cuda.get_device_properties(device).multi_processor_count,
+ "tuned_source": "nccl-collective",
+ "reference_semantics": "rank-deduplicated-payload-plus-routing-metadata-v2",
+ "routing_metadata": "expert-index-gate-weight-source-token",
+ }
+
+ def buffer_cap(self, args):
+ return None # no fixed pre-allocated buffer; all-to-all sizes itself per step
+
+ def make_problem(self, T, idx, weights, x):
+ # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared routing-trace slice.
+ return types.SimpleNamespace(T=T, x=x, topk_idx=idx.to(torch.int64),
+ topk_weights=weights.to(torch.float32), layout=None)
+
+ def dispatch(self, p):
+ ws = self.world_size
+ x = p.x # [T, H] bf16
+ idx = p.topk_idx # [T, topk]
+ T, H = int(x.shape[0]), int(x.shape[1])
+ dev = x.device
+ # DeepEP dispatches one token per destination rank, not one copy per expert. Build the same
+ # rank-deduplicated routing map so NCCL traffic and combine semantics are comparable.
+ destinations = (idx // self.experts_per_rank).clamp_(0, ws - 1)
+ present = torch.zeros((T, ws), dtype=torch.bool, device=dev)
+ present.scatter_(1, destinations, True)
+ flat_token, flat_dest = present.nonzero(as_tuple=True)
+ # Group rank copies by destination (stable -> deterministic, invertible permutation).
+ order = torch.argsort(flat_dest, stable=True)
+ ordered_token = flat_token.index_select(0, order)
+ ordered_dest = flat_dest.index_select(0, order)
+ send_counts = torch.bincount(flat_dest, minlength=ws) # [ws]
+ send_x = x.index_select(0, ordered_token).contiguous()
+ send_topk_idx = idx.index_select(0, ordered_token).contiguous()
+ expert_start = ordered_dest.unsqueeze(1) * self.experts_per_rank
+ local_mask = ((send_topk_idx >= expert_start)
+ & (send_topk_idx < expert_start + self.experts_per_rank))
+ send_topk_idx = torch.where(
+ local_mask, send_topk_idx - expert_start, torch.full_like(send_topk_idx, -1)
+ )
+ send_topk_weights = p.topk_weights.index_select(0, ordered_token).contiguous()
+ send_topk_weights.masked_fill_(~local_mask, 0)
+ send_src_metadata = (ordered_token.to(torch.int64) | (self.rank << 32)).contiguous()
+ # Exchange per-rank counts so every rank can size its receive buffer.
+ recv_counts = torch.empty_like(send_counts)
+ dist.all_to_all_single(recv_counts, send_counts)
+ sc = send_counts.tolist()
+ rc = recv_counts.tolist()
+ total_recv = int(sum(rc))
+ recv_x = torch.empty((total_recv, H), dtype=x.dtype, device=dev)
+ recv_topk_idx = torch.empty((total_recv, int(idx.shape[1])), dtype=idx.dtype, device=dev)
+ recv_topk_weights = torch.empty((total_recv, int(idx.shape[1])),
+ dtype=p.topk_weights.dtype, device=dev)
+ recv_src_metadata = torch.empty((total_recv,), dtype=torch.int64, device=dev)
+ # Dispatch the uneven per-rank splits over the configured collective transport.
+ dist.all_to_all_single(recv_x, send_x, rc, sc)
+ dist.all_to_all_single(recv_topk_idx, send_topk_idx, rc, sc)
+ dist.all_to_all_single(recv_topk_weights, send_topk_weights, rc, sc)
+ dist.all_to_all_single(recv_src_metadata, send_src_metadata, rc, sc)
+ return types.SimpleNamespace(
+ recv_x=recv_x, combine_input=None, order=order, flat_token=flat_token,
+ recv_topk_idx=recv_topk_idx,
+ recv_topk_weights=recv_topk_weights, recv_src_rank=recv_src_metadata >> 32,
+ recv_src_token=recv_src_metadata & ((1 << 32) - 1), send_counts=sc, recv_counts=rc,
+ T=T, H=H, topk=int(idx.shape[1]), total_recv=total_recv)
+
+ def stage(self, p, h):
+ # No expert compute: the expert "output" is the received tokens as-is (the round-trip identity).
+ h.combine_input = h.recv_x
+ return None
+
+ def combine(self, p, h):
+ # Reverse all-to-all: ship expert outputs back to their origin ranks (swap the split lists).
+ send_back = torch.empty((int(h.order.shape[0]), h.H), dtype=h.combine_input.dtype,
+ device=h.combine_input.device)
+ dist.all_to_all_single(send_back, h.combine_input.contiguous(),
+ h.send_counts, h.recv_counts)
+ # send_back is in send (sorted) order; invert the argsort to token-copy order.
+ copies = torch.empty_like(send_back)
+ copies[h.order] = send_back
+ # Sum one copy per destination rank under this reference's explicit unweighted contract.
+ out = torch.zeros((h.T, h.H), dtype=torch.float32, device=send_back.device)
+ out.index_add_(0, h.flat_token, copies.float())
+ return out.to(p.x.dtype)
+
+ def inspect_dispatch(self, p, h):
+ valid = h.recv_topk_idx >= 0
+ expert_ids = torch.where(
+ valid,
+ h.recv_topk_idx + self.rank * self.experts_per_rank,
+ h.recv_topk_idx,
+ )
+ return types.SimpleNamespace(
+ payload=h.recv_x,
+ expert_ids=expert_ids,
+ weights=h.recv_topk_weights.masked_fill(~valid, 0),
+ local_expert_counts=torch.bincount(
+ h.recv_topk_idx[valid], minlength=self.experts_per_rank
+ ),
+ ordering_contract="source-rank-major-stable-v1",
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ h.combine_input = transformed.to(h.recv_x.dtype)
+ return self.combine(p, h)
+
+ def recv_tokens(self, h):
+ return int(h.total_recv)
+
+ def finalize(self, rc):
+ try:
+ dist.barrier()
+ dist.destroy_process_group()
+ except Exception:
+ pass
+ return rc
diff --git a/experimental/CollectiveX/tests/ep_uccl.py b/experimental/CollectiveX/tests/ep_uccl.py
new file mode 100644
index 0000000000..7d02bb2991
--- /dev/null
+++ b/experimental/CollectiveX/tests/ep_uccl.py
@@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+"""CollectiveX UCCL adapter for the v1 BF16 normal-mode workload."""
+from __future__ import annotations
+
+import importlib.metadata as metadata
+import json
+import os
+from pathlib import Path
+from pathlib import PurePosixPath
+import sys
+import types
+
+import torch
+import torch.distributed as dist
+import contracts
+
+try:
+ import uccl
+ import uccl_deepep
+ from uccl_deepep import Buffer # type: ignore
+except Exception as exc: # pragma: no cover - requires the benchmark image
+ print(f"ERROR: uccl.ep import failed: {exc!r}", file=sys.stderr)
+ raise
+
+
+def _uccl_version() -> str:
+ try:
+ return metadata.version("uccl")
+ except Exception:
+ return getattr(uccl, "__version__", "unknown")
+
+
+def _uccl_dependency_versions() -> dict[str, str]:
+ versions = {
+ package: metadata.version(package)
+ for package in contracts.UCCL_DEPENDENCY_VERSIONS
+ }
+ if versions != contracts.UCCL_DEPENDENCY_VERSIONS:
+ raise RuntimeError(
+ "UCCL runtime dependency versions differ from the v1 contract"
+ )
+ return versions
+
+
+def _is_uccl_runtime_payload(name: str) -> bool:
+ path = PurePosixPath(name)
+ return (
+ bool(path.parts)
+ and path.parts[0] in {"uccl", "uccl.libs"}
+ and "__pycache__" not in path.parts
+ and path.suffix != ".pyc"
+ )
+
+
+def _python_dependency_evidence(package: str, version: str) -> dict[str, str]:
+ distribution = metadata.distribution(package)
+ runtime_files = []
+ for entry in distribution.files or ():
+ logical = PurePosixPath(entry.as_posix())
+ path = Path(distribution.locate_file(entry))
+ if (
+ logical.parts
+ and logical.parts[0] == package
+ and "__pycache__" not in logical.parts
+ and logical.suffix != ".pyc"
+ and path.is_file()
+ ):
+ runtime_files.append((entry.as_posix(), path))
+ return contracts.content_manifest_evidence(
+ role=f"{package}-distribution",
+ name=f"{package}-{version}",
+ files=runtime_files,
+ )
+
+
+def _loaded_libcudart_evidence(
+ version: str, maps_path: Path = Path("/proc/self/maps")
+) -> dict[str, str]:
+ distribution = metadata.distribution("nvidia-cuda-runtime-cu12")
+ candidates = {
+ Path(distribution.locate_file(entry)).resolve()
+ for entry in distribution.files or ()
+ if PurePosixPath(entry.as_posix()).name.startswith("libcudart.so")
+ and Path(distribution.locate_file(entry)).is_file()
+ }
+ candidate_names = {path.name for path in candidates}
+ if not candidates or not candidate_names:
+ raise RuntimeError("pinned CUDA runtime distribution has no libcudart payload")
+
+ loaded: set[Path] = set()
+ try:
+ mappings = maps_path.read_text().splitlines()
+ except OSError as exc:
+ raise RuntimeError("cannot inspect mapped UCCL runtime libraries") from exc
+ for mapping in mappings:
+ columns = mapping.split(maxsplit=5)
+ if len(columns) != 6:
+ continue
+ raw_path = columns[5]
+ deleted = raw_path.endswith(" (deleted)")
+ if deleted:
+ raw_path = raw_path.removesuffix(" (deleted)")
+ mapped = Path(raw_path)
+ if mapped.name not in candidate_names:
+ continue
+ if deleted or not mapped.is_file():
+ raise RuntimeError(
+ "mapped libcudart is unavailable for content verification"
+ )
+ resolved = mapped.resolve()
+ if resolved not in candidates:
+ raise RuntimeError(
+ "mapped libcudart is not owned by the pinned CUDA runtime package"
+ )
+ loaded.add(resolved)
+ if len(loaded) != 1:
+ raise RuntimeError(
+ "expected exactly one mapped libcudart from the pinned CUDA runtime"
+ )
+ return contracts.content_manifest_evidence(
+ role="cuda-runtime",
+ name=f"nvidia-cuda-runtime-cu12-{version}",
+ files=[("libcudart.so", loaded.pop())],
+ )
+
+
+def _uccl_build_evidence(
+ version: str, dependency_versions: dict[str, str]
+) -> list[dict[str, str]]:
+ distribution = metadata.distribution("uccl")
+ distribution_files = [
+ (entry.as_posix(), distribution.locate_file(entry))
+ for entry in distribution.files or ()
+ if _is_uccl_runtime_payload(entry.as_posix())
+ and Path(distribution.locate_file(entry)).is_file()
+ ]
+ wrapper_root = Path(uccl_deepep.__file__).resolve().parent
+ wrapper_files = [
+ (path.relative_to(wrapper_root).as_posix(), path)
+ for path in wrapper_root.rglob("*.py")
+ if path.is_file()
+ ]
+ return [
+ contracts.content_manifest_evidence(
+ role="uccl-distribution",
+ name=f"uccl-{version}",
+ files=distribution_files,
+ ),
+ contracts.content_manifest_evidence(
+ role="uccl-wrapper",
+ name="uccl-deepep-wrapper",
+ files=wrapper_files,
+ ),
+ _python_dependency_evidence("intervaltree", dependency_versions["intervaltree"]),
+ _python_dependency_evidence(
+ "sortedcontainers", dependency_versions["sortedcontainers"]
+ ),
+ _loaded_libcudart_evidence(dependency_versions["nvidia-cuda-runtime-cu12"]),
+ ]
+
+
+def _require_cross_rank_equal(value, label: str) -> None:
+ gathered = [None] * dist.get_world_size()
+ dist.all_gather_object(gathered, value)
+ canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered}
+ if len(canonical) != 1:
+ raise RuntimeError(f"UCCL {label} differs across ranks")
+
+
+def _normal_buffer_sizes(hidden: int, world_size: int) -> tuple[int, int]:
+ """Apply the wrapped DeepEP dispatch/combine sizing contract for this EP world."""
+ hidden_bytes = hidden * torch.tensor([], dtype=torch.bfloat16).element_size()
+ configs = (Buffer.get_dispatch_config(world_size), Buffer.get_combine_config(world_size))
+ num_nvl_bytes = max(
+ int(config.get_nvl_buffer_size_hint(hidden_bytes, world_size)) for config in configs
+ )
+ num_rdma_bytes = max(
+ int(config.get_rdma_buffer_size_hint(hidden_bytes, world_size)) for config in configs
+ )
+ if num_nvl_bytes <= 0 or num_rdma_bytes < 0:
+ raise RuntimeError("UCCL returned invalid normal-mode buffer size hints")
+ return num_nvl_bytes, num_rdma_bytes
+
+
+class UCCLBackend:
+ name = "uccl"
+ combine_needs_redispatch = False
+ combine_weight_semantics = "unweighted-rank-sum"
+ oracle_layout = "token-rank"
+ payload_unit = "token-rank"
+
+ def __init__(self, args, rank, world_size, local_rank, device):
+ self.args = args
+ self.rank = rank
+ self.world_size = world_size
+ self.device = device
+ self.mode = getattr(args, "mode", "normal")
+ if self.mode not in {"normal", "low-latency"}:
+ raise ValueError(f"unsupported UCCL mode {self.mode!r}")
+
+ self.group = dist.group.WORLD
+ device_sms = torch.cuda.get_device_properties(device).multi_processor_count
+ if self.mode == "low-latency":
+ if args.phase != "decode":
+ raise ValueError("UCCL low-latency mode only supports the decode ladder")
+ if args.experts % world_size:
+ raise ValueError("UCCL low-latency experts must divide the EP group")
+ self.combine_needs_redispatch = True
+ self.combine_weight_semantics = "gate-weighted-sum"
+ self.oracle_layout = "expert-packed"
+ self.payload_unit = "token-expert"
+ self.max_tokens_per_rank = 128
+ num_qps_per_rank = args.experts // world_size
+ num_rdma_bytes = Buffer.get_low_latency_rdma_size_hint(
+ self.max_tokens_per_rank, args.hidden, world_size, args.experts
+ )
+ self.buffer = Buffer(
+ self.group,
+ num_nvl_bytes=0,
+ num_rdma_bytes=num_rdma_bytes,
+ low_latency_mode=True,
+ num_qps_per_rank=num_qps_per_rank,
+ allow_nvlink_for_low_latency_mode=True,
+ )
+ self.buffer.clean_low_latency_buffer(
+ self.max_tokens_per_rank, args.hidden, args.experts
+ )
+ resource_provenance = {
+ "requested_num_sms": None,
+ "num_sms": None,
+ "sm_fraction": None,
+ "tuned_source": "uccl-low-latency-fixed-kernel",
+ "num_max_tokens_per_rank": self.max_tokens_per_rank,
+ "num_nvl_bytes": 0,
+ "num_rdma_bytes": num_rdma_bytes,
+ }
+ else:
+ num_nvl_bytes, num_rdma_bytes = _normal_buffer_sizes(args.hidden, world_size)
+ if world_size > args.scale_up_domain and num_rdma_bytes == 0:
+ raise RuntimeError("UCCL scale-out configuration returned no RDMA buffer")
+ self.buffer = Buffer(self.group, num_nvl_bytes, num_rdma_bytes)
+ num_sms = int(getattr(Buffer, "num_sms", args.num_sms))
+ try:
+ Buffer.set_num_sms(num_sms)
+ except Exception as exc: # pragma: no cover - version dependent
+ raise RuntimeError(
+ f"UCCL did not apply requested num_sms={num_sms}: {exc!r}"
+ ) from exc
+ applied_num_sms = int(getattr(Buffer, "num_sms", num_sms))
+ if applied_num_sms != num_sms:
+ raise RuntimeError(
+ f"UCCL num_sms mismatch: requested={num_sms} applied={applied_num_sms}"
+ )
+ resource_provenance = {
+ "requested_num_sms": num_sms,
+ "num_sms": applied_num_sms,
+ "sm_fraction": applied_num_sms / device_sms,
+ "tuned_source": "uccl-default-num_sms",
+ "num_nvl_bytes": num_nvl_bytes,
+ "num_rdma_bytes": num_rdma_bytes,
+ }
+ version = _uccl_version()
+ dependency_versions = _uccl_dependency_versions()
+ loaded_libraries = _uccl_build_evidence(version, dependency_versions)
+ _require_cross_rank_equal(loaded_libraries, "installed content identities")
+ self.backend_provenance = {
+ "uccl_version": version,
+ "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{version}",
+ "uccl_wrapper_commit": os.environ.get("UCCL_WRAPPER_COMMIT"),
+ "backend_lineage": "uccl",
+ "uccl_dependency_versions": dependency_versions,
+ "loaded_libraries": loaded_libraries,
+ "mode": self.mode,
+ "dispatch_dtype": "bf16",
+ "combine_dtype": "bf16",
+ "resource_mode": "tuned",
+ "device_sms": device_sms,
+ **resource_provenance,
+ }
+
+ def buffer_cap(self, args):
+ return self.max_tokens_per_rank if self.mode == "low-latency" else None
+
+ def make_problem(self, T, idx, weights, x):
+ return types.SimpleNamespace(
+ T=T,
+ x=x,
+ topk_idx=idx.to(torch.int64),
+ topk_weights=weights.to(torch.float32),
+ )
+
+ def dispatch(self, p):
+ if self.mode == "low-latency":
+ recv_x, recv_counts, handle, _, _ = self.buffer.low_latency_dispatch(
+ p.x,
+ p.topk_idx,
+ self.max_tokens_per_rank,
+ self.args.experts,
+ use_fp8=False,
+ async_finish=False,
+ return_recv_hook=False,
+ )
+ return types.SimpleNamespace(
+ recv_x=recv_x,
+ recv_counts=recv_counts,
+ handle=handle,
+ )
+ (
+ num_tokens_per_rank,
+ num_tokens_per_rdma_rank,
+ num_tokens_per_expert,
+ is_token_in_rank,
+ _,
+ ) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts)
+ recv_x, recv_topk_idx, recv_topk_weights, recv_counts, handle, _ = self.buffer.dispatch(
+ p.x,
+ topk_idx=p.topk_idx,
+ topk_weights=p.topk_weights,
+ num_tokens_per_rank=num_tokens_per_rank,
+ num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+ is_token_in_rank=is_token_in_rank,
+ num_tokens_per_expert=num_tokens_per_expert,
+ )
+ return types.SimpleNamespace(
+ recv_x=recv_x,
+ recv_topk_idx=recv_topk_idx,
+ recv_topk_weights=recv_topk_weights,
+ recv_counts=recv_counts,
+ handle=handle,
+ )
+
+ def stage(self, p, h):
+ h.combine_input = h.recv_x
+
+ def combine(self, p, h):
+ if self.mode == "low-latency":
+ combined_x, _, _ = self.buffer.low_latency_combine(
+ h.combine_input,
+ p.topk_idx,
+ p.topk_weights,
+ h.handle,
+ async_finish=False,
+ return_recv_hook=False,
+ )
+ return combined_x
+ combined_x, _, _ = self.buffer.combine(h.combine_input, h.handle)
+ return combined_x
+
+ def inspect_dispatch(self, p, h):
+ valid = h.recv_topk_idx >= 0
+ expert_ids = torch.where(
+ valid,
+ h.recv_topk_idx + self.rank * (self.args.experts // self.world_size),
+ h.recv_topk_idx,
+ )
+ return types.SimpleNamespace(
+ payload=h.recv_x,
+ expert_ids=expert_ids,
+ weights=h.recv_topk_weights.masked_fill(~valid, 0),
+ local_expert_counts=torch.tensor(h.recv_counts, device=self.device, dtype=torch.int64),
+ ordering_contract="source-rank-major-stable-v1",
+ )
+
+ def inspect_expert_dispatch(self, p, h):
+ if self.mode != "low-latency":
+ raise RuntimeError("expert-packed inspection requires low-latency mode")
+ return types.SimpleNamespace(
+ payload=h.recv_x,
+ local_expert_counts=h.recv_counts,
+ source_info=h.handle[0],
+ layout_range=h.handle[1],
+ )
+
+ def combine_transformed(self, p, h, transformed):
+ if self.mode == "low-latency":
+ packed = torch.zeros_like(h.recv_x)
+ packed[h.oracle_local_expert_slots, h.oracle_packed_positions] = transformed.to(
+ h.recv_x.dtype
+ )
+ combined, _, _ = self.buffer.low_latency_combine(
+ packed,
+ p.topk_idx,
+ p.topk_weights,
+ h.handle,
+ async_finish=False,
+ return_recv_hook=False,
+ )
+ return combined
+ combined, _, _ = self.buffer.combine(transformed.to(h.recv_x.dtype), h.handle)
+ return combined
+
+ def recv_tokens(self, h):
+ if self.mode == "low-latency":
+ return int(h.recv_counts.to(torch.int64).sum().item())
+ return int(h.recv_x.shape[0])
+
+ def finalize(self, rc):
+ # UCCL's proxy teardown can crash after results are written; preserve the real rc.
+ try:
+ dist.barrier()
+ except Exception:
+ pass
+ sys.stdout.flush()
+ sys.stderr.flush()
+ os._exit(rc if 0 <= rc <= 255 else 1)
diff --git a/experimental/CollectiveX/tests/eplb.py b/experimental/CollectiveX/tests/eplb.py
new file mode 100644
index 0000000000..b1479da9f1
--- /dev/null
+++ b/experimental/CollectiveX/tests/eplb.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""CollectiveX — EPLB (Expert-Parallel Load Balancer), the DeepSeek-style remedy for
+skewed (zipf) expert load.
+
+Under skewed routing, the ranks hosting hot logical experts receive far more token-copies
+than the rest; dispatch/combine latency is gated by that busiest rank (the cross-rank MAX
+the harness measures), so the whole collective stalls on it. EPLB REPLICATES hot experts
+onto extra physical slots and PLACES the slots so every rank carries ~equal load.
+
+This module is backend-agnostic: it is purely a transform of the deterministic routing
+trace. The trick that keeps every adapter unchanged — DeepEP/MoRI both route expert i to
+rank `i // experts_per_rank` (contiguous block placement) — is to number the physical slots
+RANK-MAJOR (rank r owns physical ids [r*spp, (r+1)*spp)), so the standard contiguous mapping
+reproduces EPLB's balanced placement. The harness then runs with `experts = num_physical`
+and the remapped (physical) trace; nothing else changes.
+
+ num_physical = num_logical + redundant (redundant rounded up to a multiple of ep_size)
+ build_plan(): greedy replicate-by-load + equal-cardinality balanced packing onto ep_size ranks
+ remap_idx(): each token's logical targets -> physical replicas, spread by global token id
+
+Pure-Python planner (no torch) so it unit-tests on a login node; remap_idx needs torch.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+
+
+def physical_count(num_logical: int, num_redundant: int, ep_size: int) -> int:
+ """num_logical + redundant, with redundant rounded UP to a multiple of ep_size so the
+ physical experts divide evenly across ranks (symmetric dispatch)."""
+ r = ((max(0, num_redundant) + ep_size - 1) // ep_size) * ep_size
+ return num_logical + r
+
+
+def _contiguous_rank_load(logical_load, ep_size):
+ """Per-rank received load WITHOUT EPLB: logical experts placed contiguously
+ (experts_per_rank = num_logical/ep_size), so rank r carries its block's total."""
+ n = len(logical_load)
+ per = n // ep_size
+ return [sum(logical_load[r * per:(r + 1) * per]) for r in range(ep_size)]
+
+
+def build_plan(logical_load, num_physical: int, ep_size: int) -> dict:
+ """logical_load: list[float] length num_logical (token-copies per logical expert).
+ Returns the replication+placement plan (all pure-Python lists) + before/after balance."""
+ num_logical = len(logical_load)
+ assert num_physical >= num_logical, "num_physical must be >= num_logical"
+ assert num_physical % ep_size == 0, "num_physical must divide ep_size"
+ assert num_logical % ep_size == 0, "num_logical must divide ep_size"
+ spp = num_physical // ep_size # physical slots per rank (fixed)
+
+ # 1) Replica allocation — start one slot per logical expert, then hand each redundant
+ # slot to the expert with the highest CURRENT per-replica load (greedy min-max).
+ replicas = [1] * num_logical
+ for _ in range(num_physical - num_logical):
+ best, best_lps = 0, -1.0
+ for e in range(num_logical):
+ lps = logical_load[e] / replicas[e]
+ if lps > best_lps:
+ best, best_lps = e, lps
+ replicas[best] += 1
+
+ # 2) Slots = (per-replica load, logical expert), one per replica.
+ slots = []
+ for e in range(num_logical):
+ lps = logical_load[e] / replicas[e]
+ slots.extend((lps, e) for _ in range(replicas[e]))
+
+ # 3) Balanced packing into ep_size bins of EQUAL cardinality (spp each), minimizing the
+ # max per-rank load: heaviest slot first -> least-loaded rank that still has capacity.
+ slots.sort(reverse=True)
+ rank_slots = [[] for _ in range(ep_size)]
+ rank_load = [0.0] * ep_size
+ for lps, e in slots:
+ r = min((r for r in range(ep_size) if len(rank_slots[r]) < spp),
+ key=lambda r: rank_load[r])
+ rank_slots[r].append(e)
+ rank_load[r] += lps
+
+ # 4) Rank-major physical numbering -> contiguous placement == this balanced placement.
+ phys2log, rank_of_phys = [], []
+ for r in range(ep_size):
+ for e in rank_slots[r]:
+ phys2log.append(e)
+ rank_of_phys.append(r)
+ log2phys = [[] for _ in range(num_logical)]
+ for pid, e in enumerate(phys2log):
+ log2phys[e].append(pid)
+
+ before = _contiguous_rank_load(logical_load, ep_size)
+ total = sum(logical_load) or 1.0
+ mean = total / ep_size
+ return {
+ "num_logical": num_logical, "num_physical": num_physical, "ep_size": ep_size,
+ "slots_per_rank": spp, "replicas": replicas, "max_replicas": max(replicas),
+ "phys2log": phys2log, "rank_of_phys": rank_of_phys, "log2phys": log2phys,
+ "rank_load_after": rank_load, "rank_load_before": before,
+ # imbalance = busiest rank / mean (1.0 = perfect). This is the number EPLB cuts.
+ "imbalance_before": max(before) / mean, "imbalance_after": max(rank_load) / mean,
+ "replicated_experts": sum(1 for r in replicas if r > 1),
+ }
+
+
+def mapping_hash(plan: dict) -> str:
+ """Hash the placement fields that fully determine the logical-to-physical remap."""
+ payload = {
+ "phys2log": plan["phys2log"],
+ "rank_of_phys": plan["rank_of_phys"],
+ "replicas": plan["replicas"],
+ }
+ return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest()
+
+
+def remap_rows(indices: list[list[int]], plan: dict) -> list[list[int]]:
+ """Pure-Python equivalent of remap_idx for contract verification."""
+ replicas = plan["log2phys"]
+ return [
+ [replicas[expert][token % len(replicas[expert])] for expert in row]
+ for token, row in enumerate(indices)
+ ]
+
+
+def remap_idx(idx_logical, plan):
+ """idx_logical: torch [gt, topk] int64 logical-expert ids (global trace).
+ Returns idx_physical [gt, topk]: each token's logical target -> one of that expert's
+ physical replicas, SPREAD by global token id (row) so a hot expert's tokens fan out
+ across its replicas (= across ranks). Replicas of distinct logical experts are disjoint,
+ so a token's top-k physical ids stay distinct (dispatch invariant preserved)."""
+ import torch
+ replicas = plan["replicas"]
+ num_logical = len(replicas)
+ max_rc = plan["max_replicas"]
+ rc = torch.tensor(replicas, dtype=torch.int64)
+ # padded [num_logical, max_rc] table of physical ids (pad with replica 0; never indexed
+ # past rc[e] because the replica index is taken mod rc[e]).
+ padded = torch.zeros(num_logical, max_rc, dtype=torch.int64)
+ for e, phys in enumerate(plan["log2phys"]):
+ for k in range(max_rc):
+ padded[e, k] = phys[k] if k < len(phys) else phys[0]
+ gt = idx_logical.shape[0]
+ rows = torch.arange(gt, dtype=torch.int64).unsqueeze(1) # [gt,1] global token id
+ e = idx_logical.to(torch.int64) # [gt,topk]
+ ridx = rows % rc[e] # [gt,topk] replica index
+ return padded[e, ridx] # [gt,topk] physical ids
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+ # Synthetic zipf load (popularity ∝ 1/(e+1)) — the case EPLB targets. No torch needed.
+ import sys
+ NUM_LOGICAL, EP, REDUNDANT = 256, 8, 32
+ load = [1.0 / (e + 1) for e in range(NUM_LOGICAL)]
+ nphys = physical_count(NUM_LOGICAL, REDUNDANT, EP)
+ plan = build_plan(load, nphys, EP)
+ print(f"num_logical={NUM_LOGICAL} ep={EP} num_physical={nphys} slots/rank={plan['slots_per_rank']}")
+ print(f"replicated experts={plan['replicated_experts']} max_replicas={plan['max_replicas']} "
+ f"(hottest expert 0 replicas={plan['replicas'][0]})")
+ print(f"per-rank load BEFORE (contiguous): {[round(x,3) for x in plan['rank_load_before']]}")
+ print(f"per-rank load AFTER (EPLB): {[round(x,3) for x in plan['rank_load_after']]}")
+ print(f"imbalance (max/mean) BEFORE={plan['imbalance_before']:.2f}x AFTER={plan['imbalance_after']:.2f}x")
+ # Gates: equal slot cardinality, every logical expert placed, big imbalance cut.
+ assert all(plan["replicas"][e] >= 1 for e in range(NUM_LOGICAL))
+ assert sum(plan["replicas"]) == nphys
+ assert len(plan["phys2log"]) == nphys
+ assert all(len(plan["log2phys"][e]) == plan["replicas"][e] for e in range(NUM_LOGICAL))
+ # rank-major numbering => contiguous block per rank => rank_of_phys is non-decreasing
+ assert plan["rank_of_phys"] == sorted(plan["rank_of_phys"])
+ assert plan["imbalance_after"] < plan["imbalance_before"], "EPLB must reduce imbalance"
+ assert plan["imbalance_after"] < 1.30, f"EPLB should get within ~30% of perfect, got {plan['imbalance_after']:.2f}"
+ # remap (if torch present): distinctness + balanced receive on a sampled zipf trace.
+ try:
+ import torch
+ g = torch.Generator().manual_seed(0)
+ p = torch.tensor(load)
+ p = (p / p.sum()).expand(4096, NUM_LOGICAL)
+ idx_l = torch.multinomial(p, 8, replacement=False, generator=g).to(torch.int64)
+ idx_p = remap_idx(idx_l, plan)
+ assert idx_p.shape == idx_l.shape
+ # top-k physical ids distinct per token
+ assert all(len(set(row.tolist())) == 8 for row in idx_p), "physical top-k must stay distinct"
+ spp = plan["slots_per_rank"]
+ recv_before = [0] * EP
+ recv_after = [0] * EP
+ per_log = NUM_LOGICAL // EP
+ for row_l, row_p in zip(idx_l.tolist(), idx_p.tolist()):
+ for e in row_l:
+ recv_before[e // per_log] += 1
+ for pid in row_p:
+ recv_after[pid // spp] += 1
+ ib = max(recv_before) / (sum(recv_before) / EP)
+ ia = max(recv_after) / (sum(recv_after) / EP)
+ print(f"sampled-trace receive imbalance BEFORE={ib:.2f}x AFTER={ia:.2f}x")
+ assert ia < ib and ia < 1.35, "remap must balance per-rank receive load"
+ print("remap self-test: OK")
+ except ImportError:
+ print("(torch absent — skipped remap self-test; planner gates passed)")
+ print("EPLB self-test: PASS")
+ sys.exit(0)
diff --git a/experimental/CollectiveX/tests/make_workloads.py b/experimental/CollectiveX/tests/make_workloads.py
new file mode 100644
index 0000000000..862c3d0375
--- /dev/null
+++ b/experimental/CollectiveX/tests/make_workloads.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""Generate canonical serialized workloads. Runs the stdlib counter generator for
+each (routing, global_tokens) in a ladder and writes .npz + .manifest.json into a
+dir that runs then consume via `run_ep.py --workload-dir`. One trace is emitted per global-token
+count because global token count is part of workload identity.
+
+ python3 tests/make_workloads.py --out-dir /path/to/cx_workloads \\
+ --routing uniform --ep 8 --hidden 7168 --topk 8 --experts 256 --seed 67 \\
+ --tokens-ladder "1 2 4 8 16 32 64 128 256 512"
+
+Or by the named v1 workload in configs/workloads.yaml. Explicit dimension flags still override it:
+
+ python3 tests/make_workloads.py --out-dir /path/to/cx_workloads --workload deepseek-v3-v1 --routing uniform --ep 8
+
+--id-only prints the content-bound workload_id per ladder point without torch/numpy:
+
+ python3 tests/make_workloads.py --workload deepseek-v3-v1 --ep 8 --id-only
+
+Generate every routing the suites need by running once per --routing. Idempotent (same id => same
+file). The dir is the cross-hardware artifact: copy it to each cluster so all consume identical bytes.
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import workload as wl # noqa: E402
+
+# Repo root holds configs/ (this file is in tests/). Used only for --workload name resolution.
+_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+def resolve_manifest(name):
+ """Look a workload name up in configs/workloads.yaml and return (hidden, topk, experts).
+ Searches synthetic + model_derived; expert count = `experts` or (for model-derived) `routed_experts`.
+ Raises SystemExit with the known names if the manifest is absent. Pure PyYAML + stdlib."""
+ import yaml
+ path = os.path.join(_REPO, "configs", "workloads.yaml")
+ with open(path) as handle:
+ cfg = yaml.safe_load(handle)
+ known = []
+ for section in ("synthetic", "model_derived"):
+ sec = cfg.get(section) or {}
+ known += list(sec)
+ m = sec.get(name)
+ if m is None:
+ continue
+ experts = m.get("experts", m.get("routed_experts"))
+ if m.get("hidden") is None or m.get("topk") is None or experts is None:
+ raise SystemExit(f"workload '{name}' is missing hidden/topk/experts in {path}")
+ return int(m["hidden"]), int(m["topk"]), int(experts)
+ raise SystemExit(f"unknown --workload '{name}'; known: {sorted(known)}")
+
+
+def main() -> int:
+ ap = argparse.ArgumentParser(description="Generate canonical CollectiveX workloads")
+ ap.add_argument("--out-dir", help="required unless --id-only")
+ ap.add_argument("--workload", help="named manifest in configs/workloads.yaml (sets hidden/topk/experts)")
+ ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"])
+ ap.add_argument("--ep", type=int, required=True, help="ep_size (global_tokens = T * ep)")
+ ap.add_argument("--hidden", type=int, help="override (default 7168, or the --workload's hidden)")
+ ap.add_argument("--topk", type=int, help="override (default 8, or the --workload's topk)")
+ ap.add_argument("--experts", type=int, help="override (default 256, or the --workload's experts)")
+ ap.add_argument("--seed", type=int, default=67)
+ ap.add_argument("--tokens-ladder", default="1 2 4 8 16 32 64 128 256 512")
+ ap.add_argument("--id-only", action="store_true",
+ help="print content-bound workload_id per point without torch/numpy")
+ a = ap.parse_args()
+
+ # Resolve dims: a named --workload supplies defaults; explicit --hidden/--topk/--experts override
+ # per field. With neither, fall back to the v1 DeepSeek dimensions (7168/8/256).
+ base_h, base_t, base_e = (7168, 8, 256)
+ if a.workload:
+ base_h, base_t, base_e = resolve_manifest(a.workload)
+ hidden = a.hidden if a.hidden is not None else base_h
+ topk = a.topk if a.topk is not None else base_t
+ experts = a.experts if a.experts is not None else base_e
+
+ if not a.id_only and not a.out_dir:
+ ap.error("--out-dir is required unless --id-only")
+
+ raw_ladder = [int(token) for token in a.tokens_ladder.replace(",", " ").split()]
+ if (a.ep <= 0 or min(hidden, topk, experts) <= 0 or topk > experts or experts % a.ep
+ or not raw_ladder or any(token <= 0 for token in raw_ladder)
+ or len(raw_ladder) != len(set(raw_ladder))):
+ ap.error("shape, EP, and token ladder must be positive, divisible, and unique")
+ ladder = sorted(raw_ladder)
+ epr = experts // a.ep
+ label = f"workload={a.workload} " if a.workload else ""
+
+ if a.id_only:
+ # The stdlib counter generator derives the same content-bound ID on every runtime.
+ made = []
+ for T in ladder:
+ gt = T * a.ep
+ wid = wl.compute_workload_id(a.routing, hidden, topk, experts, a.ep, gt, a.seed)
+ made.append((T, gt, wid))
+ print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}")
+ print(f"{label}id-only: {len(made)} workload_id(s) "
+ f"(hidden={hidden} topk={topk} experts={experts} ep={a.ep} routing={a.routing} seed={a.seed})")
+ return 0
+
+ os.makedirs(a.out_dir, exist_ok=True)
+ made = []
+ for T in ladder:
+ gt = T * a.ep
+ idx, w, man = wl.build_workload(hidden, topk, experts, a.routing, gt, a.seed, epr)
+ wid = wl.save_workload(a.out_dir, idx, w, man)
+ made.append((T, gt, wid))
+ print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid} "
+ f"(trace sha {man['checksums']['trace'][:12]})")
+ print(f"{label}wrote {len(made)} canonical workloads to {a.out_dir} (routing={a.routing}, ep={a.ep})")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/routing.py b/experimental/CollectiveX/tests/routing.py
new file mode 100644
index 0000000000..6065a06e43
--- /dev/null
+++ b/experimental/CollectiveX/tests/routing.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""CollectiveX — deterministic, platform-independent MoE routing trace.
+
+Fair-comparison fix #1: routing (per-token expert IDs + gate weights) is generated
+ONCE from a fixed seed over the *global* token batch, indexed by global token id, and
+is identical on every SKU for the same (seed, routing, global_tokens, experts, top-k).
+Each rank materializes its slice `[rank*T,(rank+1)*T)`. Activations
+are per-rank (same rank ⇒ same x on any platform), so a given global token id has
+identical activation everywhere without materializing a global activation tensor.
+
+The v1 suite keeps two routing distributions:
+
+ * uniform — top-k distinct experts drawn uniformly per token. The DEFAULT.
+ Expected fan-out for top-k=8, 256 experts, EP8 (32 experts/rank) ≈
+ 8·(1 − C(224,8)/C(256,8)) ≈ 5.3 ranks/token. Load ~ Poisson.
+ * zipf — expert popularity proportional to 1/rank, producing expert/rank load skew.
+
+Always publish the realized fan-out so the workload is never misread again
+(`routing_stats`).
+"""
+from __future__ import annotations
+
+import hashlib
+
+import torch
+
+ACTIVATION_GENERATOR = "collectivex-activation-counter-v3"
+SOURCE_ID_BASE = 128
+SOURCE_ID_COLUMNS = 4
+
+
+def build_global_routing(
+ global_tokens: int, experts: int, topk: int, routing: str, seed: int
+):
+ """Return byte-stable counter-generated routing tensors on CPU."""
+ import workload
+
+ indices, weights = workload.canonical_routing_rows(
+ int(global_tokens), int(experts), int(topk), routing, int(seed)
+ )
+ return (
+ torch.tensor(indices, dtype=torch.int64),
+ torch.tensor(weights, dtype=torch.float32),
+ )
+
+
+def rank_slice(idx, weights, rank: int, tokens_per_rank: int):
+ lo = rank * tokens_per_rank
+ return idx[lo:lo + tokens_per_rank].contiguous(), weights[lo:lo + tokens_per_rank].contiguous()
+
+
+def rank_activations(tokens: int, hidden: int, seed: int, rank: int, device,
+ dtype=torch.bfloat16):
+ """Exact counter-derived inputs with a reversible global source-token prefix."""
+ source = torch.arange(tokens, device=device, dtype=torch.int64) + rank * tokens
+ return activations_for_source_ids(source, hidden, seed, dtype)
+
+
+def activations_for_source_ids(source, hidden: int, seed: int, dtype=torch.bfloat16):
+ """Materialize canonical activations for arbitrary global source-token IDs."""
+ if hidden < SOURCE_ID_COLUMNS:
+ raise ValueError(f"hidden must be at least {SOURCE_ID_COLUMNS}")
+ source = source.to(torch.int64)
+ column = torch.arange(hidden, device=source.device, dtype=torch.int64)
+ values = (source[:, None] * 131 + column[None, :] * 17 + int(seed) * 19) % 257 - 128
+ output = values.to(dtype).mul_(1 / 64)
+ output[:, 0] = source % SOURCE_ID_BASE
+ output[:, 1] = (source // SOURCE_ID_BASE) % SOURCE_ID_BASE
+ output[:, 2] = (source // (SOURCE_ID_BASE**2)) % SOURCE_ID_BASE
+ output[:, 3] = (source * 29 + int(seed) * 7) % SOURCE_ID_BASE
+ return output
+
+
+def decode_source_ids(payload, seed: int):
+ """Decode and validate source IDs carried by rank_activations."""
+ if payload.ndim != 2 or payload.shape[1] < SOURCE_ID_COLUMNS:
+ raise ValueError("received payload cannot carry the source-token prefix")
+ prefix = payload[:, :SOURCE_ID_COLUMNS].float()
+ digits = prefix.round().to(torch.int64)
+ if not torch.equal(prefix, digits.float()):
+ raise ValueError("received source-token prefix is not exact")
+ if bool(((digits < 0) | (digits >= SOURCE_ID_BASE)).any().item()):
+ raise ValueError("received source-token prefix is out of range")
+ source = digits[:, 0] + SOURCE_ID_BASE * digits[:, 1] + SOURCE_ID_BASE**2 * digits[:, 2]
+ checksum = (source * 29 + int(seed) * 7) % SOURCE_ID_BASE
+ if not torch.equal(checksum, digits[:, 3]):
+ raise ValueError("received source-token checksum differs")
+ return source
+
+
+def routing_locality(idx, experts_per_rank: int, ep_size: int, tokens_per_rank: int,
+ gpus_per_node: int, scale_up_domain: int = None) -> dict:
+ """Locality of rank-deduplicated payload copies under packed placement."""
+ import torch as _t
+ gt = idx.shape[0]
+ assignments = (idx // experts_per_rank).clamp(max=ep_size - 1)
+ destinations = _t.zeros((gt, ep_size), dtype=_t.bool)
+ destinations.scatter_(1, assignments, True)
+ token, dest = destinations.nonzero(as_tuple=True)
+ src = (token // max(1, tokens_per_rank)).clamp(max=ep_size - 1)
+ sud = scale_up_domain or (gpus_per_node * ep_size) # default: all one domain
+ phys = _t.arange(ep_size, dtype=_t.int64)
+ pd, ps = phys[dest], phys[src]
+ local = (dest == src)
+ same_node = (pd // gpus_per_node) == (ps // gpus_per_node)
+ same_dom = (pd // sud) == (ps // sud)
+ n = dest.numel()
+ return {
+ "placement": "packed",
+ "local_rank_fraction": float(local.float().mean()),
+ "same_node_fraction": float(same_node.float().mean()),
+ "same_scaleup_domain_fraction": float(same_dom.float().mean()),
+ "cross_node_fraction": float((~same_node).float().mean()),
+ "cross_domain_fraction": float((~same_dom).float().mean()),
+ "gpus_per_node": gpus_per_node, "scale_up_domain": sud, "copies": int(n),
+ }
+
+
+def routing_stats(idx, experts: int, experts_per_rank: int, weights=None) -> dict:
+ """Realized routing properties for the GLOBAL trace — published per point so the
+ fan-out / load can never be silently misread. idx is the global [gt, topk] tensor;
+ weights the matching [gt, topk] gate weights (hashed too for workload identity).
+ """
+ ep = max(1, experts // max(1, experts_per_rank))
+ ranks = (idx // experts_per_rank) # [gt, topk] destination rank per assignment
+ # unique destination ranks per token (fan-out)
+ onehot = torch.zeros(idx.shape[0], ep, dtype=torch.bool)
+ onehot.scatter_(1, ranks.clamp(max=ep - 1), True)
+ fanout = onehot.sum(dim=1) # [gt]
+ hist = torch.bincount(fanout, minlength=ep + 1)[1:ep + 1].tolist() # counts for fan-out 1..ep
+ load = torch.bincount(idx.reshape(-1), minlength=experts).float()
+ # Keep expert assignments (compute load) separate from rank-deduplicated payload copies
+ # (network load). Conflating them overstates traffic when two experts share a rank.
+ assignment_load = torch.bincount(
+ ranks.reshape(-1).clamp(max=ep - 1), minlength=ep
+ ).float()
+ payload_load = onehot.sum(dim=0).float()
+ # One-number imbalance summaries so a row is self-describing for the distribution-sensitivity
+ # suite (no need to read the full histograms): CV = std/mean of the load; hotspot_ratio =
+ # worst expert load over the mean. Zipf should be more concentrated than uniform.
+ def _cv(t):
+ m = float(t.mean())
+ return float(t.std(unbiased=False) / m) if m > 0 else 0.0
+ expert_load_cv = _cv(load)
+ assignment_rank_cv = _cv(assignment_load)
+ payload_rank_cv = _cv(payload_load)
+ hotspot_ratio = float(load.max() / load.mean()) if float(load.mean()) > 0 else 0.0
+ # Empty experts capture compute skew; empty destination ranks capture network skew.
+ empty_expert_count = int((load == 0).sum())
+ empty_rank_count = int((payload_load == 0).sum())
+ # SHA-256 workload identity over both topk_idx and gate weights: a chart
+ # point's routing is provably identical across SKUs only if both hashes match.
+ idx_bytes = idx.to(torch.int32).cpu().numpy().tobytes()
+ idx_hash = hashlib.sha256(idx_bytes).hexdigest()
+ if weights is not None:
+ w_bytes = weights.to(torch.float32).cpu().numpy().tobytes()
+ w_hash = hashlib.sha256(w_bytes).hexdigest()
+ routing_hash = hashlib.sha256(idx_bytes + w_bytes).hexdigest()
+ else:
+ w_hash, routing_hash = None, idx_hash
+ return {
+ "fanout_mean": float(fanout.float().mean()),
+ "fanout_min": int(fanout.min()), "fanout_max": int(fanout.max()),
+ "fanout_hist": hist, # index k-1 = #tokens with fan-out k
+ "expert_assignments_per_rank": [int(x) for x in assignment_load.tolist()],
+ "payload_copies_per_rank": [int(x) for x in payload_load.tolist()],
+ "routed_copies": int(fanout.sum()), # total (token, dest-rank) pairs
+ "expert_load_min": int(load.min()), "expert_load_max": int(load.max()),
+ "expert_load_mean": float(load.mean()), "expert_load_cv": expert_load_cv,
+ "expert_assignment_rank_cv": assignment_rank_cv,
+ "payload_rank_cv": payload_rank_cv, "hotspot_ratio": hotspot_ratio,
+ "empty_expert_count": empty_expert_count, "empty_rank_count": empty_rank_count,
+ "routing_hash": routing_hash, "idx_hash": idx_hash, "weights_hash": w_hash,
+ }
+
+
+# --------------------------------------------------------------------------- self-test
+if __name__ == "__main__":
+ import sys
+ E, TOPK, EPR, GT = 256, 8, 32, 4096
+ ui, _ = build_global_routing(GT, E, TOPK, "uniform", 67)
+ zi, _ = build_global_routing(GT, E, TOPK, "zipf", 67)
+ assert all(len(set(row.tolist())) == TOPK for row in ui[:16])
+ uniform, zipf = routing_stats(ui, E, EPR), routing_stats(zi, E, EPR)
+ assert uniform["hotspot_ratio"] < zipf["hotspot_ratio"]
+ dev = torch.device("cpu")
+ first = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32)
+ second = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32)
+ assert torch.equal(first, second) and torch.isfinite(first).all()
+ print("routing self-test: PASS")
+ sys.exit(0)
diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py
new file mode 100644
index 0000000000..45d7dd60c9
--- /dev/null
+++ b/experimental/CollectiveX/tests/run_ep.py
@@ -0,0 +1,427 @@
+#!/usr/bin/env python3
+"""CollectiveX v1 EP benchmark entrypoint for torchrun or rank environments."""
+
+from __future__ import annotations
+
+import argparse
+import ctypes
+import json
+import os
+import platform
+import re
+import shlex
+import socket
+import subprocess
+import sys
+
+# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under
+# torchrun (it executes the file as __main__, not as a package).
+HERE = os.path.dirname(os.path.abspath(__file__))
+sys.path[:0] = [HERE, os.path.dirname(HERE)]
+
+import ep_harness # noqa: E402 (stdlib-only; safe before torch)
+import identity # noqa: E402
+
+
+def _numeric_version(command: list[str]) -> str | None:
+ try:
+ result = subprocess.run(
+ command, capture_output=True, check=False, text=True, timeout=10
+ )
+ except (OSError, subprocess.TimeoutExpired):
+ return None
+ if result.returncode != 0:
+ return None
+ match = re.search(r"\b[0-9]+(?:\.[0-9]+){1,3}\b", result.stdout)
+ return match.group(0) if match else None
+
+
+def _loaded_collective_version() -> str | None:
+ try:
+ with open("/proc/self/maps", encoding="utf-8") as handle:
+ paths = {
+ os.path.realpath(line.rstrip().split()[-1])
+ for line in handle
+ if any(name in line for name in ("libnccl.so", "librccl.so"))
+ and os.path.isfile(line.rstrip().split()[-1])
+ }
+ if len(paths) != 1:
+ return None
+ version = ctypes.c_int()
+ library = ctypes.CDLL(paths.pop())
+ if library.ncclGetVersion(ctypes.byref(version)) != 0:
+ return None
+ return ep_harness.format_collective_version(version.value)
+ except (AttributeError, OSError):
+ return None
+
+
+def _runtime_fingerprint(
+ torch, device, *, machine: str, vendor: str, arch: str
+) -> dict:
+ """Return strict runtime facts without hosts, addresses, UUIDs, or paths."""
+ properties = torch.cuda.get_device_properties(device)
+ if vendor == "nvidia":
+ driver = _numeric_version(
+ ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"]
+ )
+ runtime_kind, runtime_version, collective_kind = (
+ "cuda",
+ torch.version.cuda,
+ "nccl",
+ )
+ else:
+ driver = _numeric_version(["rocm-smi", "--showdriverversion"])
+ runtime_kind, runtime_version, collective_kind = (
+ "hip",
+ torch.version.hip,
+ "rccl",
+ )
+ return {
+ "accelerator_runtime": {"kind": runtime_kind, "version": runtime_version},
+ "collective_library": {
+ "kind": collective_kind,
+ "version": _loaded_collective_version(),
+ },
+ "device": {
+ "arch": arch,
+ "compute_units": int(properties.multi_processor_count),
+ "memory_bytes": int(properties.total_memory),
+ "product": torch.cuda.get_device_name(device),
+ "warp_size": int(properties.warp_size),
+ },
+ "driver_version": driver,
+ "framework": {"kind": "torch", "version": str(torch.__version__)},
+ "machine": machine,
+ "python_version": platform.python_version(),
+ "vendor": vendor,
+ }
+
+
+def _summarize_realized_placement(
+ records: list[tuple[str, int]],
+ *,
+ expected_nodes: int,
+ expected_gpus_per_node: int,
+ expected_world_size: int,
+) -> dict:
+ """Validate private host/rank records and return only publication-safe aggregates."""
+ if expected_nodes < 1 or expected_gpus_per_node < 1:
+ raise ValueError("requested placement dimensions must be positive")
+ if expected_nodes * expected_gpus_per_node != expected_world_size:
+ raise ValueError("requested nodes x GPUs per node differs from world size")
+ if len(records) != expected_world_size:
+ raise ValueError("realized rank count differs from world size")
+
+ by_host: dict[str, list[int]] = {}
+ for host, local_rank in records:
+ if not isinstance(host, str) or not host or type(local_rank) is not int:
+ raise ValueError("realized placement record has invalid types")
+ by_host.setdefault(host, []).append(local_rank)
+
+ counts = sorted(len(local_ranks) for local_ranks in by_host.values())
+ complete_local_ranks = all(
+ sorted(local_ranks) == list(range(expected_gpus_per_node))
+ for local_ranks in by_host.values()
+ )
+ unique_pairs = len(set(records)) == len(records)
+ if len(by_host) != expected_nodes:
+ raise ValueError(
+ f"realized node count {len(by_host)} differs from requested {expected_nodes}"
+ )
+ if counts != [expected_gpus_per_node] * expected_nodes:
+ raise ValueError("realized ranks per node differ from requested GPUs per node")
+ if not complete_local_ranks or not unique_pairs:
+ raise ValueError("realized local ranks are incomplete or duplicated")
+ return {
+ "gpus_per_node": expected_gpus_per_node,
+ "nodes": expected_nodes,
+ "ranks_per_node": expected_gpus_per_node,
+ "unique_local_ranks": True,
+ "valid": True,
+ }
+
+
+def _common_runtime_fingerprint(records: list[dict]) -> dict:
+ """Return the shared sanitized fingerprint, rejecting heterogeneous ranks."""
+ if not records:
+ raise ValueError("runtime fingerprint evidence is empty")
+ canonical = {
+ json.dumps(record, allow_nan=False, sort_keys=True, separators=(",", ":"))
+ for record in records
+ }
+ if len(canonical) != 1:
+ raise ValueError("runtime fingerprint differs across distributed ranks")
+ return records[0]
+
+
+def main() -> int:
+ ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep")
+ ap.add_argument(
+ "--backend",
+ required=True,
+ choices=[
+ "deepep",
+ "deepep-v2",
+ "deepep-hybrid",
+ "mori",
+ "uccl",
+ "nccl-ep",
+ ],
+ )
+ ep_harness.add_common_args(ap)
+ args = ap.parse_args()
+
+ if args.mode == ep_harness.LOW_LATENCY_MODE:
+ if args.backend not in {"deepep", "uccl"}:
+ print(
+ "ERROR: low-latency mode is supported only by deepep and uccl",
+ file=sys.stderr,
+ )
+ return 2
+ if args.phase != "decode":
+ print("ERROR: low-latency mode requires --phase decode", file=sys.stderr)
+ return 2
+ if args.case_id and not identity.is_typed_id(args.case_id, "case"):
+ print(f"ERROR: invalid native case ID {args.case_id!r}", file=sys.stderr)
+ return 2
+ if args.case_id and args.seed != ep_harness.ROUTING_SEED:
+ print(
+ f"ERROR: scheduled v1 cases require seed={ep_harness.ROUTING_SEED}; got {args.seed}",
+ file=sys.stderr,
+ )
+ return 2
+
+ sampling_error = ep_harness.sampling_contract_error(
+ args.iters, args.trials, args.warmup
+ )
+ if sampling_error:
+ print(f"ERROR: {sampling_error}", file=sys.stderr)
+ return 2
+
+ try:
+ import torch
+ import torch.distributed as dist
+ except Exception as exc: # pragma: no cover
+ print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr)
+ return 3
+
+ rank = int(os.environ.get("RANK", "0"))
+ world_size = int(os.environ.get("WORLD_SIZE", "1"))
+ local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+ torch.cuda.set_device(local_rank)
+ device = torch.device(f"cuda:{local_rank}")
+ os.environ.setdefault("MASTER_ADDR", "localhost")
+ os.environ.setdefault("MASTER_PORT", "12355")
+
+ import capability
+
+ sku = capability.PLATFORMS.get(args.runner)
+ if sku is None:
+ print(f"ERROR: unknown runner identity {args.runner!r}", file=sys.stderr)
+ return 5
+ machine = {"x86_64": "amd64", "aarch64": "arm64"}.get(
+ platform.machine(), platform.machine()
+ )
+ props = torch.cuda.get_device_properties(device)
+ if torch.version.hip:
+ vendor = "amd"
+ accelerator = str(getattr(props, "gcnArchName", "")).split(":", 1)[0]
+ else:
+ vendor = "nvidia"
+ major, minor = torch.cuda.get_device_capability(device)
+ accelerator = f"sm{major}{minor}"
+ device_name = torch.cuda.get_device_name(device)
+ device_count = torch.cuda.device_count()
+ identity_issues = capability.runtime_identity_issues(
+ args.runner,
+ vendor=vendor,
+ arch=accelerator,
+ machine=machine,
+ device_name=device_name,
+ device_count=device_count,
+ world_size=world_size,
+ )
+ if identity_issues:
+ print(
+ f"ERROR: runtime identity does not match {args.runner}: "
+ + "; ".join(identity_issues),
+ file=sys.stderr,
+ )
+ return 5
+ observed_gpus_per_node = args.gpus_per_node or device_count
+ if observed_gpus_per_node != sku["gpus_per_node"]:
+ print(
+ f"ERROR: {args.runner} requires {sku['gpus_per_node']} GPUs per node",
+ file=sys.stderr,
+ )
+ return 5
+ if world_size % observed_gpus_per_node:
+ print("ERROR: distributed world is not divisible by GPUs per node", file=sys.stderr)
+ return 5
+ observed_nodes = world_size // observed_gpus_per_node
+ topology = capability.topology_for(args.runner, world_size)
+ observed_topology = {
+ "nodes": observed_nodes,
+ "gpus_per_node": observed_gpus_per_node,
+ "scale_up_domain": args.scale_up_domain or observed_gpus_per_node,
+ "scope": args.scope,
+ "scale_up_transport": args.scale_up_transport,
+ "scale_out_transport": args.scale_out_transport or None,
+ "transport": args.transport,
+ "topology_class": args.topology_class,
+ }
+ if topology is None or any(
+ observed_topology[field] != topology[field] for field in observed_topology
+ ):
+ print(
+ f"ERROR: runtime topology does not match {args.runner} EP{world_size}",
+ file=sys.stderr,
+ )
+ return 5
+ schedulable, reason = capability.resolve(
+ args.runner,
+ args.backend,
+ ep=world_size,
+ nodes=observed_nodes,
+ routing=args.routing,
+ eplb=args.eplb,
+ mode=args.mode,
+ )
+ if not schedulable:
+ print(f"ERROR: scheduled case is unsupported: {reason}", file=sys.stderr)
+ return 5
+ args.runtime_device_product = device_name
+ args.runtime_device_count = device_count
+ args.allocation_execution_id = os.environ.get("COLLECTIVEX_EXECUTION_ID")
+
+ # EPLB bumps the expert count to PHYSICAL (logical + redundant) BEFORE backend construction
+ # so the backend sizes its buffers for the replicated set; ep_harness builds the LOGICAL
+ # routing trace and remaps it to the balanced physical placement (a pure routing transform,
+ # tests/eplb.py — no adapter change). Deterministic, so every rank agrees on the count.
+ if getattr(args, "eplb", False):
+ import eplb
+
+ args.num_logical_experts = args.experts
+ args.experts = eplb.physical_count(
+ args.experts, ep_harness.EPLB_REDUNDANT_EXPERTS, world_size
+ )
+
+ # Reproduction provenance (recorded in the artifact). Rack launchers provide ranks directly
+ # through srun, while single-node launchers use torchrun; do not claim torchrun for both.
+ if os.environ.get("TORCHELASTIC_RUN_ID"):
+ args.distributed_launcher = "torchrun"
+ prefix = f"torchrun --nproc_per_node={world_size}"
+ else:
+ args.distributed_launcher = "rank-environment"
+ prefix = f"RANK={rank} WORLD_SIZE={world_size} LOCAL_RANK={local_rank} python3"
+ args.reproduction_command = f"{prefix} tests/run_ep.py {shlex.join(sys.argv[1:])}"
+ args.image = os.environ.get("COLLECTIVEX_IMAGE", "")
+ args.image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST", "")
+ args.image_digest_verified = (
+ os.environ.get("COLLECTIVEX_IMAGE_DIGEST_VERIFIED") == "1"
+ )
+ # Container architecture and local squash hash for Enroot/Pyxis.
+ args.image_arch = machine
+ args.squash_sha256 = os.environ.get("COLLECTIVEX_SQUASH_SHA256")
+ # GitHub provenance: repo, run ID, attempt, ref, source SHA, job,
+ # artifact. A result is only publication-'official' when these are present (validity gate).
+ _run = {
+ "run_id": os.environ.get("GITHUB_RUN_ID"),
+ "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
+ "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"),
+ "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA")
+ or os.environ.get("GITHUB_SHA"),
+ "repo": os.environ.get("GITHUB_REPOSITORY"),
+ "job": os.environ.get("GITHUB_JOB"),
+ "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"),
+ }
+ args.git_run = _run if any(_run.values()) else None
+
+ # Import the backend class only after torch initializes. The selected mode is an
+ # explicit case dimension; adapters do not infer it from the token ladder.
+ if args.backend == "mori":
+ from ep_mori import MoRIBackend as Backend
+ elif args.backend == "nccl-ep":
+ from ep_nccl import NCCLBackend as Backend
+ elif args.backend == "uccl":
+ from ep_uccl import UCCLBackend as Backend
+ elif args.backend == "deepep-hybrid":
+ from ep_deepep_hybrid import DeepEPHybridBackend as Backend
+ elif args.backend == "deepep-v2":
+ from ep_deepep_v2 import DeepEPV2Backend as Backend
+ else:
+ from ep_deepep import DeepEPBackend as Backend
+
+ # MoRI uses the gloo+NCCL group shape from its reference; other adapters use NCCL/RCCL.
+ if not dist.is_initialized():
+ if args.backend == "mori":
+ dist.init_process_group(
+ backend="cpu:gloo,cuda:nccl",
+ rank=rank,
+ world_size=world_size,
+ device_id=device,
+ )
+ elif args.backend == "deepep-v2":
+ # PR #605 reuses PyTorch's NCCL communicator through ``_comm_ptr``. Supplying
+ # device_id eagerly forms it before ElasticBuffer construction.
+ dist.init_process_group("nccl", device_id=device)
+ else:
+ dist.init_process_group("nccl")
+
+ args.runtime_fingerprint = _runtime_fingerprint(
+ torch, device, machine=machine, vendor=vendor, arch=accelerator
+ )
+
+ gpus_per_node = args.gpus_per_node or sku["gpus_per_node"]
+ try:
+ expected_nodes = int(
+ os.environ.get("SLURM_NNODES", str(world_size // gpus_per_node))
+ )
+ except ValueError as exc:
+ raise ValueError("SLURM_NNODES must be a positive integer") from exc
+ realized_records: list[tuple[str, int, dict] | None] = [None] * world_size
+ dist.all_gather_object(
+ realized_records,
+ (socket.gethostname(), local_rank, args.runtime_fingerprint),
+ )
+ complete_records = [record for record in realized_records if record is not None]
+ args.realized_placement = _summarize_realized_placement(
+ [(record[0], record[1]) for record in complete_records],
+ expected_nodes=expected_nodes,
+ expected_gpus_per_node=gpus_per_node,
+ expected_world_size=world_size,
+ )
+ args.runtime_fingerprint = _common_runtime_fingerprint(
+ [record[2] for record in complete_records]
+ )
+
+ # Construct + run inside a try so a backend exception (esp. a new adapter on GPU) prints its
+ # FULL traceback to STDOUT — torchrun captures per-rank stdout but only summarizes stderr, so an
+ # uncaught exception is otherwise invisible in CI. Print on every rank (prefixed) then re-raise.
+ try:
+ backend = Backend(args, rank, world_size, local_rank, device)
+ if rank == 0:
+ print(
+ f"[run_ep] backend={args.backend} phase={args.phase} mode={args.mode} "
+ f"world={world_size} ep_size={world_size} hidden={args.hidden} "
+ f"topk={args.topk} experts={args.experts} dtype=bf16 "
+ f"routing={args.routing} seed={args.seed}"
+ )
+ rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size)
+ except Exception:
+ import traceback
+
+ print(
+ f"[run_ep][rank{rank}] backend={args.backend} FAILED:\n"
+ + traceback.format_exc(),
+ flush=True,
+ )
+ raise
+ # finalize() handles backend-specific teardown: DeepEP returns rc cleanly;
+ # MoRI hard-exits past its post-shmem_finalize teardown assertion.
+ return backend.finalize(rc)
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/experimental/CollectiveX/tests/test_deepep_v2_contract.py b/experimental/CollectiveX/tests/test_deepep_v2_contract.py
new file mode 100644
index 0000000000..082cd491a6
--- /dev/null
+++ b/experimental/CollectiveX/tests/test_deepep_v2_contract.py
@@ -0,0 +1,2151 @@
+#!/usr/bin/env python3
+"""CPU-only structural and registry tests for the pinned DeepEP V2 path."""
+from __future__ import annotations
+
+import ast
+import argparse
+import copy
+import ctypes
+import hashlib
+import json
+import os
+from pathlib import Path
+from pathlib import PurePosixPath
+import shutil
+import stat
+import subprocess
+import sys
+import tempfile
+import types
+import unittest
+
+HERE = Path(__file__).resolve().parent
+ROOT = HERE.parent
+sys.path.insert(0, str(ROOT))
+
+import capability # noqa: E402
+import contracts # noqa: E402
+import ep_harness # noqa: E402
+import identity # noqa: E402
+import run_ep # noqa: E402
+
+
+COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6"
+TREE = "29809e75c5874e6609dac4804e7b651d5226959f"
+FMT_COMMIT = "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa"
+
+
+def deepep_v2_jit_provenance() -> list[dict[str, str]]:
+ return [
+ {
+ "cache_key": f"kernel.{name}.{index:032x}",
+ "cubin_sha256": f"{index + 1:x}" * 64,
+ "sass_sha256": f"{index + 2:x}" * 64,
+ "source_sha256": f"{index + 3:x}" * 64,
+ }
+ for index, name in enumerate(sorted(contracts.DEEPEP_V2_JIT_KERNELS))
+ ]
+
+
+def hybrid_realized_config() -> dict[str, object]:
+ config = {field: 1 for field in contracts.HYBRID_REALIZED_CONFIG_FIELDS}
+ for field in contracts.HYBRID_REALIZED_BOOL_FIELDS:
+ config[field] = True
+ config["token_data_type"] = "UINT16"
+ return config
+
+
+def hybrid_jit_provenance(ranks: int = 2) -> tuple[list[str], list[dict[str, object]]]:
+ keys = ["combine-key", "dispatch-key", "preprocess-key"]
+ artifacts = [
+ {
+ "kernel_key": key,
+ "rank_artifacts": [
+ {"bytes": 10 + index, "rank": rank, "sha256": f"{index + 1:x}" * 64}
+ for rank in range(ranks)
+ ],
+ }
+ for index, key in enumerate(keys)
+ ]
+ return keys, artifacts
+
+
+def load_uccl_function(name: str, namespace: dict[str, object]):
+ path = HERE / "ep_uccl.py"
+ function = next(
+ node
+ for node in ast.parse(path.read_text()).body
+ if isinstance(node, ast.FunctionDef) and node.name == name
+ )
+ exec(compile(ast.Module(body=[function], type_ignores=[]), str(path), "exec"), namespace)
+ return namespace[name]
+
+
+def operator_config(root: Path) -> dict[str, object]:
+ path = str(root)
+ network = {"socket_ifname": "eth0", "rdma_devices": "mlx5_0:1"}
+ runners = {
+ "h100-dgxc": {
+ "partition": "test", "account": "test", "squash_dir": path,
+ "stage_dir": path, **network,
+ },
+ "h200-dgxc": {
+ "partition": "test", "squash_dir": path, "stage_dir": path, **network,
+ },
+ "b200-dgxc": {
+ "partition": "test", "account": "test", "squash_dir": path,
+ "stage_dir": path, **network,
+ },
+ "b300": {
+ "partition": "test", "account": "test", "squash_dir": path, "stage_dir": path,
+ **network,
+ },
+ "gb200": {"partition": "test", "account": "test", "storage_roots": [path]},
+ "gb300": {
+ "partition": "test", "account": "test", "squash_dir": path,
+ "stage_dir": path, "enroot_cache_path": path,
+ },
+ "mi325x": {
+ "partition": "test", "squash_dir": path, "stage_dir": path, **network,
+ },
+ "mi355x": {
+ "partition": "test", "squash_dir": path, "stage_dir": path, **network,
+ },
+ }
+ return {"schema_version": 1, "runners": runners}
+
+
+class DeepEPV2ContractTests(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls) -> None:
+ cls.path = HERE / "ep_deepep_v2.py"
+ cls.tree = ast.parse(cls.path.read_text(), str(cls.path))
+
+ def test_capability_is_explicit_for_every_sku(self) -> None:
+ backend = capability.BACKENDS["deepep-v2"]
+ self.assertEqual(
+ (backend["implementation"], backend["commit"], backend["torch"], backend["nccl"]),
+ ("deep_ep.ElasticBuffer", COMMIT, "2.10.0+cu130", "2.30.4"),
+ )
+ self.assertEqual(backend["source"], "deepseek-ai/DeepEP#605+#630")
+ self.assertEqual(backend["communication_backend"], "nccl-device-lsa")
+ self.assertEqual(set(backend["sku_capabilities"]), set(capability.PLATFORMS))
+ for sku, platform in capability.PLATFORMS.items():
+ ok, _ = capability.resolve(sku, "deepep-v2")
+ self.assertEqual(ok, platform["vendor"] == "nvidia" and sku != "h100-dgxc")
+ self.assertEqual(
+ set(backend["sku_capabilities"][sku]), {"basis", "schedulable"}
+ )
+ self.assertEqual(
+ backend["sku_capabilities"]["h100-dgxc"],
+ {
+ "schedulable": False,
+ "basis": "current-runner-nccl-device-api-symmetric-memory-unavailable",
+ },
+ )
+
+ def test_adapter_ast_pins_elastic_api_and_weight_semantics(self) -> None:
+ imports = {
+ alias.name
+ for node in ast.walk(self.tree)
+ if isinstance(node, ast.ImportFrom) and node.module == "deep_ep"
+ for alias in node.names
+ }
+ self.assertEqual(imports, {"ElasticBuffer"})
+ constants = {
+ node.targets[0].id: ast.literal_eval(node.value)
+ for node in self.tree.body
+ if isinstance(node, ast.Assign)
+ and len(node.targets) == 1
+ and isinstance(node.targets[0], ast.Name)
+ and isinstance(node.value, ast.Constant)
+ }
+ self.assertEqual(constants["DEEPEP_V2_COMMIT"], COMMIT)
+ self.assertEqual(constants["DEEPEP_V2_TREE"], TREE)
+ self.assertEqual(constants["DEEPEP_V2_FMT_COMMIT"], FMT_COMMIT)
+ self.assertEqual(constants["DEEPEP_V2_PR"], 605)
+ self.assertEqual(constants["DEEPEP_V2_FIX_PR"], 630)
+ self.assertEqual(
+ constants["DEEPEP_V2_JIT_RANDOM_SEED"],
+ "collectivex-deepep-v2-fa8a9b1",
+ )
+ self.assertEqual(constants["NCCL_VERSION"], "2.30.4")
+ self.assertEqual(constants["NVSHMEM_VERSION"], "3.3.9")
+ backend = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.ClassDef) and node.name == "DeepEPV2Backend"
+ )
+ assignments = {
+ node.targets[0].id: ast.literal_eval(node.value)
+ for node in backend.body
+ if isinstance(node, ast.Assign)
+ and isinstance(node.targets[0], ast.Name)
+ and isinstance(node.value, ast.Constant)
+ }
+ self.assertEqual(assignments["combine_weight_semantics"], "unweighted-rank-sum")
+ methods = {node.name for node in backend.body if isinstance(node, ast.FunctionDef)}
+ self.assertTrue({
+ "dispatch", "inspect_dispatch", "combine_transformed", "capture_deferred_provenance",
+ "finalize",
+ } <= methods)
+ self.assertNotIn("expected", methods)
+ constructor = next(
+ node for node in ast.walk(backend)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Name)
+ and node.func.id == "ElasticBuffer"
+ )
+ deterministic = next(
+ keyword for keyword in constructor.keywords if keyword.arg == "deterministic"
+ )
+ self.assertIs(ast.literal_eval(deterministic.value), False)
+ self.assertIn("deterministic", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("num_experts", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("tuning_num_experts", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("jit_random_seed", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("gin_enabled", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("communication_backend", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("deepep_pr", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ self.assertIn("deepep_fix_pr", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"])
+ source = self.path.read_text()
+ self.assertIn('getattr(args, "num_logical_experts", args.experts)', source)
+ self.assertIn('"use_expanded_layout": False', source)
+ self.assertIn("allow_hybrid_mode = _configure_gin_mode(args, world_size)", source)
+ self.assertIn("get_theoretical_num_sms(tuning_num_experts, args.topk)", source)
+
+ jit_function = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_jit_cache_key"
+ )
+ namespace = {"hashlib": __import__("hashlib"), "json": json}
+ exec(compile(ast.Module(body=[jit_function], type_ignores=[]), str(self.path), "exec"), namespace)
+ key = namespace["_jit_cache_key"]
+ baseline = types.SimpleNamespace(
+ runner="h100-dgxc", hidden=7168, topk=8, experts=256,
+ routing="uniform", eplb=False, case_id="uniform",
+ )
+ zipf = types.SimpleNamespace(**{**vars(baseline), "routing": "zipf", "case_id": "zipf"})
+ eplb = types.SimpleNamespace(
+ **{**vars(zipf), "experts": 288, "num_logical_experts": 256, "eplb": True}
+ )
+ realized = {
+ "num_sms": 24,
+ "num_qps": 9,
+ "allocated_qps": 17,
+ "logical_scaleout_ranks": 1,
+ "logical_scaleup_ranks": 8,
+ "physical_rdma_ranks": 2,
+ "physical_nvlink_ranks": 4,
+ "is_scaleup_nvlink": False,
+ "device_arch_major": 9,
+ "device_arch_minor": 0,
+ "device_sms": 132,
+ "device_smem_bytes": 232448,
+ "gpu_timeout_cycles": 198000000000,
+ }
+ direct = key(baseline, 8, 128, False, realized)
+ self.assertTrue(direct.startswith("jitcfg-v3-"))
+ self.assertEqual(direct, key(zipf, 8, 128, False, realized))
+ self.assertNotEqual(direct, key(zipf, 8, 128, True, realized))
+ self.assertNotEqual(direct, key(eplb, 8, 128, False, realized))
+ for field, value in realized.items():
+ changed = not value if type(value) is bool else value + 1
+ self.assertNotEqual(
+ direct,
+ key(baseline, 8, 128, False, {**realized, field: changed}),
+ field,
+ )
+ init = next(
+ node for node in backend.body
+ if isinstance(node, ast.FunctionDef) and node.name == "__init__"
+ )
+ buffer_call = next(
+ node for node in ast.walk(init)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Name)
+ and node.func.id == "ElasticBuffer"
+ )
+ jit_config_check = next(
+ node for node in ast.walk(init)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Name)
+ and node.func.id == "_require_cross_rank_equal"
+ and ast.literal_eval(node.args[1]) == "JIT configuration"
+ )
+ cache_assignment = next(
+ node for node in ast.walk(init)
+ if isinstance(node, ast.Assign)
+ and isinstance(node.targets[0], ast.Subscript)
+ and ast.unparse(node.targets[0].value) == "os.environ"
+ and ast.literal_eval(node.targets[0].slice) == "EP_JIT_CACHE_DIR"
+ )
+ self.assertLess(buffer_call.lineno, jit_config_check.lineno)
+ self.assertLess(jit_config_check.lineno, cache_assignment.lineno)
+ capture = next(
+ node for node in backend.body
+ if isinstance(node, ast.FunctionDef)
+ and node.name == "capture_deferred_provenance"
+ )
+ calls = [node for node in ast.walk(capture) if isinstance(node, ast.Call)]
+ barrier = next(
+ node for node in calls
+ if isinstance(node.func, ast.Attribute) and node.func.attr == "barrier"
+ )
+ self.assertEqual(
+ {keyword.arg: ast.literal_eval(keyword.value) for keyword in barrier.keywords},
+ {"use_comm_stream": True, "with_cpu_sync": True},
+ )
+ scan = next(
+ node for node in calls
+ if isinstance(node.func, ast.Name) and node.func.id == "_jit_artifact_evidence"
+ )
+ self.assertLess(barrier.lineno, scan.lineno)
+ realized_check = next(
+ node for node in ast.walk(backend)
+ if isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Name)
+ and node.func.id == "_require_cross_rank_equal"
+ and len(node.args) > 1
+ and isinstance(node.args[1], ast.Constant)
+ and node.args[1].value == "realized tuning/topology"
+ )
+ self.assertIsInstance(realized_check, ast.Call)
+ self.assertEqual(
+ (ROOT / "tests" / "ep_harness.py").read_text().count(
+ "capture_deferred_provenance()"
+ ),
+ 2,
+ )
+ schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text())
+ provenance = schema["properties"]["implementation"]["properties"]["provenance"]
+ self.assertEqual(provenance["properties"]["deterministic"], {"type": "boolean"})
+ self.assertEqual(
+ provenance["properties"]["num_experts"],
+ {"minimum": 1, "type": "integer"},
+ )
+ self.assertEqual(
+ provenance["properties"]["tuning_num_experts"],
+ {"minimum": 1, "type": "integer"},
+ )
+ self.assertEqual(
+ provenance["properties"]["jit_cubins"]["items"],
+ {"$ref": "#/$defs/deepep_v2_jit_cubin"},
+ )
+ self.assertEqual(
+ (
+ provenance["properties"]["jit_cubins"]["minItems"],
+ provenance["properties"]["jit_cubins"]["maxItems"],
+ ),
+ (5, 5),
+ )
+ self.assertEqual(
+ provenance["properties"]["jit_random_seed"],
+ {"const": "collectivex-deepep-v2-fa8a9b1"},
+ )
+ self.assertEqual(provenance["properties"]["allow_hybrid_mode"], {"type": "boolean"})
+ self.assertEqual(provenance["properties"]["gin_enabled"], {"type": "boolean"})
+ self.assertEqual(provenance["properties"]["deepep_pr"], {"const": 605})
+ self.assertEqual(provenance["properties"]["deepep_fix_pr"], {"const": 630})
+ self.assertEqual(
+ provenance["properties"]["communication_backend"],
+ {"enum": ["nccl-device-lsa", "nccl-gin"]},
+ )
+ self.assertEqual(
+ provenance["properties"]["num_rdma_bytes"],
+ {"minimum": 0, "type": "integer"},
+ )
+ self.assertEqual(
+ provenance["properties"]["num_qps_per_rank"],
+ {"minimum": 1, "type": "integer"},
+ )
+ for field, value in (
+ ("num_experts", "288"),
+ ("tuning_num_experts", "not-an-integer"),
+ ("tuning_num_experts", 0),
+ ):
+ with self.subTest(provenance_field=field, value=value):
+ self.assertIn(
+ field,
+ contracts.backend_provenance_issues(
+ "deepep-v2", {field: value}
+ ),
+ )
+
+ def test_v2_gin_mode_uses_the_scale_up_domain_and_safe_fallbacks(self) -> None:
+ functions = {
+ node.name: node for node in self.tree.body if isinstance(node, ast.FunctionDef)
+ }
+ namespace = {"os": os}
+ exec(
+ compile(
+ ast.Module(
+ body=[
+ functions["_configure_gin_mode"],
+ functions["_lsa_topology_is_valid"],
+ ],
+ type_ignores=[],
+ ),
+ str(self.path),
+ "exec",
+ ),
+ namespace,
+ )
+ configure = namespace["_configure_gin_mode"]
+ topology_is_valid = namespace["_lsa_topology_is_valid"]
+ original = os.environ.get("EP_DISABLE_GIN")
+ try:
+ args = types.SimpleNamespace(scale_up_domain=72, gpus_per_node=4)
+ self.assertFalse(configure(args, 8))
+ self.assertEqual(os.environ.get("EP_DISABLE_GIN"), "1")
+
+ os.environ["EP_DISABLE_GIN"] = "stale"
+ args = types.SimpleNamespace(scale_up_domain=8, gpus_per_node=4)
+ self.assertTrue(configure(args, 16))
+ self.assertNotIn("EP_DISABLE_GIN", os.environ)
+
+ args = types.SimpleNamespace(gpus_per_node=4)
+ self.assertTrue(configure(args, 8))
+ self.assertNotIn("EP_DISABLE_GIN", os.environ)
+
+ self.assertFalse(configure(types.SimpleNamespace(), 8))
+ self.assertEqual(os.environ.get("EP_DISABLE_GIN"), "1")
+
+ topology = {
+ "physical_rdma_ranks": 1,
+ "physical_nvlink_ranks": 8,
+ "logical_scaleout_ranks": 1,
+ "logical_scaleup_ranks": 8,
+ "is_scaleup_nvlink": True,
+ }
+ self.assertTrue(topology_is_valid(False, 8, 8, topology))
+ topology["physical_rdma_ranks"] = 2
+ topology["logical_scaleout_ranks"] = 2
+ self.assertTrue(topology_is_valid(True, 16, 8, topology))
+ topology["physical_nvlink_ranks"] = 4
+ self.assertFalse(topology_is_valid(False, 8, 8, topology))
+ finally:
+ if original is None:
+ os.environ.pop("EP_DISABLE_GIN", None)
+ else:
+ os.environ["EP_DISABLE_GIN"] = original
+
+ def test_ep_adapters_declare_unweighted_rank_sum(self) -> None:
+ adapters = {
+ "ep_deepep.py": "DeepEPBackend",
+ "ep_deepep_v2.py": "DeepEPV2Backend",
+ "ep_deepep_hybrid.py": "DeepEPHybridBackend",
+ "ep_mori.py": "MoRIBackend",
+ "ep_nccl.py": "NCCLBackend",
+ "ep_uccl.py": "UCCLBackend",
+ }
+ for filename, class_name in adapters.items():
+ with self.subTest(adapter=filename):
+ tree = ast.parse((HERE / filename).read_text())
+ backend = next(
+ node for node in tree.body
+ if isinstance(node, ast.ClassDef) and node.name == class_name
+ )
+ assignment = next(
+ node for node in backend.body
+ if isinstance(node, ast.Assign)
+ and isinstance(node.targets[0], ast.Name)
+ and node.targets[0].id == "combine_weight_semantics"
+ )
+ self.assertEqual(ast.literal_eval(assignment.value), "unweighted-rank-sum")
+ combine_methods = [
+ item for item in backend.body
+ if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef))
+ and item.name in {"combine", "combine_transformed"}
+ ]
+ self.assertEqual(len(combine_methods), 2)
+ for method in combine_methods:
+ source = ast.unparse(method)
+ if filename in {"ep_deepep.py", "ep_uccl.py"}:
+ self.assertIn("self.mode == 'low-latency'", source)
+ else:
+ self.assertNotIn("topk_weights", source)
+ self.assertNotIn("combine_topk_weights", source)
+
+ def test_low_latency_mode_parser_and_profile_are_explicit(self) -> None:
+ parser = argparse.ArgumentParser()
+ ep_harness.add_common_args(parser)
+ required = [
+ "--runner", "test", "--topology-class", "test",
+ "--scope", "scale-up", "--scale-up-transport", "nvlink",
+ "--out", "test.json",
+ ]
+ self.assertEqual(parser.parse_args(required).mode, "normal")
+ self.assertEqual(
+ parser.parse_args([*required, "--mode", "low-latency"]).mode,
+ "low-latency",
+ )
+ profile = identity.case_profile("low-latency")
+ self.assertEqual(profile["contract"], "expert-packed-weighted-combine-v1")
+ self.assertEqual(
+ profile["component_order_contract"],
+ "roundtrip-dispatch-gate-weighted-combine-v1",
+ )
+ self.assertEqual(
+ profile["correctness_scope"],
+ "expert-assignment-and-weighted-combine",
+ )
+ self.assertEqual(profile["payload_unit"], "token-expert")
+
+ def test_expert_packed_slot_map_reconstructs_exact_sources(self) -> None:
+ pack = lambda begin, count: (begin << 32) | count
+ slots = ep_harness.expert_packed_slot_map(
+ [2, 1],
+ [[1, 0, 0, 0], [1, 0, 0, 0]],
+ [[pack(0, 1), pack(1, 1)], [pack(0, 0), pack(0, 1)]],
+ tokens_per_rank=2,
+ experts_per_rank=2,
+ world_size=2,
+ )
+ self.assertEqual(slots, [(0, 0, 1), (0, 1, 2), (1, 0, 3)])
+
+ invalid = (
+ ([1], [[0]], [[pack(1, 1), pack(0, 0)]]),
+ ([1], [[2]], [[pack(0, 1), pack(1, 0)]]),
+ ([2], [[1, 1]], [[pack(0, 2), pack(2, 0)]]),
+ )
+ for counts, source, layout in invalid:
+ with self.subTest(counts=counts, source=source, layout=layout):
+ with self.assertRaises(ValueError):
+ ep_harness.expert_packed_slot_map(
+ counts,
+ source,
+ layout,
+ tokens_per_rank=2,
+ experts_per_rank=1,
+ world_size=2,
+ )
+
+ def test_deepep_and_uccl_expose_genuine_low_latency_calls(self) -> None:
+ required_fragments = (
+ "Buffer.get_low_latency_rdma_size_hint(",
+ "low_latency_mode=True",
+ "num_qps_per_rank=num_qps_per_rank",
+ "self.buffer.clean_low_latency_buffer(",
+ "self.buffer.low_latency_dispatch(",
+ "use_fp8=False",
+ "self.buffer.low_latency_combine(",
+ "p.topk_weights",
+ 'self.combine_weight_semantics = "gate-weighted-sum"',
+ "self.combine_needs_redispatch = True",
+ "def inspect_expert_dispatch(",
+ )
+ for filename in ("ep_deepep.py", "ep_uccl.py"):
+ source = (HERE / filename).read_text()
+ with self.subTest(adapter=filename):
+ for fragment in required_fragments:
+ self.assertIn(fragment, source)
+ self.assertIn("self.max_tokens_per_rank = 128", source)
+ self.assertIn("async_finish=False", source)
+ self.assertIn("return_recv_hook=False", source)
+
+ run_ep_source = (HERE / "run_ep.py").read_text()
+ self.assertIn('args.backend not in {"deepep", "uccl"}', run_ep_source)
+ self.assertIn('args.phase != "decode"', run_ep_source)
+
+ def test_deepep_v2_jit_evidence_is_strict_and_stable(self) -> None:
+ valid = deepep_v2_jit_provenance()
+ self.assertTrue(contracts._deepep_v2_jit_cubins_are_valid(valid))
+ for invalid in (
+ [],
+ [{**valid[0], "path": "/private/kernel.cubin"}],
+ [{**item, "cache_key": "dispatch"} for item in valid],
+ [{**item, "cubin_sha256": "invalid"} for item in valid],
+ valid[:-1],
+ [*valid, valid[0]],
+ [
+ *valid,
+ {
+ **valid[0],
+ "cache_key": valid[0]["cache_key"][:-32] + "f" * 32,
+ },
+ ],
+ ):
+ with self.subTest(invalid=invalid):
+ self.assertFalse(contracts._deepep_v2_jit_cubins_are_valid(invalid))
+
+ backend = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.ClassDef) and node.name == "DeepEPV2Backend"
+ )
+ capture = next(
+ node for node in backend.body
+ if isinstance(node, ast.FunctionDef)
+ and node.name == "capture_deferred_provenance"
+ )
+ artifacts = copy.deepcopy(valid)
+
+ class FakeBuffer:
+ @staticmethod
+ def barrier(*, use_comm_stream: bool, with_cpu_sync: bool) -> None:
+ self.assertTrue(use_comm_stream)
+ self.assertTrue(with_cpu_sync)
+
+ namespace = {
+ "torch": types.SimpleNamespace(
+ cuda=types.SimpleNamespace(synchronize=lambda: None)
+ ),
+ "_jit_artifact_evidence": lambda: copy.deepcopy(artifacts),
+ "_require_cross_rank_equal": lambda _value, _label: None,
+ }
+ exec(
+ compile(ast.Module(body=[capture], type_ignores=[]), str(self.path), "exec"),
+ namespace,
+ )
+ state = types.SimpleNamespace(
+ buffer=FakeBuffer(),
+ _deferred_jit_snapshot=None,
+ backend_provenance={"jit_cubins": []},
+ )
+ namespace["capture_deferred_provenance"](state)
+ namespace["capture_deferred_provenance"](state)
+ artifacts[0]["cubin_sha256"] = "f" * 64
+ with self.assertRaisesRegex(RuntimeError, "changed after measurement"):
+ namespace["capture_deferred_provenance"](state)
+
+ def test_deepep_v2_jit_files_are_complete_regular_and_content_bound(self) -> None:
+ functions = [
+ node for node in self.tree.body
+ if isinstance(node, ast.FunctionDef)
+ and node.name in {"_sha256", "_jit_artifact_evidence"}
+ ]
+ namespace = {
+ "hashlib": hashlib,
+ "os": os,
+ "Path": Path,
+ "re": __import__("re"),
+ "DEEPEP_V2_JIT_KERNELS": contracts.DEEPEP_V2_JIT_KERNELS,
+ }
+ exec(compile(ast.Module(body=functions, type_ignores=[]), str(self.path), "exec"), namespace)
+ with tempfile.TemporaryDirectory() as temporary:
+ cache = Path(temporary) / "cache"
+ cache.mkdir()
+ for index, name in enumerate(sorted(contracts.DEEPEP_V2_JIT_KERNELS)):
+ kernel = cache / f"kernel.{name}.{index:032x}"
+ kernel.mkdir()
+ for suffix in ("cu", "cubin", "sass"):
+ (kernel / f"kernel.{suffix}").write_bytes(f"{name}-{suffix}".encode())
+ old_cache = os.environ.get("EP_JIT_CACHE_DIR")
+ os.environ["EP_JIT_CACHE_DIR"] = temporary
+ try:
+ evidence = namespace["_jit_artifact_evidence"]()
+ self.assertEqual(len(evidence), len(contracts.DEEPEP_V2_JIT_KERNELS))
+ self.assertEqual(
+ set(evidence[0]),
+ {"cache_key", "cubin_sha256", "sass_sha256", "source_sha256"},
+ )
+ first = cache / evidence[0]["cache_key"]
+ duplicate = cache / (evidence[0]["cache_key"][:-32] + "f" * 32)
+ duplicate.mkdir()
+ for suffix in ("cu", "cubin", "sass"):
+ (duplicate / f"kernel.{suffix}").write_bytes(b"duplicate")
+ with self.assertRaisesRegex(RuntimeError, "kernel set"):
+ namespace["_jit_artifact_evidence"]()
+ shutil.rmtree(duplicate)
+ (first / "kernel.sass").unlink()
+ with self.assertRaisesRegex(RuntimeError, "incomplete"):
+ namespace["_jit_artifact_evidence"]()
+ (first / "kernel.sass").symlink_to(first / "kernel.cubin")
+ with self.assertRaisesRegex(RuntimeError, "regular file"):
+ namespace["_jit_artifact_evidence"]()
+ finally:
+ if old_cache is None:
+ os.environ.pop("EP_JIT_CACHE_DIR", None)
+ else:
+ os.environ["EP_JIT_CACHE_DIR"] = old_cache
+
+ def test_runtime_and_shared_version_formatter_are_valid(self) -> None:
+ subprocess.run(
+ ["bash", "-n", str(ROOT / "runtime" / "run_in_container.sh")],
+ check=True,
+ )
+ self.assertEqual(ep_harness.format_collective_version(23004), "2.30.4")
+ self.assertEqual(ep_harness.format_collective_version((2, 30, 4)), "2.30.4")
+ source = self.path.read_text()
+ version_function = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_loaded_nccl_version"
+ )
+
+ class FakeNccl:
+ @staticmethod
+ def ncclGetVersion(pointer) -> int:
+ pointer._obj.value = 23004
+ return 0
+
+ namespace = {
+ "ctypes": types.SimpleNamespace(
+ CDLL=lambda _path: FakeNccl(), byref=ctypes.byref, c_int=ctypes.c_int,
+ ),
+ "ep_harness": ep_harness,
+ "os": os,
+ "_loaded_library_paths": lambda: {"/safe/libnccl.so.2"},
+ }
+ exec(
+ compile(ast.Module(body=[version_function], type_ignores=[]), str(self.path), "exec"),
+ namespace,
+ )
+ self.assertEqual(namespace["_loaded_nccl_version"](), "2.30.4")
+ for paths in (set(), {"/safe/libnccl.so.2", "/other/libnccl.so.2"}):
+ namespace["_loaded_library_paths"] = lambda paths=paths: paths
+ with self.assertRaisesRegex(RuntimeError, "exactly one"):
+ namespace["_loaded_nccl_version"]()
+ evidence_function = next(
+ node for node in self.tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_loaded_library_evidence"
+ )
+ paths = {
+ "/safe/_C.cpython-310-x86_64-linux-gnu.so",
+ "/safe/libnccl.so.2",
+ "/safe/libnvshmem_host.so.3",
+ }
+ namespace.update(
+ _loaded_library_paths=lambda: paths,
+ _sha256=lambda _path: "a" * 64,
+ )
+ exec(
+ compile(ast.Module(body=[evidence_function], type_ignores=[]), str(self.path), "exec"),
+ namespace,
+ )
+ evidence = namespace["_loaded_library_evidence"]()
+ self.assertIn(
+ {"name": "deep_ep._C", "role": "deepep-extension", "sha256": "a" * 64},
+ evidence,
+ )
+ self.assertTrue(
+ contracts._content_evidence_is_valid(
+ evidence, {"deepep-extension", "nccl", "nvshmem"}
+ )
+ )
+ self.assertNotIn("torch.cuda.nccl.version()", source)
+ fingerprint = {"runtime": "cuda", "version": "13.0"}
+ self.assertIs(
+ run_ep._common_runtime_fingerprint([fingerprint, dict(fingerprint)]),
+ fingerprint,
+ )
+ with self.assertRaises(ValueError):
+ run_ep._common_runtime_fingerprint([fingerprint, {"runtime": "cuda", "version": "12.8"}])
+
+ def test_conditioning_contract_is_exact_for_each_phase(self) -> None:
+ expected = {
+ "decode": [1, 2, 4, 8, 16, 32, 64, 128],
+ "prefill": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
+ }
+ for phase, ladder in expected.items():
+ valid = {
+ "contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "ladder": ladder,
+ "roundtrips_per_shape": 8,
+ }
+ self.assertIs(contracts.validate_conditioning_contract(valid, phase), valid)
+ for mutate in (
+ lambda item: item["ladder"].reverse(),
+ lambda item: item["ladder"].pop(),
+ lambda item: item.update(ladder=[1.0, *item["ladder"][1:]]),
+ lambda item: item.update(roundtrips_per_shape=7),
+ lambda item: item.update(roundtrips_per_shape=8.0),
+ ):
+ changed = copy.deepcopy(valid)
+ mutate(changed)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_conditioning_contract(changed, phase)
+ other = "prefill" if phase == "decode" else "decode"
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_conditioning_contract(valid, other)
+
+ def test_content_manifest_evidence_is_stable_and_content_sensitive(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ first, second = root / "first", root / "second"
+ first.write_bytes(b"first")
+ second.write_bytes(b"second")
+ files = [("pkg/first", first), ("pkg/second", second)]
+ evidence = contracts.content_manifest_evidence(
+ role="test-content", name="test-build", files=files,
+ )
+ self.assertNotIn(temporary, json.dumps(evidence))
+ self.assertEqual(
+ evidence,
+ contracts.content_manifest_evidence(
+ role="test-content", name="test-build", files=reversed(files),
+ ),
+ )
+ self.assertRegex(evidence["sha256"], r"^[0-9a-f]{64}$")
+ second.write_bytes(b"changed")
+ self.assertNotEqual(
+ evidence,
+ contracts.content_manifest_evidence(
+ role="test-content", name="test-build", files=files,
+ ),
+ )
+ for invalid in (
+ [("../first", first)],
+ [("same", first), ("same", second)],
+ [("missing", root / "missing")],
+ ):
+ with self.assertRaises(contracts.ContractError):
+ contracts.content_manifest_evidence(
+ role="test-content", name="test-build", files=invalid,
+ )
+
+ def test_hybrid_realized_config_and_jit_evidence_are_path_free(self) -> None:
+ path = HERE / "ep_deepep_hybrid.py"
+ tree = ast.parse(path.read_text(), str(path))
+ selected = [
+ node for node in tree.body
+ if (
+ isinstance(node, ast.Assign)
+ and any(
+ isinstance(target, ast.Name) and target.id == "HYBRID_CONFIG_FIELDS"
+ for target in node.targets
+ )
+ )
+ or isinstance(node, ast.FunctionDef)
+ and node.name in {
+ "_hybrid_realized_config", "_sha256_with_size", "_hybrid_jit_evidence",
+ }
+ ]
+ namespace = {"Path": Path, "hashlib": hashlib, "re": __import__("re")}
+ exec(compile(ast.Module(body=selected, type_ignores=[]), str(path), "exec"), namespace)
+ fields = namespace["HYBRID_CONFIG_FIELDS"]
+ self.assertEqual(set(fields), contracts.HYBRID_REALIZED_CONFIG_FIELDS)
+
+ class TokenType:
+ def __init__(self, label: str, name: str | None = None) -> None:
+ self.label = label
+ if name is not None:
+ self.name = name
+
+ def __str__(self) -> str:
+ return self.label
+
+ values = {field: 1 for field in fields}
+ values.update({field: True for field in contracts.HYBRID_REALIZED_BOOL_FIELDS})
+ for raw, expected in (("uint16_t", "UINT16"), ("uint8_t", "UINT8")):
+ values["token_data_type"] = TokenType(raw)
+ config = types.SimpleNamespace(**values)
+ realized = namespace["_hybrid_realized_config"](config)
+ self.assertEqual(realized["token_data_type"], expected)
+ self.assertEqual(set(realized), contracts.HYBRID_REALIZED_CONFIG_FIELDS)
+ values["token_data_type"] = TokenType("opaque-enum", "UINT16")
+ self.assertEqual(
+ namespace["_hybrid_realized_config"](types.SimpleNamespace(**values))[
+ "token_data_type"
+ ],
+ "UINT16",
+ )
+ values["token_data_type"] = TokenType("UINT16")
+ with self.assertRaisesRegex(RuntimeError, "token_data_type is invalid"):
+ namespace["_hybrid_realized_config"](types.SimpleNamespace(**values))
+ values["token_data_type"] = TokenType("uint16_t")
+ config = types.SimpleNamespace(**values)
+ delattr(config, "hidden_dim")
+ with self.assertRaisesRegex(RuntimeError, "omits hidden_dim"):
+ namespace["_hybrid_realized_config"](config)
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ for key, payload in (
+ ("preprocess-key", b"pre"),
+ ("combine-key", b"combine"),
+ ("dispatch-key", b"dispatch"),
+ ):
+ (root / f"{key}.so").write_bytes(payload)
+ evidence = namespace["_hybrid_jit_evidence"](root)
+ self.assertEqual(
+ [item["kernel_key"] for item in evidence],
+ ["combine-key", "dispatch-key", "preprocess-key"],
+ )
+ self.assertNotIn(temporary, json.dumps(evidence))
+ (root / "dispatch-key.so").write_bytes(b"changed")
+ self.assertNotEqual(evidence, namespace["_hybrid_jit_evidence"](root))
+ (root / "extra-key.so").write_bytes(b"extra")
+ with self.assertRaisesRegex(RuntimeError, "expected 3"):
+ namespace["_hybrid_jit_evidence"](root)
+ (root / "extra-key.so").unlink()
+ (root / "bad key.so").write_bytes(b"bad")
+ with self.assertRaisesRegex(RuntimeError, "kernel key"):
+ namespace["_hybrid_jit_evidence"](root)
+ (root / "bad key.so").unlink()
+ (root / "combine-key.so").unlink()
+ (root / "combine-key.so").symlink_to(root / "dispatch-key.so")
+ with self.assertRaisesRegex(RuntimeError, "regular file"):
+ namespace["_hybrid_jit_evidence"](root)
+ empty = root / "empty"
+ empty.mkdir()
+ with self.assertRaisesRegex(RuntimeError, "expected 3"):
+ namespace["_hybrid_jit_evidence"](empty)
+
+ def test_hybrid_uses_communication_domains_not_physical_hosts(self) -> None:
+ path = HERE / "ep_deepep_hybrid.py"
+ function = next(
+ node for node in ast.parse(path.read_text(), str(path)).body
+ if isinstance(node, ast.FunctionDef) and node.name == "_hybrid_topology"
+ )
+ namespace: dict[str, object] = {}
+ exec(compile(ast.Module(body=[function], type_ignores=[]), str(path), "exec"), namespace)
+ resolve = namespace["_hybrid_topology"]
+ cases = (
+ (8, 8, 8, "scale-up", "nvlink", "", 8, 1, 1),
+ (16, 8, 8, "scale-out", "nvlink", "rdma", 8, 2, 2),
+ (8, 4, 72, "scale-up", "mnnvl", "", 8, 1, 2),
+ (16, 4, 72, "scale-up", "mnnvl", "", 16, 1, 4),
+ )
+ for world, gpn, domain, scope, up, out, ranks, domains, hosts in cases:
+ with self.subTest(world=world, gpus_per_node=gpn, transport=up):
+ topology = resolve(types.SimpleNamespace(
+ gpus_per_node=gpn,
+ scale_up_domain=domain,
+ scope=scope,
+ scale_up_transport=up,
+ scale_out_transport=out,
+ transport=up if not out else f"{up}-{out}",
+ ), world)
+ self.assertEqual(
+ (topology["domain_ranks"], topology["communication_domains"],
+ topology["physical_nodes"]),
+ (ranks, domains, hosts),
+ )
+ with self.assertRaisesRegex(RuntimeError, "outside the fixed v1 matrix"):
+ resolve(types.SimpleNamespace(
+ gpus_per_node=8, scale_up_domain=8, scope="scale-up",
+ scale_up_transport="nvlink", scale_out_transport="", transport="nvlink",
+ ), 16)
+
+ def test_mori_ep16_pins_upstream_internode_v1_resources(self) -> None:
+ source = (HERE / "ep_mori.py").read_text()
+ for fragment in (
+ 'kernel_enum.InterNodeV1',
+ 'self.block_num = self._block_target = 96',
+ 'self.rdma_block_num = 64',
+ 'self.dispatch_warps = self.combine_warps = 8',
+ 'self.num_qps = 1',
+ '"gpu_per_node": gpus_per_node',
+ '"rdma_block_num": self.rdma_block_num',
+ '"num_qp_per_pe": self.num_qps',
+ '"use_external_inp_buf": self._external_input',
+ 'os.environ["MORI_EP_LAUNCH_CONFIG_MODE"] = "MANUAL"',
+ 'rdma_block_num=self.rdma_block_num',
+ ):
+ self.assertIn(fragment, source)
+ self.assertGreaterEqual(source.count("rdma_block_num=self.rdma_block_num"), 2)
+
+ def test_hybrid_deferred_provenance_wraps_before_conditioning_and_recaptures(self) -> None:
+ path = HERE / "ep_deepep_hybrid.py"
+ source = path.read_text()
+ tree = ast.parse(source, str(path))
+ backend = next(
+ node for node in tree.body
+ if isinstance(node, ast.ClassDef) and node.name == "DeepEPHybridBackend"
+ )
+ methods = {node.name for node in backend.body if isinstance(node, ast.FunctionDef)}
+ self.assertIn("capture_deferred_provenance", methods)
+ constructor = next(node for node in backend.body if isinstance(node, ast.FunctionDef) and node.name == "__init__")
+ buffer_call = next(
+ node for node in ast.walk(constructor)
+ if isinstance(node, ast.Call) and isinstance(node.func, ast.Name)
+ and node.func.id == "HybridEPBuffer"
+ )
+ wrapper_install = next(
+ node for node in ast.walk(constructor)
+ if isinstance(node, ast.Assign)
+ and any(
+ isinstance(target, ast.Attribute)
+ and target.attr == "update_template_config"
+ for target in node.targets
+ )
+ )
+ cache_line = source[:source.index('os.environ["HYBRID_EP_CACHE_DIR"]')].count("\n") + 1
+ self.assertLess(cache_line, buffer_call.lineno)
+ self.assertLess(buffer_call.lineno, wrapper_install.lineno)
+
+ capture = next(
+ node for node in backend.body
+ if isinstance(node, ast.FunctionDef) and node.name == "capture_deferred_provenance"
+ )
+ called = {
+ node.func.id if isinstance(node.func, ast.Name) else node.func.attr
+ for node in ast.walk(capture) if isinstance(node, ast.Call)
+ and isinstance(node.func, (ast.Name, ast.Attribute))
+ }
+ self.assertTrue({"_hybrid_jit_evidence", "_require_cross_rank_equal", "all_gather_object"} <= called)
+ self.assertIn("changed after measurement", ast.get_source_segment(source, capture))
+
+ artifacts = [[
+ {"bytes": 1, "kernel_key": key, "sha256": digit * 64}
+ for key, digit in (("a", "1"), ("b", "2"), ("c", "3"))
+ ]]
+
+ class FakeCuda:
+ @staticmethod
+ def synchronize() -> None:
+ return None
+
+ class FakeDist:
+ @staticmethod
+ def barrier() -> None:
+ return None
+
+ @staticmethod
+ def get_world_size() -> int:
+ return 2
+
+ @staticmethod
+ def all_gather_object(output, value) -> None:
+ output[:] = [copy.deepcopy(value), copy.deepcopy(value)]
+
+ namespace = {
+ "torch": types.SimpleNamespace(cuda=FakeCuda),
+ "dist": FakeDist,
+ "_hybrid_jit_evidence": lambda _root: copy.deepcopy(artifacts[0]),
+ "_require_cross_rank_equal": lambda _value, _label: None,
+ }
+ exec(compile(ast.Module(body=[capture], type_ignores=[]), str(path), "exec"), namespace)
+ state = types.SimpleNamespace(
+ _deferred_jit_diagnostics=None,
+ _deferred_semantic_snapshot=None,
+ _jit_root=Path("private-cache"),
+ _realized_config=hybrid_realized_config(),
+ backend_provenance={},
+ )
+ namespace["capture_deferred_provenance"](state)
+ artifacts[0][0]["kernel_key"] = "changed"
+ with self.assertRaisesRegex(RuntimeError, "kernel set changed"):
+ namespace["capture_deferred_provenance"](state)
+ artifacts[0][0]["kernel_key"] = "a"
+ artifacts[0][0]["sha256"] = "f" * 64
+ with self.assertRaisesRegex(RuntimeError, "artifacts changed"):
+ namespace["capture_deferred_provenance"](state)
+
+ harness = (HERE / "ep_harness.py").read_text()
+ captures = [
+ index for index in range(len(harness))
+ if harness.startswith("capture_deferred_provenance()", index)
+ ]
+ self.assertEqual(len(captures), 2)
+ self.assertLess(harness.index("for wt in conditioning_ladder:"), captures[0])
+ self.assertLess(captures[0], harness.index("oracle = _run_expert_oracle("))
+ self.assertLess(harness.index("trace_sig = hashlib.sha256"), captures[1])
+
+ def test_hybrid_diagnostic_hashes_do_not_split_series_identity(self) -> None:
+ keys, artifacts = hybrid_jit_provenance()
+ provenance = {
+ "deepep_tree": "b" * 40,
+ "jit_kernel_keys": keys,
+ "jit_shared_objects": artifacts,
+ "loaded_libraries": [{
+ "name": "hybrid_ep_cpp", "role": "deepep-hybrid-extension",
+ "sha256": "a" * 64,
+ }],
+ "realized_config": hybrid_realized_config(),
+ }
+ baseline = ep_harness._series_provenance(provenance)
+ changed = copy.deepcopy(provenance)
+ changed["jit_shared_objects"][0]["rank_artifacts"][0]["sha256"] = "f" * 64
+ self.assertEqual(ep_harness._series_provenance(changed), baseline)
+ changed = copy.deepcopy(provenance)
+ changed["loaded_libraries"][0]["sha256"] = "f" * 64
+ self.assertEqual(ep_harness._series_provenance(changed), baseline)
+ changed = copy.deepcopy(provenance)
+ changed["jit_kernel_keys"][0] = "changed-key"
+ self.assertNotEqual(ep_harness._series_provenance(changed), baseline)
+ changed = copy.deepcopy(provenance)
+ changed["realized_config"]["num_of_blocks_dispatch_api"] += 1
+ self.assertNotEqual(ep_harness._series_provenance(changed), baseline)
+ changed = copy.deepcopy(provenance)
+ changed["deepep_tree"] = "c" * 40
+ self.assertNotEqual(ep_harness._series_provenance(changed), baseline)
+
+ def test_v2_series_identity_uses_source_and_sass_not_container_metadata(self) -> None:
+ provenance = {
+ "deepep_tree": "a" * 40,
+ "loaded_libraries": [
+ {"name": "deep_ep._C.so", "role": "deepep-extension", "sha256": "1" * 64},
+ {"name": "libnccl.so.2", "role": "nccl", "sha256": "2" * 64},
+ ],
+ "jit_cubins": deepep_v2_jit_provenance(),
+ "jit_random_seed": "collectivex-deepep-v2-fa8a9b1",
+ }
+ baseline = contracts.series_provenance(provenance)
+ changed = copy.deepcopy(provenance)
+ changed["loaded_libraries"][0]["sha256"] = "f" * 64
+ changed["jit_cubins"][0]["cubin_sha256"] = "e" * 64
+ self.assertEqual(contracts.series_provenance(changed), baseline)
+ for mutate in (
+ lambda item: item["loaded_libraries"][1].update(sha256="f" * 64),
+ lambda item: item["jit_cubins"][0].update(source_sha256="f" * 64),
+ lambda item: item["jit_cubins"][0].update(sass_sha256="f" * 64),
+ lambda item: item.update(deepep_tree="f" * 40),
+ ):
+ changed = copy.deepcopy(provenance)
+ mutate(changed)
+ self.assertNotEqual(contracts.series_provenance(changed), baseline)
+
+ def test_mnnvl_resolution_has_no_ambiguous_signature_fallback(self) -> None:
+ self.assertEqual(
+ contracts.resolve_deepep_mnnvl(
+ requested=False, signature_parameters=(), deepep_commit=None,
+ ),
+ ({}, "not-requested"),
+ )
+ self.assertEqual(
+ contracts.resolve_deepep_mnnvl(
+ requested=True, signature_parameters=("allow_mnnvl",),
+ deepep_commit="a" * 40,
+ ),
+ ({"allow_mnnvl": True}, "explicit-allow-mnnvl"),
+ )
+ with self.assertRaises(contracts.ContractError):
+ contracts.resolve_deepep_mnnvl(
+ requested=True, signature_parameters=(),
+ deepep_commit="814e508537c6ffc775d59f6f1b9ba43f3a65968c",
+ )
+
+ def test_backend_provenance_requires_lineage_and_content_hashes(self) -> None:
+ def record(role: str, name: str, digit: str) -> dict[str, str]:
+ return {"role": role, "name": name, "sha256": digit * 64}
+
+ hybrid_keys, hybrid_artifacts = hybrid_jit_provenance()
+ v2 = {
+ **contracts.DEEPEP_V2_V1_PROVENANCE,
+ "api_signature_sha256": "c" * 64,
+ "loaded_libraries": [
+ record("deepep-extension", "deep_ep._C", "1"),
+ record("nccl", "libnccl.so.2", "2"),
+ record("nvshmem", "libnvshmem_host.so.3", "3"),
+ ],
+ "jit_cubins": deepep_v2_jit_provenance(),
+ "jit_random_seed": "collectivex-deepep-v2-fa8a9b1",
+ "deterministic": False,
+ "num_experts": 256,
+ "tuning_num_experts": 256,
+ "allow_hybrid_mode": False,
+ "gin_enabled": False,
+ "communication_backend": "nccl-device-lsa",
+ }
+ deepep = {
+ "deepep_version": "1.1.0", "deepep_commit": "a" * 40,
+ "backend_lineage": "deepep-v1", "allow_mnnvl": False,
+ "mnnvl_comm": "not-requested", "mode": "normal",
+ "num_nvl_bytes": 1024, "num_rdma_bytes": 0,
+ }
+ hybrid = {
+ "deepep_commit": "a" * 40, "deepep_tree": "b" * 40,
+ "branch": "hybrid-ep", "backend_lineage": "deepep-hybrid",
+ "loaded_libraries": [
+ record("deepep-extension", "deep_ep_cpp", "1"),
+ record("deepep-hybrid-extension", "hybrid_ep_cpp", "2"),
+ ],
+ "jit_kernel_keys": hybrid_keys,
+ "jit_shared_objects": hybrid_artifacts,
+ "realized_config": hybrid_realized_config(),
+ }
+ uccl = {
+ "uccl_version": "0.1.1", "uccl_commit": "pkg-0.1.1",
+ "uccl_wrapper_commit": "c" * 40, "backend_lineage": "uccl",
+ "uccl_dependency_versions": dict(contracts.UCCL_DEPENDENCY_VERSIONS),
+ "loaded_libraries": [
+ record("uccl-distribution", "uccl-0.1.1", "3"),
+ record("uccl-wrapper", "uccl-deepep-wrapper", "4"),
+ record("intervaltree-distribution", "intervaltree-3.1.0", "5"),
+ record("sortedcontainers-distribution", "sortedcontainers-2.4.0", "6"),
+ record("cuda-runtime", "nvidia-cuda-runtime-cu12-12.9.79", "7"),
+ ],
+ "mode": "normal", "num_nvl_bytes": 1024, "num_rdma_bytes": 0,
+ }
+ reference = {
+ "nccl_version": "2.30.4", "collective_library": "nccl",
+ "backend_lineage": "nccl",
+ }
+ for backend, provenance in (
+ ("deepep", deepep), ("deepep-v2", v2), ("deepep-hybrid", hybrid),
+ ("uccl", uccl), ("nccl-ep", reference),
+ ):
+ self.assertEqual(contracts.backend_provenance_issues(backend, provenance), [])
+ changed = copy.deepcopy(provenance)
+ if "loaded_libraries" in changed:
+ changed["loaded_libraries"][0]["sha256"] = "invalid"
+ expected = "loaded_libraries"
+ else:
+ changed["backend_lineage"] = "wrong"
+ expected = "backend_lineage"
+ self.assertIn(expected, contracts.backend_provenance_issues(backend, changed))
+
+ changed = copy.deepcopy(uccl)
+ changed["uccl_dependency_versions"]["intervaltree"] = "3.2.0"
+ self.assertIn(
+ "uccl_dependency_versions",
+ contracts.backend_provenance_issues("uccl", changed),
+ )
+ changed = copy.deepcopy(uccl)
+ changed["loaded_libraries"] = [
+ item
+ for item in changed["loaded_libraries"]
+ if item["role"] != "sortedcontainers-distribution"
+ ]
+ self.assertIn(
+ "loaded_libraries", contracts.backend_provenance_issues("uccl", changed)
+ )
+
+ for field, mutate in (
+ ("realized_config", lambda item: item["realized_config"].pop("hidden_dim")),
+ ("jit_kernel_keys", lambda item: item["jit_kernel_keys"].reverse()),
+ (
+ "jit_shared_objects",
+ lambda item: item["jit_shared_objects"][0]["rank_artifacts"][0].update(
+ sha256="invalid"
+ ),
+ ),
+ ):
+ with self.subTest(hybrid_field=field):
+ changed = copy.deepcopy(hybrid)
+ mutate(changed)
+ self.assertIn(
+ field,
+ contracts.backend_provenance_issues("deepep-hybrid", changed),
+ )
+
+ for field, value in (
+ ("jit_cubins", [{"cache_key": "invalid", "cubin_sha256": "4" * 64}]),
+ ("jit_random_seed", "different-seed"),
+ ):
+ with self.subTest(v2_field=field):
+ changed = copy.deepcopy(v2)
+ changed[field] = value
+ self.assertIn(
+ field,
+ contracts.backend_provenance_issues("deepep-v2", changed),
+ )
+
+ changed = copy.deepcopy(v2)
+ changed["gin_enabled"] = True
+ self.assertIn("gin_enabled", contracts.backend_provenance_issues("deepep-v2", changed))
+ changed = copy.deepcopy(v2)
+ changed["communication_backend"] = "nccl-gin"
+ self.assertIn(
+ "communication_backend", contracts.backend_provenance_issues("deepep-v2", changed)
+ )
+ changed = copy.deepcopy(v2)
+ changed.update(
+ allow_hybrid_mode=True,
+ gin_enabled=True,
+ communication_backend="nccl-gin",
+ )
+ self.assertEqual(
+ contracts.backend_provenance_issues("deepep-v2", changed),
+ [],
+ )
+ changed["allow_hybrid_mode"] = False
+ self.assertEqual(
+ contracts.backend_provenance_issues("deepep-v2", changed),
+ ["allow_hybrid_mode", "communication_backend", "gin_enabled"],
+ )
+ for field, expected in contracts.DEEPEP_V2_V1_PROVENANCE.items():
+ with self.subTest(v2_pin_field=field):
+ changed = copy.deepcopy(v2)
+ changed[field] = not expected if type(expected) is bool else "wrong"
+ self.assertIn(
+ field,
+ contracts.backend_provenance_issues("deepep-v2", changed),
+ )
+
+ schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text())
+ provenance_schema = schema["properties"]["implementation"]["properties"]["provenance"]
+ self.assertEqual(
+ provenance_schema["properties"]["realized_config"],
+ {"$ref": "#/$defs/hybrid_realized_config"},
+ )
+ self.assertFalse(schema["$defs"]["hybrid_realized_config"]["additionalProperties"])
+ self.assertEqual(provenance_schema["properties"]["jit_kernel_keys"]["minItems"], 3)
+ self.assertEqual(provenance_schema["properties"]["jit_shared_objects"]["minItems"], 3)
+
+ self.assertEqual(contracts.collective_kernel_generation("nccl"), "nccl")
+ self.assertEqual(contracts.collective_kernel_generation("rccl"), "rccl")
+ with self.assertRaises(contracts.ContractError):
+ contracts.collective_kernel_generation("unknown")
+
+ def test_transport_resource_provenance_is_exact(self) -> None:
+ self.assertEqual(contracts.hybrid_communication_domains(8, 8), (8, 1))
+ self.assertEqual(contracts.hybrid_communication_domains(16, 8), (8, 2))
+ self.assertEqual(contracts.hybrid_communication_domains(8, 72), (8, 1))
+ self.assertEqual(contracts.hybrid_communication_domains(16, 72), (16, 1))
+
+ profile = contracts.project_resource_profile({
+ "num_nvl_bytes": 1024, "num_rdma_bytes": 2048,
+ "num_qps_per_rank": 32, "heap_size": "6G",
+ })
+ self.assertEqual(profile["persistent_bytes"], 3072)
+ self.assertEqual(profile["qps_per_rank"], 32)
+ self.assertEqual(
+ contracts.project_resource_profile({
+ "num_nvl_bytes": 0, "num_rdma_bytes": 0, "heap_size": "6G",
+ })["persistent_bytes"],
+ 0,
+ )
+ self.assertEqual(
+ contracts.project_resource_profile({"heap_size": "6G"})[
+ "persistent_bytes"
+ ],
+ "6G",
+ )
+
+ mori = {
+ "mori_commit": "a" * 40, "kernel_type": "InterNodeV1",
+ "block_num": 96, "rdma_block_num": 64,
+ "dispatch_warps": 8, "combine_warps": 8, "num_qps": 1,
+ "use_external_inp_buf": True, "gpus_per_node": 8,
+ }
+ self.assertEqual(contracts.backend_provenance_issues("mori", mori), [])
+ for field in (
+ "block_num", "rdma_block_num", "dispatch_warps", "combine_warps",
+ "num_qps", "use_external_inp_buf", "gpus_per_node",
+ ):
+ changed = copy.deepcopy(mori)
+ changed[field] = False if field == "use_external_inp_buf" else 0
+ with self.subTest(mori_field=field):
+ self.assertIn(
+ field, contracts.backend_provenance_issues("mori", changed)
+ )
+
+ def test_routing_control_binds_binary_but_allows_treatment_configuration(self) -> None:
+ hybrid_keys, hybrid_artifacts = hybrid_jit_provenance()
+ implementation = {
+ "kernel_generation": "hybrid",
+ "name": "deepep-hybrid",
+ "provenance": {
+ "deepep_tree": "a" * 40,
+ "loaded_libraries": [{
+ "role": "deepep-extension", "name": "deep_ep_cpp", "sha256": "1" * 64,
+ }],
+ "local_experts": 32,
+ "num_experts": 256,
+ "num_sms": 24,
+ "jit_cache_key": "case-one",
+ "jit_cubins": [{"cache_key": "one", "cubin_sha256": "2" * 64}],
+ "jit_kernel_keys": hybrid_keys,
+ "jit_shared_objects": hybrid_artifacts,
+ "realized_config": hybrid_realized_config(),
+ },
+ "resource_profile": {"configured_units": 24},
+ }
+ baseline = contracts.routing_implementation_control_sha256(implementation)
+ treatment = copy.deepcopy(implementation)
+ treatment["provenance"].update({
+ "local_experts": 36,
+ "num_experts": 288,
+ "jit_cache_key": "case-two",
+ "jit_cubins": [{"cache_key": "two", "cubin_sha256": "3" * 64}],
+ "jit_kernel_keys": ["changed-a", "changed-b", "changed-c"],
+ "jit_shared_objects": hybrid_jit_provenance(3)[1],
+ "realized_config": {
+ **hybrid_realized_config(),
+ "num_of_experts_per_rank": 36,
+ },
+ })
+ self.assertEqual(
+ contracts.routing_implementation_control_sha256(treatment), baseline
+ )
+ changed = copy.deepcopy(implementation)
+ changed["provenance"]["loaded_libraries"][0]["sha256"] = "4" * 64
+ self.assertEqual(
+ contracts.routing_implementation_control_sha256(changed), baseline
+ )
+ changed = copy.deepcopy(implementation)
+ changed["provenance"]["deepep_tree"] = "b" * 40
+ self.assertNotEqual(
+ contracts.routing_implementation_control_sha256(changed), baseline
+ )
+ changed = copy.deepcopy(implementation)
+ changed["provenance"]["num_sms"] = 20
+ self.assertNotEqual(
+ contracts.routing_implementation_control_sha256(changed), baseline
+ )
+
+ def test_runtime_pins_uccl_wheel_and_hybrid_source_tree(self) -> None:
+ runtime = (ROOT / "runtime" / "run_in_container.sh").read_text()
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ self.assertIn("cd /ix/experimental/CollectiveX", runtime)
+ for launcher_name in ("launch_single-slurm.sh", "launch_gb-nv.sh"):
+ launcher = (ROOT / "launchers" / launcher_name).read_text()
+ self.assertIn("$MOUNT_SRC:/ix", launcher)
+ self.assertIn("cx_prepare_backend_cache", launcher)
+ self.assertNotIn('$(cx_prepare_backend_cache', launcher)
+ self.assertIn("$CX_PREPARED_BACKEND_CACHE:/cx-cache", launcher)
+ self.assertIn("CX_BACKEND_CACHE_ROOT=/cx-cache", launcher)
+ self.assertIn("CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources", launcher)
+ self.assertIn('|| [ "$CX_BENCH" = deepep-hybrid ]', launcher)
+ self.assertIn("cx_prepare_backend_source", launcher)
+ cache_block = launcher[launcher.index('if [ "$CX_BENCH" = deepep-v2 ]'):]
+ self.assertLess(
+ cache_block.index("cx_set_failure_stage backend-setup"),
+ cache_block.index("cx_prepare_backend_cache"),
+ )
+ self.assertLess(
+ cache_block.index("cx_prepare_backend_source"),
+ cache_block.index("cx_set_failure_stage scheduler-allocation"),
+ )
+ self.assertIn("--frandom-seed=$seed", runtime)
+ self.assertIn("DEEPEP_V2_JIT_RANDOM_SEED", runtime)
+ persisted = runtime[runtime.index("cx_persist_backend_env()") :]
+ self.assertIn("CUDA_HOME CPATH NVCC_PREPEND_FLAGS", persisted)
+ self.assertIn(
+ "390c1320918972206546e44d79b132988f2818ec07e23afcd0595f7183916cec",
+ runtime,
+ )
+ self.assertIn("--require-hashes", runtime)
+ self.assertIn("d77aeab7f1bb52b615666fe178d26ced41fae08e", common)
+ self.assertIn("HEAD^{tree}", runtime)
+ self.assertIn("$PWD/.cx_backend/deepep-hybrid-", runtime)
+ self.assertIn("cx_materialize_backend_source deepep-hybrid", runtime)
+ self.assertIn("cx_materialize_backend_source deepep-v2", runtime)
+ self.assertIn("cx_deepep_hybrid_marker_content_sha256", runtime)
+ self.assertIn("cx_deepep_hybrid_cache_is_valid", runtime)
+ self.assertIn("cx_extension_pair_sha256", runtime)
+ self.assertIn(".collectivex-complete.tmp.", runtime)
+ self.assertNotIn("cx_fetch_revision", runtime)
+ self.assertIn("cx_fetch_revision", common)
+ self.assertIn("third-party/fmt", common)
+ hybrid = runtime[
+ runtime.index("cx_build_deepep_hybrid()"):
+ runtime.index("# UCCL EP")
+ ]
+ configure = runtime[
+ runtime.index("cx_configure_deepep_hybrid_build()"):
+ runtime.index("cx_deepep_hybrid_marker_content_sha256()")
+ ]
+ self.assertIn("cx_prepare_cuda_cccl", hybrid)
+ self.assertIn("unset NVSHMEM_DIR", hybrid)
+ self.assertIn(
+ "unset HYBRID_EP_MULTINODE USE_NIXL RDMA_CORE_HOME", configure
+ )
+ self.assertIn("cx_configure_deepep_hybrid_build || return 1", hybrid)
+ self.assertIn('[ "$(uname -m)" = x86_64 ]', configure)
+ self.assertIn('[ -n "${GLOO_SOCKET_IFNAME:-}" ]', configure)
+ self.assertIn('[ -d "/sys/class/infiniband/$rdma_name" ]', configure)
+ self.assertIn("command -v make", configure)
+ self.assertIn("/usr/include/infiniband/verbs.h", configure)
+ self.assertIn("export HYBRID_EP_MULTINODE=1 USE_NIXL=0", configure)
+ self.assertNotIn("cx_prepare_deepep_toolchain", hybrid)
+ toolchain = runtime[
+ runtime.index("cx_prepare_deepep_toolchain()"):
+ runtime.index("cx_probe_deepep()")
+ ]
+ self.assertIn('overlay="$root/nvshmem-overlay"', toolchain)
+ self.assertIn("flock 8 || exit 1", toolchain)
+ self.assertIn('mv "$temporary" "$overlay" || exit 1', toolchain)
+ self.assertNotIn("/tmp/collectivex-nvshmem", toolchain)
+ jit = runtime[
+ runtime.index("cx_enable_deepep_v2_jit_reproducibility()"):
+ runtime.index("cx_probe_deepep_v2()")
+ ]
+ self.assertIn('cccl="${CX_CUDA_CCCL:-}"', jit)
+ self.assertNotIn("/usr/local/cuda*", jit)
+ self.assertIn("deepep-v2-cache-v2|$cpu|sm${arch/./}", runtime)
+ self.assertNotIn("deepep-v2-cache-v1|", runtime)
+ self.assertIn('base="${CX_BACKEND_CACHE_ROOT:-}"', runtime)
+ self.assertNotIn("${CX_BACKEND_CACHE_ROOT:-$PWD/.cx_backend}", runtime)
+ self.assertIn(
+ "recipe=aot-persistent-nvshmem-active-cuda-maxjobs16-v2", runtime
+ )
+ self.assertNotIn("recipe=aot-source-date-epoch-arch-maxjobs16-v1", runtime)
+ self.assertNotIn("recipe=$source_sha", runtime)
+ self.assertIn("pip=26.1.2|setuptools=82.0.1|wheel=0.47.0|ninja=1.13.0", runtime)
+ self.assertIn("manual-unverified", runtime)
+ self.assertIn("cx_deepep_v2_content_sha256", runtime)
+ self.assertIn("DeepEP V2 cache validation failed", runtime)
+ probe = runtime[
+ runtime.index("cx_probe_deepep_v2()"):
+ runtime.index("cx_deepep_v2_content_sha256()")
+ ]
+ self.assertNotIn("torch.cuda.nccl.version", probe)
+ self.assertIn("ncclGetVersion", probe)
+ self.assertIn("runtime_version.value == 23004", probe)
+ self.assertIn("cx_nvidia_package_root nvidia-nccl-cu13 nccl", runtime)
+ self.assertIn("cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem", runtime)
+ self.assertNotIn("import os,nvidia.nccl", runtime)
+ self.assertNotIn("import os,nvidia.nvshmem", runtime)
+ self.assertIn(
+ 'export EP_JIT_CACHE_DIR="$stage_root/.cx_backend/deepep-v2-jit"', runtime
+ )
+ self.assertIn('stage_root="${CX_BACKEND_SOURCE_ROOT%/.cx_sources}"', runtime)
+ self.assertNotIn('export EP_JIT_CACHE_DIR="$root/jit"', runtime)
+ self.assertIn('EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR"', runtime)
+ reference = (HERE / "ep_nccl.py").read_text()
+ self.assertIn("self.kernel_generation = contracts.collective_kernel_generation", reference)
+
+ def test_deepep_v2_cache_recovers_from_an_unpublished_partial_build(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ cache_key = "a" * 64
+ content_hash = "b" * 64
+ root = Path(temporary) / f"deepep-v2-{cache_key}"
+ root.mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ stale = root / "stale-partial-build"
+ stale.write_text("partial\n")
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")"
+ cache_root="$2"; expected_revision="$3"; expected_tree="$4"; expected_fmt="$5"
+ expected_content="$6"
+ cx_log() { :; }
+ cx_verify_backend_cache_mount() { return 0; }
+ cx_cuda_arch() { printf '9.0'; }
+ cx_deepep_v2_root() { printf '%s' "$cache_root"; }
+ cx_activate_deepep_v2() { export DEEPEP_V2_COMMIT="$expected_revision"; }
+ cx_prepare_deepep_toolchain() { export NVSHMEM_DIR=/tmp/cx-test-nvshmem; }
+ cx_probe_deepep_v2() { return 0; }
+ cx_deepep_v2_content_sha256() { printf '%s' "$expected_content"; }
+ cx_deepep_v2_cache_is_valid() {
+ test -f "$2" && test "$(wc -l < "$2" | tr -d ' ')" = 5
+ }
+ cx_enable_deepep_v2_jit_reproducibility() { return 0; }
+ cx_materialize_backend_source() { mkdir -p "$2/third-party/fmt"; }
+ flock() { return 0; }
+ python3() {
+ if [ "${1:-}" = -m ] && [ "${2:-}" = venv ]; then
+ mkdir -p "$3/bin"
+ printf '#!/bin/sh\nexit 0\n' > "$3/bin/python"
+ chmod 700 "$3/bin/python"
+ fi
+ return 0
+ }
+ git() {
+ case " $* " in
+ *' third-party/fmt rev-parse HEAD '*) printf '%s\n' "$expected_fmt" ;;
+ *' rev-parse HEAD^{tree} '*) printf '%s\n' "$expected_tree" ;;
+ *' show -s --format=%ct HEAD '*) printf '1\n' ;;
+ *) return 0 ;;
+ esac
+ }
+ cx_git_in_tree() { shift; git "$@"; }
+ cx_build_deepep_v2
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_", str(runtime), str(root),
+ COMMIT, TREE, FMT_COMMIT, content_hash,
+ ],
+ check=True,
+ )
+ self.assertFalse(stale.exists())
+ self.assertEqual(
+ marker.read_text(),
+ f"{COMMIT}\n{TREE}\n{FMT_COMMIT}\n{cache_key}\n{content_hash}\n",
+ )
+ self.assertEqual(list(root.glob(".collectivex-complete.tmp.*")), [])
+
+ def test_deepep_v2_published_cache_is_never_deleted_after_probe_failure(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ cache_key = "a" * 64
+ root = Path(temporary) / f"deepep-v2-{cache_key}"
+ root.mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ marker.write_text("published\n")
+ sentinel = root / "active-reader"
+ sentinel.write_text("active\n")
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")"
+ cache_root="$2"
+ cx_log() { :; }
+ cx_verify_backend_cache_mount() { return 0; }
+ cx_cuda_arch() { printf '9.0'; }
+ cx_deepep_v2_root() { printf '%s' "$cache_root"; }
+ cx_deepep_v2_cache_is_valid() { return 0; }
+ cx_activate_deepep_v2() { return 0; }
+ cx_prepare_deepep_toolchain() { return 0; }
+ cx_enable_deepep_v2_jit_reproducibility() { return 0; }
+ cx_probe_deepep_v2() { return 1; }
+ ! cx_build_deepep_v2
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), str(root)],
+ check=True,
+ )
+ self.assertEqual(sentinel.read_text(), "active\n")
+ self.assertEqual(marker.read_text(), "published\n")
+
+ def test_deepep_v2_corrupt_published_cache_fails_without_reset(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ cache_key = "a" * 64
+ root = Path(temporary) / f"deepep-v2-{cache_key}"
+ root.mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ marker.write_text("corrupt\n")
+ sentinel = root / "active-reader"
+ sentinel.write_text("active\n")
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")"
+ cache_root="$2"
+ cx_log() { :; }
+ cx_verify_backend_cache_mount() { return 0; }
+ cx_cuda_arch() { printf '9.0'; }
+ cx_deepep_v2_root() { printf '%s' "$cache_root"; }
+ cx_deepep_v2_cache_is_valid() { return 1; }
+ flock() { return 0; }
+ ! cx_build_deepep_v2
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), str(root)],
+ check=True,
+ )
+ self.assertEqual(sentinel.read_text(), "active\n")
+ self.assertEqual(marker.read_text(), "corrupt\n")
+
+ def test_deepep_v2_marker_requires_private_owned_cache_objects(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary) / "cache"
+ root.mkdir(mode=0o700)
+ (root / "source").mkdir(mode=0o700)
+ (root / "venv").mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ cache_key = "a" * 64
+ content_hash = "b" * 64
+ marker.write_text(
+ f"{COMMIT}\n{TREE}\n{FMT_COMMIT}\n{cache_key}\n{content_hash}\n"
+ )
+ root.chmod(0o2700)
+ marker.chmod(0o600)
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_deepep_v2_marker_content_sha256()/,/^}/p' "$1")"
+ cx_deepep_v2_marker_content_sha256 "$2" "$3" "$4" "$5" "$6" "$7"
+ '''
+ args = [
+ "bash", "-c", command, "_", str(runtime), str(root), str(marker),
+ COMMIT, TREE, FMT_COMMIT, cache_key,
+ ]
+ valid = subprocess.run(args, text=True, capture_output=True, check=True)
+ self.assertEqual(valid.stdout, content_hash)
+ marker.chmod(0o644)
+ self.assertNotEqual(subprocess.run(args).returncode, 0)
+
+ def test_deepep_hybrid_marker_requires_a_private_regular_file(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary) / "cache"
+ root.mkdir(mode=0o700)
+ marker = root / ".collectivex-complete"
+ content_hash = "b" * 64
+ marker.write_text(f"{COMMIT}\n{TREE}\n{content_hash}\n")
+ root.chmod(0o2700)
+ marker.chmod(0o600)
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_deepep_hybrid_marker_content_sha256()/,/^}/p' "$1")"
+ cx_deepep_hybrid_marker_content_sha256 "$2" "$3" "$4" "$5"
+ '''
+ args = [
+ "bash", "-c", command, "_", str(runtime), str(root), str(marker),
+ COMMIT, TREE,
+ ]
+ valid = subprocess.run(args, text=True, capture_output=True, check=True)
+ self.assertEqual(valid.stdout, content_hash)
+ marker_contract = runtime.read_text()
+ marker_contract = marker_contract[
+ marker_contract.index("cx_deepep_hybrid_marker_content_sha256()"):
+ marker_contract.index("cx_deepep_hybrid_cache_is_valid()")
+ ]
+ self.assertIn("marker_item.st_uid != root_item.st_uid", marker_contract)
+ self.assertNotIn("st_uid != os.getuid()", marker_contract)
+ marker.chmod(0o644)
+ self.assertNotEqual(subprocess.run(args).returncode, 0)
+
+ def test_deepep_v2_installed_content_digest_binds_every_distribution_file(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ site = Path(temporary) / "venv" / "lib" / "python3.11" / "site-packages"
+ package = site / "deep_ep"
+ info = site / "deep_ep-2.0.0.dist-info"
+ package.mkdir(parents=True)
+ info.mkdir()
+ (package / "__init__.py").write_text("__version__ = '2.0.0'\n")
+ extension = package / "_C.so"
+ extension.write_bytes(b"extension-one")
+ (info / "METADATA").write_text(
+ "Metadata-Version: 2.1\nName: deep_ep\nVersion: 2.0.0\n"
+ )
+ (info / "RECORD").write_text(
+ "deep_ep/__init__.py,,\n"
+ "deep_ep/_C.so,,\n"
+ "deep_ep-2.0.0.dist-info/METADATA,,\n"
+ "deep_ep-2.0.0.dist-info/RECORD,,\n"
+ )
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_deepep_v2_content_sha256()/,/^}/p' "$1")"
+ cx_deepep_v2_content_sha256
+ '''
+ env = {
+ **os.environ,
+ "PYTHONPATH": str(site),
+ "VIRTUAL_ENV": str(Path(temporary) / "venv"),
+ }
+ first = subprocess.run(
+ ["bash", "-c", command, "_", str(runtime)],
+ text=True, capture_output=True, check=True, env=env,
+ ).stdout
+ extension.write_bytes(b"extension-two")
+ second = subprocess.run(
+ ["bash", "-c", command, "_", str(runtime)],
+ text=True, capture_output=True, check=True, env=env,
+ ).stdout
+ self.assertRegex(first, r"^[0-9a-f]{64}$")
+ self.assertRegex(second, r"^[0-9a-f]{64}$")
+ self.assertNotEqual(first, second)
+ extension.unlink()
+ outside = Path(temporary) / "outside.so"
+ outside.write_bytes(b"outside")
+ extension.symlink_to(outside)
+ self.assertNotEqual(
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime)], env=env,
+ ).returncode,
+ 0,
+ )
+
+ def test_uccl_content_identity_excludes_install_generated_files(self) -> None:
+ keep = load_uccl_function(
+ "_is_uccl_runtime_payload", {"PurePosixPath": PurePosixPath}
+ )
+ self.assertTrue(keep("uccl/ep.abi3.so"))
+ self.assertTrue(keep("uccl.libs/libnuma.so"))
+ self.assertFalse(keep("uccl/__pycache__/collective.cpython-312.pyc"))
+ self.assertFalse(keep("uccl-0.1.1.dist-info/RECORD"))
+
+ def test_uccl_dependency_versions_are_exact(self) -> None:
+ installed = dict(contracts.UCCL_DEPENDENCY_VERSIONS)
+ dependency_versions = load_uccl_function(
+ "_uccl_dependency_versions",
+ {
+ "contracts": contracts,
+ "metadata": types.SimpleNamespace(
+ version=lambda package: installed[package]
+ ),
+ },
+ )
+ self.assertEqual(dependency_versions(), contracts.UCCL_DEPENDENCY_VERSIONS)
+ installed["intervaltree"] = "3.2.0"
+ with self.assertRaisesRegex(RuntimeError, "differ from the v1 contract"):
+ dependency_versions()
+
+ schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text())
+ dependency_schema = schema["properties"]["implementation"]["properties"][
+ "provenance"
+ ]["properties"]["uccl_dependency_versions"]
+ self.assertFalse(dependency_schema["additionalProperties"])
+ self.assertEqual(
+ {
+ package: definition["const"]
+ for package, definition in dependency_schema["properties"].items()
+ },
+ contracts.UCCL_DEPENDENCY_VERSIONS,
+ )
+
+ def test_uccl_support_dependency_content_is_path_free(self) -> None:
+ with tempfile.TemporaryDirectory() as directory:
+ root = Path(directory)
+ source_entry = PurePosixPath("intervaltree/__init__.py")
+ cache_entry = PurePosixPath("intervaltree/__pycache__/__init__.pyc")
+ metadata_entry = PurePosixPath("intervaltree-3.1.0.dist-info/RECORD")
+ for entry in (source_entry, cache_entry, metadata_entry):
+ path = root / entry
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_bytes(entry.as_posix().encode())
+ distribution = types.SimpleNamespace(
+ files=[source_entry, cache_entry, metadata_entry],
+ locate_file=lambda item: root / item,
+ )
+ evidence_for = load_uccl_function(
+ "_python_dependency_evidence",
+ {
+ "Path": Path,
+ "PurePosixPath": PurePosixPath,
+ "contracts": contracts,
+ "metadata": types.SimpleNamespace(
+ distribution=lambda package: distribution
+ ),
+ },
+ )
+ evidence = evidence_for("intervaltree", "3.1.0")
+ self.assertEqual(
+ evidence,
+ contracts.content_manifest_evidence(
+ role="intervaltree-distribution",
+ name="intervaltree-3.1.0",
+ files=[(source_entry.as_posix(), root / source_entry)],
+ ),
+ )
+ self.assertNotIn(str(root), json.dumps(evidence))
+
+ def test_uccl_hashes_the_mapped_pinned_libcudart_without_exposing_paths(
+ self,
+ ) -> None:
+ with tempfile.TemporaryDirectory() as directory:
+ root = Path(directory)
+ entry = PurePosixPath("nvidia/cuda_runtime/lib/libcudart.so.12")
+ library = root / entry
+ library.parent.mkdir(parents=True)
+ library.write_bytes(b"pinned CUDA 12 runtime")
+ distribution = types.SimpleNamespace(
+ files=[entry],
+ locate_file=lambda item: root / item,
+ )
+ evidence_for = load_uccl_function(
+ "_loaded_libcudart_evidence",
+ {
+ "Path": Path,
+ "PurePosixPath": PurePosixPath,
+ "contracts": contracts,
+ "metadata": types.SimpleNamespace(
+ distribution=lambda package: distribution
+ ),
+ },
+ )
+ maps = root / "maps"
+ maps.write_text(f"7f00-7f10 r-xp 00000000 00:00 0 {library}\n")
+ evidence = evidence_for("12.9.79", maps)
+ self.assertEqual(
+ evidence,
+ contracts.content_manifest_evidence(
+ role="cuda-runtime",
+ name="nvidia-cuda-runtime-cu12-12.9.79",
+ files=[("libcudart.so", library)],
+ ),
+ )
+ self.assertNotIn(str(root), json.dumps(evidence))
+
+ unowned = root / "unowned" / library.name
+ unowned.parent.mkdir()
+ unowned.write_bytes(library.read_bytes())
+ maps.write_text(f"7f00-7f10 r-xp 00000000 00:00 0 {unowned}\n")
+ with self.assertRaisesRegex(RuntimeError, "not owned") as raised:
+ evidence_for("12.9.79", maps)
+ self.assertNotIn(str(root), str(raised.exception))
+
+ def test_private_runtime_logs_are_not_public_artifacts(self) -> None:
+ path = subprocess.check_output(
+ [
+ "bash", "-c", 'source "$1"; cx_private_log_path test', "_",
+ str(ROOT / "runtime" / "common.sh"),
+ ],
+ text=True,
+ env={**os.environ, "COLLECTIVEX_EXECUTION_ID": "contract-test"},
+ ).strip()
+ try:
+ log = Path(path)
+ self.assertEqual(stat.S_IMODE(log.stat().st_mode), 0o600)
+ self.assertEqual(stat.S_IMODE(log.parent.stat().st_mode), 0o700)
+ self.assertFalse(log.is_relative_to(ROOT))
+ finally:
+ shutil.rmtree(Path(path).parent, ignore_errors=True)
+
+ def test_private_runtime_logs_reject_traversal_and_symlinks(self) -> None:
+ common = str(ROOT / "runtime" / "common.sh")
+ for variable, value in (
+ ("COLLECTIVEX_EXECUTION_ID", ".."),
+ ("CX_TEST_LABEL", ".."),
+ ):
+ environment = {
+ **os.environ,
+ "COLLECTIVEX_EXECUTION_ID": "contract-adversarial",
+ "CX_TEST_LABEL": "test",
+ variable: value,
+ }
+ result = subprocess.run(
+ ["bash", "-c", 'source "$1"; cx_private_log_path "$CX_TEST_LABEL"', "_", common],
+ text=True,
+ capture_output=True,
+ env=environment,
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertNotIn(value, result.stderr)
+
+ private_root = Path(f"/tmp/inferencex-collectivex-{os.getuid()}")
+ private_root.mkdir(mode=0o700, exist_ok=True)
+ self.assertFalse(private_root.is_symlink())
+ os.chmod(private_root, 0o700)
+ with tempfile.TemporaryDirectory() as temporary:
+ target = Path(temporary)
+ tag = f"contract-symlink-{os.getpid()}"
+ link = private_root / tag
+ link.symlink_to(target, target_is_directory=True)
+ try:
+ result = subprocess.run(
+ ["bash", "-c", 'source "$1"; cx_private_log_path test', "_", common],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "COLLECTIVEX_EXECUTION_ID": tag},
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertEqual(list(target.iterdir()), [])
+ finally:
+ link.unlink(missing_ok=True)
+
+ tag = f"contract-log-symlink-{os.getpid()}"
+ directory = private_root / tag
+ directory.mkdir(mode=0o700)
+ target_file = target / "target"
+ target_file.write_text("unchanged")
+ log_link = directory / "test.log"
+ log_link.symlink_to(target_file)
+ try:
+ result = subprocess.run(
+ ["bash", "-c", 'source "$1"; cx_private_log_path test', "_", common],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "COLLECTIVEX_EXECUTION_ID": tag},
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertEqual(target_file.read_text(), "unchanged")
+ finally:
+ log_link.unlink(missing_ok=True)
+ directory.rmdir()
+
+ def test_operator_config_failure_is_value_free(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ config = Path(temporary) / "operator.env"
+ config.write_text("printf 'private-config-token\\n' >&2\nfalse\n")
+ config.chmod(0o600)
+ result = subprocess.run(
+ ["bash", "-c",
+ 'export COLLECTIVEX_EXECUTION_ID="operator-failure-$$"; '
+ "trap 'cx_cleanup_private_logs 0' EXIT; source \"$1\"; "
+ "cx_load_operator_config", "_",
+ str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertIn("runner-local configuration failed", result.stderr)
+ self.assertNotIn("private-config-token", result.stderr)
+
+ def test_ephemeral_operator_config_is_removed_after_source(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ config = Path(temporary) / "operator.env"
+ decoy = Path(temporary) / "decoy"
+ decoy.write_text("keep")
+ config.write_text(json.dumps(operator_config(Path(temporary) / "storage")))
+ config.chmod(0o600)
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'export COLLECTIVEX_EXECUTION_ID="operator-ephemeral-$$"; '
+ "trap 'cx_cleanup_private_logs 0' EXIT; "
+ 'config="$COLLECTIVEX_OPERATOR_CONFIG"; source "$1"; '
+ 'cx_load_operator_config; test ! -e "$config"; '
+ 'test "$CX_PARTITION" = test; '
+ 'test -z "${COLLECTIVEX_OPERATOR_CONFIG+x}"',
+ "_", str(ROOT / "runtime" / "common.sh"),
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ "COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL": "1",
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertFalse(config.exists())
+ self.assertEqual(decoy.read_text(), "keep")
+
+ def test_operator_config_is_strict_per_runner_json(self) -> None:
+ command = (
+ 'source "$1"; export COLLECTIVEX_EXECUTION_ID="operator-config-$$"; '
+ "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; "
+ 'test "$CX_PARTITION" = test; '
+ 'test -z "${COLLECTIVEX_OPERATOR_CONFIG_CONTENT+x}"; '
+ 'test -z "${ENROOT_CACHE_PATH+x}"'
+ )
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ document = operator_config(root / "storage")
+ config = root / "operator.json"
+ config.write_text(json.dumps(document))
+ config.chmod(0o600)
+ for runner in capability.PLATFORMS:
+ with self.subTest(runner=runner):
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": runner,
+ "ENROOT_CACHE_PATH": "/private/stale-enroot-cache",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+
+ lock_dir = root / "amd-locks"
+ document["runners"]["mi355x"]["lock_dir"] = str(lock_dir)
+ config.write_text(json.dumps(document))
+ config.chmod(0o600)
+ canonical = subprocess.run(
+ [
+ "bash",
+ "-c",
+ 'source "$1"; export COLLECTIVEX_EXECUTION_ID="canonical-lock-$$"; '
+ "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; "
+ 'cx_lock_canonical_gha_env mi355x; test "$CX_LOCK_DIR" = "$2"',
+ "_",
+ str(ROOT / "runtime" / "common.sh"),
+ str(lock_dir),
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "mi355x",
+ "CX_SHARD_FILE": ".shards/test.json",
+ "CX_SHARD_SKU": "mi355x",
+ "CX_NODES": "1",
+ "CX_GPUS_PER_NODE": "8",
+ "COLLECTIVEX_CANONICAL_GHA": "1",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ "COLLECTIVEX_SOURCE_SHA": "a" * 40,
+ "GITHUB_ACTIONS": "true",
+ "GITHUB_RUN_ATTEMPT": "1",
+ "GITHUB_RUN_ID": "1",
+ "GITHUB_WORKSPACE": str(root.resolve()),
+ },
+ )
+ self.assertEqual(canonical.returncode, 0, canonical.stderr)
+
+ selected_only = {
+ "schema_version": 1,
+ "runners": {"h100-dgxc": document["runners"]["h100-dgxc"]},
+ }
+ result = subprocess.run(
+ [
+ "bash", "-c", command + '; test "$CX_STAGE_DIR" = "$2"', "_",
+ str(ROOT / "runtime" / "common.sh"), str(root / "storage"),
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "CX_STAGE_DIR": "/private/stale-stage",
+ "ENROOT_CACHE_PATH": "/private/stale-enroot-cache",
+ "COLLECTIVEX_OPERATOR_CONFIG_LOADED": "1",
+ "COLLECTIVEX_OPERATOR_CONFIG_CONTENT": json.dumps(selected_only),
+ "COLLECTIVEX_OPERATOR_CONFIG_REQUIRED": "1",
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+
+ rejected = json.loads(json.dumps(document))
+ rejected["runners"]["h100-dgxc"]["shell"] = "private-command"
+ boolean_version = {**document, "schema_version": True}
+ missing_socket = json.loads(json.dumps(document))
+ del missing_socket["runners"]["h100-dgxc"]["socket_ifname"]
+ missing_rdma = json.loads(json.dumps(document))
+ del missing_rdma["runners"]["mi355x"]["rdma_devices"]
+ missing_amd_stage = json.loads(json.dumps(document))
+ del missing_amd_stage["runners"]["mi325x"]["stage_dir"]
+ missing_nvidia_stage = json.loads(json.dumps(document))
+ del missing_nvidia_stage["runners"]["h100-dgxc"]["stage_dir"]
+ for invalid in (rejected, boolean_version, missing_nvidia_stage):
+ config.write_text(json.dumps(invalid))
+ config.chmod(0o600)
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertNotIn("private-command", result.stderr)
+
+ for valid, runner in (
+ (missing_socket, "h100-dgxc"),
+ (missing_rdma, "h100-dgxc"),
+ (missing_amd_stage, "h100-dgxc"),
+ ):
+ config.write_text(json.dumps(valid))
+ config.chmod(0o600)
+ result = subprocess.run(
+ [
+ "bash", "-c", command + "; cx_apply_network_profile 1 nvlink",
+ "_", str(ROOT / "runtime" / "common.sh"),
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": runner,
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+
+ config.write_text(json.dumps(missing_socket))
+ config.chmod(0o600)
+ scaleout = subprocess.run(
+ [
+ "bash", "-c", command + "; cx_apply_network_profile 2 nvlink-rdma",
+ "_", str(ROOT / "runtime" / "common.sh"),
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertNotEqual(scaleout.returncode, 0)
+
+ config.write_text(json.dumps(missing_amd_stage))
+ config.chmod(0o600)
+ selected_missing_stage = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "mi325x",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertNotEqual(selected_missing_stage.returncode, 0)
+
+ config.write_text(json.dumps(document))
+ config.chmod(0o644)
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "CX_RUNNER": "h100-dgxc",
+ "COLLECTIVEX_OPERATOR_CONFIG": str(config),
+ },
+ )
+ self.assertNotEqual(result.returncode, 0)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/experimental/CollectiveX/tests/test_publisher.py b/experimental/CollectiveX/tests/test_publisher.py
new file mode 100644
index 0000000000..5624fbcf68
--- /dev/null
+++ b/experimental/CollectiveX/tests/test_publisher.py
@@ -0,0 +1,2418 @@
+#!/usr/bin/env python3
+"""Focused end-to-end tests for the isolated CollectiveX publisher."""
+from __future__ import annotations
+
+import copy
+import hashlib
+import itertools
+import json
+import os
+from pathlib import Path
+import subprocess
+import sys
+import tempfile
+import types
+import unittest
+from unittest import mock
+import zipfile
+
+HERE = Path(__file__).resolve().parent
+ROOT = HERE.parent
+sys.path[:0] = [str(ROOT), str(HERE)]
+
+import contracts # noqa: E402
+import identity # noqa: E402
+import publisher # noqa: E402
+import summarize # noqa: E402
+import sweep_matrix # noqa: E402
+
+
+RUN = {
+ "repository": "SemiAnalysisAI/InferenceX",
+ "run_id": "12345",
+ "run_attempt": 1,
+ "source_sha": "a" * 40,
+}
+
+
+def _unsupported_delivery(
+ root: Path, ordinals: tuple[int, ...] = (1,), run: dict = RUN,
+) -> tuple[Path, Path]:
+ matrix = sweep_matrix.resolve_matrix(backends="all")
+ wrapper = next(item for item in matrix["requested_cases"] if item["disposition"] == "unsupported")
+ matrix = {
+ "format": "collectivex.matrix.v1",
+ "schema_version": 1,
+ "requested_cases": [wrapper],
+ "include": [],
+ }
+ case = {key: value for key, value in wrapper["case"].items() if key != "case_id"}
+ artifact_name = f"cxunsupported-{run['run_id']}-{run['run_attempt']}"
+ git_run = {
+ "artifact": artifact_name,
+ "job": "setup",
+ "ref": "collectivex",
+ "repo": run["repository"],
+ "run_attempt": str(run["run_attempt"]),
+ "run_id": run["run_id"],
+ "source_sha": run["source_sha"],
+ }
+ allocation = {
+ "artifact": artifact_name,
+ "execution_id": f"{run['run_id']}_{run['run_attempt']}_unsupported",
+ "job": "setup",
+ "repo": run["repository"],
+ "run_attempt": str(run["run_attempt"]),
+ "run_id": run["run_id"],
+ "runner": "capability-resolver",
+ "source_sha": run["source_sha"],
+ }
+ matrix_path = root / "matrix.json"
+ artifact = root / artifact_name
+ artifact.mkdir()
+ matrix_path.write_text(json.dumps(matrix))
+ control_sha256 = hashlib.sha256(matrix_path.read_bytes()).hexdigest()
+ for ordinal in ordinals:
+ terminal = contracts.make_terminal_document(
+ allocation_factors=allocation, attempt_ordinal=ordinal, case=case,
+ case_factors={"case": case, "profile": identity.V1_CASE_PROFILE,
+ "sku": wrapper["sku"]},
+ control_sha256=control_sha256, failure_mode="capability",
+ generated_at="2026-07-04T00:00:00Z", git_run=git_run,
+ reason=wrapper["reason"], return_code=5, source="matrix-capability-resolver",
+ status="unsupported", expected_case_id=wrapper["case"]["case_id"],
+ )
+ (artifact / f"unsupported-{ordinal}.json").write_text(json.dumps(terminal))
+ return matrix_path, artifact
+
+
+def _args(
+ store: Path, matrix: Path, artifact: Path, run: dict = RUN
+) -> types.SimpleNamespace:
+ return types.SimpleNamespace(
+ store_root=str(store),
+ matrix=str(matrix),
+ artifact=[str(artifact)],
+ repository=run["repository"],
+ run_id=run["run_id"],
+ run_attempt=run["run_attempt"],
+ source_sha=run["source_sha"],
+ )
+
+
+def _ids(seed: str) -> tuple[str, str, str, str, str, str]:
+ case = identity.digest("case", {"seed": seed})
+ allocation = identity.allocation_id({"seed": seed})
+ attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1)
+ series = identity.series_id({"seed": seed})
+ point = identity.point_id(series=series, tokens_per_rank=8)
+ evidence = identity.evidence_id(
+ point=point, allocation=allocation, attempt=attempt, sample_sha256="b" * 64
+ )
+ return case, allocation, attempt, series, point, evidence
+
+
+def _component(scale: float = 1.0) -> dict:
+ latency = {"p50": 10.0 * scale, "p90": 12.0 * scale,
+ "p95": 14.0 * scale, "p99": 20.0 * scale}
+ logical_bytes = 100_000
+ return {
+ "origin": "measured",
+ "latency_us": latency,
+ "logical_bytes": logical_bytes,
+ "logical_payload_rate_gbps_at_latency_percentile": {
+ name: logical_bytes / (value * 1000.0) for name, value in latency.items()
+ },
+ "sample_count": 512,
+ }
+
+
+def _hybrid_provenance(ep_size: int = 1) -> dict:
+ realized = {field: 1 for field in contracts.HYBRID_REALIZED_CONFIG_FIELDS}
+ for field in contracts.HYBRID_REALIZED_BOOL_FIELDS:
+ realized[field] = True
+ realized.update({
+ "num_of_experts_per_rank": 1,
+ "num_of_nodes": 1,
+ "num_of_ranks_per_node": ep_size,
+ "token_data_type": "UINT16",
+ })
+ kernel_keys = ["combine-key", "dispatch-key", "preprocess-key"]
+ return {
+ "backend_lineage": "deepep-hybrid", "branch": "hybrid-ep",
+ "deepep_commit": "a" * 40, "deepep_tree": "b" * 40,
+ "device_sms": 1,
+ "jit_kernel_keys": kernel_keys,
+ "jit_shared_objects": [
+ {
+ "kernel_key": key,
+ "rank_artifacts": [
+ {"bytes": 1, "rank": rank, "sha256": f"{index + 1:x}" * 64}
+ for rank in range(ep_size)
+ ],
+ }
+ for index, key in enumerate(kernel_keys)
+ ],
+ "loaded_libraries": [
+ {"name": "deep_ep_cpp", "role": "deepep-extension", "sha256": "4" * 64},
+ {"name": "hybrid_ep_cpp", "role": "deepep-hybrid-extension", "sha256": "5" * 64},
+ ],
+ "realized_config": realized,
+ "resource_mode": "tuned",
+ "tuned_source": "deepep-hybrid-configurer-autotune-v1",
+ }
+
+
+def _native_fixture(backend: str = "nccl-ep") -> tuple[dict, dict]:
+ def digest(value: object) -> str:
+ return hashlib.sha256(contracts.canonical_json_bytes(value)).hexdigest()
+
+ scheduled = {
+ "backend": backend, "canonical": True, "eplb": False, "ep": 1,
+ "experts": 1, "gpus_per_node": 1, "hidden": 1, "ladder": "1", "nodes": 1,
+ "mode": "normal", "phase": "decode", "required_publication": "official",
+ "routing": "uniform", "samples_per_point": 512,
+ "scale_out_transport": None, "scale_up_domain": 1,
+ "scale_up_transport": "nvlink", "scope": "scale-up", "suite": "ep-core-v1",
+ "timing": "8:64:32", "topk": 1,
+ "topology_class": "fixture", "transport": "nvlink",
+ "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1",
+ "workload": "deepseek-v3-v1",
+ }
+ case_factors = {
+ "case": scheduled, "profile": identity.V1_NORMAL_CASE_PROFILE, "sku": "fixture"
+ }
+ case_id = identity.digest("case", case_factors)
+ git_run = {
+ "artifact": "cxshard-fixture-999-1", "job": "sweep", "ref": "collectivex",
+ "repo": RUN["repository"], "run_attempt": "1", "run_id": "999",
+ "source_sha": RUN["source_sha"],
+ }
+ allocation_factors = {
+ "artifact": git_run["artifact"], "execution_id": "999_1_fixture",
+ "job": git_run["job"], "repo": git_run["repo"], "run_attempt": "1",
+ "run_id": "999", "runner": "fixture", "source_sha": git_run["source_sha"],
+ }
+ allocation_id = identity.allocation_id(allocation_factors)
+ attempt_id = identity.attempt_id(allocation=allocation_id, case=case_id, ordinal=1)
+ member_id, member_checksums, routing_hash, routing_rows, routing_weights = (
+ contracts._expected_canonical_trace(
+ "uniform", hidden=1, topk=1, logical_experts=1, physical_experts=1,
+ ep_size=1, tokens_per_rank=1, seed=67, eplb_enabled=False,
+ reference_tokens_per_rank=2048,
+ )
+ )
+ workload_id = identity.workload_id({
+ "members": [{"checksums": member_checksums, "workload_id": member_id}]
+ })
+ runtime = {
+ "accelerator_runtime": {"kind": "cuda", "version": "13.0"},
+ "collective_library": {"kind": "nccl", "version": "2.30.4"},
+ "device": {
+ "arch": "sm100", "compute_units": 1, "memory_bytes": 1,
+ "product": "Fixture GPU", "warp_size": 32,
+ },
+ "driver_version": "1", "framework": {"kind": "torch", "version": "2.10.0"},
+ "machine": "fixture", "python_version": "3.12", "vendor": "nvidia",
+ }
+ implementation_provenance = (
+ {
+ "backend": "nccl-ep", "backend_lineage": "nccl",
+ "collective_library": "nccl", "nccl_version": "2.30.4",
+ "reference_semantics": "fixture-v1",
+ }
+ if backend == "nccl-ep"
+ else _hybrid_provenance()
+ )
+ kernel_generation = "nccl" if backend == "nccl-ep" else "hybrid"
+ implementation = {
+ "kernel_generation": kernel_generation,
+ "name": backend,
+ "provenance": implementation_provenance,
+ "resource_profile": contracts.project_resource_profile(implementation_provenance),
+ }
+ public_config = contracts.public_series_config(
+ kernel_generation=implementation["kernel_generation"],
+ provenance=implementation_provenance,
+ resource_profile=implementation["resource_profile"],
+ resource_mode="tuned",
+ device_product=runtime["device"]["product"],
+ )
+ series_factors = {
+ "backend": backend, "case_id": case_id,
+ "image_digest": "sha256:" + "d" * 64,
+ "implementation_contract_sha256": digest({
+ **implementation,
+ "provenance": contracts.series_provenance(implementation_provenance),
+ }),
+ "public_config_sha256": contracts.public_series_config_sha256(public_config),
+ "routing_control_sha256": contracts.routing_implementation_control_sha256(
+ implementation
+ ),
+ "runtime_fingerprint_sha256": digest(runtime),
+ "source_sha": RUN["source_sha"], "squash_sha256": "e" * 64,
+ "workload_id": workload_id,
+ }
+ series_id = identity.series_id(series_factors)
+ point_id = identity.point_id(series=series_id, tokens_per_rank=1)
+ sample_components = {
+ name: {
+ "availability": "measured", "sample_count": 512,
+ "trials": [[latency] * 8 for _ in range(64)],
+ }
+ for name, latency in (("combine", 20.0), ("dispatch", 10.0), ("roundtrip", 40.0))
+ }
+ sample_sha = digest({"components": sample_components, "tokens_per_rank": 1})
+ evidence_id = identity.evidence_id(
+ point=point_id, allocation=allocation_id, attempt=attempt_id,
+ sample_sha256=sample_sha,
+ )
+ samples = {
+ "allocation_id": allocation_id, "attempt_id": attempt_id, "case_id": case_id,
+ "format": contracts.SAMPLES_FORMAT,
+ "points": [{
+ "components": sample_components, "evidence_id": evidence_id,
+ "point_id": point_id, "sample_sha256": sample_sha, "tokens_per_rank": 1,
+ }],
+ "sampling": {
+ "iterations_per_trial": 8, "reduction": "cross-rank-max-per-iteration",
+ "trials": 64,
+ },
+ "schema_version": 1, "series_id": series_id,
+ }
+ sample_bytes = contracts.canonical_json_bytes(samples)
+ oracle = {
+ "atol": 0.02,
+ "checks": {name: True for name in (
+ "combine_values", "counts", "metadata", "multiplicity", "payload",
+ "source_set", "weights",
+ )},
+ "combine_weight_semantics": "unweighted-rank-sum",
+ "contract": "expert-specific-transform-v1", "dispatch_sha256": "1" * 64,
+ "max_absolute_error": 0.0, "max_elementwise_relative_error": 0.0,
+ "max_relative_error": 0.0, "max_weight_error": 0.0,
+ "order_sha256": "2" * 64, "ordering_contract": "fixture-order-v1",
+ "passed": True, "receive_count": 1, "rtol": 0.05,
+ }
+ def pct(value: float) -> dict[str, float]:
+ return {name: value for name in ("p50", "p90", "p95", "p99")}
+
+ def measured(value: float) -> dict:
+ return {
+ "availability": "measured", "origin": "measured",
+ "percentiles_us": pct(value), "sample_count": 512,
+ }
+ row = {
+ "anomalies": [],
+ "components": {
+ "combine": measured(20.0), "dispatch": measured(10.0),
+ "isolated_sum": {
+ "availability": "derived", "origin": "derived-percentile-sum",
+ "percentiles_us": pct(30.0), "sample_count": 0,
+ },
+ "roundtrip": measured(40.0),
+ },
+ "correctness": {
+ "contract": "expert-specific-transform-v1", "max_relative_error": 0.0,
+ "passed": True,
+ "rank_evidence": [{
+ "input_unchanged": True, "order_stable": True,
+ "post_timing": copy.deepcopy(oracle), "pre_timing": copy.deepcopy(oracle),
+ "rank": 0,
+ }],
+ "scope": "dispatch-metadata-and-transformed-combine",
+ },
+ "evidence_id": evidence_id, "global_tokens": 1,
+ "logical_bytes": {"combine": 2, "dispatch": 2, "roundtrip": 4},
+ "point_id": point_id,
+ "receive": {"max": 1, "mean": 1.0, "min": 1, "total": 1},
+ "routing": contracts._expected_routing_summary(
+ routing_rows,
+ routing_weights,
+ physical_experts=1,
+ ep_size=1,
+ tokens_per_rank=1,
+ gpus_per_node=1,
+ scale_up_domain=1,
+ ),
+ "sample_histograms": {
+ name: contracts._expected_histogram([value] * 512)
+ for name, value in (("combine", 20.0), ("dispatch", 10.0), ("roundtrip", 40.0))
+ },
+ "sample_sha256": sample_sha,
+ "token_rate_at_latency_percentile": pct(25_000.0), "tokens_per_rank": 1,
+ }
+ raw = {
+ "case": {
+ "attempt_ordinal": 1, "backend": backend,
+ "eplb": {
+ "enabled": False, "imbalance_after": None, "imbalance_before": None,
+ "mapping_hash": None, "max_replicas": None, "num_logical_experts": 1,
+ "num_physical_experts": 1, "num_redundant": 0, "planner": None,
+ "reference_tokens_per_rank": None, "replicated_experts": 0,
+ },
+ "ep_size": 1, "mode": "normal", "phase": "decode",
+ "required_publication": "official", "resource_mode": "tuned", "runner": "fixture",
+ "shape": {
+ "activation_profile": "canonical-counter-source-v3", "dispatch_dtype": "bf16",
+ "eplb": False, "experts": 1, "experts_per_rank": 1, "hidden": 1,
+ "kernel_gen": kernel_generation, "num_logical_experts": 1,
+ "quant": {
+ "combine_accum_dtype": "fp32", "combine_input_dtype": "bf16",
+ "combine_output_dtype": "bf16", "combine_quant_mode": "none",
+ "scale_layout": None,
+ },
+ "routing": "uniform", "topk": 1,
+ },
+ "suite": "ep-core-v1", "workload_name": "deepseek-v3-v1",
+ },
+ "format": contracts.RAW_FORMAT, "generated_at": "2026-07-04T00:00:00Z",
+ "identity": {
+ "allocation_factors": allocation_factors, "allocation_id": allocation_id,
+ "attempt_id": attempt_id, "attempt_ordinal": 1, "case_factors": case_factors,
+ "case_id": case_id, "series_factors": series_factors, "series_id": series_id,
+ },
+ "implementation": implementation,
+ "measurement": {
+ "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2",
+ "conditioning": {
+ "contract": "fixed-phase-ramp-8-roundtrips-v1",
+ "ladder": [1, 2, 4, 8, 16, 32, 64, 128],
+ "roundtrips_per_shape": 8,
+ },
+ "contract": "layout-and-dispatch-v1",
+ "rows": [row],
+ "sampling": {
+ "contract": "fixed-512-v1", "iterations_per_trial": 8,
+ "percentile_method": "nearest-rank",
+ "reduction": "cross-rank-max-per-iteration", "samples_per_component": 512,
+ "trials": 64, "warmup_iterations": 32,
+ "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1",
+ },
+ "source_allocation": "even",
+ },
+ "outcome": {
+ "publication_status": "diagnostic", "reasons": [], "status": "success",
+ "validity": {
+ "anomaly_free": True, "execution_status": "complete",
+ "measurement_conformance": "conformant", "provenance_complete": True,
+ "resource_conformance": implementation["resource_profile"]["conformance_class"],
+ "sampling_conformance": "conformant",
+ "semantic_correctness": "pass",
+ "workload_identity": "consistent-across-ranks",
+ "workload_source": "canonical-serialized",
+ },
+ },
+ "provenance": {
+ "command": "run_ep", "distributed_launcher": "torchrun", "git_run": git_run,
+ "image": {
+ "arch": "amd64", "digest": "sha256:" + "d" * 64,
+ "digest_verified": True, "reference": "fixture:1", "squash_sha256": "e" * 64,
+ },
+ "redaction": "sanitized-v1",
+ },
+ "record_type": "case-attempt",
+ "runtime_fingerprint": runtime,
+ "sample_artifact": {
+ "bytes": len(sample_bytes), "format": contracts.SAMPLES_FORMAT,
+ "path": "samples.json", "sha256": hashlib.sha256(sample_bytes).hexdigest(),
+ },
+ "schema_version": 1,
+ "topology": {
+ "device_count": 1, "device_product": "Fixture GPU", "gpus_per_node": 1,
+ "nodes": 1, "placement": "packed",
+ "realized_placement": {
+ "gpus_per_node": 1, "nodes": 1, "ranks_per_node": 1,
+ "unique_local_ranks": True, "valid": True,
+ },
+ "scale_out_transport": None, "scale_up_domain": 1,
+ "scale_up_transport": "nvlink", "scope": "scale-up",
+ "topology_class": "fixture", "transport": "nvlink",
+ "world_size": 1,
+ },
+ "workload": {
+ "activation_generator": "collectivex-activation-counter-v3",
+ "activation_identity": hashlib.sha256(
+ b"counter|seed=67|hidden=1|gen=collectivex-activation-counter-v3"
+ ).hexdigest(),
+ "activation_profile": "canonical-counter-source-v3", "cross_rank_consistent": True,
+ "manifest_checksums": {member_id: member_checksums}, "members": [member_id],
+ "routing_generator": "collectivex-routing-counter-v3", "source": "canonical-serialized",
+ "trace_hashes": [routing_hash],
+ "trace_signature": hashlib.sha256(routing_hash.encode()).hexdigest(),
+ "workload_id": workload_id,
+ },
+ }
+ return raw, samples
+
+
+def _series(seed: str, backend: str, *, decision_grade: bool = False) -> tuple[dict, dict]:
+ case, allocation, attempt, series_id, point_id, evidence = _ids(seed)
+ allocations = [identity.allocation_id({"seed": seed, "run": run}) for run in range(3)]
+ eligibility = publisher._eligibility_record(
+ allocations if decision_grade else [allocation],
+ complete=decision_grade,
+ correct=True,
+ measured=True,
+ stable_ordering=True,
+ p50_ratio=1.01 if decision_grade else None,
+ p99_ratio=1.02 if decision_grade else None,
+ )
+ component = _component(1.0 if backend == "deepep" else 1.2)
+ item = {
+ "series_id": series_id,
+ "label": f"H100 / {backend}",
+ "status": "decision-grade" if decision_grade else "diagnostic",
+ "case_ids": [case],
+ "allocation_ids": allocations if decision_grade else [allocation],
+ "model": "deepseek-v3-v1",
+ "suite": "ep-core-v1",
+ "mode": "normal",
+ "phase": "decode",
+ "publication_tier": "official",
+ "backend": {
+ "id": backend, "label": publisher.BACKEND_LABELS[backend],
+ "role": "reference" if backend == "nccl-ep" else "library",
+ "generation": "nccl" if backend == "nccl-ep" else None,
+ "version": "1.0"},
+ "build": {
+ "implementation_contract_sha256": hashlib.sha256(backend.encode()).hexdigest(),
+ "public_config_sha256": "0" * 64,
+ "routing_control_sha256": hashlib.sha256(backend.encode()).hexdigest(),
+ "runtime_fingerprint_sha256": "3" * 64,
+ "image_digest": "sha256:" + "1" * 64,
+ "source_sha": "a" * 40,
+ "squash_sha256": "2" * 64,
+ },
+ "system": {
+ "sku": "h100-dgxc", "label": "NVIDIA H100", "vendor": "nvidia",
+ "topology_class": "h100-nvlink-island", "transport": "nvlink",
+ "scale_up_transport": "nvlink", "scale_out_transport": None,
+ "scope": "scale-up", "nodes": 1, "gpus_per_node": 8,
+ "scale_up_domain": 8,
+ "world_size": 8, "ep_size": 8, "placement": "packed",
+ },
+ "workload": {
+ "workload_id": identity.workload_id({"shape": "shared"}),
+ "hidden": 7168, "top_k": 8, "experts": 256,
+ "routing": "uniform", "eplb": False,
+ "dispatch_dtype": "bf16", "combine_dtype": "bf16",
+ "activation_profile": "canonical-counter-source-v3",
+ },
+ "eplb": {
+ "enabled": False, "planner": None, "mapping_sha256": None,
+ "logical_experts": 256, "physical_experts": 256,
+ "redundant_experts": 0, "reference_tokens_per_rank": None,
+ "replicated_experts": 0, "max_replicas": None,
+ "imbalance_before": None, "imbalance_after": None,
+ },
+ "resource": {"mode": "tuned", "profile": "profile-1", "comm_units_kind": "sm", "configured_units": 24},
+ "measurement": {
+ "contract": "layout-and-dispatch-v1",
+ "component_order_contract": "roundtrip-dispatch-activation-only-combine-v2",
+ "combine_semantics": "activation-only", "payload_unit": "token-rank",
+ "sampling_contract": "fixed-512-v1",
+ "iters": 8, "trials": 64, "warmups": 32, "samples_per_component": 512,
+ "headline_component": "roundtrip", "headline_percentile": "p99",
+ },
+ "points": [{
+ "point_id": point_id, "tokens_per_rank": 8, "global_tokens": 64,
+ "correct": True,
+ "routing": {
+ "fanout_mean": 4.0, "recv_tokens_max": 64,
+ "expert_load_cv": 0.5, "payload_rank_cv": 0.25,
+ "hotspot_ratio": 2.0, "empty_expert_count": 0,
+ "empty_rank_count": 0, "routed_copies": 256,
+ },
+ "components": {"dispatch": None, "combine": None,
+ "roundtrip": component, "isolated_sum": None},
+ "roundtrip_token_rate_at_latency_percentile": {
+ name: 64 / (latency * 1e-6)
+ for name, latency in component["latency_us"].items()
+ },
+ "evidence_ids": [evidence],
+ }],
+ "eligibility": eligibility,
+ }
+ item["build"]["public_config_sha256"] = contracts.public_series_config_sha256(
+ publisher._public_series_config(item)
+ )
+ case = identity.digest("case", publisher._public_case_factors(item))
+ item["case_ids"] = [case]
+ build = item["build"]
+ series_id = identity.series_id({
+ "backend": item["backend"]["id"],
+ "case_id": case,
+ "image_digest": build["image_digest"],
+ "implementation_contract_sha256": build["implementation_contract_sha256"],
+ "public_config_sha256": build["public_config_sha256"],
+ "routing_control_sha256": build["routing_control_sha256"],
+ "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"],
+ "source_sha": build["source_sha"],
+ "squash_sha256": build["squash_sha256"],
+ "workload_id": item["workload"]["workload_id"],
+ })
+ item["series_id"] = series_id
+ point_id = identity.point_id(series=series_id, tokens_per_rank=8)
+ item["points"][0]["point_id"] = point_id
+ attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1)
+ evidence = identity.evidence_id(
+ point=point_id, allocation=allocation, attempt=attempt,
+ sample_sha256=hashlib.sha256(seed.encode()).hexdigest(),
+ )
+ item["points"][0]["evidence_ids"] = [evidence]
+ runs = {
+ str(run): {8: {
+ "latency_us": {
+ statistic: component["latency_us"][statistic] * (1 + run / 100)
+ for statistic in ("p50", "p99")
+ },
+ "logical_payload_rate_gbps_at_latency_percentile": {
+ statistic: component["logical_payload_rate_gbps_at_latency_percentile"][statistic] / (1 + run / 100)
+ for statistic in ("p50", "p99")
+ },
+ }}
+ for run in range(3)
+ }
+ internal = {"run_metrics": runs}
+ return item, internal
+
+
+def _dataset() -> dict:
+ item, _ = _series("one", "deepep")
+ case = item["case_ids"][0]
+ allocation = item["allocation_ids"][0]
+ attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1)
+ evidence = item["points"][0]["evidence_ids"][0]
+ return {
+ "format": "collectivex.public.v1", "schema_version": 1,
+ "generated_at": "2026-07-04T00:00:00Z", "source_bundle_ids": ["c" * 64],
+ "promotion": {
+ "status": "diagnostic", "reason": None, "matrix_id": "d" * 64,
+ "allocation_ids": [allocation], "required_allocations": 3,
+ "requested_cases": 1, "terminal_cases": 1,
+ "policy": "collectivex-decision-grade-v1",
+ },
+ "coverage": [{
+ "case_id": case, "label": "case", "required": True, "sku": "h100-dgxc",
+ "backend": "deepep", "mode": "normal", "phase": "decode",
+ "topology": publisher._coverage_topology(item["system"]),
+ "disposition": "runnable",
+ "selected_attempt_id": attempt,
+ "outcome": "success", "failure_mode": None, "reason": None,
+ "attempt_ids": [attempt],
+ }],
+ "attempts": [{
+ "attempt_id": attempt,
+ "evidence": [{"evidence_id": evidence,
+ "point_id": item["points"][0]["point_id"]}],
+ "case_id": case,
+ "allocation_id": allocation, "run_id": "1", "run_attempt": 1,
+ "attempt_index": 1,
+ "selected": True, "outcome": "success", "failure_mode": None, "reason": None,
+ "series_id": item["series_id"],
+ "completed_at": "2026-07-04T00:00:00Z",
+ }],
+ "series": [item], "cohorts": [], "rankings": [], "recommendations": [],
+ "sensitivities": [],
+ }
+
+
+def _promoted_dataset() -> dict:
+ specifications = (
+ ("library-fast", "deepep", None, False),
+ ("library-slow", "uccl", None, False),
+ ("chip-peer", "deepep", "h200-dgxc", False),
+ ("system-one", "nccl-ep", None, True),
+ ("system-two", "nccl-ep", "h200-dgxc", True),
+ ("routing-zipf", "deepep", None, False),
+ ("routing-zipf-eplb", "deepep", None, False),
+ )
+ series = []
+ internals = {}
+ attempts = []
+ coverage = []
+ for seed, backend, peer_sku, reference in specifications:
+ item, internal = _series(seed, backend, decision_grade=True)
+ if peer_sku:
+ platform = publisher.capability.PLATFORMS[peer_sku]
+ item["system"].update({
+ "sku": peer_sku,
+ "label": f"NVIDIA {platform['product'].upper()}",
+ "topology_class": platform["topology_class"],
+ "transport": platform["transport"],
+ })
+ if reference:
+ item["backend"]["role"] = "reference"
+ if seed.startswith("routing-zipf"):
+ item["suite"] = "ep-routing-v1"
+ item["publication_tier"] = "comparable-experimental"
+ item["workload"]["routing"] = "zipf"
+ if seed == "routing-zipf-eplb":
+ item["workload"]["eplb"] = True
+ plan = contracts._expected_eplb_plan(
+ "zipf", 8, 256, 288, item["system"]["ep_size"], 67, 2048
+ )
+ item["eplb"] = {
+ "enabled": True, "planner": "greedy-rank-major-v1",
+ "mapping_sha256": contracts.eplb_contract.mapping_hash(plan),
+ "logical_experts": 256, "physical_experts": 288,
+ "redundant_experts": 32, "reference_tokens_per_rank": 2048,
+ "replicated_experts": plan["replicated_experts"],
+ "max_replicas": plan["max_replicas"],
+ "imbalance_before": plan["imbalance_before"],
+ "imbalance_after": plan["imbalance_after"],
+ }
+ item["build"]["implementation_contract_sha256"] = "8" * 64
+ case_id = identity.digest("case", publisher._public_case_factors(item))
+ item["case_ids"] = [case_id]
+ build = item["build"]
+ build["public_config_sha256"] = contracts.public_series_config_sha256(
+ publisher._public_series_config(item)
+ )
+ item["series_id"] = identity.series_id({
+ "backend": item["backend"]["id"],
+ "case_id": case_id,
+ "image_digest": build["image_digest"],
+ "implementation_contract_sha256": build["implementation_contract_sha256"],
+ "public_config_sha256": build["public_config_sha256"],
+ "routing_control_sha256": build["routing_control_sha256"],
+ "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"],
+ "source_sha": build["source_sha"],
+ "squash_sha256": build["squash_sha256"],
+ "workload_id": item["workload"]["workload_id"],
+ })
+ point = item["points"][0]
+ point["point_id"] = identity.point_id(
+ series=item["series_id"], tokens_per_rank=point["tokens_per_rank"]
+ )
+ case_attempts = []
+ evidence_ids = []
+ for run_id, allocation_id in enumerate(item["allocation_ids"], 1):
+ attempt_id = identity.attempt_id(
+ allocation=allocation_id, case=case_id, ordinal=1
+ )
+ evidence_id = identity.evidence_id(
+ point=point["point_id"], allocation=allocation_id,
+ attempt=attempt_id,
+ sample_sha256=hashlib.sha256(f"{seed}-{run_id}".encode()).hexdigest(),
+ )
+ attempts.append({
+ "attempt_id": attempt_id,
+ "evidence": [{"evidence_id": evidence_id, "point_id": point["point_id"]}],
+ "case_id": case_id, "allocation_id": allocation_id,
+ "run_id": str(run_id), "run_attempt": 1,
+ "attempt_index": 1, "selected": True,
+ "outcome": "success", "failure_mode": None, "reason": None,
+ "series_id": item["series_id"],
+ "completed_at": "2026-07-04T00:00:00Z",
+ })
+ case_attempts.append(attempt_id)
+ evidence_ids.append(evidence_id)
+ point["evidence_ids"] = evidence_ids
+ coverage.append({
+ "case_id": case_id, "label": seed, "required": True,
+ "sku": item["system"]["sku"], "backend": backend,
+ "mode": item["mode"], "phase": item["phase"], "disposition": "runnable",
+ "topology": publisher._coverage_topology(item["system"]),
+ "selected_attempt_id": case_attempts[-1], "outcome": "success",
+ "failure_mode": None, "reason": None, "attempt_ids": case_attempts,
+ })
+ series.append(item)
+ internals[item["series_id"]] = internal
+
+ unsupported_case = (
+ "cxcase-v1-f5d0b1c21df4eea0f5b74da25a0b85da7388a0dd22c8d81564c4bfb4e4465a34"
+ )
+ unsupported_attempts = []
+ for run_id in range(1, 4):
+ allocation_id = identity.allocation_id(
+ {"seed": "planned-unsupported", "run": run_id}
+ )
+ attempt_id = identity.attempt_id(
+ allocation=allocation_id, case=unsupported_case, ordinal=1
+ )
+ attempts.append({
+ "attempt_id": attempt_id, "evidence": [], "case_id": unsupported_case,
+ "allocation_id": allocation_id, "run_id": str(run_id),
+ "run_attempt": 1,
+ "attempt_index": 1, "selected": True, "outcome": "unsupported",
+ "failure_mode": "capability", "reason": "backend-platform-unsupported",
+ "series_id": None, "completed_at": "2026-07-04T00:00:00Z",
+ })
+ unsupported_attempts.append(attempt_id)
+ coverage.append({
+ "case_id": unsupported_case, "label": "planned unsupported", "required": True,
+ "sku": "mi355x", "backend": "deepep", "mode": "normal", "phase": "decode",
+ "topology": {
+ "ep_size": 8, "nodes": 1, "gpus_per_node": 8, "scale_up_domain": 8,
+ "scope": "scale-up", "scale_up_transport": "xgmi",
+ "scale_out_transport": None, "transport": "xgmi",
+ "topology_class": "mi355x-xgmi",
+ },
+ "disposition": "unsupported", "selected_attempt_id": unsupported_attempts[-1],
+ "outcome": "unsupported", "failure_mode": "capability",
+ "reason": "backend-platform-unsupported", "attempt_ids": unsupported_attempts,
+ })
+ cohorts, rankings, recommendations, sensitivities = publisher.build_decisions(
+ series, internals
+ )
+ return {
+ "format": "collectivex.public.v1", "schema_version": 1,
+ "generated_at": "2026-07-04T00:00:00Z",
+ "source_bundle_ids": ["a" * 64, "b" * 64, "c" * 64],
+ "promotion": {
+ "status": "promoted", "reason": None,
+ "matrix_id": publisher.CANONICAL_FULL_V1_MATRIX_SHA256,
+ "allocation_ids": sorted({item["allocation_id"] for item in attempts}),
+ "required_allocations": 3, "requested_cases": len(coverage),
+ "terminal_cases": len(coverage), "policy": "collectivex-decision-grade-v1",
+ },
+ "coverage": sorted(coverage, key=lambda item: item["case_id"]),
+ "attempts": sorted(attempts, key=lambda item: item["attempt_id"]),
+ "series": sorted(series, key=lambda item: item["series_id"]),
+ "cohorts": cohorts, "rankings": rankings,
+ "recommendations": recommendations, "sensitivities": sensitivities,
+ }
+
+
+def _cohort_counts(dataset: dict) -> dict[str, int]:
+ return {
+ kind: sum(item["kind"] == kind for item in dataset["cohorts"])
+ for kind in ("library", "system", "routing")
+ }
+
+
+class PublisherTest(unittest.TestCase):
+ def test_terminal_allocation_and_source_status_are_bound(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ path = next(artifact.glob("*.json"))
+ terminal = contracts.strict_load(path)
+ self.assertIs(contracts.validate_terminal_document(terminal), terminal)
+ self.assertEqual(
+ contracts.validate_delivery(
+ [str(path)], str(matrix), disposition="unsupported"
+ ),
+ 1,
+ )
+
+ for control_sha256 in (None, "0" * 64):
+ broken = copy.deepcopy(terminal)
+ broken["provenance"]["control_sha256"] = control_sha256
+ path.write_text(json.dumps(broken))
+ with self.assertRaisesRegex(contracts.ContractError, "exact control document"):
+ contracts.validate_delivery(
+ [str(path)], str(matrix), disposition="unsupported"
+ )
+ path.write_text(json.dumps(terminal))
+
+ for field in (
+ "artifact", "job", "repo", "run_attempt", "run_id", "source_sha", "runner"
+ ):
+ broken = copy.deepcopy(terminal)
+ broken["identity"]["allocation_factors"][field] = f"forged-{field}"
+ allocation_id = identity.allocation_id(
+ broken["identity"]["allocation_factors"]
+ )
+ broken["identity"]["allocation_id"] = allocation_id
+ broken["identity"]["attempt_id"] = identity.attempt_id(
+ allocation=allocation_id,
+ case=broken["identity"]["case_id"],
+ ordinal=broken["identity"]["attempt_ordinal"],
+ )
+ with self.assertRaisesRegex(
+ contracts.ContractError, "allocation factors differ"
+ ):
+ contracts.validate_terminal_document(broken)
+
+ broken = copy.deepcopy(terminal)
+ broken["outcome"]["status"] = "failed"
+ with self.assertRaisesRegex(contracts.ContractError, "source and outcome"):
+ contracts.validate_terminal_document(broken)
+ broken = copy.deepcopy(terminal)
+ broken["provenance"]["source"] = "runtime-emitter"
+ with self.assertRaisesRegex(contracts.ContractError, "source and outcome"):
+ contracts.validate_terminal_document(broken)
+
+ for path_parts, replacement in (
+ (("provenance", "source"), "unregistered-producer"),
+ (("outcome", "failure_mode"), "unsupported-capability"),
+ (("outcome", "reason"), "unregistered-capability"),
+ ):
+ with self.subTest(path=path_parts):
+ broken = copy.deepcopy(terminal)
+ broken[path_parts[0]][path_parts[1]] = replacement
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("terminal-outcome-v1.schema.json", broken)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_terminal_document(broken)
+
+ runtime_allocation = copy.deepcopy(
+ terminal["identity"]["allocation_factors"]
+ )
+ runtime_allocation["runner"] = terminal["identity"]["case_factors"]["sku"]
+ runtime = contracts.make_terminal_document(
+ allocation_factors=runtime_allocation,
+ attempt_ordinal=1,
+ case=terminal["case"],
+ case_factors=terminal["identity"]["case_factors"],
+ control_sha256=terminal["provenance"]["control_sha256"],
+ failure_mode="setup",
+ generated_at=terminal["generated_at"],
+ git_run=terminal["provenance"]["git_run"],
+ reason="launcher-setup-failed",
+ return_code=1,
+ source="runtime-emitter",
+ status="failed",
+ expected_case_id=terminal["identity"]["case_id"],
+ )
+ publisher._schema("terminal-outcome-v1.schema.json", runtime)
+ broken = copy.deepcopy(runtime)
+ broken["outcome"]["reason"] = "backend-setup-failed"
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("terminal-outcome-v1.schema.json", broken)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_terminal_document(broken)
+
+ def test_post_emit_demotion_uses_closed_failure_taxonomy(self) -> None:
+ raw, _ = _native_fixture()
+ expected = {
+ 5: "runtime-identity",
+ 6: "execution",
+ 124: "timeout",
+ 137: "timeout",
+ 134: "execution",
+ 9: "execution",
+ }
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ for return_code, failure_mode in expected.items():
+ with self.subTest(return_code=return_code):
+ path = root / f"attempt-{return_code}.json"
+ path.write_text(json.dumps(raw))
+ terminal = contracts.demote_raw_attempt(path, return_code)
+ self.assertEqual(
+ terminal["outcome"],
+ {
+ "failure_mode": failure_mode,
+ "reason": "post-emit-distributed-command-failed",
+ "return_code": return_code,
+ "status": "failed",
+ },
+ )
+ self.assertEqual(terminal["provenance"]["source"], "post-emit-command")
+ publisher._schema("terminal-outcome-v1.schema.json", terminal)
+
+ broken = copy.deepcopy(terminal)
+ broken["outcome"]["reason"] = "distributed-command-failed"
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("terminal-outcome-v1.schema.json", broken)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_terminal_document(broken)
+
+ def test_artifact_safety_accepts_current_v1_fixtures(self) -> None:
+ raw, samples = _native_fixture()
+ publisher.artifact_safety.assert_publication_safe([
+ sweep_matrix.resolve_matrix(backends="all"),
+ raw,
+ samples,
+ _dataset(),
+ _promoted_dataset(),
+ ])
+
+ def test_native_raw_and_sample_schema_match_semantic_validator(self) -> None:
+ raw, samples = _native_fixture()
+ publisher._schema("samples-v1.schema.json", samples)
+ publisher._schema("raw-case-v1.schema.json", raw)
+ self.assertIs(contracts.validate_raw_document(raw, samples), raw)
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ (root / "samples.json").write_bytes(contracts.canonical_json_bytes(samples))
+ (root / "raw.json").write_bytes(contracts.canonical_json_bytes(raw))
+ self.assertEqual(contracts.load_raw_attempt(root / "raw.json"), raw)
+ for target in ("raw", "samples"):
+ broken_raw, broken_samples = copy.deepcopy((raw, samples))
+ broken = broken_raw if target == "raw" else broken_samples
+ broken["unexpected"] = True
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema(
+ "raw-case-v1.schema.json" if target == "raw" else "samples-v1.schema.json",
+ broken,
+ )
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_raw_document(broken_raw, broken_samples)
+ tampered = copy.deepcopy(raw)
+ tampered["measurement"]["rows"][0]["token_rate_at_latency_percentile"]["p50"] *= 2
+ with self.assertRaisesRegex(contracts.ContractError, "token_rate_at_latency_percentile"):
+ contracts.validate_raw_document(tampered, samples)
+ tampered = copy.deepcopy(raw)
+ tampered["case"]["shape"]["hidden"] = 2
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_raw_document(tampered, samples)
+ tampered = copy.deepcopy(raw)
+ configured = tampered["implementation"]["resource_profile"]["configured_units"]
+ tampered["implementation"]["resource_profile"]["configured_units"] = (
+ 1 if configured is None else configured + 1
+ )
+ with self.assertRaisesRegex(contracts.ContractError, "resource profile"):
+ contracts.validate_raw_document(tampered, samples)
+ tampered = copy.deepcopy(raw)
+ oracle = tampered["measurement"]["rows"][0]["correctness"]["rank_evidence"][0]
+ oracle["pre_timing"]["checks"]["combine_values"] = False
+ with self.assertRaisesRegex(contracts.ContractError, "passed differs"):
+ contracts.validate_raw_document(tampered, samples)
+
+ def test_hybrid_raw_binds_realized_config_and_every_rank_artifact(self) -> None:
+ raw, samples = _native_fixture("deepep-hybrid")
+ publisher._schema("raw-case-v1.schema.json", raw)
+ self.assertIs(contracts.validate_raw_document(raw, samples), raw)
+
+ mutations = {
+ "hidden_dim": lambda provenance: provenance["realized_config"].update(
+ hidden_dim=2
+ ),
+ "experts_per_rank": lambda provenance: provenance["realized_config"].update(
+ num_of_experts_per_rank=2
+ ),
+ "ranks_per_node": lambda provenance: provenance["realized_config"].update(
+ num_of_ranks_per_node=2
+ ),
+ "num_nodes": lambda provenance: provenance["realized_config"].update(
+ num_of_nodes=2
+ ),
+ "token_data_type": lambda provenance: provenance["realized_config"].update(
+ token_data_type="UINT8"
+ ),
+ "rank_coverage": lambda provenance: [
+ artifact["rank_artifacts"].append({
+ "bytes": 1, "rank": 1, "sha256": "9" * 64,
+ })
+ for artifact in provenance["jit_shared_objects"]
+ ],
+ }
+ for name, mutate in mutations.items():
+ with self.subTest(name=name):
+ changed = copy.deepcopy(raw)
+ mutate(changed["implementation"]["provenance"])
+ with self.assertRaisesRegex(
+ contracts.ContractError,
+ "DeepEP Hybrid realized config/JIT evidence differs",
+ ):
+ contracts.validate_raw_document(changed, samples)
+
+ def test_native_contract_recomputes_routing_receive_histograms_and_anomalies(self) -> None:
+ raw, samples = _native_fixture()
+
+ tampered = copy.deepcopy(raw)
+ changed = tampered["measurement"]["rows"][0]
+ changed["routing"]["routed_copies"] *= 2
+ for name in ("combine", "dispatch", "roundtrip"):
+ changed["logical_bytes"][name] *= 2
+ with self.assertRaisesRegex(contracts.ContractError, "routing.routed_copies"):
+ contracts.validate_raw_document(tampered, samples)
+
+ tampered = copy.deepcopy(raw)
+ changed = tampered["measurement"]["rows"][0]
+ changed["routing"]["payload_copies_per_rank"] = [2]
+ changed["receive"] = {"max": 2, "mean": 2.0, "min": 2, "total": 2}
+ with self.assertRaisesRegex(contracts.ContractError, "payload_copies_per_rank"):
+ contracts.validate_raw_document(tampered, samples)
+
+ tampered = copy.deepcopy(raw)
+ tampered["measurement"]["rows"][0]["sample_histograms"]["roundtrip"][
+ "counts"
+ ] = [511]
+ with self.assertRaisesRegex(contracts.ContractError, "sample_histograms"):
+ contracts.validate_raw_document(tampered, samples)
+
+ tampered = copy.deepcopy(raw)
+ tampered["measurement"]["rows"][0]["anomalies"] = [{
+ "type": "roundtrip_gt_isolated_sum",
+ "T": 1,
+ "roundtrip_p99": 40.0,
+ "isolated_sum_p99": 30.0,
+ "ratio": 1.33,
+ "threshold": 3.0,
+ }]
+ tampered["outcome"]["validity"]["anomaly_free"] = False
+ with self.assertRaisesRegex(contracts.ContractError, "anomalies"):
+ contracts.validate_raw_document(tampered, samples)
+
+ anomalous_raw, anomalous_samples = copy.deepcopy((raw, samples))
+ sample_point = anomalous_samples["points"][0]
+ sample_point["components"]["roundtrip"]["trials"] = [
+ [100.0] * 8 for _ in range(64)
+ ]
+ sample_core = {
+ "components": sample_point["components"],
+ "tokens_per_rank": sample_point["tokens_per_rank"],
+ }
+ sample_sha = hashlib.sha256(
+ contracts.canonical_json_bytes(sample_core)
+ ).hexdigest()
+ point_id = sample_point["point_id"]
+ evidence_id = identity.evidence_id(
+ point=point_id,
+ allocation=anomalous_raw["identity"]["allocation_id"],
+ attempt=anomalous_raw["identity"]["attempt_id"],
+ sample_sha256=sample_sha,
+ )
+ sample_point.update({"sample_sha256": sample_sha, "evidence_id": evidence_id})
+ changed = anomalous_raw["measurement"]["rows"][0]
+ changed["sample_sha256"] = sample_sha
+ changed["evidence_id"] = evidence_id
+ changed["components"]["roundtrip"]["percentiles_us"] = {
+ name: 100.0 for name in ("p50", "p90", "p95", "p99")
+ }
+ changed["token_rate_at_latency_percentile"] = {
+ name: 10_000.0 for name in ("p50", "p90", "p95", "p99")
+ }
+ changed["sample_histograms"]["roundtrip"] = contracts._expected_histogram(
+ [100.0] * 512
+ )
+ changed["anomalies"] = contracts._expected_anomalies(1, changed["components"])
+ anomalous_raw["outcome"]["validity"]["anomaly_free"] = False
+ sample_bytes = contracts.canonical_json_bytes(anomalous_samples)
+ anomalous_raw["sample_artifact"].update({
+ "bytes": len(sample_bytes),
+ "sha256": hashlib.sha256(sample_bytes).hexdigest(),
+ })
+ self.assertIs(
+ contracts.validate_raw_document(anomalous_raw, anomalous_samples),
+ anomalous_raw,
+ )
+ changed["anomalies"] = []
+ anomalous_raw["outcome"]["validity"]["anomaly_free"] = True
+ with self.assertRaisesRegex(contracts.ContractError, "anomalies"):
+ contracts.validate_raw_document(anomalous_raw, anomalous_samples)
+
+ def test_native_contract_rejects_every_schema_only_nested_mutation(self) -> None:
+ raw, samples = _native_fixture()
+ self.assertIs(contracts.validate_raw_document(raw, samples), raw)
+
+ def locate(document: object, path: tuple[object, ...]) -> object:
+ value = document
+ for part in path:
+ value = value[part] # type: ignore[index]
+ return value
+
+ def reject_raw(document: dict) -> None:
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("raw-case-v1.schema.json", document)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_raw_document(document, samples)
+
+ required_fields = (
+ (("measurement", "rows", 0, "receive"), "total"),
+ (("measurement", "rows", 0, "routing"), "fanout_mean"),
+ (("measurement", "rows", 0, "routing", "source_token_stats"), "ranks"),
+ (("measurement", "rows", 0, "sample_histograms"), "roundtrip"),
+ (("measurement", "rows", 0, "sample_histograms", "roundtrip"), "n"),
+ (("runtime_fingerprint", "accelerator_runtime"), "kind"),
+ (("runtime_fingerprint", "collective_library"), "kind"),
+ (("runtime_fingerprint", "framework"), "kind"),
+ )
+ for path, required in required_fields:
+ with self.subTest(path=path, mutation="missing"):
+ broken = copy.deepcopy(raw)
+ del locate(broken, path)[required] # type: ignore[index]
+ reject_raw(broken)
+ with self.subTest(path=path, mutation="extra"):
+ broken = copy.deepcopy(raw)
+ locate(broken, path)["unexpected"] = None # type: ignore[index]
+ reject_raw(broken)
+
+ invalid_values = (
+ (("measurement", "rows", 0, "receive", "mean"), "one"),
+ (("measurement", "rows", 0, "routing", "fanout_mean"), "one"),
+ (("measurement", "rows", 0, "sample_histograms", "roundtrip", "bins"), 0),
+ (("provenance", "image", "arch"), "AMD64"),
+ (("runtime_fingerprint", "accelerator_runtime", "kind"), "rocm"),
+ )
+ for path, invalid in invalid_values:
+ with self.subTest(path=path, mutation="value"):
+ broken = copy.deepcopy(raw)
+ parent = locate(broken, path[:-1])
+ parent[path[-1]] = invalid # type: ignore[index]
+ reject_raw(broken)
+
+ def reject_samples(document: dict) -> None:
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("samples-v1.schema.json", document)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_samples_document(document)
+
+ for path, required in (
+ (("points", 0), "evidence_id"),
+ (("points", 0, "components"), "roundtrip"),
+ (("points", 0, "components", "roundtrip"), "trials"),
+ (("sampling",), "reduction"),
+ ):
+ with self.subTest(path=path, artifact="samples"):
+ broken = copy.deepcopy(samples)
+ del locate(broken, path)[required] # type: ignore[index]
+ reject_samples(broken)
+
+ def test_terminal_contract_and_schema_reject_the_same_shape_gaps(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ _, artifact = _unsupported_delivery(Path(temporary).resolve())
+ terminal = contracts.strict_load(next(artifact.glob("*.json")))
+ publisher._schema("terminal-outcome-v1.schema.json", terminal)
+ self.assertIs(contracts.validate_terminal_document(terminal), terminal)
+
+ def reject(document: dict) -> None:
+ with self.assertRaises(publisher.PublisherError):
+ publisher._schema("terminal-outcome-v1.schema.json", document)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_terminal_document(document)
+
+ for path, invalid in (
+ (("outcome", "failure_mode"), "Not Safe"),
+ (("outcome", "reason"), "x" * 241),
+ (("provenance", "source"), "Not Safe"),
+ (("provenance", "git_run", "ref"), ""),
+ ):
+ with self.subTest(path=path):
+ broken = copy.deepcopy(terminal)
+ parent = broken
+ for part in path[:-1]:
+ parent = parent[part]
+ parent[path[-1]] = invalid
+ reject(broken)
+
+ def test_invalid_retry_is_quarantined_before_valid_retry_upload(self) -> None:
+ raw, samples = _native_fixture()
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ sample_bytes = contracts.canonical_json_bytes(samples)
+ bad = copy.deepcopy(raw)
+ bad["sample_artifact"].update({
+ "path": "a01.samples.json", "bytes": len(sample_bytes),
+ "sha256": hashlib.sha256(sample_bytes).hexdigest(),
+ })
+ bad["measurement"]["rows"][0]["token_rate_at_latency_percentile"]["p50"] *= 2
+ (root / "a01.samples.json").write_bytes(sample_bytes)
+ (root / "a01.json").write_bytes(contracts.canonical_json_bytes(bad))
+ self.assertTrue(contracts.quarantine_invalid_attempt(root / "a01.json"))
+ valid = copy.deepcopy(raw)
+ valid["sample_artifact"].update({
+ "path": "a02.samples.json", "bytes": len(sample_bytes),
+ "sha256": hashlib.sha256(sample_bytes).hexdigest(),
+ })
+ (root / "a02.samples.json").write_bytes(sample_bytes)
+ (root / "a02.json").write_bytes(contracts.canonical_json_bytes(valid))
+ paths = sorted(str(path) for path in root.glob("*.json"))
+ self.assertEqual(contracts.validate_attempt_paths(paths), 1)
+ self.assertTrue((root / "a01.json.quarantine").is_file())
+ self.assertTrue((root / "a01.samples.json.quarantine").is_file())
+
+ def test_ingest_archives_first_and_publishes_latest_attempt(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ self.assertEqual(len(summarize.load_results(str(artifact), None, None)), 1)
+ result = publisher.ingest_command(_args(root / "store", matrix, artifact))
+ store = publisher.Store(root / "store")
+ pointer = store.verify_channel("latest-attempt")
+ self.assertEqual(result["status"], "accepted")
+ self.assertEqual(pointer["dataset"]["sha256"], result["dataset_sha256"])
+ self.assertTrue((store.incoming / result["incoming_id"] / "COMPLETE").is_file())
+ self.assertTrue((store.bundles / result["bundle_id"] / "COMPLETE").is_file())
+ self.assertFalse((store.channels / "dev-latest.json").exists())
+ self.assertEqual(os.stat(store.private).st_mode & 0o777, 0o700)
+ self.assertEqual(os.stat(store.public).st_mode & 0o777, 0o755)
+ self.assertEqual(os.stat(store.bundles / result["bundle_id"]).st_mode & 0o777, 0o500)
+ dataset_dir = store.datasets / result["dataset_sha256"]
+ self.assertEqual(os.stat(dataset_dir).st_mode & 0o777, 0o555)
+ self.assertEqual(os.stat(dataset_dir / "dataset.json").st_mode & 0o777, 0o444)
+
+ def test_repeated_ingest_is_content_idempotent(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ args = _args(root / "store", matrix, artifact)
+ first = publisher.ingest_command(args)
+ store = publisher.Store(root / "store")
+ pointer_before = (store.channels / "latest-attempt.json").read_bytes()
+ second = publisher.ingest_command(args)
+ self.assertEqual(second, first)
+ self.assertEqual(
+ (store.channels / "latest-attempt.json").read_bytes(), pointer_before
+ )
+ self.assertEqual(len(list(store.incoming.iterdir())), 1)
+ self.assertEqual(len(list(store.bundles.iterdir())), 1)
+ self.assertEqual(len(list(store.datasets.iterdir())), 1)
+ bundle = publisher.strict_load(
+ store.bundles / first["bundle_id"] / "bundle.json"
+ )
+ terminal = publisher.strict_load(next(artifact.glob("*.json")))
+ self.assertEqual(bundle["created_at"], terminal["generated_at"])
+
+ def test_dataset_is_invariant_to_bundle_argument_order(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ store_root = root / "store"
+ bundle_ids = []
+ for run_id in (9, 11, 10):
+ run = {**RUN, "run_id": str(run_id)}
+ delivery = root / f"run-{run_id}"
+ delivery.mkdir()
+ matrix, artifact = _unsupported_delivery(delivery, run=run)
+ result = publisher.ingest_command(
+ _args(store_root, matrix, artifact, run=run)
+ )
+ bundle_ids.append(result["bundle_id"])
+ datasets = [
+ publisher.build_dataset(
+ publisher.Store(store_root), order, promote=False,
+ )
+ for order in itertools.permutations(bundle_ids)
+ ]
+ self.assertTrue(all(dataset == datasets[0] for dataset in datasets[1:]))
+ self.assertEqual(datasets[0]["generated_at"], "2026-07-04T00:00:00Z")
+ selected = datasets[0]["coverage"][0]["selected_attempt_id"]
+ selected_attempt = next(
+ item for item in datasets[0]["attempts"]
+ if item["attempt_id"] == selected
+ )
+ self.assertEqual(selected_attempt["run_id"], "11")
+
+ def test_diagnostic_dataset_orders_reruns_by_run_attempt(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ store_root = root / "store"
+ bundle_ids = []
+ for run_attempt in (1, 2):
+ run = {**RUN, "run_attempt": run_attempt}
+ delivery = root / f"attempt-{run_attempt}"
+ delivery.mkdir()
+ matrix, artifact = _unsupported_delivery(delivery, run=run)
+ result = publisher.ingest_command(
+ _args(store_root, matrix, artifact, run=run)
+ )
+ bundle_ids.append(result["bundle_id"])
+ dataset = publisher.build_dataset(
+ publisher.Store(store_root), bundle_ids, promote=False
+ )
+ selected_id = dataset["coverage"][0]["selected_attempt_id"]
+ selected = next(
+ item for item in dataset["attempts"]
+ if item["attempt_id"] == selected_id
+ )
+ self.assertEqual(selected["run_attempt"], 2)
+
+ def test_promotion_requires_every_runnable_case_to_succeed_in_every_bundle(self) -> None:
+ cases = {
+ "runnable": {"_disposition": "runnable"},
+ "planned-unsupported": {"_disposition": "unsupported"},
+ }
+ bundles = []
+ for _ in range(3):
+ runnable = {
+ "identity": {"case_id": "runnable"},
+ "outcome": {"status": "success"},
+ }
+ unsupported = {
+ "identity": {"case_id": "planned-unsupported"},
+ "outcome": {"status": "unsupported"},
+ }
+ bundles.append({
+ "selected": {"runnable": runnable, "planned-unsupported": unsupported},
+ "documents": {"runnable": runnable, "planned-unsupported": unsupported},
+ })
+ publisher._require_runnable_promotion_success(bundles, cases)
+
+ for status in ("failed", "invalid", "unsupported", "diagnostic"):
+ with self.subTest(status=status):
+ broken = copy.deepcopy(bundles)
+ broken[1]["selected"]["runnable"]["outcome"]["status"] = status
+ with self.assertRaisesRegex(
+ publisher.PublisherError, "every runnable matrix case"
+ ):
+ publisher._require_runnable_promotion_success(broken, cases)
+
+ broken = copy.deepcopy(bundles)
+ broken[1]["documents"]["retry"] = {
+ "identity": {"case_id": "runnable"},
+ "outcome": {"status": "failed"},
+ }
+ with self.assertRaisesRegex(publisher.PublisherError, "rejects runnable cases"):
+ publisher._require_runnable_promotion_success(broken, cases)
+
+ def test_promoted_public_dataset_rejects_failed_retry_history(self) -> None:
+ dataset = _promoted_dataset()
+ successful = next(
+ item for item in dataset["attempts"]
+ if item["outcome"] == "success"
+ )
+ failed = copy.deepcopy(successful)
+ old_attempt_id = successful["attempt_id"]
+ successful["attempt_index"] = 2
+ successful["attempt_id"] = identity.attempt_id(
+ allocation=successful["allocation_id"], case=successful["case_id"], ordinal=2
+ )
+ failed.update({
+ "attempt_id": old_attempt_id,
+ "attempt_index": 1,
+ "outcome": "failed",
+ "failure_mode": "execution",
+ "reason": "execution-failed",
+ "series_id": None,
+ "selected": False,
+ "evidence": [],
+ })
+ dataset["attempts"].append(failed)
+ dataset["attempts"].sort(key=lambda item: item["attempt_id"])
+ coverage = next(
+ item for item in dataset["coverage"]
+ if item["case_id"] == failed["case_id"]
+ )
+ coverage["attempt_ids"] = [
+ successful["attempt_id"] if value == old_attempt_id else value
+ for value in coverage["attempt_ids"]
+ ]
+ coverage["attempt_ids"].append(failed["attempt_id"])
+ coverage["attempt_ids"].sort()
+ if coverage["selected_attempt_id"] == old_attempt_id:
+ coverage["selected_attempt_id"] = successful["attempt_id"]
+
+ fixture_catalog = publisher._case_disposition_catalog_sha256(dataset["coverage"])
+ with mock.patch.object(
+ publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog
+ ), self.assertRaisesRegex(publisher.PublisherError, "rejects runnable cases"):
+ publisher.validate_public_dataset(dataset)
+
+ def test_unselected_success_does_not_reference_an_unpublished_series(self) -> None:
+ raw, _ = _native_fixture()
+ retained = publisher._public_attempt(raw, selected=False)
+ selected = publisher._public_attempt(raw, selected=True)
+ self.assertEqual(retained["outcome"], "success")
+ self.assertIsNone(retained["series_id"])
+ self.assertEqual(selected["series_id"], raw["identity"]["series_id"])
+
+ def test_public_dataset_selects_latest_derived_retry(self) -> None:
+ dataset = _dataset()
+ first = dataset["attempts"][0]
+ second = copy.deepcopy(first)
+ second.update({
+ "attempt_id": identity.attempt_id(
+ allocation=first["allocation_id"], case=first["case_id"], ordinal=2
+ ),
+ "attempt_index": 2,
+ "selected": False,
+ "series_id": None,
+ "evidence": [],
+ })
+ dataset["attempts"].append(second)
+ dataset["attempts"].sort(key=lambda item: item["attempt_id"])
+ dataset["coverage"][0]["attempt_ids"].append(second["attempt_id"])
+ dataset["coverage"][0]["attempt_ids"].sort()
+ with self.assertRaisesRegex(publisher.PublisherError, "select the latest retry"):
+ publisher.validate_public_dataset(dataset)
+
+ second["attempt_id"] = identity.digest("attempt", {"not": "derived"})
+ dataset["attempts"].sort(key=lambda item: item["attempt_id"])
+ dataset["coverage"][0]["attempt_ids"] = [
+ item["attempt_id"] for item in dataset["attempts"]
+ ]
+ with self.assertRaisesRegex(publisher.PublisherError, "retry identity differs"):
+ publisher.validate_public_dataset(dataset)
+
+ def test_promotion_requires_an_eligible_cohort_for_every_comparison_kind(self) -> None:
+ stable_fast, stable_fast_internal = _series(
+ "stable-fast", "deepep", decision_grade=True
+ )
+ stable_slow, stable_slow_internal = _series(
+ "stable-slow", "uccl", decision_grade=True
+ )
+ unstable_fast, unstable_fast_internal = _series(
+ "unstable-fast", "deepep", decision_grade=True
+ )
+ unstable_slow, unstable_slow_internal = _series(
+ "unstable-slow", "uccl", decision_grade=True
+ )
+ unstable_fast["phase"] = unstable_slow["phase"] = "prefill"
+ unstable_fast["series_id"] = identity.series_id({"test": "unstable-fast"})
+ unstable_slow["series_id"] = identity.series_id({"test": "unstable-slow"})
+ for statistic in ("p50", "p99"):
+ unstable_slow_internal["run_metrics"]["1"][8]["latency_us"][statistic] = (
+ unstable_fast_internal["run_metrics"]["1"][8]["latency_us"][statistic]
+ / 2
+ )
+ unstable_slow_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] = (
+ unstable_fast_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic]
+ * 2
+ )
+ series = [stable_fast, stable_slow, unstable_fast, unstable_slow]
+ internals = {
+ stable_fast["series_id"]: stable_fast_internal,
+ stable_slow["series_id"]: stable_slow_internal,
+ unstable_fast["series_id"]: unstable_fast_internal,
+ unstable_slow["series_id"]: unstable_slow_internal,
+ }
+ cohorts, _, _, _ = publisher.build_decisions(series, internals)
+ eligible = [item for item in cohorts if item["eligibility"]["decision_grade"]]
+ ineligible = [item for item in cohorts if not item["eligibility"]["decision_grade"]]
+ self.assertEqual({item["kind"] for item in eligible}, {"library"})
+ self.assertTrue(ineligible)
+ anchor_series = [
+ {
+ "series_id": name,
+ "workload": {"routing": routing, "eplb": eplb},
+ "build": {"implementation_contract_sha256": "1" * 64},
+ }
+ for name, routing, eplb in (
+ ("uniform", "uniform", False),
+ ("zipf", "zipf", False),
+ ("zipf-eplb", "zipf", True),
+ )
+ ]
+ required = eligible + [
+ {
+ "kind": kind,
+ "eligibility": {"decision_grade": True},
+ **({"series_ids": [item["series_id"] for item in anchor_series]}
+ if kind == "routing" else {}),
+ }
+ for kind in publisher.REQUIRED_COHORT_KINDS
+ if kind != "library"
+ ]
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", {}
+ ), mock.patch.object(
+ publisher, "_expected_chip_cohort_count", return_value=1
+ ):
+ publisher._require_promotion_cohorts(
+ required + ineligible, anchor_series
+ )
+ for kind in publisher.REQUIRED_COHORT_KINDS:
+ with self.subTest(missing_kind=kind), self.assertRaisesRegex(
+ publisher.PublisherError, rf"cohort kinds:.*{kind}"
+ ):
+ publisher._require_promotion_cohorts([
+ item for item in required + ineligible
+ if item["kind"] != kind or not item["eligibility"]["decision_grade"]
+ ], anchor_series)
+
+ def test_promotion_requires_exact_counts_and_routing_anchors(self) -> None:
+ dataset = _promoted_dataset()
+ counts = _cohort_counts(dataset)
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts
+ ):
+ publisher._require_promotion_cohorts(
+ dataset["cohorts"], dataset["series"]
+ )
+ routing = next(
+ item for item in dataset["cohorts"] if item["kind"] == "routing"
+ )
+ eplb = next(
+ item for item in dataset["series"]
+ if item["series_id"] in routing["series_ids"]
+ and item["workload"]["eplb"]
+ )
+ eplb["workload"]["eplb"] = False
+ with self.assertRaisesRegex(publisher.PublisherError, "exact uniform"):
+ publisher._require_promotion_cohorts(
+ dataset["cohorts"], dataset["series"]
+ )
+
+ dataset = _promoted_dataset()
+ routing = next(item for item in dataset["cohorts"] if item["kind"] == "routing")
+ zipf = next(
+ item for item in dataset["series"]
+ if item["series_id"] in routing["series_ids"]
+ and item["workload"]["routing"] == "zipf"
+ and not item["workload"]["eplb"]
+ )
+ zipf["build"]["implementation_contract_sha256"] = "f" * 64
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts
+ ), self.assertRaisesRegex(publisher.PublisherError, "identical off-EPLB"):
+ publisher._require_promotion_cohorts(dataset["cohorts"], dataset["series"])
+
+ wrong_counts = {**counts, "library": counts["library"] + 1}
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", wrong_counts
+ ), self.assertRaisesRegex(publisher.PublisherError, "exactly"):
+ publisher._require_promotion_cohorts(
+ dataset["cohorts"], dataset["series"]
+ )
+
+ def test_promotion_requires_every_derived_chip_cohort_to_be_stable(self) -> None:
+ dataset = _promoted_dataset()
+ chip = next(item for item in dataset["cohorts"] if item["kind"] == "chip")
+ self.assertEqual(
+ publisher._expected_chip_cohort_count(dataset["series"]),
+ sum(item["kind"] == "chip" for item in dataset["cohorts"]),
+ )
+ with mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", _cohort_counts(dataset)
+ ):
+ missing = [item for item in dataset["cohorts"] if item is not chip]
+ with self.assertRaisesRegex(publisher.PublisherError, "derived chip cohorts"):
+ publisher._require_promotion_cohorts(missing, dataset["series"])
+
+ chip["eligibility"]["decision_grade"] = False
+ with self.assertRaisesRegex(publisher.PublisherError, "derived chip cohorts"):
+ publisher._require_promotion_cohorts(
+ dataset["cohorts"], dataset["series"]
+ )
+
+ def test_promotion_rejects_more_than_three_bundles(self) -> None:
+ bundles = {
+ str(run_id): {
+ "id": str(run_id), "cases": [],
+ "manifest": {
+ "matrix": {"sha256": publisher.CANONICAL_FULL_V1_MATRIX_SHA256},
+ "run": {"run_id": str(run_id), "run_attempt": 1},
+ },
+ }
+ for run_id in range(1, 5)
+ }
+ with mock.patch.object(
+ publisher, "load_bundle", side_effect=lambda _, bundle_id: bundles[bundle_id]
+ ), self.assertRaisesRegex(publisher.PublisherError, "three independent"):
+ publisher.build_dataset(object(), list(bundles), promote=True)
+
+ dataset = _promoted_dataset()
+ dataset["source_bundle_ids"].append("d" * 64)
+ counts = _cohort_counts(dataset)
+ with mock.patch.object(
+ publisher,
+ "CANONICAL_FULL_V1_CASE_CATALOG_SHA256",
+ publisher._case_disposition_catalog_sha256(dataset["coverage"]),
+ ), mock.patch.object(
+ publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts
+ ), self.assertRaisesRegex(publisher.PublisherError, "complete coverage"):
+ publisher.validate_public_dataset(dataset)
+
+ def test_standalone_promotion_binds_matrix_and_requested_dispositions(self) -> None:
+ dataset = _promoted_dataset()
+ fixture_catalog = publisher._case_disposition_catalog_sha256(dataset["coverage"])
+ with self.assertRaisesRegex(
+ publisher.PublisherError, "canonical case/disposition catalog"
+ ):
+ publisher.validate_public_dataset(dataset)
+ with mock.patch.object(
+ publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog
+ ), mock.patch.object(
+ publisher,
+ "REQUIRED_PROMOTION_COHORT_COUNTS",
+ _cohort_counts(dataset),
+ ):
+ publisher.validate_public_dataset(dataset)
+
+ diagnostic = copy.deepcopy(dataset)
+ item = diagnostic["series"][0]
+ item["status"] = "diagnostic"
+ item["eligibility"].update({
+ "decision_grade": False,
+ "stable_p50": False,
+ "p50_max_min_ratio": 1.20,
+ "reasons": ["unstable-p50"],
+ })
+ with mock.patch.object(
+ publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog
+ ), mock.patch.object(
+ publisher,
+ "REQUIRED_PROMOTION_COHORT_COUNTS",
+ _cohort_counts(dataset),
+ ), self.assertRaisesRegex(
+ publisher.PublisherError, "unstable or incomplete required series"
+ ):
+ publisher.validate_public_dataset(diagnostic)
+
+ broken = copy.deepcopy(dataset)
+ broken["promotion"]["matrix_id"] = "d" * 64
+ with self.assertRaisesRegex(publisher.PublisherError, "canonical full-v1 matrix"):
+ publisher.validate_public_dataset(broken)
+
+ for original, replacement in (("runnable", "unsupported"),
+ ("unsupported", "runnable")):
+ with self.subTest(original=original):
+ broken = copy.deepcopy(dataset)
+ item = next(
+ coverage for coverage in broken["coverage"]
+ if coverage["disposition"] == original
+ )
+ item["disposition"] = replacement
+ with mock.patch.object(
+ publisher,
+ "CANONICAL_FULL_V1_CASE_CATALOG_SHA256",
+ publisher._case_disposition_catalog_sha256(broken["coverage"]),
+ ), self.assertRaisesRegex(
+ publisher.PublisherError, "requested dispositions"
+ ):
+ publisher.validate_public_dataset(broken)
+
+ def test_workflow_matrix_and_catalog_digests_do_not_drift(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ matrix_path = Path(temporary) / "matrix_full.json"
+ result = subprocess.run(
+ [
+ sys.executable, str(ROOT / "sweep_matrix.py"),
+ "--suites", "all", "--max-cases", "128",
+ "--backends", "all", "--out", str(matrix_path),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertEqual(
+ hashlib.sha256(matrix_path.read_bytes()).hexdigest(),
+ publisher.CANONICAL_FULL_V1_MATRIX_SHA256,
+ )
+ matrix = contracts.strict_load(matrix_path)
+ coverage = [
+ {
+ "case_id": item["case"]["case_id"],
+ "disposition": item["disposition"],
+ }
+ for item in matrix["requested_cases"]
+ ]
+ self.assertEqual(
+ publisher._case_disposition_catalog_sha256(coverage),
+ publisher.CANONICAL_FULL_V1_CASE_CATALOG_SHA256,
+ )
+ self.assertEqual(
+ (
+ len(matrix["include"]), len(coverage),
+ sum(item["disposition"] == "runnable" for item in coverage),
+ sum(item["disposition"] == "unsupported" for item in coverage),
+ sum(
+ len(item["case"]["ladder"].split())
+ for item in matrix["requested_cases"]
+ ),
+ sum(
+ len(item["case"]["ladder"].split())
+ for item in matrix["requested_cases"]
+ if item["disposition"] == "runnable"
+ ),
+ sum(
+ len(item["case"]["ladder"].split())
+ for item in matrix["requested_cases"]
+ if item["disposition"] == "unsupported"
+ ),
+ ),
+ (58, 608, 364, 244, 1600, 940, 660),
+ )
+ library: dict[tuple, set[str]] = {}
+ system: dict[tuple, set[str]] = {}
+ routing: dict[tuple, list[tuple[str, bool]]] = {}
+ for requested in matrix["requested_cases"]:
+ if requested["disposition"] != "runnable":
+ continue
+ case = requested["case"]
+ shape = tuple(
+ case[field]
+ for field in (
+ "workload", "mode", "hidden", "topk", "experts", "ep", "phase"
+ )
+ )
+ route = (case["routing"], case["eplb"])
+ if case["backend"] != "nccl-ep":
+ library.setdefault((requested["sku"], shape, route), set()).add(
+ case["backend"]
+ )
+ else:
+ system.setdefault((shape, route), set()).add(requested["sku"])
+ routing.setdefault(
+ (requested["sku"], case["backend"], shape), []
+ ).append(route)
+ anchors = {("uniform", False), ("zipf", False), ("zipf", True)}
+ self.assertEqual(
+ {
+ "library": sum(len(variants) >= 2 for variants in library.values()),
+ "system": sum(len(variants) >= 2 for variants in system.values()),
+ "routing": sum(
+ len(variants) == 3 and set(variants) == anchors
+ for variants in routing.values()
+ ),
+ },
+ publisher.REQUIRED_PROMOTION_COHORT_COUNTS,
+ )
+
+ def test_build_promotion_requires_canonical_full_matrix(self) -> None:
+ bundles = {
+ str(run_id): {
+ "id": str(run_id), "cases": [],
+ "manifest": {
+ "matrix": {"sha256": "d" * 64},
+ "run": {"run_id": str(run_id), "run_attempt": 1},
+ },
+ }
+ for run_id in range(1, 4)
+ }
+ with mock.patch.object(
+ publisher, "load_bundle", side_effect=lambda _, bundle_id: bundles[bundle_id]
+ ), self.assertRaisesRegex(publisher.PublisherError, "canonical full-v1 matrix"):
+ publisher.build_dataset(object(), list(bundles), promote=True)
+
+ def test_rejection_updates_latest_but_never_dev_latest(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ store = publisher.Store(root / "store")
+ sentinel = b"existing-promoted-pointer\n"
+ (store.channels / "dev-latest.json").write_bytes(sentinel)
+ (artifact / "unknown.json").write_text('{"format":"unknown"}')
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ self.assertEqual((store.channels / "dev-latest.json").read_bytes(), sentinel)
+ pointer = store.verify_channel("latest-attempt")
+ dataset = publisher.strict_load(store.public / pointer["dataset"]["path"])
+ self.assertEqual(dataset["promotion"]["status"], "quarantined")
+ self.assertTrue(any(store.quarantine.iterdir()))
+
+ def test_repeated_rejection_is_content_idempotent(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ store = publisher.Store(root / "store")
+ (artifact / "unknown.json").write_text('{"format":"unknown"}')
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ pointer = (store.channels / "latest-attempt.json").read_bytes()
+ counts = tuple(
+ len(list(path.iterdir()))
+ for path in (store.incoming, store.quarantine, store.datasets)
+ )
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ self.assertEqual((store.channels / "latest-attempt.json").read_bytes(), pointer)
+ self.assertEqual(
+ tuple(
+ len(list(path.iterdir()))
+ for path in (store.incoming, store.quarantine, store.datasets)
+ ),
+ counts,
+ )
+
+ def test_distinct_rejections_advance_latest_attempt(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ store = publisher.Store(root / "store")
+ unknown = artifact / "unknown.json"
+ unknown.write_text('{"format":"unknown-one"}')
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ first = (store.channels / "latest-attempt.json").read_bytes()
+ unknown.write_text('{"format":"unknown-two"}')
+ with self.assertRaises(publisher.PublisherError):
+ publisher.ingest_command(_args(store.root, matrix, artifact))
+ second = (store.channels / "latest-attempt.json").read_bytes()
+ self.assertNotEqual(second, first)
+ self.assertEqual(len(list(store.datasets.iterdir())), 2)
+
+ def test_zip_traversal_is_rejected(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "bad.zip"
+ with zipfile.ZipFile(archive, "w") as handle:
+ handle.writestr("../escape.json", "{}")
+ with self.assertRaisesRegex(publisher.PublisherError, "escapes"):
+ publisher.extract_archive(archive, root / "out")
+
+ def test_store_and_directory_archive_reject_symlinks(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ real = root / "real"
+ real.mkdir()
+ alias = root / "alias"
+ alias.symlink_to(real, target_is_directory=True)
+ with self.assertRaisesRegex(publisher.PublisherError, "symlinked parent"):
+ publisher.Store(alias / "store")
+ self.assertFalse((real / "store").exists())
+ artifact = root / f"cxunsupported-{RUN['run_id']}-{RUN['run_attempt']}"
+ artifact.mkdir()
+ target = root / "target.json"
+ target.write_text("{}")
+ (artifact / "linked.json").symlink_to(target)
+ with self.assertRaisesRegex(publisher.PublisherError, "symlink"):
+ publisher._archive_download_directory(artifact, root / "artifact.zip")
+
+ def test_offline_caller_metadata_is_validated_before_store_creation(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ store_root = root / "store"
+ args = _args(store_root, matrix, artifact)
+ args.run_id = "0"
+ with self.assertRaisesRegex(publisher.PublisherError, "run-id"):
+ publisher.ingest_command(args)
+ self.assertFalse(store_root.exists())
+
+ promote = types.SimpleNamespace(
+ store_root=str(store_root), bundle=["not-a-digest"]
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "bundle IDs"):
+ publisher.promote_command(promote)
+ self.assertFalse(store_root.exists())
+ with self.assertRaisesRegex(publisher.PublisherError, "absolute path"):
+ publisher._store_from_args(types.SimpleNamespace(store_root="relative-store"))
+
+ def test_store_rejects_group_or_world_writable_root(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve() / "unsafe-store"
+ root.mkdir()
+ root.chmod(0o772)
+ with self.assertRaisesRegex(publisher.PublisherError, "group/world writable"):
+ publisher.Store(root)
+
+ def test_retry_ordinals_must_be_contiguous_from_one(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root, (1, 3))
+ with self.assertRaisesRegex(publisher.PublisherError, "contiguous ordinals"):
+ publisher.ingest_command(_args(root / "store", matrix, artifact))
+
+ def test_delivery_rejects_extra_archive_and_non_native_member(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ extra = root / f"cxshard-extra-{RUN['run_id']}-{RUN['run_attempt']}"
+ extra.mkdir()
+ (extra / "extra.json").write_text("{}")
+ args = _args(root / "store-extra", matrix, artifact)
+ args.artifact.append(str(extra))
+ with self.assertRaisesRegex(publisher.PublisherError, "archive set"):
+ publisher.ingest_command(args)
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ (artifact / "notes.txt").write_text("not native evidence")
+ with self.assertRaisesRegex(publisher.PublisherError, "unconsumed"):
+ publisher.ingest_command(_args(root / "store-member", matrix, artifact))
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ matrix, artifact = _unsupported_delivery(root)
+ path = next(artifact.glob("*.json"))
+ terminal = json.loads(path.read_text())
+ terminal["outcome"]["reason"] = next(
+ reason for reason in contracts.CAPABILITY_FAILURE_REASONS
+ if reason != terminal["outcome"]["reason"]
+ )
+ path.write_text(json.dumps(terminal))
+ with self.assertRaisesRegex(publisher.PublisherError, "reason differs"):
+ publisher.ingest_command(_args(root / "store-reason", matrix, artifact))
+
+ def test_rates_invert_latency_and_global_tokens_use_ep_size(self) -> None:
+ dataset = _dataset()
+ publisher.validate_public_dataset(dataset)
+ rates = dataset["series"][0]["points"][0]["components"]["roundtrip"]["logical_payload_rate_gbps_at_latency_percentile"]
+ self.assertGreater(rates["p50"], rates["p99"])
+ broken = copy.deepcopy(dataset)
+ broken["series"][0]["points"][0]["global_tokens"] = 128
+ with self.assertRaisesRegex(publisher.PublisherError, "EP size"):
+ publisher.validate_public_dataset(broken)
+ broken = copy.deepcopy(dataset)
+ broken["series"][0]["points"][0]["roundtrip_token_rate_at_latency_percentile"]["p99"] *= 2
+ with self.assertRaisesRegex(publisher.PublisherError, "token throughput"):
+ publisher.validate_public_dataset(broken)
+ broken = copy.deepcopy(dataset)
+ broken["attempts"][0]["evidence"][0]["point_id"] = identity.point_id(
+ series=broken["series"][0]["series_id"], tokens_per_rank=16
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "point evidence"):
+ publisher.validate_public_dataset(broken)
+ broken = copy.deepcopy(dataset)
+ broken["attempts"][0]["series_id"] = None
+ with self.assertRaisesRegex(publisher.PublisherError, "present exactly for selected success"):
+ publisher.validate_public_dataset(broken)
+ broken = copy.deepcopy(dataset)
+ component = broken["series"][0]["points"][0]["components"]["roundtrip"]
+ component["logical_bytes"] = None
+ component["logical_payload_rate_gbps_at_latency_percentile"] = None
+ with self.assertRaisesRegex(publisher.PublisherError, "logical bandwidth is missing"):
+ publisher.validate_public_dataset(broken)
+
+ for mutate in (
+ lambda item: item.update({"model": "different-model"}),
+ lambda item: item["workload"].update({"hidden": 4096}),
+ lambda item: item["workload"].update({"top_k": 4}),
+ lambda item: item["workload"].update({"experts": 128}),
+ ):
+ broken = copy.deepcopy(dataset)
+ mutate(broken["series"][0])
+ with self.assertRaisesRegex(publisher.PublisherError, "frozen v1"):
+ publisher.validate_public_dataset(broken)
+
+ broken = copy.deepcopy(dataset)
+ broken["series"][0]["eplb"]["mapping_sha256"] = "f" * 64
+ with self.assertRaisesRegex(publisher.PublisherError, "claims a plan"):
+ publisher.validate_public_dataset(broken)
+
+ broken = copy.deepcopy(dataset)
+ broken["series"][0]["backend"].update({
+ "id": "nccl-ep", "label": publisher.BACKEND_LABELS["nccl-ep"],
+ "role": "reference", "generation": "rccl",
+ })
+ broken["coverage"][0]["backend"] = "nccl-ep"
+ with self.assertRaisesRegex(publisher.PublisherError, "configuration"):
+ publisher.validate_public_dataset(broken)
+
+ def test_public_coverage_binds_exact_topology_and_case_identity(self) -> None:
+ dataset = _promoted_dataset()
+ dataset["promotion"]["status"] = "diagnostic"
+ self.assertEqual(
+ {item["disposition"] for item in dataset["coverage"]},
+ {"runnable", "unsupported"},
+ )
+ for item in dataset["coverage"]:
+ self.assertEqual(
+ tuple(item["topology"]), publisher.COVERAGE_TOPOLOGY_FIELDS
+ )
+ publisher.validate_public_dataset(dataset)
+
+ broken = copy.deepcopy(dataset)
+ unsupported = next(
+ item for item in broken["coverage"]
+ if item["disposition"] == "unsupported"
+ )
+ unsupported["topology"]["nodes"] = 2
+ with self.assertRaisesRegex(publisher.PublisherError, "capability registry"):
+ publisher.validate_public_dataset(broken)
+
+ broken = copy.deepcopy(dataset)
+ unsupported = next(
+ item for item in broken["coverage"]
+ if item["disposition"] == "unsupported"
+ )
+ unsupported["sku"] = "mi325x"
+ topology = publisher.capability.topology_for("mi325x", 8)
+ self.assertIsNotNone(topology)
+ unsupported["topology"] = publisher._coverage_topology({
+ "ep_size": 8, **topology,
+ })
+ with self.assertRaisesRegex(publisher.PublisherError, "case identity"):
+ publisher.validate_public_dataset(broken)
+
+ def test_cohort_contract_and_labels_name_mode_explicitly(self) -> None:
+ dataset = _promoted_dataset()
+ dataset["promotion"]["status"] = "diagnostic"
+ publisher.validate_public_dataset(dataset)
+ for cohort in dataset["cohorts"]:
+ self.assertIn("mode", cohort["controlled_factors"])
+ self.assertIn("/ normal /", cohort["label"])
+
+ broken = copy.deepcopy(dataset)
+ cohort = broken["cohorts"][0]
+ cohort["controlled_factors"].remove("mode")
+ cohort["cohort_id"] = publisher._derived_id("cxcohort-v1-", {
+ "kind": cohort["kind"], "series_ids": cohort["series_ids"],
+ "controlled_factors": cohort["controlled_factors"],
+ "varying_factors": cohort["varying_factors"],
+ })
+ with self.assertRaisesRegex(publisher.PublisherError, "cohort factors"):
+ publisher.validate_public_dataset(broken)
+
+ def test_routing_and_eplb_facts_must_match_across_repeats(self) -> None:
+ raw, _ = _native_fixture()
+ descriptor = publisher._eplb_descriptor(raw)
+ facts = publisher._routing_facts(raw["measurement"]["rows"][0])
+ self.assertEqual(
+ publisher._exact_repeat_value([descriptor, copy.deepcopy(descriptor)], "EPLB"),
+ descriptor,
+ )
+ self.assertEqual(
+ publisher._exact_repeat_value([facts, copy.deepcopy(facts)], "routing"),
+ facts,
+ )
+ changed = copy.deepcopy(facts)
+ changed["hotspot_ratio"] += 0.1
+ with self.assertRaisesRegex(publisher.PublisherError, "routing differs"):
+ publisher._exact_repeat_value([facts, changed], "routing")
+
+ dataset = _promoted_dataset()
+ dataset["promotion"]["status"] = "diagnostic"
+ eplb = next(item for item in dataset["series"] if item["eplb"]["enabled"])
+ eplb["points"][0]["routing"]["empty_expert_count"] = 280
+ publisher.validate_public_dataset(dataset)
+ eplb["points"][0]["routing"]["empty_expert_count"] = 288
+ with self.assertRaisesRegex(publisher.PublisherError, "routing/load facts"):
+ publisher.validate_public_dataset(dataset)
+
+ for field, value in (
+ ("mapping_sha256", "0" * 64),
+ ("redundant_experts", 31),
+ ("replicated_experts", 1),
+ ("max_replicas", 2),
+ ("replicated_experts", 257),
+ ("max_replicas", 999),
+ ("imbalance_after", 0.4),
+ ("planner", "different-planner"),
+ ("reference_tokens_per_rank", 1024),
+ ):
+ broken = _promoted_dataset()
+ broken["promotion"]["status"] = "diagnostic"
+ descriptor = next(
+ item["eplb"] for item in broken["series"] if item["eplb"]["enabled"]
+ )
+ descriptor[field] = value
+ with self.subTest(eplb_field=field), self.assertRaisesRegex(
+ publisher.PublisherError, "EPLB descriptor"
+ ):
+ publisher.validate_public_dataset(broken)
+
+ def test_publisher_owns_stable_rankings_and_recommendations(self) -> None:
+ fast, fast_internal = _series("fast", "deepep", decision_grade=True)
+ slow, slow_internal = _series("slow", "uccl", decision_grade=True)
+ reference, reference_internal = _series("reference", "nccl-ep", decision_grade=True)
+ reference_peer, reference_peer_internal = _series(
+ "reference-peer", "nccl-ep", decision_grade=True
+ )
+ reference["backend"]["role"] = "reference"
+ reference_peer["backend"]["role"] = "reference"
+ reference_peer["system"].update({"sku": "h200-dgxc", "label": "NVIDIA H200"})
+ cohorts, rankings, recommendations, _ = publisher.build_decisions(
+ [fast, slow, reference, reference_peer], {
+ fast["series_id"]: fast_internal,
+ slow["series_id"]: slow_internal,
+ reference["series_id"]: reference_internal,
+ reference_peer["series_id"]: reference_peer_internal,
+ }
+ )
+ library = next(item for item in cohorts if item["kind"] == "library")
+ ranking = next(item for item in rankings if item["cohort_id"] == library["cohort_id"]
+ and item["metric"]["measure"] == "latency_us"
+ and item["metric"]["statistic"] == "p99")
+ self.assertTrue(library["eligibility"]["decision_grade"])
+ self.assertEqual(ranking["entries"][0]["series_id"], fast["series_id"])
+ self.assertTrue(any(item["series_id"] == fast["series_id"] for item in recommendations))
+ self.assertFalse(any(
+ entry["series_id"] == reference["series_id"]
+ for item in rankings if item["cohort_id"] == library["cohort_id"]
+ for entry in item["entries"]
+ ))
+ self.assertTrue(any(
+ item["kind"] == "system" and reference["series_id"] in item["series_ids"]
+ for item in cohorts
+ ))
+
+ def test_routing_evidence_is_experimental_and_not_a_configuration_recommendation(self) -> None:
+ dataset = _promoted_dataset()
+ routing = next(item for item in dataset["cohorts"] if item["kind"] == "routing")
+ members = [
+ item for item in dataset["series"]
+ if item["series_id"] in routing["series_ids"]
+ ]
+ self.assertEqual(
+ {(item["workload"]["routing"], item["workload"]["eplb"]) for item in members},
+ {("uniform", False), ("zipf", False), ("zipf", True)},
+ )
+ self.assertIn("implementation-static-build", routing["controlled_factors"])
+ self.assertIn("resource", routing["controlled_factors"])
+ self.assertEqual(
+ routing["varying_factors"],
+ ["workload.routing", "workload.eplb", "implementation-config"],
+ )
+ self.assertEqual(
+ len({item["build"]["routing_control_sha256"] for item in members}),
+ 1,
+ )
+ self.assertGreater(
+ len({item["build"]["implementation_contract_sha256"] for item in members}),
+ 1,
+ )
+ self.assertEqual(len({json.dumps(item["resource"], sort_keys=True) for item in members}), 1)
+ self.assertEqual(routing["publication_tier"], "comparable-experimental")
+ self.assertTrue(any(
+ item["cohort_id"] == routing["cohort_id"] for item in dataset["rankings"]
+ ))
+ self.assertFalse(any(
+ item["cohort_id"] == routing["cohort_id"] for item in dataset["recommendations"]
+ ))
+ self.assertTrue(all(
+ item["publication_tier"] == "official"
+ for item in dataset["recommendations"]
+ ))
+ self.assertFalse(any(
+ dataset_cohort["publication_tier"] == "comparable-experimental"
+ and item["cohort_id"] == dataset_cohort["cohort_id"]
+ for item in dataset["recommendations"]
+ for dataset_cohort in dataset["cohorts"]
+ ))
+ self.assertTrue(all(
+ item["publication_tier"] == "comparable-experimental"
+ for item in dataset["sensitivities"]
+ if item["cohort_id"] == routing["cohort_id"]
+ ))
+
+ def test_routing_implementation_mismatch_blocks_all_decisions(self) -> None:
+ dataset = _promoted_dataset()
+ published = next(item for item in dataset["cohorts"] if item["kind"] == "routing")
+ members = [
+ item for item in dataset["series"]
+ if item["series_id"] in published["series_ids"]
+ ]
+ zipf = next(
+ item for item in members
+ if item["workload"]["routing"] == "zipf" and not item["workload"]["eplb"]
+ )
+ zipf["build"]["implementation_contract_sha256"] = "f" * 64
+ internals = {}
+ for member in members:
+ point = member["points"][0]
+ roundtrip = point["components"]["roundtrip"]
+ metrics = {
+ "latency_us": {
+ name: roundtrip["latency_us"][name] for name in ("p50", "p99")
+ },
+ "logical_payload_rate_gbps_at_latency_percentile": {
+ name: roundtrip[
+ "logical_payload_rate_gbps_at_latency_percentile"
+ ][name]
+ for name in ("p50", "p99")
+ },
+ }
+ internals[member["series_id"]] = {
+ "run_metrics": {
+ str(run): {point["tokens_per_rank"]: metrics}
+ for run in range(3)
+ }
+ }
+ cohorts, rankings, recommendations, sensitivities = publisher.build_decisions(
+ members, internals
+ )
+ routing = next(item for item in cohorts if item["kind"] == "routing")
+ self.assertFalse(routing["eligibility"]["decision_grade"])
+ self.assertIn(
+ "implementation-config-mismatch", routing["eligibility"]["reasons"]
+ )
+ self.assertEqual((rankings, recommendations, sensitivities), ([], [], []))
+
+ def test_promoted_series_fields_are_bound_to_case_and_series_identities(self) -> None:
+ dataset = _promoted_dataset()
+ changed = copy.deepcopy(dataset)
+ series = next(
+ item for item in changed["series"]
+ if item["system"]["sku"] == "h100-dgxc"
+ )
+ series["system"].update({
+ "sku": "h200-dgxc", "label": "NVIDIA H200",
+ "topology_class": "h200-nvlink-island",
+ })
+ for case_id in series["case_ids"]:
+ coverage = next(
+ item for item in changed["coverage"] if item["case_id"] == case_id
+ )
+ coverage["sku"] = "h200-dgxc"
+ coverage["topology"] = publisher._coverage_topology(series["system"])
+ with self.assertRaisesRegex(publisher.PublisherError, "configuration|case identity"):
+ publisher.validate_public_dataset(changed)
+
+ for field, value in (
+ ("source_sha", "b" * 40),
+ ("image_digest", "sha256:" + "4" * 64),
+ ("squash_sha256", "5" * 64),
+ ("runtime_fingerprint_sha256", "6" * 64),
+ ("implementation_contract_sha256", "7" * 64),
+ ("public_config_sha256", "9" * 64),
+ ("routing_control_sha256", "8" * 64),
+ ):
+ changed = copy.deepcopy(dataset)
+ changed["series"][0]["build"][field] = value
+ with self.subTest(build_field=field), self.assertRaisesRegex(
+ publisher.PublisherError, "commit"
+ ):
+ publisher.validate_public_dataset(changed)
+ changed = copy.deepcopy(dataset)
+ changed["series"][0]["workload"]["workload_id"] = identity.workload_id(
+ {"changed": True}
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "committed factors"):
+ publisher.validate_public_dataset(changed)
+
+ for mutate, message in (
+ (lambda item: item["backend"].update({
+ "generation": "fabricated", "version": "fabricated-999",
+ }), "configuration"),
+ (lambda item: item["resource"].update({
+ "profile": "profile-fabricated", "configured_units": 99,
+ }), "configuration"),
+ (lambda item: item["system"].update({"label": "Fabricated H100"}), "commit"),
+ ):
+ changed = copy.deepcopy(dataset)
+ mutate(changed["series"][0])
+ with self.assertRaisesRegex(publisher.PublisherError, message):
+ publisher.validate_public_dataset(changed)
+
+ diagnostic = _dataset()
+ diagnostic["series"][0]["build"]["source_sha"] = "b" * 40
+ with self.assertRaisesRegex(publisher.PublisherError, "committed factors"):
+ publisher.validate_public_dataset(diagnostic)
+
+ def test_all_decision_metrics_require_stable_repeat_ordering(self) -> None:
+ fast, fast_internal = _series("ordering-fast", "deepep", decision_grade=True)
+ slow, slow_internal = _series("ordering-slow", "uccl", decision_grade=True)
+ internals = {
+ fast["series_id"]: fast_internal,
+ slow["series_id"]: slow_internal,
+ }
+
+ cohorts, rankings, recommendations, _ = publisher.build_decisions(
+ [fast, slow], internals
+ )
+ library = next(item for item in cohorts if item["kind"] == "library")
+ self.assertTrue(library["eligibility"]["decision_grade"])
+ self.assertEqual(
+ len([item for item in rankings if item["cohort_id"] == library["cohort_id"]]),
+ 4,
+ )
+ self.assertEqual(
+ len([
+ item for item in recommendations
+ if item["cohort_id"] == library["cohort_id"]
+ ]),
+ 4,
+ )
+
+ for statistic in ("p50", "p99"):
+ slow_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] = (
+ fast_internal["run_metrics"]["1"][8]["logical_payload_rate_gbps_at_latency_percentile"][statistic] * 2
+ )
+ cohorts, rankings, recommendations, _ = publisher.build_decisions(
+ [fast, slow], internals
+ )
+ library = next(item for item in cohorts if item["kind"] == "library")
+ self.assertFalse(library["eligibility"]["decision_grade"])
+ self.assertIn("unstable-ordering", library["eligibility"]["reasons"])
+ self.assertFalse(any(
+ item["cohort_id"] == library["cohort_id"] for item in rankings
+ ))
+ self.assertFalse(any(
+ item["cohort_id"] == library["cohort_id"] for item in recommendations
+ ))
+
+ def test_extra_eligibility_reason_blocks_decision_grade(self) -> None:
+ allocations = [identity.allocation_id({"run": run}) for run in range(3)]
+ eligibility = publisher._eligibility_record(
+ allocations, complete=True, correct=True, measured=True,
+ stable_ordering=True, p50_ratio=1.01, p99_ratio=1.02,
+ extra_reasons=["incomplete-provenance"],
+ )
+ self.assertFalse(eligibility["decision_grade"])
+ self.assertEqual(eligibility["reasons"], ["incomplete-provenance"])
+ self.assertIs(publisher._eligibility(eligibility, "fixture"), eligibility)
+ broken = {**eligibility, "decision_grade": True}
+ with self.assertRaisesRegex(publisher.PublisherError, "promotion gates"):
+ publisher._eligibility(broken, "fixture")
+
+ def test_schema_is_strict_and_channel_target_must_be_complete(self) -> None:
+ dataset = _dataset()
+ dataset["unexpected"] = True
+ with self.assertRaises(publisher.PublisherError):
+ publisher.validate_public_dataset(dataset)
+ with mock.patch.object(publisher, "MAX_PUBLIC_DATASET_BYTES", 1), self.assertRaisesRegex(
+ publisher.PublisherError, "serving size limit"
+ ):
+ publisher.validate_public_dataset(_dataset())
+ with tempfile.TemporaryDirectory() as temporary:
+ store = publisher.Store(Path(temporary).resolve())
+ dataset = _dataset()
+ digest, size = store.install_dataset(dataset)
+ store.update_channel("latest-attempt", digest, size, dataset["generated_at"])
+ self.assertEqual(store.verify_channel("latest-attempt")["dataset"]["sha256"], digest)
+ channel_path = store.channels / "latest-attempt.json"
+ pointer = publisher.strict_load(channel_path)
+ pointer["generated_at"] = "2099-01-01T00:00:00Z"
+ channel_path.write_bytes(contracts.canonical_json_bytes(pointer))
+ with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"):
+ store.verify_channel("latest-attempt")
+ store.update_channel("latest-attempt", digest, size, dataset["generated_at"])
+ with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"):
+ store.update_channel(
+ "latest-attempt", digest, size + 1, dataset["generated_at"]
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"):
+ store.update_channel(
+ "latest-attempt", digest, size, "2026-07-05T00:00:00Z"
+ )
+ os.chmod(channel_path, 0o666)
+ with self.assertRaisesRegex(publisher.PublisherError, "regular 644"):
+ store.verify_channel("latest-attempt")
+ os.chmod(channel_path, 0o644)
+ dataset_dir = store.datasets / digest
+ os.chmod(dataset_dir, 0o755)
+ with self.assertRaisesRegex(publisher.PublisherError, "mode differs"):
+ store.verify_channel("latest-attempt")
+ os.chmod(dataset_dir, 0o555)
+ os.chmod(dataset_dir / "dataset.json", 0o644)
+ with self.assertRaisesRegex(publisher.PublisherError, "mode differs"):
+ store.verify_channel("latest-attempt")
+ os.chmod(dataset_dir / "dataset.json", 0o444)
+ os.chmod(dataset_dir, 0o755)
+ (dataset_dir / "COMPLETE").unlink()
+ os.chmod(dataset_dir, 0o555)
+ with self.assertRaisesRegex(publisher.PublisherError, "incomplete"):
+ store.verify_channel("latest-attempt")
+
+ def test_store_modes_do_not_depend_on_process_umask(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ previous = os.umask(0o077)
+ try:
+ store = publisher.Store(Path(temporary).resolve())
+ dataset = _dataset()
+ digest, size = store.install_dataset(dataset)
+ store.update_channel(
+ "latest-attempt", digest, size, dataset["generated_at"]
+ )
+ with store.locked():
+ pass
+ finally:
+ os.umask(previous)
+ self.assertEqual(
+ store.root.stat().st_mode & 0o777,
+ 0o750,
+ )
+ self.assertEqual(
+ (store.channels / "latest-attempt.json").stat().st_mode & 0o777,
+ 0o644,
+ )
+ self.assertEqual(
+ (store.datasets / digest / "dataset.json").stat().st_mode & 0o777,
+ 0o444,
+ )
+ self.assertEqual(
+ (store.locks / "publisher.lock").stat().st_mode & 0o777,
+ 0o600,
+ )
+
+ def test_verify_requires_bootstrap_but_dev_latest_is_optional(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ args = types.SimpleNamespace(
+ store_root=str(root / "store"), channel=None, bundle=[]
+ )
+ with self.assertRaises(publisher.PublisherError):
+ publisher.verify_command(args)
+ store = publisher.Store(args.store_root)
+ dataset = publisher._quarantine_dataset(
+ "awaiting-v1-runs", "2026-07-04T00:00:00Z"
+ )
+ digest, size = store.install_dataset(dataset)
+ store.update_channel(
+ "latest-attempt", digest, size, "2026-07-04T00:00:00Z"
+ )
+ result = publisher.verify_command(args)
+ self.assertEqual(set(result["channels"]), {"latest-attempt"})
+ explicit = types.SimpleNamespace(
+ store_root=args.store_root, channel=["dev-latest"], bundle=[]
+ )
+ with self.assertRaises(publisher.PublisherError):
+ publisher.verify_command(explicit)
+ dev_pointer = copy.deepcopy(store.verify_channel("latest-attempt"))
+ dev_pointer["channel"] = "dev-latest"
+ (store.channels / "dev-latest.json").write_bytes(
+ contracts.canonical_json_bytes(dev_pointer)
+ )
+ with self.assertRaisesRegex(publisher.PublisherError, "non-promoted"):
+ publisher.verify_command(args)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/experimental/CollectiveX/tests/test_sampling_contract.py b/experimental/CollectiveX/tests/test_sampling_contract.py
new file mode 100644
index 0000000000..50de2b1ef5
--- /dev/null
+++ b/experimental/CollectiveX/tests/test_sampling_contract.py
@@ -0,0 +1,3213 @@
+#!/usr/bin/env python3
+"""CPU-only behavioral tests for the CollectiveX v1 execution contract."""
+from __future__ import annotations
+
+import argparse
+import ast
+import copy
+import hashlib
+import io
+import json
+import os
+from pathlib import Path
+import re
+import stat
+import subprocess
+import sys
+import tarfile
+import tempfile
+import types
+import unittest
+from unittest import mock
+
+import numpy as np
+
+HERE = Path(__file__).resolve().parent
+ROOT = HERE.parent
+sys.path[:0] = [str(ROOT), str(HERE)]
+
+import artifact_safety # noqa: E402
+import capability # noqa: E402
+import contracts # noqa: E402
+import eplb # noqa: E402
+import ep_harness # noqa: E402
+import identity # noqa: E402
+import run_ep # noqa: E402
+import source_archive # noqa: E402
+import summarize # noqa: E402
+import sweep_matrix # noqa: E402
+import workload # noqa: E402
+
+
+class SamplingContractTest(unittest.TestCase):
+ def test_identity_and_fixed_sampling_profile(self) -> None:
+ identity.verify_test_vector()
+ self.assertTrue(identity.is_typed_id(identity.IDENTITY_TEST_VECTOR["series_id"], "series"))
+ self.assertEqual(ep_harness.SAMPLING_CONTRACT, "fixed-512-v1")
+ self.assertEqual(
+ (
+ ep_harness.TIMED_ITERS_PER_TRIAL,
+ ep_harness.TRIALS_PER_POINT,
+ ep_harness.TIMED_SAMPLES_PER_POINT,
+ ep_harness.WARMUP_ITERS_PER_TRIAL,
+ ),
+ (8, 64, 512, 32),
+ )
+ self.assertEqual(identity.V1_CASE_PROFILE["activation_profile"], "canonical-counter-source-v3")
+ self.assertEqual(
+ identity.V1_CASE_PROFILE["activation_generator"],
+ "collectivex-activation-counter-v3",
+ )
+ self.assertEqual(identity.V1_CASE_PROFILE["sampling_contract"], "fixed-512-v1")
+ self.assertEqual(identity.V1_CASE_PROFILE["percentile_method"], "nearest-rank")
+ self.assertEqual(
+ identity.V1_CASE_PROFILE["rank_reduction"],
+ "cross-rank-max-per-iteration",
+ )
+ self.assertEqual(
+ identity.V1_CASE_PROFILE["oracle_contract"],
+ "expert-specific-transform-v1",
+ )
+ self.assertEqual(
+ set(identity.V1_CASE_PROFILES), {"normal", "low-latency"}
+ )
+ self.assertEqual(
+ identity.V1_LOW_LATENCY_CASE_PROFILE["payload_unit"], "token-expert"
+ )
+ self.assertNotEqual(
+ identity.digest("case", identity.V1_NORMAL_CASE_PROFILE),
+ identity.digest("case", identity.V1_LOW_LATENCY_CASE_PROFILE),
+ )
+ parser = argparse.ArgumentParser()
+ ep_harness.add_common_args(parser)
+ args = parser.parse_args(
+ [
+ "--runner", "test", "--topology-class", "test",
+ "--scope", "scale-up", "--scale-up-transport", "nvlink",
+ "--out", "result.json",
+ ]
+ )
+ self.assertEqual((args.iters, args.trials, args.warmup), (8, 64, 32))
+ for profile in ((8, 64, 32), (128, 4, 32), (8, 1, 4), (0, 64, 32)):
+ with self.subTest(profile=profile):
+ self.assertEqual(
+ ep_harness.sampling_contract_error(*profile) is None,
+ profile == (8, 64, 32),
+ )
+
+ def test_nearest_rank_percentiles_use_all_512_samples(self) -> None:
+ samples = list(range(1, 513))
+ self.assertEqual(ep_harness.percentile(samples, 50), 256)
+ self.assertEqual(ep_harness.percentile(samples, 99), 507)
+
+ def test_terminal_summary_uses_bound_sku_and_route(self) -> None:
+ terminal = {
+ "format": contracts.TERMINAL_FORMAT,
+ "case": {
+ "backend": "deepep", "phase": "prefill", "ep": 8,
+ "suite": "ep-routing-v1", "routing": "zipf", "eplb": True,
+ "required_publication": "comparable-experimental",
+ },
+ "identity": {"case_factors": {"sku": "h100-dgxc"}},
+ }
+ self.assertEqual(
+ summarize._identity(terminal),
+ (
+ "h100-dgxc", "ep-routing-v1", "zipf", "prefill", True,
+ "comparable-experimental", 8,
+ ),
+ )
+
+ def test_matrix_cases_and_shards_are_identity_bound(self) -> None:
+ matrix = sweep_matrix.validate_matrix_document(
+ sweep_matrix.resolve_matrix(backends="all")
+ )
+ requested = {item["case"]["case_id"]: item for item in matrix["requested_cases"]}
+ assigned = [case_id for shard in matrix["include"] for case_id in shard["case_ids"]]
+ runnable = {
+ case_id for case_id, item in requested.items()
+ if item["disposition"] == "runnable"
+ }
+ runnable_cases = [
+ item for item in matrix["requested_cases"]
+ if item["disposition"] == "runnable"
+ ]
+ unsupported_cases = [
+ item for item in matrix["requested_cases"]
+ if item["disposition"] == "unsupported"
+ ]
+ self.assertEqual(
+ (
+ len(matrix["include"]),
+ len(matrix["requested_cases"]),
+ len(runnable_cases),
+ len(unsupported_cases),
+ sum(
+ len(item["case"]["ladder"].split())
+ for item in matrix["requested_cases"]
+ ),
+ sum(len(item["case"]["ladder"].split()) for item in runnable_cases),
+ sum(len(item["case"]["ladder"].split()) for item in unsupported_cases),
+ ),
+ (58, 608, 364, 244, 1600, 940, 660),
+ )
+ expected_topologies = {}
+ for sku, product in (
+ ("h100-dgxc", "h100"), ("h200-dgxc", "h200"),
+ ("b200-dgxc", "b200"), ("b300", "b300"),
+ ):
+ expected_topologies[sku, 8] = (
+ 1, 8, 8, "scale-up", "nvlink", None, "nvlink",
+ f"{product}-nvlink-island",
+ )
+ expected_topologies[sku, 16] = (
+ 2, 8, 8, "scale-out", "nvlink", "rdma", "nvlink-rdma",
+ f"{product}-nvlink-rdma",
+ )
+ for sku in ("gb200", "gb300"):
+ topology_class = f"{sku}-nvl72-mnnvl"
+ expected_topologies[sku, 8] = (
+ 2, 4, 72, "scale-up", "mnnvl", None, "mnnvl", topology_class,
+ )
+ expected_topologies[sku, 16] = (
+ 4, 4, 72, "scale-up", "mnnvl", None, "mnnvl", topology_class,
+ )
+ for sku in ("mi325x", "mi355x"):
+ expected_topologies[sku, 8] = (
+ 1, 8, 8, "scale-up", "xgmi", None, "xgmi", f"{sku}-xgmi",
+ )
+ expected_topologies[sku, 16] = (
+ 2, 8, 8, "scale-out", "xgmi", "rdma", "xgmi-rdma",
+ f"{sku}-xgmi-rdma",
+ )
+ topology_fields = sweep_matrix.TOPOLOGY_FIELDS
+ observed_topologies: dict[tuple[str, int], set[tuple[object, ...]]] = {}
+ for item in matrix["requested_cases"]:
+ case = item["case"]
+ observed_topologies.setdefault((item["sku"], case["ep"]), set()).add(
+ tuple(case[field] for field in topology_fields)
+ )
+ self.assertEqual(
+ {key: next(iter(values)) for key, values in observed_topologies.items()},
+ expected_topologies,
+ )
+ self.assertTrue(all(len(values) == 1 for values in observed_topologies.values()))
+ self.assertEqual(
+ {
+ (sku, ep): tuple(topology[field] for field in topology_fields)
+ for sku, platform in capability.PLATFORMS.items()
+ for ep, topology in platform["topologies"].items()
+ },
+ expected_topologies,
+ )
+ self.assertEqual(
+ {shard["n"] for shard in matrix["include"]}, {6, 7}
+ )
+ self.assertEqual(
+ sum(shard["n"] == 7 for shard in matrix["include"]), 16
+ )
+ ll_cases = [
+ item for item in matrix["requested_cases"]
+ if item["case"]["mode"] == "low-latency"
+ ]
+ self.assertEqual(len(ll_cases), 32)
+ self.assertTrue(all(
+ item["case"]["suite"] == "ep-low-latency-v1"
+ and item["case"]["backend"] in {"deepep", "uccl"}
+ and item["case"]["phase"] == "decode"
+ and item["case"]["routing"] == "uniform"
+ and not item["case"]["eplb"]
+ and item["case"]["ladder"] == "1 2 4 8 16 32 64 128"
+ for item in ll_cases
+ ))
+ for shard in matrix["include"]:
+ ep = next(
+ requested[case_id]["case"]["ep"] for case_id in shard["case_ids"]
+ )
+ self.assertEqual(
+ tuple(shard[field] for field in topology_fields),
+ expected_topologies[shard["sku"], ep],
+ )
+ routing_points = {
+ phase: {
+ int(point)
+ for item in matrix["requested_cases"]
+ if item["case"]["suite"] == "ep-routing-v1"
+ and item["case"]["phase"] == phase
+ for point in item["case"]["ladder"].split()
+ }
+ for phase in ("decode", "prefill")
+ }
+ self.assertEqual(routing_points, {"decode": {128}, "prefill": {512}})
+ skus = sorted({shard["sku"] for shard in matrix["include"]})
+ self.assertEqual(
+ [shard["sku"] for shard in matrix["include"][:len(skus)]],
+ skus,
+ )
+ self.assertEqual(set(assigned), runnable)
+ self.assertEqual(len(assigned), len(set(assigned)))
+ self.assertEqual({item["case"]["ep"] for item in matrix["requested_cases"]}, {8, 16})
+ self.assertFalse(capability.resolve("gb200", "deepep", ep=8, nodes=1)[0])
+ excluded = {
+ "uccl": {"b200-dgxc", "b300"},
+ }
+ for backend, skus in excluded.items():
+ for sku in skus:
+ with self.subTest(backend=backend, sku=sku):
+ self.assertFalse(capability.resolve(sku, backend)[0])
+ for case_id, item in requested.items():
+ case = {key: value for key, value in item["case"].items() if key != "case_id"}
+ self.assertEqual(
+ case_id,
+ identity.case_id(
+ sku=item["sku"], profile=identity.profile_for_case(case), case=case
+ ),
+ )
+ self.assertEqual(case["timing"], "8:64:32")
+ self.assertEqual(case["samples_per_point"], 512)
+
+ bad_matrix = copy.deepcopy(matrix)
+ bad_matrix["schema_version"] = True
+ with self.assertRaises(sweep_matrix.MatrixError):
+ sweep_matrix.validate_matrix_document(bad_matrix)
+
+ bad_catalog = copy.deepcopy(matrix)
+ wrapper = next(
+ item for item in bad_catalog["requested_cases"]
+ if item["disposition"] == "runnable"
+ )
+ old_id = wrapper["case"]["case_id"]
+ wrapper["case"]["hidden"] = 1
+ factors = {key: value for key, value in wrapper["case"].items() if key != "case_id"}
+ new_id = identity.case_id(
+ sku=wrapper["sku"], profile=identity.V1_CASE_PROFILE, case=factors
+ )
+ wrapper["case"]["case_id"] = new_id
+ for shard in bad_catalog["include"]:
+ shard["case_ids"] = [new_id if value == old_id else value for value in shard["case_ids"]]
+ with self.assertRaisesRegex(sweep_matrix.MatrixError, "frozen v1"):
+ sweep_matrix.validate_matrix_document(bad_catalog)
+
+ bad_topology = copy.deepcopy(matrix)
+ wrapper = next(
+ item for item in bad_topology["requested_cases"]
+ if item["disposition"] == "runnable"
+ )
+ old_id = wrapper["case"]["case_id"]
+ wrapper["case"]["transport"] = "incorrect-transport"
+ factors = {key: value for key, value in wrapper["case"].items() if key != "case_id"}
+ new_id = identity.case_id(
+ sku=wrapper["sku"], profile=identity.V1_CASE_PROFILE, case=factors
+ )
+ wrapper["case"]["case_id"] = new_id
+ for shard in bad_topology["include"]:
+ shard["case_ids"] = [new_id if value == old_id else value for value in shard["case_ids"]]
+ with self.assertRaisesRegex(sweep_matrix.MatrixError, "platform registry"):
+ sweep_matrix.validate_matrix_document(bad_topology)
+
+ shard_meta = matrix["include"][0]
+ requested_cases = {item["case"]["case_id"]: item["case"] for item in matrix["requested_cases"]}
+ shard = {
+ "schema_version": True,
+ "id": shard_meta["id"],
+ "sku": shard_meta["sku"],
+ "backend": shard_meta["backend"],
+ "nodes": shard_meta["nodes"],
+ "n": shard_meta["n"],
+ "cases": [requested_cases[value] for value in shard_meta["case_ids"]],
+ }
+ with self.assertRaises(sweep_matrix.MatrixError):
+ sweep_matrix.validate_shard_control(
+ shard, sku=shard_meta["sku"], backend=shard_meta["backend"],
+ nodes=shard_meta["nodes"],
+ )
+
+ def test_matrix_yaml_and_config_validation_are_strict(self) -> None:
+ suites = sweep_matrix._load("suites.yaml")
+ workloads = sweep_matrix._load("workloads.yaml")
+ self.assertEqual(
+ {tuple(suite["ep_degrees"]) for suite in suites["suites"].values()},
+ {(8, 16)},
+ )
+ invalid = (
+ ("unknown top", lambda s, _w: s.update({"typo": True})),
+ (
+ "unknown suite field",
+ lambda s, _w: s["suites"]["ep-core-v1"].update({"modes": ["normal"]}),
+ ),
+ (
+ "unknown workload field",
+ lambda _s, w: w["model_derived"]["deepseek-v3-v1"].update({"unused": 1}),
+ ),
+ (
+ "string phases",
+ lambda s, _w: s["suites"]["ep-core-v1"].update({"phases": "decode"}),
+ ),
+ (
+ "unknown routing",
+ lambda s, _w: s["suites"]["ep-core-v1"].update({"routings": ["random"]}),
+ ),
+ (
+ "integer EPLB",
+ lambda s, _w: s["suites"]["ep-routing-v1"].update({"eplb": [0, 1]}),
+ ),
+ (
+ "duplicate platform",
+ lambda s, _w: s["suites"]["ep-core-v1"]["platforms"].append("h100-dgxc"),
+ ),
+ (
+ "missing EP degrees",
+ lambda s, _w: s["suites"]["ep-core-v1"].pop("ep_degrees"),
+ ),
+ (
+ "non-v1 EP degrees",
+ lambda s, _w: s["suites"]["ep-core-v1"].update({"ep_degrees": [8]}),
+ ),
+ ("missing top field", lambda s, _w: s.pop("schema_version")),
+ (
+ "string dimension",
+ lambda _s, w: w["model_derived"]["deepseek-v3-v1"].update({"hidden": "7168"}),
+ ),
+ (
+ "unreachable phase ladder",
+ lambda s, _w: s["suites"]["ep-routing-v1"].update({"phases": ["prefill"]}),
+ ),
+ )
+ for label, mutate in invalid:
+ with self.subTest(label=label), self.assertRaises(SystemExit):
+ bad_suites, bad_workloads = copy.deepcopy(suites), copy.deepcopy(workloads)
+ mutate(bad_suites, bad_workloads)
+ sweep_matrix.validate_config_documents(bad_suites, bad_workloads)
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ (root / "configs").mkdir()
+ (root / "configs" / "duplicate.yaml").write_text(
+ "schema_version: 1\nsuites:\n same: 1\n same: 2\n"
+ )
+ with mock.patch.object(sweep_matrix, "HERE", root), self.assertRaisesRegex(
+ SystemExit, "duplicate YAML key"
+ ):
+ sweep_matrix._load("duplicate.yaml")
+
+ def test_semantically_duplicate_suite_points_are_rejected(self) -> None:
+ matrix = sweep_matrix.resolve_matrix()
+ with mock.patch.object(
+ sweep_matrix, "_semantic_points", return_value=["duplicate"]
+ ), self.assertRaisesRegex(
+ sweep_matrix.MatrixError, "duplicates a semantic token point"
+ ):
+ sweep_matrix.validate_matrix_document(matrix)
+
+ def test_only_three_shared_launchers_are_registered(self) -> None:
+ expected = {
+ "launch_single-slurm.sh",
+ "launch_gb-nv.sh",
+ "launch_mi-amds.sh",
+ }
+ self.assertEqual({path.name for path in (ROOT / "launchers").glob("launch_*.sh")}, expected)
+ self.assertEqual(
+ {platform["launcher"] for platform in capability.PLATFORMS.values()},
+ {"single-slurm", "gb-nv", "mi-amds"},
+ )
+ for platform in capability.PLATFORMS.values():
+ launcher = ROOT / "launchers" / f"launch_{platform['launcher']}.sh"
+ self.assertTrue(launcher.is_file())
+ source = launcher.read_text()
+ self.assertNotIn("RUNNER_NAME", source)
+ self.assertIn("cx_preflight_allocation", source)
+ lock_environment = 'cx_lock_canonical_gha_env "$RUNNER"'
+ self.assertIn(lock_environment, source)
+ self.assertLess(
+ source.index("cx_load_operator_config"),
+ source.index(lock_environment),
+ )
+ validate = 'cx_validate_shard_control "$CX_DIR"'
+ stage = 'MOUNT_SRC="$(cx_stage_path '
+ self.assertIn(validate, source)
+ self.assertLess(source.index(validate), source.index(stage))
+ self.assertLess(source.index(stage), source.index('cx_stage_repo "$REPO_ROOT"'))
+ self.assertLess(source.index(validate), source.index("cx_require_vars"))
+ if platform["launcher"] in {"single-slurm", "mi-amds"}:
+ network = "cx_validate_network_profile_on_job"
+ self.assertIn(network, source)
+ self.assertLess(source.index("cx_salloc_jobid"), source.index(network))
+ self.assertLess(source.index(network), source.index("cx_preflight_allocation"))
+ if platform["launcher"] == "single-slurm":
+ self.assertLess(
+ source.index(network),
+ source.index("CX_ENROOT_LOCAL_IMPORT=1 cx_ensure_squash"),
+ )
+
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ workflow = (ROOT.parent.parent / ".github" / "workflows" / "collectivex-sweep.yml").read_text()
+ self.assertNotIn("RUNNER_NAME", common)
+ self.assertNotIn("RUNNER_NAME:", workflow)
+ self.assertNotIn("flashinfer", capability.BACKENDS)
+ self.assertFalse((HERE / "ep_flashinfer.py").exists())
+
+ def test_scaleout_network_profile_is_explicit_and_allowlisted(self) -> None:
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ ! (unset CX_SOCKET_IFNAME CX_RDMA_DEVICES; cx_apply_network_profile 2 nvlink-rdma)
+ ! (export CX_SOCKET_IFNAME=eth0; unset CX_RDMA_DEVICES; cx_apply_network_profile 2 nvlink-rdma)
+ export CX_SOCKET_IFNAME=ib0 CX_RDMA_DEVICES=mlx5_0:1,mlx5_1:1
+ export NCCL_NET=Socket NCCL_IB_HCA=stale NVSHMEM_HCA_LIST=stale
+ cx_apply_network_profile 1 nvlink
+ test -z "${NCCL_NET+x}${NCCL_IB_HCA+x}${NVSHMEM_HCA_LIST+x}"
+ cx_apply_network_profile 4 mnnvl
+ test -z "${NCCL_NET+x}${NCCL_IB_HCA+x}${NVSHMEM_HCA_LIST+x}"
+ export CX_IB_GID_INDEX=3 CX_RDMA_SERVICE_LEVEL=2
+ cx_apply_network_profile 2 nvlink-rdma
+ test "$NCCL_SOCKET_IFNAME:$GLOO_SOCKET_IFNAME:$UCCL_SOCKET_IFNAME" = ib0:ib0:ib0
+ test "$NCCL_NET:$NCCL_IB_HCA" = 'IB:=mlx5_0:1,mlx5_1:1'
+ test "$NVSHMEM_HCA_LIST" = mlx5_0:1,mlx5_1:1
+ test "$MORI_RDMA_DEVICES:$EP_NIC_NAME" = mlx5_0,mlx5_1:mlx5_0
+ test "$NCCL_IB_GID_INDEX:$NCCL_IB_SL" = 3:2
+ test "$NVSHMEM_IB_ENABLE_IBGDA:$NVSHMEM_IBGDA_NIC_HANDLER" = 1:gpu
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ check=True,
+ env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"},
+ )
+
+ def test_network_profile_validation_is_private_and_all_node(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ binary = root / "srun"
+ arguments = root / "arguments"
+ script = root / "script"
+ binary.write_text(
+ "#!/usr/bin/env bash\n"
+ "printf '%s\\n' \"$@\" > \"$CAPTURE_ARGS\"\n"
+ "cat > \"$CAPTURE_SCRIPT\"\n"
+ "exit \"${SRUN_RC:-0}\"\n"
+ )
+ binary.chmod(0o700)
+ command = (
+ 'source "$1"; export COLLECTIVEX_EXECUTION_ID="network-test-$$"; '
+ "trap 'cx_cleanup_private_logs 0' EXIT; "
+ 'cx_validate_network_profile_on_job 42 2 nvlink-rdma'
+ )
+ environment = {
+ **os.environ,
+ "PATH": f"{root}:{os.environ['PATH']}",
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "CAPTURE_ARGS": str(arguments),
+ "CAPTURE_SCRIPT": str(script),
+ "CX_SOCKET_IFNAME": "privateif0",
+ "CX_RDMA_DEVICES": "privatehca0:1",
+ "CX_IB_GID_INDEX": "3",
+ }
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env=environment,
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ invoked = arguments.read_text()
+ self.assertIn("--nodes=2", invoked)
+ self.assertIn("--ntasks=2", invoked)
+ self.assertIn("--input=all", invoked)
+ self.assertIn("CX_SOCKET_IFNAME,CX_RDMA_DEVICES,CX_IB_GID_INDEX", invoked)
+ self.assertIn('/sys/class/infiniband/$device/ports', script.read_text())
+ self.assertNotIn("privateif0", result.stdout + result.stderr)
+ self.assertNotIn("privatehca0", result.stdout + result.stderr)
+
+ failed = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")],
+ text=True,
+ capture_output=True,
+ env={**environment, "SRUN_RC": "9"},
+ )
+ self.assertNotEqual(failed.returncode, 0)
+ self.assertNotIn("privateif0", failed.stdout + failed.stderr)
+ self.assertNotIn("privatehca0", failed.stdout + failed.stderr)
+
+ arguments.unlink()
+ subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_validate_network_profile_on_job 42 1 nvlink',
+ "_", str(ROOT / "runtime" / "common.sh"),
+ ],
+ check=True,
+ env=environment,
+ )
+ self.assertFalse(arguments.exists())
+
+ def test_allocation_preflight_proves_shared_write_visibility(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ mount = root / "mount"
+ runtime = mount / "experimental" / "CollectiveX" / "runtime"
+ runtime.mkdir(parents=True)
+ (runtime / "run_in_container.sh").write_text("#!/bin/sh\n")
+ squash = root / "image.sqsh"
+ squash.write_bytes(b"squash")
+ binary = root / "bin"
+ binary.mkdir()
+ (binary / "unsquashfs").write_text("#!/bin/sh\nexit 0\n")
+ (binary / "unsquashfs").chmod(0o700)
+ (binary / "srun").write_text(
+ "#!/usr/bin/env bash\n"
+ "set -euo pipefail\n"
+ "case \" $* \" in *' --input=all '*) ;; *) exit 97 ;; esac\n"
+ "worker=\"$FAKE_ROOT/worker.sh\"\n"
+ "cat > \"$worker\"\n"
+ "args=(\"$@\")\n"
+ "start=0\n"
+ "for ((i=0; i<${#args[@]}; i++)); do\n"
+ " [ \"${args[$i]}\" != -- ] || start=$((i + 1))\n"
+ "done\n"
+ "[ \"$start\" -gt 0 ]\n"
+ "worker_args=(\"${args[@]:$start}\")\n"
+ "probe=\"${worker_args[4]}\"\n"
+ "case \"${FAKE_MODE:-success}\" in\n"
+ " missing-source) rm -f -- \"$probe/source\" ;;\n"
+ " readonly) chmod 500 \"$probe\" ;;\n"
+ "esac\n"
+ "for ((node=0; node None:
+ runtime = (ROOT / "runtime" / "run_in_container.sh").read_text()
+ probe = runtime[runtime.index("cx_probe_deepep()"):
+ runtime.index("cx_activate_deepep_v2()")]
+ self.assertIn('expected_version="1.2.1"', probe)
+ self.assertIn('expected_version="1.1.0+814e508"', probe)
+ self.assertNotIn("pip install", probe)
+ self.assertNotIn("cx_fetch_revision", probe)
+ self.assertIn("Path(deep_ep.__file__).resolve() in recorded_files", probe)
+ self.assertIn("Path(buffer_module.__file__).resolve() in recorded_files", probe)
+
+ harness = (HERE / "ep_harness.py").read_text()
+ pass_one = harness[harness.index("# ---- Pass 1"):
+ harness.index("# ---- Pass 2")]
+ self.assertLess(
+ pass_one.index("input_snapshots[T] ="),
+ pass_one.index("oracle = _run_expert_oracle"),
+ )
+ self.assertIn("pre_input_unchanged", pass_one)
+ self.assertIn("hh = prep()\n torch.cuda.synchronize()", harness)
+
+ def test_squash_imports_are_reproducible_and_use_a_fresh_cache_key(self) -> None:
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text()
+ self.assertIn('CX_SQUASH_FORMAT_VERSION="repro-v1"', common)
+ self.assertIn("SOURCE_DATE_EPOCH=\"$CX_SQUASH_SOURCE_DATE_EPOCH\"", common)
+ self.assertIn("${COLLECTIVEX_IMAGE_DIGEST#sha256:}", common)
+ self.assertIn("cx_ensure_squash_on_job", amd)
+ self.assertIn('"${CX_LOCK_DIR:-}"', amd)
+ self.assertNotIn('"${CX_LOCK_DIR:-/tmp}"', amd)
+ self.assertIn('[ -n "$lock_dir" ] || lock_dir="$squash_dir/.locks"', common)
+ self.assertGreaterEqual(common.count("--chdir=/tmp"), 2)
+ self.assertGreaterEqual(amd.count("--chdir=/tmp"), 2)
+ self.assertIn('ENROOT_CACHE_PATH="$compute_home/enroot-cache"', common)
+ self.assertIn('ENROOT_RUNTIME_PATH="$compute_home/enroot-run"', common)
+ self.assertEqual(common.count('cx_reverify_registry_image "$image"'), 2)
+ result = subprocess.run(
+ [
+ "bash",
+ "-c",
+ f'source "{ROOT / "runtime" / "common.sh"}"; '
+ 'COLLECTIVEX_IMAGE_DIGEST="sha256:$(printf b%.0s {1..64})"; '
+ 'CX_IMAGE_PLATFORM=linux/amd64; cx_squash_path /cache repo/image:tag; '
+ 'printf "\\n"; CX_IMAGE_PLATFORM=linux/arm64; '
+ 'cx_squash_path /cache repo/image:tag',
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ digest = "b" * 64
+ self.assertEqual(
+ result.stdout.splitlines(),
+ [
+ f"/cache/repro-v1_{digest}_repo_image_tag.sqsh",
+ f"/cache/repro-v1_linux_arm64_{digest}_repo_image_tag.sqsh",
+ ],
+ )
+
+ def test_launchers_preserve_platform_specific_runtime_requirements(self) -> None:
+ single = (ROOT / "launchers" / "launch_single-slurm.sh").read_text()
+ gb = (ROOT / "launchers" / "launch_gb-nv.sh").read_text()
+ amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text()
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ self.assertIn("ALLOC_EXTRA=(--mem=0)", single)
+ self.assertIn("ALLOC_EXTRA=(-N 1 --mem=0)", single)
+ self.assertIn("SRUN_EXTRA=(--mpi=none --container-remap-root)", single)
+ self.assertIn("CX_ENROOT_LOCAL_IMPORT=1", single)
+ self.assertIn('PRODUCT="${CX_SHARD_SKU:-${CX_GB_PRODUCT:-', gb)
+ self.assertIn("cx_ensure_squash_on_job", gb)
+ self.assertIn("--mem=0 --cpus-per-task=35", gb)
+ self.assertIn("--container-writable", gb)
+ self.assertIn("--container-remap-root", gb)
+ workload_stage = common[
+ common.index("workload_args=("):
+ common.index("workload_log=", common.index("workload_args=("))
+ ]
+ self.assertNotIn("--workload", workload_stage)
+ self.assertIn("mi325x) CPUS_PER_TASK=256", amd)
+ self.assertIn("/dev/kfd:/dev/kfd,/dev/dri:/dev/dri", amd)
+ self.assertIn("--container-writable --container-remap-root", amd)
+ self.assertIn(
+ "CX_DISTRIBUTED_CONTAINER_ARGS=(--container-writable --container-remap-root)",
+ amd,
+ )
+ collect = common[common.index("cx_collect_results()"):
+ common.index("cx_cleanup_stage()")]
+ cleanup = common[common.index("cx_launcher_cleanup()"):
+ common.index("cx_install_launcher_fail_safe()")]
+ self.assertNotIn("cx_cleanup_stage", collect)
+ self.assertLess(cleanup.index("cx_cancel_job"), cleanup.index("cx_cleanup_stage"))
+ runtime = (ROOT / "runtime" / "run_in_container.sh").read_text()
+ self.assertIn('distribution.read_text("direct_url.json")', runtime)
+ self.assertIn("6548e9c504a12b2471af4b7f4d9546321210a57a456b5dc55bd4a8dad0f932ac", runtime)
+ self.assertIn("2671cff7baf8c2c214ff4bac721af875d513130670bec57601998bd1aae82882", runtime)
+
+ def test_deferred_backend_provenance_resolves_before_measurement(self) -> None:
+ harness = (ROOT / "tests" / "ep_harness.py").read_text()
+ conditioning = harness.index("for wt in conditioning_ladder")
+ provenance = harness.index("# Setup may materialize deferred provenance")
+ measurement = harness.index("# ---- Pass 1: build each deterministic problem")
+ self.assertLess(conditioning, provenance)
+ self.assertLess(provenance, measurement)
+
+ def test_backend_specific_routing_contracts_are_explicit(self) -> None:
+ hybrid = (ROOT / "tests" / "ep_deepep_hybrid.py").read_text()
+ self.assertIn("self.domain_rank = int(self.buffer.local_rank)", hybrid)
+ self.assertIn(
+ "probability_columns = self.domain_rank * self.local_experts + local_expert_ids",
+ hybrid,
+ )
+ self.assertIn("h.recv_probs[:count][rows, probability_columns]", hybrid)
+
+ mori = (ROOT / "tests" / "ep_mori.py").read_text()
+ self.assertIn("topk_idx=indices", mori)
+ self.assertIn("indices=indices", mori)
+ self.assertIn(
+ "combine_indices = p.indices if self._async_ll else h.dispatch_indices",
+ mori,
+ )
+ self.assertIn("h.combine_input,\n None,\n combine_indices", mori)
+ self.assertIn('"use_external_inp_buf": self._external_input', mori)
+ self.assertIn("self.block_num = self._block_target = 64", mori)
+ self.assertIn('config_kwargs["block_num"] = self.block_num', mori)
+ self.assertIn(
+ 'config_kwargs["warp_num_per_block"] = self.dispatch_warps', mori
+ )
+ self.assertIn("count > tensor.size(0)", mori)
+ self.assertIn("return combined[:p.T]", mori)
+ self.assertNotIn("return combined\n", mori)
+ self.assertIn(
+ "raw_expert_ids < local_start + experts_per_rank",
+ mori,
+ )
+ self.assertNotIn("MoRI returned a non-local expert", mori)
+ harness = (ROOT / "tests" / "ep_harness.py").read_text()
+ self.assertIn("problem.recv_tokens = backend.recv_tokens(handle)", harness)
+
+ def test_mori_masks_global_topk_metadata_to_the_local_rank(self) -> None:
+ path = HERE / "ep_mori.py"
+ tree = ast.parse(path.read_text(), str(path))
+ helper = next(
+ node
+ for node in tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_project_local_metadata"
+ )
+ namespace: dict[str, object] = {}
+ exec(compile(ast.Module(body=[helper], type_ignores=[]), str(path), "exec"), namespace)
+ raw_ids = np.array([[0, 32, 63, -1], [64, 95, 7, 96]], dtype=np.int64)
+ raw_weights = np.arange(8, dtype=np.float32).reshape(2, 4)
+ torch_module = types.SimpleNamespace(
+ where=np.where,
+ full_like=np.full_like,
+ zeros_like=np.zeros_like,
+ )
+ ids, weights, local_ids = namespace["_project_local_metadata"](
+ torch_module, raw_ids, raw_weights, 1, 32
+ )
+ np.testing.assert_array_equal(
+ ids,
+ np.array([[-1, 32, 63, -1], [-1, -1, -1, -1]], dtype=np.int64),
+ )
+ np.testing.assert_array_equal(
+ weights,
+ np.array([[0, 1, 2, 0], [0, 0, 0, 0]], dtype=np.float32),
+ )
+ counts = np.bincount(local_ids, minlength=32)
+ self.assertEqual((counts[0], counts[31], int(counts.sum())), (1, 1, 2))
+ commit_helper = next(
+ node for node in tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_mori_source_commit"
+ )
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ module = root / "python" / "mori" / "__init__.py"
+ module.parent.mkdir(parents=True)
+ module.touch()
+ git = root / ".git"
+ git.mkdir()
+ (git / "HEAD").write_text("a" * 40 + "\n")
+ commit_namespace = {
+ "Path": Path,
+ "re": re,
+ "mori": types.SimpleNamespace(__file__=str(module)),
+ }
+ exec(
+ compile(ast.Module(body=[commit_helper], type_ignores=[]), str(path), "exec"),
+ commit_namespace,
+ )
+ self.assertEqual(commit_namespace["_mori_source_commit"](), "a" * 40)
+ (git / "HEAD").write_text("ref: refs/heads/main\n")
+ with self.assertRaisesRegex(RuntimeError, "detached commit"):
+ commit_namespace["_mori_source_commit"]()
+
+ profile = contracts.project_resource_profile(
+ {
+ "block_num": 64,
+ "device_cus": 304,
+ "kernel_type": "AsyncLL",
+ "tuned_source": "upstream-asyncll-64x8-external-input",
+ }
+ )
+ self.assertIsNone(profile["comm_units_kind"])
+ self.assertIsNone(profile["configured_units"])
+
+ def test_squash_identity_rehashes_instead_of_trusting_a_sidecar(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ image = Path(temporary) / "image.sqsh"
+ image.write_bytes(b"current squash bytes")
+ sidecar = Path(f"{image}.sha256")
+ sidecar.write_text("a" * 64)
+ os.utime(sidecar, (image.stat().st_mtime + 10, image.stat().st_mtime + 10))
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; COLLECTIVEX_EXECUTION_ID="squash-hash-$$"; '
+ 'cx_export_squash_identity "$2"; cx_cleanup_private_logs 0; '
+ 'printf "%s" "$COLLECTIVEX_SQUASH_SHA256"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(image),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertEqual(result.stdout, hashlib.sha256(image.read_bytes()).hexdigest())
+
+ def _run_salloc_scenario(
+ self, salloc_body: str, squeue_body: str, *, cleanup: bool
+ ) -> dict[str, object]:
+ prefix = f"inferencex-collectivex-{os.getpid()}-1-"
+ with tempfile.TemporaryDirectory(prefix=prefix, dir="/tmp") as temporary:
+ root = Path(temporary)
+ command_dir = root / "bin"
+ repo = root / "repo"
+ command_dir.mkdir()
+ repo.mkdir()
+ paths = {
+ name: root / name
+ for name in ("arguments", "squeue-calls", "sleep-calls", "scancel-calls")
+ }
+ scripts = {
+ "salloc": (
+ "printf '%s\\n' \"$@\" > \"$CX_TEST_SALLOC_ARGUMENTS\"\n"
+ + salloc_body
+ ),
+ "squeue": (
+ "printf '%s\\n' \"$*\" >> \"$CX_TEST_SQUEUE_CALLS\"\n"
+ + squeue_body
+ ),
+ "sleep": "printf '%s\\n' \"$1\" >> \"$CX_TEST_SLEEP_CALLS\"\n",
+ "scancel": (
+ "printf '%s\\n' \"$*\" >> \"$CX_TEST_SCANCEL_CALLS\"\n"
+ ),
+ }
+ for name, body in scripts.items():
+ path = command_dir / name
+ path.write_text(f"#!/usr/bin/env bash\n{body}\n")
+ path.chmod(0o700)
+ execution_id = f"scheduler-{root.name}"
+ expected_name = "cx-" + hashlib.sha256(
+ execution_id.encode()
+ ).hexdigest()[:24]
+ command = r'''
+ source "$1"
+ JOB_ID=""
+ set +e
+ cx_salloc_jobid --partition=compute
+ run_rc=$?
+ set -e
+ printf '%s:%s:%s\n' \
+ "$run_rc" "$JOB_ID" "$CX_ALLOCATION_UNCERTAIN"
+ cx_cleanup_private_logs 0
+ if [ "$3" = cleanup ]; then
+ export CX_JOB_ROOT="$2" REPO_ROOT="$2/repo" MOUNT_SRC="$2/repo"
+ export COLLECTIVEX_CANONICAL_GHA=1
+ cx_write_cleanup_guard() {
+ rm -f -- "$CX_JOB_ROOT/cleanup-safe" "$CX_JOB_ROOT/cleanup-unsafe"
+ : > "$CX_JOB_ROOT/cleanup-$1"
+ }
+ unset CX_BENCH
+ cx_launcher_cleanup "$run_rc"
+ fi
+ exit "$run_rc"
+ '''
+ result = subprocess.run(
+ [
+ "bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"),
+ str(root), "cleanup" if cleanup else "no-cleanup",
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "PATH": f"{command_dir}:{os.environ['PATH']}",
+ "COLLECTIVEX_EXECUTION_ID": execution_id,
+ "CX_TEST_SALLOC_ARGUMENTS": str(paths["arguments"]),
+ "CX_TEST_SQUEUE_CALLS": str(paths["squeue-calls"]),
+ "CX_TEST_SLEEP_CALLS": str(paths["sleep-calls"]),
+ "CX_TEST_SCANCEL_CALLS": str(paths["scancel-calls"]),
+ },
+ )
+ return {
+ "result": result,
+ "job_name": expected_name,
+ "arguments": paths["arguments"].read_text().splitlines(),
+ "squeue_calls": (
+ paths["squeue-calls"].read_text().splitlines()
+ if paths["squeue-calls"].exists() else []
+ ),
+ "sleep_calls": (
+ paths["sleep-calls"].read_text().splitlines()
+ if paths["sleep-calls"].exists() else []
+ ),
+ "scancel_calls": (
+ paths["scancel-calls"].read_text().splitlines()
+ if paths["scancel-calls"].exists() else []
+ ),
+ "cleanup_safe": (root / "cleanup-safe").is_file(),
+ "cleanup_unsafe": (root / "cleanup-unsafe").is_file(),
+ }
+
+ def test_salloc_job_id_parser_uses_the_portable_grant_message(self) -> None:
+ scenario = self._run_salloc_scenario(
+ "printf 'salloc: Granted job allocation 4242\\n' >&2",
+ "exit 2",
+ cleanup=False,
+ )
+ result = scenario["result"]
+ self.assertIsInstance(result, subprocess.CompletedProcess)
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertEqual(
+ result.stdout, "0:4242:0\n"
+ )
+ self.assertEqual(
+ scenario["arguments"],
+ [
+ "--partition=compute",
+ f"--job-name={scenario['job_name']}",
+ "--no-shell",
+ ],
+ )
+ self.assertEqual(scenario["squeue_calls"], [])
+
+ def test_salloc_verified_rejection_is_cleanup_safe(self) -> None:
+ scenario = self._run_salloc_scenario("exit 1", "exit 0", cleanup=True)
+ result = scenario["result"]
+ self.assertEqual(result.returncode, 1)
+ self.assertEqual(result.stdout, "1::0\n")
+ self.assertEqual(len(scenario["squeue_calls"]), 3)
+ scheduler_user = subprocess.check_output(["id", "-un"], text=True).strip()
+ self.assertTrue(all(
+ f"--name={scenario['job_name']}" in call
+ and f"--user={scheduler_user}" in call
+ for call in scenario["squeue_calls"]
+ ))
+ self.assertEqual(scenario["sleep_calls"], ["1", "2"])
+ self.assertTrue(scenario["cleanup_safe"])
+ self.assertFalse(scenario["cleanup_unsafe"])
+
+ def test_salloc_recovers_and_cancels_one_matching_allocation(self) -> None:
+ scenario = self._run_salloc_scenario(
+ "exit 1",
+ r'''
+ case " $* " in
+ *" --name="*) printf '5151\n' ;;
+ *" -j 5151 "*) exit 0 ;;
+ *) exit 2 ;;
+ esac
+ ''',
+ cleanup=True,
+ )
+ result = scenario["result"]
+ self.assertEqual(result.returncode, 1)
+ self.assertEqual(result.stdout, "1:5151:0\n")
+ self.assertEqual(scenario["scancel_calls"], ["5151"])
+ self.assertTrue(scenario["cleanup_safe"])
+ self.assertFalse(scenario["cleanup_unsafe"])
+
+ def test_salloc_ambiguous_lookup_remains_cleanup_unsafe(self) -> None:
+ scenario = self._run_salloc_scenario(
+ "exit 1", "printf '5151\\n5152\\n'", cleanup=True
+ )
+ result = scenario["result"]
+ self.assertEqual(result.returncode, 1)
+ self.assertEqual(result.stdout, "1::1\n")
+ self.assertEqual(scenario["scancel_calls"], [])
+ self.assertFalse(scenario["cleanup_safe"])
+ self.assertTrue(scenario["cleanup_unsafe"])
+
+ def test_salloc_query_failure_and_interruption_remain_cleanup_unsafe(self) -> None:
+ query_failure = self._run_salloc_scenario("exit 1", "exit 2", cleanup=True)
+ self.assertEqual(query_failure["result"].returncode, 1)
+ self.assertEqual(len(query_failure["squeue_calls"]), 1)
+ self.assertFalse(query_failure["cleanup_safe"])
+ self.assertTrue(query_failure["cleanup_unsafe"])
+
+ interrupted = self._run_salloc_scenario("exit 130", "exit 0", cleanup=True)
+ self.assertEqual(interrupted["result"].returncode, 1)
+ self.assertEqual(interrupted["squeue_calls"], [])
+ self.assertFalse(interrupted["cleanup_safe"])
+ self.assertTrue(interrupted["cleanup_unsafe"])
+
+ def test_allocation_cleanup_fails_closed_when_scheduler_queries_fail(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ directory = Path(temporary)
+ for name, body in {
+ "scancel": "exit 0",
+ "squeue": "exit 2",
+ "sleep": "exit 0",
+ }.items():
+ command = directory / name
+ command.write_text(f"#!/usr/bin/env bash\n{body}\n")
+ command.chmod(0o700)
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_cancel_job 4242',
+ "_", str(ROOT / "runtime" / "common.sh"),
+ ],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "PATH": f"{directory}:{os.environ['PATH']}"},
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertIn("did not terminate", result.stderr)
+
+ workflow = (ROOT.parent.parent / ".github" / "workflows" / "collectivex-sweep.yml").read_text()
+ self.assertIn("cleanup-unsafe", workflow)
+ self.assertIn("cleanup-safe", workflow)
+ self.assertIn("Confirm allocation cleanup", workflow)
+ self.assertIn("Prepare pinned backend source archive", workflow)
+ self.assertIn("Install pinned backend source seed", workflow)
+ self.assertIn("CX_BACKEND_SOURCE_SEED_ROOT", workflow)
+ self.assertIn("steps.gen.outputs.source_backends", workflow)
+ self.assertIn('python3 "$destination/source_archive.py"', workflow)
+ artifact_validation = workflow[workflow.index("- name: Validate shard artifact safety"):]
+ self.assertIn("steps.allocation_cleanup.outcome == 'success'", artifact_validation)
+ sweep_workflow = workflow[workflow.index(" sweep:"):]
+ self.assertNotIn("GITHUB_WORKSPACE", sweep_workflow)
+ self.assertNotIn("RUNNER_WORKSPACE", sweep_workflow)
+ self.assertIn('CX_SOURCE_ROOT: /tmp/inferencex-collectivex-', sweep_workflow)
+ source_step = sweep_workflow[:sweep_workflow.index("- uses: actions/download-artifact")]
+ self.assertNotIn("unsafe_guards=", source_step)
+ self.assertIn("cutoff = time.time() - 86400", source_step)
+ self.assertIn("stat.S_IMODE(metadata.st_mode) != 0o700", source_step)
+ self.assertIn('for marker_name in ("cleanup-safe", "cleanup-unsafe")', source_step)
+ self.assertIn("stat.S_IMODE(marker.st_mode) == 0o600", source_step)
+ self.assertIn("shutil.rmtree(entry.path)", source_step)
+ self.assertLess(
+ source_step.index('rev-parse HEAD'),
+ source_step.index("echo 'prepared=true'"),
+ )
+ upload = workflow[workflow.index("- name: Stage shard artifact"):]
+ self.assertIn("id: stage_artifact", upload)
+ self.assertIn("id: upload_artifact", upload)
+ self.assertIn("steps.stage_artifact.outcome == 'success'", upload)
+ cleanup = workflow[workflow.index("- name: Cleanup isolated workspace"):]
+ for step in (
+ "sweep_shard", "allocation_cleanup", "artifact_safety",
+ "delivery_contracts", "stage_artifact", "upload_artifact",
+ ):
+ self.assertIn(f"steps.{step}.outcome", cleanup)
+ self.assertLess(
+ cleanup.index('cleanup-safe" ]'),
+ cleanup.index('rm -rf -- "$CX_JOB_ROOT"'),
+ )
+
+ def test_v1_publication_requires_explicit_release_markers(self) -> None:
+ workflows = ROOT.parent.parent / ".github" / "workflows"
+ sweep = (workflows / "collectivex-sweep.yml").read_text()
+ publish = (workflows / "collectivex-publish.yml").read_text()
+
+ self.assertIn("release_tag:", sweep)
+ self.assertIn("default: unversioned", sweep)
+ self.assertIn("options: [unversioned, v1]", sweep)
+ self.assertIn("inputs.release_tag == 'v1'", sweep)
+ self.assertIn("collectivex.release-tag.v1", sweep)
+ self.assertIn("V1 release tag requires the locked full matrix", sweep)
+ self.assertIn("EXPECTED_MATRIX_SHA256", sweep)
+ self.assertIn("cxrelease-v1-${{ github.run_id }}-${{ github.run_attempt }}", sweep)
+
+ self.assertIn("run_ids must contain exactly three IDs", publish)
+ self.assertIn("source runs do not share one source SHA", publish)
+ self.assertIn("cxrelease-v1-$run_id-$attempt/release.json", publish)
+ self.assertIn("run $run_id is not tagged for V1 publication", publish)
+ self.assertIn("collectivex.release-tag.v1", publish)
+ self.assertIn("ref: ${{ steps.runs.outputs.source_sha }}", publish)
+ self.assertIn("cxpublication-v1-${{ github.run_id }}-${{ github.run_attempt }}", publish)
+ self.assertIn("retention-days: 90", publish)
+ self.assertNotIn("workflow_run:", publish)
+
+ def test_source_archive_preserves_only_contained_leaf_symlinks(self) -> None:
+ selected = "deepep-hybrid-pinned"
+ other = "deepep-v2-pinned"
+
+ def directory(name: str) -> tarfile.TarInfo:
+ member = tarfile.TarInfo(name)
+ member.type = tarfile.DIRTYPE
+ member.mode = 0o755
+ return member
+
+ def regular(
+ name: str, payload: bytes, mode: int = 0o644
+ ) -> tuple[tarfile.TarInfo, io.BytesIO]:
+ member = tarfile.TarInfo(name)
+ member.size = len(payload)
+ member.mode = mode
+ return member, io.BytesIO(payload)
+
+ def symbolic(name: str, target: str) -> tarfile.TarInfo:
+ member = tarfile.TarInfo(name)
+ member.type = tarfile.SYMTYPE
+ member.linkname = target
+ member.mode = 0o777
+ return member
+
+ def write_archive(path: Path, extras: list[tarfile.TarInfo] | None = None) -> None:
+ root = f".cx_sources/{selected}"
+ with tarfile.open(path, "w") as archive:
+ for name in (
+ ".cx_sources", root, f"{root}/third-party",
+ f"{root}/third-party/nccl", f"{root}/third-party/nccl/pkg",
+ f"{root}/third-party/nccl/pkg/debian",
+ f".cx_sources/{other}",
+ ):
+ archive.addfile(directory(name))
+ member, stream = regular(
+ f"{root}/third-party/nccl/LICENSE.txt", b"license\n"
+ )
+ archive.addfile(member, stream)
+ member, stream = regular(f".cx_sources/{other}/sentinel", b"other\n")
+ archive.addfile(member, stream)
+ member, stream = regular(f"{root}/group-executable", b"exec\n", 0o010)
+ archive.addfile(member, stream)
+ archive.addfile(symbolic(
+ f"{root}/third-party/nccl/pkg/debian/copyright",
+ "../../LICENSE.txt",
+ ))
+ for member in extras or []:
+ archive.addfile(member)
+ path.chmod(0o600)
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "source.tar"
+ destination = root / "destination"
+ destination.mkdir(mode=0o700)
+ write_archive(archive)
+ source_archive.extract_source_archive(archive, destination, selected)
+ link = (
+ destination / ".cx_sources" / selected / "third-party" / "nccl"
+ / "pkg" / "debian" / "copyright"
+ )
+ self.assertTrue(link.is_symlink())
+ self.assertEqual(os.readlink(link), "../../LICENSE.txt")
+ self.assertEqual(link.read_text(), "license\n")
+ self.assertFalse((destination / ".cx_sources" / other).exists())
+ extracted = destination / ".cx_sources" / selected
+ self.assertEqual(
+ stat.S_IMODE((extracted / "group-executable").stat().st_mode), 0o700
+ )
+ self.assertEqual(
+ stat.S_IMODE(
+ (extracted / "third-party" / "nccl" / "LICENSE.txt").stat().st_mode
+ ),
+ 0o600,
+ )
+
+ invalid: dict[str, list[tarfile.TarInfo]] = {
+ "absolute member": [directory("/outside")],
+ "traversal member": [directory(".cx_sources/../outside")],
+ "duplicate member": [directory(f".cx_sources/{selected}")],
+ "absolute link": [symbolic(f".cx_sources/{selected}/absolute", "/tmp/x")],
+ "escaping link": [symbolic(f".cx_sources/{selected}/escape", "../x")],
+ "cross-root link": [
+ symbolic(f".cx_sources/{selected}/cross", f"../{other}/sentinel")
+ ],
+ "missing target": [symbolic(f".cx_sources/{selected}/missing", "none")],
+ }
+ hardlink = tarfile.TarInfo(f".cx_sources/{selected}/hard")
+ hardlink.type = tarfile.LNKTYPE
+ hardlink.linkname = f".cx_sources/{selected}/third-party/nccl/LICENSE.txt"
+ invalid["hardlink"] = [hardlink]
+ fifo = tarfile.TarInfo(f".cx_sources/{selected}/fifo")
+ fifo.type = tarfile.FIFOTYPE
+ invalid["fifo"] = [fifo]
+ character = tarfile.TarInfo(f".cx_sources/{selected}/character")
+ character.type = tarfile.CHRTYPE
+ invalid["character device"] = [character]
+ block = tarfile.TarInfo(f".cx_sources/{selected}/block")
+ block.type = tarfile.BLKTYPE
+ invalid["block device"] = [block]
+ unknown = tarfile.TarInfo(f".cx_sources/{selected}/unknown")
+ unknown.type = b"Z"
+ invalid["unknown type"] = [unknown]
+ invalid["unsafe unselected root"] = [
+ symbolic(f".cx_sources/{other}/escape", f"../{selected}/group-executable")
+ ]
+ chain_target = symbolic(
+ f".cx_sources/{selected}/chain-target", "third-party/nccl/LICENSE.txt"
+ )
+ invalid["symlink chain"] = [
+ chain_target, symbolic(f".cx_sources/{selected}/chain", "chain-target")
+ ]
+ linked_child = tarfile.TarInfo(f".cx_sources/{selected}/linked-file/child")
+ invalid["symlink parent"] = [
+ symbolic(
+ f".cx_sources/{selected}/linked-file",
+ "third-party/nccl/LICENSE.txt",
+ ),
+ linked_child,
+ ]
+ for label, extras in invalid.items():
+ with self.subTest(label=label), tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "source.tar"
+ destination = root / "destination"
+ destination.mkdir(mode=0o700)
+ write_archive(archive, extras)
+ with self.assertRaises(source_archive.SourceArchiveError):
+ source_archive.extract_source_archive(archive, destination, selected)
+ self.assertFalse((destination / ".cx_sources").exists())
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "source.tar"
+ destination = root / "destination"
+ destination.mkdir(mode=0o700)
+ existing = destination / ".cx_sources"
+ existing.mkdir(mode=0o700)
+ marker = existing / "marker"
+ marker.write_text("existing\n")
+ write_archive(archive)
+ with self.assertRaises(source_archive.SourceArchiveError):
+ source_archive.extract_source_archive(archive, destination, selected)
+ self.assertEqual(marker.read_text(), "existing\n")
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "source.tar"
+ write_archive(archive)
+ real_destination = root / "real-destination"
+ real_destination.mkdir(mode=0o700)
+ linked_destination = root / "linked-destination"
+ linked_destination.symlink_to(real_destination, target_is_directory=True)
+ with self.assertRaises((OSError, source_archive.SourceArchiveError)):
+ source_archive.extract_source_archive(archive, linked_destination, selected)
+ self.assertFalse((real_destination / ".cx_sources").exists())
+
+ unsafe_destination = root / "unsafe-destination"
+ unsafe_destination.mkdir(mode=0o700)
+ unsafe_destination.chmod(0o755)
+ with self.assertRaises(source_archive.SourceArchiveError):
+ source_archive.extract_source_archive(archive, unsafe_destination, selected)
+ self.assertFalse((unsafe_destination / ".cx_sources").exists())
+
+ for limit, value in (
+ ("MAX_ARCHIVE_MEMBERS", 1),
+ ("MAX_MEMBER_BYTES", 1),
+ ("MAX_EXPANDED_BYTES", 1),
+ ("MAX_ARCHIVE_BYTES", 1),
+ ("MAX_ARCHIVE_HEADERS", 1),
+ ):
+ with self.subTest(limit=limit), tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "source.tar"
+ destination = root / "destination"
+ destination.mkdir(mode=0o700)
+ write_archive(archive)
+ with mock.patch.object(source_archive, limit, value):
+ with self.assertRaises(source_archive.SourceArchiveError):
+ source_archive.extract_source_archive(archive, destination, selected)
+ self.assertFalse((destination / ".cx_sources").exists())
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "source.tar"
+ destination = root / "destination"
+ destination.mkdir(mode=0o700)
+ write_archive(archive)
+ long_name = f".cx_sources/{selected}/long-name-result\0".encode()
+ with tarfile.open(archive, "a") as handle:
+ for _ in range(3):
+ extension = tarfile.TarInfo("././@LongLink")
+ extension.type = tarfile.GNUTYPE_LONGNAME
+ extension.size = len(long_name)
+ handle.addfile(extension, io.BytesIO(long_name))
+ member, stream = regular("placeholder", b"payload\n")
+ handle.addfile(member, stream)
+ archive.chmod(0o600)
+ for limit, value in (
+ ("MAX_EXTENSION_CHAIN", 1),
+ ("MAX_EXTENSION_MEMBER_BYTES", 1),
+ ("MAX_EXTENSION_BYTES", len(long_name) * 2),
+ ):
+ with self.subTest(limit=limit), mock.patch.object(
+ source_archive, limit, value
+ ):
+ with self.assertRaises(source_archive.SourceArchiveError):
+ source_archive.extract_source_archive(
+ archive, destination, selected
+ )
+ self.assertFalse((destination / ".cx_sources").exists())
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "source.tar"
+ destination = root / "destination"
+ destination.mkdir(mode=0o700)
+ write_archive(archive)
+ with tarfile.open(archive, "a", format=tarfile.PAX_FORMAT) as handle:
+ member, stream = regular(
+ f".cx_sources/{selected}/sparse-v1", b"1\n0\n1\n"
+ )
+ member.pax_headers = {
+ "GNU.sparse.major": "1",
+ "GNU.sparse.minor": "0",
+ "GNU.sparse.realsize": "1",
+ }
+ handle.addfile(member, stream)
+ archive.chmod(0o600)
+ with self.assertRaises(source_archive.SourceArchiveError):
+ source_archive.extract_source_archive(archive, destination, selected)
+ self.assertFalse((destination / ".cx_sources").exists())
+
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ archive = root / "source.tar"
+ destination = root / "destination"
+ destination.mkdir(mode=0o700)
+ write_archive(archive)
+ original_next = tarfile.TarFile.next
+
+ def sparse_next(handle: tarfile.TarFile) -> tarfile.TarInfo | None:
+ member = original_next(handle)
+ if member is not None and member.isfile():
+ member.sparse = [(0, 1)]
+ return member
+
+ with mock.patch.object(tarfile.TarFile, "next", sparse_next):
+ with self.assertRaises(source_archive.SourceArchiveError):
+ source_archive.extract_source_archive(archive, destination, selected)
+ self.assertFalse((destination / ".cx_sources").exists())
+
+ def test_runtime_identity_and_realized_placement_are_behavioral(self) -> None:
+ self.assertFalse(capability.runtime_identity_issues(
+ "mi325x", vendor="amd", arch="gfx942", machine="amd64",
+ device_name="AMD Instinct MI325X", device_count=8, world_size=8,
+ ))
+ self.assertTrue(capability.runtime_identity_issues(
+ "mi355x", vendor="amd", arch="gfx942", machine="amd64",
+ device_name="AMD Instinct MI325X", device_count=8, world_size=8,
+ ))
+ records = [("private-a", rank) for rank in range(4)] + [
+ ("private-b", rank) for rank in range(4)
+ ]
+ self.assertEqual(
+ run_ep._summarize_realized_placement(
+ records, expected_nodes=2, expected_gpus_per_node=4, expected_world_size=8
+ ),
+ {
+ "gpus_per_node": 4,
+ "nodes": 2,
+ "ranks_per_node": 4,
+ "unique_local_ranks": True,
+ "valid": True,
+ },
+ )
+ with self.assertRaises(ValueError):
+ run_ep._summarize_realized_placement(
+ records[:-1] + [("private-b", 2)],
+ expected_nodes=2,
+ expected_gpus_per_node=4,
+ expected_world_size=8,
+ )
+
+ def test_collective_version_and_rccl_fingerprint_are_normalized(self) -> None:
+ self.assertEqual(ep_harness.format_collective_version(23004), "2.30.4")
+ self.assertEqual(ep_harness.format_collective_version(21805), "2.18.5")
+ self.assertEqual(ep_harness.format_collective_version((2, 21, 5)), "2.21.5")
+
+ properties = types.SimpleNamespace(
+ multi_processor_count=304, total_memory=1024, warp_size=64
+ )
+ fake = types.SimpleNamespace(
+ __version__="2.9.0",
+ version=types.SimpleNamespace(cuda=None, hip="7.2"),
+ cuda=types.SimpleNamespace(
+ get_device_properties=lambda _device: properties,
+ get_device_name=lambda _device: "AMD Instinct MI325X",
+ nccl=types.SimpleNamespace(version=lambda: 21805),
+ ),
+ )
+ with mock.patch.object(
+ run_ep, "_loaded_collective_version", return_value="2.18.5"
+ ):
+ fingerprint = run_ep._runtime_fingerprint(
+ fake, "device", machine="amd64", vendor="amd", arch="gfx942"
+ )
+ self.assertEqual(fingerprint["collective_library"], {"kind": "rccl", "version": "2.18.5"})
+ self.assertEqual(fingerprint["accelerator_runtime"], {"kind": "hip", "version": "7.2"})
+
+ class FakeCollective:
+ @staticmethod
+ def ncclGetVersion(pointer) -> int:
+ pointer._obj.value = 23004
+ return 0
+
+ maps = "0-1 r-xp 0 00:00 0 /runtime/libnccl.so.2\n"
+ with (
+ mock.patch("builtins.open", return_value=io.StringIO(maps)),
+ mock.patch.object(run_ep.os.path, "isfile", return_value=True),
+ mock.patch.object(
+ run_ep.os.path, "realpath", return_value="/runtime/libnccl.so.2"
+ ),
+ mock.patch.object(run_ep.ctypes, "CDLL", return_value=FakeCollective()),
+ ):
+ self.assertEqual(run_ep._loaded_collective_version(), "2.30.4")
+
+ path = HERE / "ep_nccl.py"
+ tree = ast.parse(path.read_text(), str(path))
+ helper = next(
+ node for node in tree.body
+ if isinstance(node, ast.FunctionDef) and node.name == "_runtime_collective"
+ )
+ namespace = {"re": re}
+ exec(compile(ast.Module(body=[helper], type_ignores=[]), str(path), "exec"), namespace)
+ args = types.SimpleNamespace(
+ runtime_fingerprint={
+ "collective_library": {"kind": "nccl", "version": "2.30.4"}
+ }
+ )
+ cuda = types.SimpleNamespace(version=types.SimpleNamespace(hip=None))
+ self.assertEqual(namespace["_runtime_collective"](args, cuda), ("nccl", "2.30.4"))
+ args.runtime_fingerprint["collective_library"]["version"] = None
+ with self.assertRaisesRegex(RuntimeError, "runtime identity is unavailable"):
+ namespace["_runtime_collective"](args, cuda)
+ self.assertNotIn("torch.cuda.nccl.version", path.read_text())
+
+ def test_workloads_bind_generator_activation_and_trace(self) -> None:
+ args = ("uniform", 7168, 8, 256, 8, 64, 67)
+ first = workload.compute_workload_id(*args)
+ self.assertTrue(identity.is_typed_id(first, "workload"))
+ self.assertEqual(first, workload.compute_workload_id(*args))
+ self.assertNotEqual(first, workload.compute_workload_id(*args[:-1], 68))
+ self.assertNotEqual(
+ first,
+ workload.compute_workload_id(*args, trace_checksum="a" * 64),
+ )
+ _, _, manifest = workload.build_workload(8, 2, 4, "uniform", 4, 67, 2)
+ member, checksums, _, _ = workload.canonical_member(
+ "uniform", 8, 2, 4, 2, 2, 67
+ )
+ self.assertEqual(member, manifest["workload_id"])
+ self.assertEqual(checksums, manifest["checksums"])
+
+ def test_canonical_members_are_bound_to_each_scheduled_row(self) -> None:
+ case = {
+ "routing": "uniform", "hidden": 8, "topk": 2, "experts": 4, "ep": 2,
+ }
+ eplb_record = {
+ "enabled": False, "mapping_hash": None, "num_physical_experts": 4,
+ }
+
+ def expected(
+ *, tokens: int = 1, hidden: int = 8
+ ) -> tuple[str, dict[str, str], str]:
+ member, checksums, row_hash, _, _ = contracts._expected_canonical_trace(
+ "uniform", hidden, 2, 4, 4, 2, tokens, 67, False, 2048
+ )
+ return member, checksums, row_hash
+
+ member, checksums, row_hash = expected()
+ rows = [{"tokens_per_rank": 1, "routing": {"hash": row_hash}}]
+ proof = {
+ "manifest_checksums": {member: checksums},
+ "members": [member],
+ "workload_id": identity.workload_id({
+ "members": [{"checksums": checksums, "workload_id": member}]
+ }),
+ }
+ contracts._validate_canonical_workload(proof, case, rows, eplb_record)
+
+ def replace_member(document: dict, replacement: tuple[str, dict[str, str], str]) -> None:
+ replacement_id, replacement_checksums, _ = replacement
+ document["members"] = [replacement_id]
+ document["manifest_checksums"] = {replacement_id: replacement_checksums}
+ document["workload_id"] = identity.workload_id({
+ "members": [{
+ "checksums": replacement_checksums,
+ "workload_id": replacement_id,
+ }]
+ })
+
+ mutations = {
+ "wrong member token": lambda document, mutated_rows: replace_member(
+ document, expected(tokens=2)
+ ),
+ "wrong member dimensions": lambda document, mutated_rows: replace_member(
+ document, expected(hidden=16)
+ ),
+ "wrong member checksum": lambda document, mutated_rows: replace_member(
+ document,
+ (
+ member,
+ {**checksums, "topk_idx": "0" * 64},
+ row_hash,
+ ),
+ ),
+ "row hash unrelated to member": lambda document, mutated_rows: mutated_rows[0][
+ "routing"
+ ].update({"hash": "f" * 64}),
+ }
+ for label, mutate in mutations.items():
+ with self.subTest(label=label), self.assertRaises(contracts.ContractError):
+ bad_proof, bad_rows = copy.deepcopy(proof), copy.deepcopy(rows)
+ mutate(bad_proof, bad_rows)
+ contracts._validate_canonical_workload(
+ bad_proof, case, bad_rows, eplb_record
+ )
+
+ def test_eplb_row_hash_is_bound_to_the_frozen_remap(self) -> None:
+ case = {"routing": "zipf", "hidden": 8, "topk": 2, "experts": 4, "ep": 2}
+ physical = eplb.physical_count(4, 32, 2)
+ plan = contracts._expected_eplb_plan("zipf", 2, 4, physical, 2, 67, 2048)
+ eplb_record = {
+ "enabled": True,
+ "mapping_hash": eplb.mapping_hash(plan),
+ "num_physical_experts": physical,
+ }
+ member, checksums, row_hash, _, _ = contracts._expected_canonical_trace(
+ "zipf", 8, 2, 4, physical, 2, 1, 67, True, 2048
+ )
+ self.assertNotEqual(row_hash, checksums["trace"])
+ workload_proof = {
+ "manifest_checksums": {member: checksums},
+ "members": [member],
+ "workload_id": identity.workload_id({
+ "members": [{"checksums": checksums, "workload_id": member}]
+ }),
+ }
+ rows = [{"tokens_per_rank": 1, "routing": {"hash": row_hash}}]
+ contracts._validate_canonical_workload(workload_proof, case, rows, eplb_record)
+ with self.assertRaisesRegex(contracts.ContractError, "EPLB mapping"):
+ contracts._validate_canonical_workload(
+ workload_proof, case, rows, {**eplb_record, "mapping_hash": "0" * 64}
+ )
+
+ def test_oracle_pass_cannot_ignore_combined_value_failure(self) -> None:
+ oracle = {
+ "atol": ep_harness.ORACLE_ATOL,
+ "checks": {
+ "combine_values": True,
+ "counts": True,
+ "metadata": True,
+ "multiplicity": True,
+ "payload": True,
+ "source_set": True,
+ "weights": True,
+ },
+ "combine_weight_semantics": "unweighted-rank-sum",
+ "contract": ep_harness.ORACLE_CONTRACT,
+ "dispatch_sha256": "a" * 64,
+ "max_absolute_error": 0.0,
+ "max_elementwise_relative_error": 0.0,
+ "max_relative_error": 0.0,
+ "max_weight_error": 0.0,
+ "order_sha256": "b" * 64,
+ "ordering_contract": "stable-v1",
+ "passed": True,
+ "receive_count": 1,
+ "rtol": ep_harness.ORACLE_RTOL,
+ }
+ contracts._validate_oracle(oracle, "oracle")
+ weighted = copy.deepcopy(oracle)
+ weighted["combine_weight_semantics"] = "native-gate-weighted"
+ with self.assertRaisesRegex(contracts.ContractError, "differs from v1"):
+ contracts._validate_oracle(weighted, "oracle")
+ tampered = copy.deepcopy(oracle)
+ tampered["checks"]["combine_values"] = False
+ with self.assertRaises(contracts.ContractError):
+ contracts._validate_oracle(tampered, "oracle")
+
+ def test_oracle_stability_canonicalizes_native_receive_order(self) -> None:
+ source = (HERE / "ep_harness.py").read_text()
+ begin = source.index("canonical_order = torch.argsort")
+ canonical = source[begin:source.index("problem.recv_tokens = receive_count", begin)]
+ self.assertIn("canonical_sources", canonical)
+ self.assertIn("canonical_ids", canonical)
+ self.assertIn("canonical_weights", canonical)
+ self.assertNotIn("_tensor_sha256(source_ids", canonical)
+ mori = (HERE / "ep_mori.py").read_text()
+ self.assertIn('"inter-node-v1" if self._inter_node', mori)
+ self.assertIn('else "async-ll" if self._async_ll', mori)
+ backend = types.SimpleNamespace(name="mori", kernel_generation="async-ll")
+ self.assertEqual(ep_harness.kernel_generation(backend), "async-ll")
+ backend.kernel_generation = "inter-node-v1"
+ self.assertEqual(ep_harness.kernel_generation(backend), "inter-node-v1")
+
+ def test_terminal_fail_safe_fills_only_missing_shard_cases(self) -> None:
+ matrix = sweep_matrix.resolve_matrix(backends="all", max_cases=128)
+ shard = next(item for item in matrix["include"] if item["n"] >= 2)
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ matrix_path = root / "matrix.json"
+ control_path = root / "control.json"
+ out_dir = root / "results"
+ matrix_path.write_text(json.dumps(matrix))
+ control = sweep_matrix.extract_shard(
+ matrix_path, shard["id"], control_path,
+ sku=shard["sku"], backend=shard["backend"], nodes=shard["nodes"],
+ )
+ control["cases"] = control["cases"][:2]
+ control["n"] = 2
+ control_path.write_text(json.dumps(control))
+ first = {key: value for key, value in control["cases"][0].items() if key != "case_id"}
+ git_run = {
+ "artifact": "artifact", "job": "job", "ref": "collectivex",
+ "repo": "SemiAnalysisAI/InferenceX", "run_attempt": "1",
+ "run_id": "123", "source_sha": "a" * 40,
+ }
+ allocation = {
+ "artifact": "artifact", "execution_id": "execution", "job": "job",
+ "repo": "SemiAnalysisAI/InferenceX", "run_attempt": "1", "run_id": "123",
+ "runner": shard["sku"], "source_sha": "a" * 40,
+ }
+ out_dir.mkdir()
+ existing = contracts.make_terminal_document(
+ allocation_factors=allocation, attempt_ordinal=1, case=first,
+ case_factors={"case": first, "profile": identity.V1_CASE_PROFILE, "sku": shard["sku"]},
+ control_sha256=hashlib.sha256(control_path.read_bytes()).hexdigest(),
+ failure_mode="setup", generated_at="2026-07-04T00:00:00Z", git_run=git_run,
+ reason="launcher-setup-failed", return_code=7, source="runtime-emitter",
+ status="failed",
+ expected_case_id=control["cases"][0]["case_id"],
+ )
+ (out_dir / "existing.json").write_text(json.dumps(existing))
+ (out_dir / "partial.json").write_text(json.dumps({
+ "format": contracts.RAW_FORMAT,
+ "identity": {"case_id": control["cases"][1]["case_id"]},
+ "sample_artifact": {"path": "partial.samples.json"},
+ }))
+ (out_dir / "partial.samples.json").write_text("{broken")
+ environment = {
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "CX_SHARD_FILE": str(control_path),
+ "CX_SHARD_SKU": shard["sku"],
+ "CX_RUNNER": shard["sku"],
+ "CX_BENCH": shard["backend"],
+ "CX_NODES": str(shard["nodes"]),
+ "COLLECTIVEX_EXECUTION_ID": "execution",
+ "COLLECTIVEX_ARTIFACT_NAME": "artifact",
+ "GITHUB_JOB": "job", "GITHUB_REF_NAME": "collectivex",
+ "GITHUB_REPOSITORY": "SemiAnalysisAI/InferenceX",
+ "GITHUB_RUN_ATTEMPT": "1", "GITHUB_RUN_ID": "123",
+ "GITHUB_SHA": "a" * 40,
+ }
+ subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_emit_setup_failures "$2" "$3" "$4" 7',
+ "_", str(ROOT / "runtime" / "common.sh"), str(ROOT),
+ str(out_dir), shard["backend"],
+ ],
+ check=True,
+ env=environment,
+ )
+ attempts = [contracts.strict_load(path) for path in out_dir.glob("*.json")]
+ self.assertEqual(len(attempts), 2)
+ self.assertEqual(
+ contracts.validate_attempt_paths([str(path) for path in out_dir.glob("*.json")]),
+ 2,
+ )
+ delivery = [str(path) for path in out_dir.glob("*.json")]
+ self.assertEqual(contracts.validate_delivery(delivery, str(control_path)), 2)
+ with self.assertRaises(contracts.ContractError):
+ contracts.validate_delivery(delivery[:1], str(control_path))
+ self.assertEqual(
+ {attempt["identity"]["case_id"] for attempt in attempts},
+ {case["case_id"] for case in control["cases"]},
+ )
+ self.assertTrue((out_dir / "partial.json.quarantine").is_file())
+ self.assertTrue((out_dir / "partial.samples.json.quarantine").is_file())
+
+ preallocation = root / "preallocation"
+ preallocation_results = preallocation / "experimental" / "CollectiveX" / "results"
+ preallocation_results.mkdir(parents=True)
+ failed = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; REPO_ROOT="$2"; export REPO_ROOT; '
+ 'cx_install_launcher_fail_safe; cx_load_operator_config',
+ "_", str(ROOT / "runtime" / "common.sh"), str(preallocation),
+ ],
+ env={**environment, "COLLECTIVEX_OPERATOR_CONFIG_REQUIRED": "1"},
+ )
+ self.assertNotEqual(failed.returncode, 0)
+ preallocation_attempts = [
+ contracts.validate_terminal_document(contracts.strict_load(path))
+ for path in preallocation_results.glob("*.json")
+ ]
+ self.assertEqual(
+ {attempt["identity"]["case_id"] for attempt in preallocation_attempts},
+ {case["case_id"] for case in control["cases"]},
+ )
+
+ def test_runtime_identity_mismatch_is_failed_not_unsupported(self) -> None:
+ wrapper = next(
+ item for item in sweep_matrix.resolve_matrix()["requested_cases"]
+ if item["disposition"] == "runnable"
+ )
+ case = wrapper["case"]
+ environment = {
+ "CX_RUNNER": wrapper["sku"], "CX_CASE_ID": case["case_id"],
+ "CX_SUITE": case["suite"], "CX_WORKLOAD_NAME": case["workload"],
+ "CX_REQUIRED_PUBLICATION": case["required_publication"],
+ "CX_ROUTING": case["routing"], "CX_EPLB": "1" if case["eplb"] else "",
+ "CX_EP": str(case["ep"]), "CX_NGPUS": str(case["ep"]),
+ "CX_HIDDEN": str(case["hidden"]), "CX_TOPK": str(case["topk"]),
+ "CX_EXPERTS": str(case["experts"]), "CX_NODES": str(case["nodes"]),
+ "CX_GPUS_PER_NODE": str(case["gpus_per_node"]),
+ "CX_SCALE_UP_DOMAIN": str(case["scale_up_domain"]),
+ "CX_MODE": case["mode"], "CX_SCOPE": case["scope"],
+ "CX_TOPO": case["topology_class"], "CX_TRANSPORT": case["transport"],
+ "CX_SCALE_UP_TRANSPORT": case["scale_up_transport"],
+ "CX_SCALE_OUT_TRANSPORT": case["scale_out_transport"] or "",
+ "CX_TOKENS_LADDER": case["ladder"], "CX_CANONICAL": "1",
+ "CX_ITERS": "8", "CX_TRIALS": "64", "CX_WARMUP": "32",
+ "CX_SAMPLES_PER_POINT": "512", "GITHUB_RUN_ID": "123",
+ "GITHUB_RUN_ATTEMPT": "1", "GITHUB_REF_NAME": "collectivex",
+ "GITHUB_SHA": "a" * 40, "GITHUB_REPOSITORY": "SemiAnalysisAI/InferenceX",
+ "GITHUB_JOB": "sweep", "COLLECTIVEX_ARTIFACT_NAME": "artifact",
+ "COLLECTIVEX_EXECUTION_ID": "execution",
+ }
+ with mock.patch.dict(os.environ, environment, clear=False):
+ terminal = contracts.make_terminal_from_environment(
+ backend=case["backend"], phase=case["phase"], return_code=5
+ )
+ self.assertEqual(terminal["identity"]["case_id"], case["case_id"])
+ self.assertEqual(
+ terminal["outcome"],
+ {
+ "failure_mode": "runtime-identity",
+ "reason": "runtime-identity-mismatch",
+ "return_code": 5,
+ "status": "failed",
+ },
+ )
+ for mode, reason in contracts.RUNTIME_FAILURE_REASONS.items():
+ with self.subTest(mode=mode), mock.patch.dict(os.environ, environment, clear=False):
+ staged = contracts.make_terminal_from_environment(
+ backend=case["backend"], phase=case["phase"], return_code=1,
+ failure_mode=mode,
+ )
+ self.assertEqual(staged["outcome"]["reason"], reason)
+ mismatched = copy.deepcopy(staged)
+ mismatched["outcome"]["reason"] = "distributed-command-failed"
+ if reason == "distributed-command-failed":
+ mismatched["outcome"]["reason"] = "backend-setup-failed"
+ with self.assertRaisesRegex(
+ contracts.ContractError, "source and outcome are not registered"
+ ):
+ contracts.validate_terminal_document(mismatched)
+ with mock.patch.dict(os.environ, environment, clear=False):
+ with self.assertRaisesRegex(
+ contracts.ContractError, "runtime failure mode is not registered"
+ ) as raised:
+ contracts.make_terminal_from_environment(
+ backend=case["backend"], phase=case["phase"], return_code=1,
+ failure_mode="raw-private-error",
+ )
+ self.assertNotIn("raw-private-error", str(raised.exception))
+ with mock.patch.dict(os.environ, environment, clear=False):
+ generic = contracts.make_terminal_from_environment(
+ backend=case["backend"], phase=case["phase"], return_code=6,
+ )
+ self.assertEqual(
+ generic["outcome"],
+ {
+ "failure_mode": "execution",
+ "reason": "distributed-command-failed",
+ "return_code": 6,
+ "status": "failed",
+ },
+ )
+ manual_environment = {
+ "CX_RUNNER": "manual-runner",
+ "COLLECTIVEX_EXECUTION_ID": "manual-execution",
+ }
+ with mock.patch.dict(os.environ, manual_environment, clear=True):
+ manual = contracts.make_terminal_from_environment(
+ backend="nccl-ep", phase="decode", return_code=6,
+ )
+ self.assertIsNone(manual["provenance"]["git_run"])
+ self.assertEqual(
+ {
+ field: manual["case"][field]
+ for field in ("suite", "workload", "canonical", "required_publication")
+ },
+ {
+ "suite": "manual", "workload": "manual", "canonical": False,
+ "required_publication": "diagnostic",
+ },
+ )
+ self.assertEqual(
+ manual["identity"]["allocation_factors"],
+ {
+ "artifact": None, "execution_id": "manual-execution", "job": None,
+ "repo": None, "run_attempt": None, "run_id": None,
+ "runner": "manual-runner", "source_sha": None,
+ },
+ )
+ broken = copy.deepcopy(manual)
+ broken["identity"]["allocation_factors"]["artifact"] = "forged-artifact"
+ allocation_id = identity.allocation_id(
+ broken["identity"]["allocation_factors"]
+ )
+ broken["identity"]["allocation_id"] = allocation_id
+ broken["identity"]["attempt_id"] = identity.attempt_id(
+ allocation=allocation_id,
+ case=broken["identity"]["case_id"],
+ ordinal=broken["identity"]["attempt_ordinal"],
+ )
+ with self.assertRaisesRegex(
+ contracts.ContractError, "allocation factors differ"
+ ):
+ contracts.validate_terminal_document(broken)
+
+ def test_launchers_use_private_logs_and_allowlisted_failure_stages(self) -> None:
+ expected = {
+ "launch_single-slurm.sh": {
+ "setup", "registry-verification", "container-import", "container-hash",
+ "repository-stage", "scheduler-allocation", "container-launch",
+ "artifact-collection",
+ },
+ "launch_gb-nv.sh": {
+ "setup", "registry-verification", "container-import", "container-hash",
+ "repository-stage", "scheduler-allocation", "container-launch", "backend-setup",
+ "execution", "artifact-collection",
+ },
+ "launch_mi-amds.sh": {
+ "setup", "repository-stage", "registry-verification", "scheduler-allocation",
+ "container-import", "container-hash", "container-launch", "artifact-collection",
+ },
+ }
+ common = (ROOT / "runtime" / "common.sh").read_text()
+ for name, stages in expected.items():
+ launcher = (ROOT / "launchers" / name).read_text()
+ stage_source = launcher + common if name == "launch_gb-nv.sh" else launcher
+ self.assertNotIn("--export=ALL", launcher)
+ if name == "launch_gb-nv.sh":
+ self.assertIn("cx_run_distributed_shard", launcher)
+ else:
+ self.assertIn("cx_container_exports", launcher)
+ self.assertIn("collect_rc=0", launcher)
+ for stage in stages:
+ with self.subTest(launcher=name, stage=stage):
+ self.assertIn(f"cx_set_failure_stage {stage}", stage_source)
+ amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text()
+ self.assertIn("cx_ensure_squash_on_job", amd)
+ self.assertIn("cx_fail_stage container-hash", amd)
+ self.assertNotIn('cat "$import_log"', amd)
+ self.assertIn('bash -s -- "$sq" "$lock" "$image"', common)
+ self.assertIn("> \"$log\" 2>&1 <<'BASH'", common)
+ self.assertIn("cx_fail_stage container-import", common)
+ runtime = (ROOT / "runtime" / "run_in_container.sh").read_text()
+ export_start = common.index("\ncx_container_exports() {")
+ exports = common[export_start:common.index("\n}", export_start)]
+ export_names = {
+ name
+ for payload in re.findall(r"printf '%s' '([^']*)'", exports)
+ for name in payload.split(",") if name
+ }
+ for private_name in (
+ "COLLECTIVEX_OPERATOR_CONFIG", "GITHUB_TOKEN", "GITHUB_WORKSPACE", "HOME",
+ "CX_PARTITION", "CX_ACCOUNT", "CX_SQUASH_DIR", "CX_STAGE_DIR",
+ ):
+ self.assertNotIn(private_name, export_names)
+ self.assertIn("CX_BACKEND_CACHE_ROOT", export_names)
+ self.assertIn("CX_BACKEND_CACHE_SENTINEL_SHA256", export_names)
+ self.assertNotIn("CX_PREPARED_BACKEND_CACHE", export_names)
+ self.assertIn("MORI_COMMIT", export_names)
+ self.assertIn("cx_write_runtime_stage backend-setup", runtime)
+ self.assertIn("cx_write_runtime_stage execution", runtime)
+ distributed = common[common.index("cx_run_distributed_shard()") :]
+ self.assertIn("cx_private_log_path shard-summary", distributed)
+ self.assertIn("cx_fail_stage execution", distributed)
+ self.assertIn('cx_fail_stage execution "$runtime_log"', distributed)
+
+ def test_case_failure_diagnostic_precedes_normal_srun_footer(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ log = Path(temporary) / "runtime.log"
+ log.write_text(
+ "WARN: deepep decode run failed rc=1 (CX_RUN_TIMEOUT=900s)\n"
+ "SHARD done: 6/6 case(s) failed\n"
+ "srun: error: task exited 1\n"
+ )
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_fail_stage execution "$2"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(log),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 1)
+ self.assertIn("diagnostic=benchmark-case-failure", result.stderr)
+
+ def test_non_timeout_failure_warning_is_classified_as_case_failure(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ log = Path(temporary) / "runtime.log"
+ log.write_text("WARN: deepep decode run failed rc=1\nsrun: task exited 1\n")
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_fail_stage execution "$2"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(log),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertEqual(result.returncode, 1)
+ self.assertNotIn("diagnostic=network-or-timeout", result.stderr)
+ self.assertIn("diagnostic=benchmark-case-failure", result.stderr)
+
+ def test_private_runtime_failure_signatures_override_case_footer(self) -> None:
+ signatures = {
+ "DeepEP V2 no-GIN run is outside one realized LSA domain":
+ "accelerator-topology",
+ "NCCL exception (/src/nccl.cu:111): 3": "accelerator-topology",
+ "NCCL exception (/src/nccl.cu:112): 3": "accelerator-topology",
+ "CUDA error: call requires newer driver": "accelerator-driver",
+ "NCCL failure in ncclCommWindowRegister": "nccl-device-api",
+ "Communicator does not support symmetric memory": "nccl-device-api",
+ "NCCL exception (/src/nccl.cu:106): 5": "nccl-device-api",
+ "NCCL exception (/src/nccl.cu:127): 5": "nccl-device-api",
+ "NCCL exception (/src/nccl.cu:128): 5": "nccl-device-api",
+ "NCCL exception (/src/nccl.cu:129): 5": "nccl-device-api",
+ "NCCL exception (/src/nccl.cu:135): 5": "nccl-device-api",
+ "NVCC compilation failed": "jit-toolchain",
+ "CUDA out of memory": "accelerator-memory",
+ "torch rendezvous timed out": "network-or-timeout",
+ }
+ with tempfile.TemporaryDirectory() as temporary:
+ log = Path(temporary) / "runtime.log"
+ for signature, diagnostic in signatures.items():
+ log.write_text(f"{signature}\nSHARD done: 6/6 case(s) failed\n")
+ result = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_fail_stage execution "$2"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(log),
+ ],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "CX_BENCH": "deepep-v2"},
+ )
+ self.assertEqual(result.returncode, 1)
+ self.assertIn(f"diagnostic={diagnostic}", result.stderr)
+
+ log.write_text(
+ "NCCL exception (/src/nccl.cu:106): 5\n"
+ "SHARD done: 6/6 case(s) failed\n"
+ )
+ result = subprocess.run(
+ [
+ "bash", "-c", 'source "$1"; cx_fail_stage execution "$2"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(log),
+ ],
+ text=True, capture_output=True,
+ env={**os.environ, "CX_BENCH": "deepep"},
+ )
+ self.assertIn("diagnostic=benchmark-case-failure", result.stderr)
+
+ def test_runtime_stage_marker_distinguishes_launch_from_execution(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ mount = Path(temporary)
+ root = mount / "experimental" / "CollectiveX"
+ root.mkdir(parents=True)
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export COLLECTIVEX_EXECUTION_ID=test_1_shard CX_TS=test
+ cx_set_failure_stage container-launch
+ cx_prepare_runtime_marker "$2"
+ (cd "$2/experimental/CollectiveX"; cx_write_runtime_stage backend-setup)
+ cx_adopt_runtime_stage "$2"
+ test "$CX_FAILSAFE_MODE" = backend-setup
+ (cd "$2/experimental/CollectiveX"; cx_write_runtime_stage execution)
+ cx_adopt_runtime_stage "$2"
+ test "$CX_FAILSAFE_MODE" = execution
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"),
+ str(mount)],
+ check=True,
+ )
+
+ def test_canonical_gha_environment_is_locked_but_manual_overrides_survive(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true
+ export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1
+ export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+ export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x
+ export CX_NODES=1 CX_GPUS_PER_NODE=8
+ export CX_IMAGE=untrusted CX_IMAGE_DIGEST=untrusted CX_NGPUS=99
+ export CX_NCCL_HOME=/untrusted CX_LOCK_DIR=/tmp CX_SQUASH_DIR=/shared/containers
+ export CX_STAGE_DIR=/private/stale-stage
+ export CX_MORI_KERNEL_TYPE=intranode MORI_ENABLE_SDMA=0
+ export NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 CX_DRYRUN=1
+ export CX_BACKEND_CACHE_ROOT=/untrusted CX_BACKEND_CACHE_SENTINEL_SHA256=bad
+ export CX_PREPARED_BACKEND_CACHE=/untrusted CX_BACKEND_SOURCE_ROOT=/untrusted
+ ! (cx_lock_canonical_gha_env mi325x)
+ export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$
+ export CX_STAGE_DIR="$GITHUB_WORKSPACE"
+ unset CX_LOCK_DIR
+ cx_lock_canonical_gha_env mi325x
+ test "$CX_IMAGE" = "$CX_IMAGE_AMD_MORI_MI325"
+ test "$CX_IMAGE_DIGEST" = "$CX_IMAGE_AMD_MORI_MI325_DIGEST"
+ test "$CX_NGPUS:$CX_SEED:$CX_RUN_TIMEOUT" = 8:67:1800
+ test "$CX_MORI_KERNEL_TYPE:$MORI_DISABLE_AUTO_XGMI:$MORI_ENABLE_SDMA" = asyncll:0:1
+ test "$MORI_COMMIT" = "$CX_MORI_COMMIT_MI325"
+ test "$MORI_APP_LOG_LEVEL:$MORI_SHMEM_LOG_LEVEL:$MORI_IO_LOG_LEVEL" = info:info:info
+ test "$CX_STAGE_DIR" = "$GITHUB_WORKSPACE"
+ test -z "${CX_NCCL_HOME+x}${CX_LOCK_DIR+x}${NCCL_MNNVL_ENABLE+x}${MC_FORCE_MNNVL+x}"
+ test -z "${CX_BACKEND_CACHE_ROOT+x}${CX_BACKEND_CACHE_SENTINEL_SHA256+x}"
+ test -z "${CX_PREPARED_BACKEND_CACHE+x}${CX_BACKEND_SOURCE_ROOT+x}"
+ test -z "${CX_DRYRUN+x}"
+
+ export CX_STAGE_DIR=/shared/gb-stage
+ export CX_SHARD_SKU=gb300 CX_NODES=2 CX_GPUS_PER_NODE=4
+ export CX_IMAGE=untrusted CX_NGPUS=1 CX_MORI_KERNEL_TYPE=untrusted
+ export MORI_ENABLE_SDMA=0 CX_NCCL_HOME=/untrusted CX_MASTER_PORT=1
+ cx_lock_canonical_gha_env gb300
+ test "$CX_IMAGE" = "$CX_IMAGE_MULTIARCH"
+ test "$CX_IMAGE_DIGEST" = "$CX_IMAGE_MULTIARCH_DIGEST"
+ test "$CX_NGPUS:$CX_SEED:$CX_RUN_TIMEOUT" = 8:67:900
+ test "$CX_NCCL_HOME:$CX_MASTER_PORT" = /usr:29551
+ test "$CX_STAGE_DIR" = /shared/gb-stage
+ test -z "${CX_MORI_KERNEL_TYPE+x}${MORI_ENABLE_SDMA+x}"
+
+ export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$
+ export CX_SHARD_SKU=mi355x CX_NODES=1 CX_GPUS_PER_NODE=8
+ export CX_LOCK_DIR=/validated/amd-locks CX_STAGE_DIR=/validated/amd-stage
+ cx_lock_canonical_gha_env mi355x
+ test "$CX_LOCK_DIR" = /validated/amd-locks
+ test "$CX_STAGE_DIR" = /validated/amd-stage
+ test "$MORI_COMMIT" = "$CX_MORI_COMMIT_MI355"
+
+ unset COLLECTIVEX_CANONICAL_GHA
+ unset COLLECTIVEX_OPERATOR_CONFIG_LOADED
+ CX_IMAGE=manual CX_IMAGE_DIGEST=manual CX_NGPUS=3
+ CX_MORI_KERNEL_TYPE=manual
+ cx_lock_canonical_gha_env mi355x
+ test "$CX_IMAGE:$CX_IMAGE_DIGEST:$CX_NGPUS:$CX_MORI_KERNEL_TYPE" = manual:manual:3:manual
+ '''
+ with tempfile.TemporaryDirectory(dir=Path.home()) as workspace:
+ Path(workspace).chmod(0o720)
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common)],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "GITHUB_WORKSPACE": workspace,
+ },
+ )
+ self.assertEqual(list(Path(workspace).iterdir()), [])
+
+ def test_canonical_amd_stage_uses_config_not_world_writable_workspace(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ command = r'''
+ source "$1"
+ export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true
+ export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1
+ export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+ export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x
+ export CX_NODES=1 CX_GPUS_PER_NODE=8 CX_SQUASH_DIR=/shared/containers
+ export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$ CX_STAGE_DIR=/shared/amd-stage
+ cx_lock_canonical_gha_env mi325x
+ printf '%s' "$CX_STAGE_DIR"
+ '''
+ with tempfile.TemporaryDirectory(dir=Path.home()) as workspace:
+ Path(workspace).chmod(0o702)
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(common)],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "GITHUB_WORKSPACE": workspace,
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertEqual(result.stdout, "/shared/amd-stage")
+ self.assertNotIn(workspace, result.stderr)
+
+ def test_canonical_amd_stage_uses_config_not_symlinked_workspace(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ command = r'''
+ source "$1"
+ export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true
+ export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1
+ export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+ export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x
+ export CX_NODES=1 CX_GPUS_PER_NODE=8 CX_SQUASH_DIR=/shared/containers
+ export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$ CX_STAGE_DIR=/shared/amd-stage
+ cx_lock_canonical_gha_env mi325x
+ printf '%s' "$CX_STAGE_DIR"
+ '''
+ with tempfile.TemporaryDirectory(dir=Path.home()) as temporary:
+ root = Path(temporary)
+ real = root / "real"
+ real.mkdir()
+ link = root / "workspace"
+ link.symlink_to(real, target_is_directory=True)
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(common)],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "GITHUB_WORKSPACE": str(link),
+ },
+ )
+ self.assertEqual(result.returncode, 0, result.stderr)
+ self.assertEqual(result.stdout, "/shared/amd-stage")
+ self.assertNotIn(str(root), result.stderr)
+
+ def test_image_selection_and_registry_verification_are_fail_closed(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ command = r'''
+ source "$1"
+ test "$(cx_default_image mi325x)" = "$CX_IMAGE_AMD_MORI_MI325"
+ test "$(cx_default_image mi355x)" = "$CX_IMAGE_AMD_MORI"
+ pinned="sha256:$(printf 'a%.0s' {1..64})"
+ curl() {
+ case "$*" in
+ *auth.docker.io*) printf '{"token":"test"}' ;;
+ *) printf 'Docker-Content-Digest: %s\r\n' "$pinned" ;;
+ esac
+ }
+ test "$(cx_resolve_registry_digest ubuntu:latest)" = "$pinned"
+ test "$(cx_resolve_registry_digest docker.io/library/ubuntu:latest)" = "$pinned"
+ ! (cx_resolve_registry_digest "ubuntu@$pinned")
+ ! (cx_resolve_registry_digest ghcr.io/example/image:tag)
+ ! (cx_resolve_registry_digest 'ubuntu@sha256:bad')
+ curl() {
+ case "$*" in *auth.docker.io*) printf '{"token":"test"}';; esac
+ }
+ ! (cx_resolve_registry_digest ubuntu:latest)
+ cx_resolve_registry_digest() { printf '%s' "$CX_IMAGE_MULTIARCH_DIGEST"; }
+ cx_verify_registry_image "$CX_IMAGE_MULTIARCH"
+ test "$COLLECTIVEX_IMAGE_DIGEST_VERIFIED" = 1
+ test "$COLLECTIVEX_IMAGE_DIGEST" = "$CX_IMAGE_MULTIARCH_DIGEST"
+ cx_reverify_registry_image "$CX_IMAGE_MULTIARCH"
+ cx_resolve_registry_digest() { printf 'sha256:%064d' 0; }
+ ! (cx_reverify_registry_image "$CX_IMAGE_MULTIARCH")
+ ! (cx_verify_registry_image "$CX_IMAGE_MULTIARCH")
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common)],
+ check=True,
+ env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"},
+ )
+
+ def test_canonical_gha_requires_compute_visible_staging(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ repo = root / "repo"
+ squash = root / "squash"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ squash.mkdir()
+ (source / "public.py").write_text("public\n")
+ (source / "private-infra.md").write_text("private\n")
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ unset CX_SHARD_FILE CX_STAGE_DIR
+ ! (COLLECTIVEX_CANONICAL_GHA=1; cx_stage_path "$2" "")
+ staged="$(COLLECTIVEX_CANONICAL_GHA=0; cx_stage_path "$2" "")"
+ cx_stage_repo "$2" "$staged"
+ test "$staged" != "$2"
+ test -f "$staged/experimental/CollectiveX/public.py"
+ test ! -e "$staged/experimental/CollectiveX/private-infra.md"
+ cx_cleanup_stage "$staged" "$2"
+ test ! -e "$staged"
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"),
+ str(repo)],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "CX_SQUASH_DIR": str(squash),
+ },
+ )
+ self.assertEqual(list(squash.iterdir()), [])
+
+ def test_manual_stage_does_not_write_to_checkout_parent(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ parent = Path(temporary).resolve() / "readonly-parent"
+ repo = parent / "repo"
+ squash = parent / "squash"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ squash.mkdir(mode=0o700)
+ (source / "public.py").write_text("public\n")
+ original_mode = parent.stat().st_mode & 0o777
+ parent.chmod(0o555)
+ try:
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ unset CX_STAGE_DIR
+ staged="$(cx_stage_path "$2" "")"
+ cx_stage_repo "$2" "$staged"
+ case "$staged" in "$3"/.collectivex-stage-*) ;; *) exit 1 ;; esac
+ test -f "$staged/experimental/CollectiveX/public.py"
+ test ! -e "$4/.collectivex-stage"
+ cx_cleanup_stage "$staged" "$2"
+ test ! -e "$staged"
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_",
+ str(ROOT / "runtime" / "common.sh"), str(repo),
+ str(squash), str(parent),
+ ],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "CX_SQUASH_DIR": str(squash),
+ },
+ )
+ finally:
+ parent.chmod(original_mode)
+ self.assertEqual(
+ sorted(path.name for path in parent.iterdir()),
+ ["repo", "squash"],
+ )
+ self.assertEqual(list(squash.iterdir()), [])
+
+ def test_stage_refuses_to_reuse_an_execution_child(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ repo = root / "repo"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ (source / "public.py").write_text("public\n")
+ base = root / "stage"
+ child = base / "job_collision"
+ child.mkdir(parents=True, mode=0o700)
+ sentinel = child / "keep"
+ sentinel.write_text("keep")
+ command = r'''
+ source "$1"
+ ! (cx_stage_repo "$2" "$3/job_collision")
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_",
+ str(ROOT / "runtime" / "common.sh"), str(repo), str(base),
+ ],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "COLLECTIVEX_CANONICAL_GHA": "1",
+ "COLLECTIVEX_EXECUTION_ID": "collision",
+ "CX_STAGE_DIR": str(base),
+ },
+ )
+ self.assertEqual(sentinel.read_text(), "keep")
+
+ def test_stage_removes_its_execution_child_when_rsync_fails(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ repo = root / "repo"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ (source / "public.py").write_text("public\n")
+ base = root / "stage"
+ base.mkdir(mode=0o700)
+ sentinel = root / "rsync-called"
+ command = r'''
+ source "$1"
+ rsync() { : > "$RSYNC_CALLED"; return 1; }
+ staged="$(cx_stage_path "$2" "$3")"
+ ! cx_stage_repo "$2" "$staged"
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_",
+ str(ROOT / "runtime" / "common.sh"), str(repo), str(base),
+ ],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "COLLECTIVEX_CANONICAL_GHA": "1",
+ "CX_STAGE_DIR": str(base),
+ "RSYNC_CALLED": str(sentinel),
+ },
+ )
+ self.assertTrue(sentinel.is_file())
+ self.assertEqual(list(base.iterdir()), [])
+
+ def test_interrupted_stage_is_cleanup_capable_before_copy(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ repo = root / "repo"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ (source / "public.py").write_text("public\n")
+ base = root / "stage"
+ base.mkdir(mode=0o700)
+ sibling = base / "keep"
+ sibling.write_text("keep\n")
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export REPO_ROOT="$2" CX_BENCH=nccl-ep
+ MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "$3")"
+ cx_install_launcher_fail_safe
+ rsync() { kill -TERM $$; return 143; }
+ cx_stage_repo "$REPO_ROOT" "$MOUNT_SRC"
+ '''
+ result = subprocess.run(
+ [
+ "bash", "-c", command, "_",
+ str(ROOT / "runtime" / "common.sh"), str(repo), str(base),
+ ],
+ text=True,
+ capture_output=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "COLLECTIVEX_CANONICAL_GHA": "1",
+ "COLLECTIVEX_EXECUTION_ID": "interrupted",
+ "CX_STAGE_DIR": str(base),
+ },
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertFalse((base / "job_interrupted").exists())
+ self.assertEqual(sibling.read_text(), "keep\n")
+
+ def test_stage_base_and_early_cleanup_are_isolated(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary).resolve()
+ repo = root / "repo"
+ source = repo / "experimental" / "CollectiveX"
+ source.mkdir(parents=True)
+ (source / "public.py").write_text("public\n")
+ nested = repo / "stage"
+ nested.mkdir(mode=0o700)
+ group_writable = root / "group-stage"
+ group_writable.mkdir(mode=0o770)
+ group_writable.chmod(0o770)
+ setgid = root / "setgid-stage"
+ setgid.mkdir(mode=0o750)
+ setgid.chmod(0o2750)
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ ! (CX_STAGE_DIR="$3"; cx_stage_path "$2" "$3")
+ ! (CX_STAGE_DIR="$4"; cx_stage_path "$2" "$4")
+ export CX_STAGE_DIR="$5" COLLECTIVEX_EXECUTION_ID="setgid-$$"
+ trap 'cx_cleanup_private_logs 0' EXIT
+ staged="$(cx_stage_path "$2" "$CX_STAGE_DIR")"
+ cx_stage_repo "$2" "$staged"
+ chmod 2700 "$staged"
+ cx_cleanup_stage "$staged" "$2"
+ test ! -e "$staged"
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_",
+ str(ROOT / "runtime" / "common.sh"), str(repo), str(nested),
+ str(group_writable), str(setgid),
+ ],
+ check=True,
+ env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"},
+ )
+
+ early = r'''
+ set -euo pipefail
+ source "$1"
+ export REPO_ROOT="$2" CX_STAGE_DIR="$3" CX_BENCH=nccl-ep
+ export COLLECTIVEX_EXECUTION_ID="pre-marker-$$"
+ MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "$CX_STAGE_DIR")"
+ cx_install_launcher_fail_safe
+ mkdir -m 700 "$MOUNT_SRC"
+ exit 17
+ '''
+ result = subprocess.run(
+ [
+ "bash", "-c", early, "_",
+ str(ROOT / "runtime" / "common.sh"), str(repo), str(setgid),
+ ],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"},
+ )
+ self.assertEqual(result.returncode, 17, result.stderr)
+ self.assertEqual(list(setgid.iterdir()), [])
+
+ def test_backend_cache_reuses_v3_and_falls_back_once_without_repair(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ parent = Path(temporary) / "stage"
+ parent.mkdir(mode=0o700)
+ concurrent = Path(temporary) / "concurrent"
+ concurrent.mkdir(mode=0o700)
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ for worker in 1 2 3; do
+ (
+ cx_prepare_backend_cache "$2"
+ printf '%s %s\n' "$CX_BACKEND_CACHE_SENTINEL_SHA256" \
+ "$CX_PREPARED_BACKEND_CACHE" > "$3/$worker"
+ ) &
+ done
+ wait
+ cmp "$3/1" "$3/2"
+ cmp "$3/1" "$3/3"
+ cx_prepare_backend_cache "$2"
+ first="$CX_PREPARED_BACKEND_CACHE"
+ first_digest="$CX_BACKEND_CACHE_SENTINEL_SHA256"
+ chmod 2700 "$first"
+ cx_prepare_backend_cache "$2"
+ second="$CX_PREPARED_BACKEND_CACHE"
+ test "$first" = "$second"
+ test "$first_digest" = "$CX_BACKEND_CACHE_SENTINEL_SHA256"
+ test "$first" = "$(cd "$2" && pwd -P)/.collectivex-backend-cache-v3-$(id -u)"
+ export CX_BACKEND_CACHE_ROOT="$first"
+ cx_verify_backend_cache_mount
+ export CX_BACKEND_CACHE_SENTINEL_SHA256="$(printf '0%.0s' {1..64})"
+ ! cx_verify_backend_cache_mount
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_", str(common), str(parent),
+ str(concurrent),
+ ],
+ check=True,
+ )
+ cache = parent / f".collectivex-backend-cache-v3-{os.getuid()}"
+ self.assertTrue(cache.is_dir())
+ self.assertEqual(cache.stat().st_mode & 0o777, 0o700)
+ self.assertEqual(
+ list(cache.glob(".collectivex-mount-sentinel-v1.tmp.*")), []
+ )
+ alias = Path(temporary) / "stage-alias"
+ alias.symlink_to(parent, target_is_directory=True)
+ canonical = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_prepare_backend_cache "$2"; '
+ 'printf "%s\\n%s\\n" "$CX_PREPARED_BACKEND_CACHE" '
+ '"$CX_BACKEND_CACHE_SENTINEL_SHA256"',
+ "_", str(common), str(alias),
+ ],
+ text=True,
+ capture_output=True,
+ check=True,
+ )
+ cache_path, digest = canonical.stdout.splitlines()
+ self.assertEqual(cache_path, str(cache.resolve()))
+ self.assertRegex(digest, r"^[0-9a-f]{64}$")
+ saved = parent / "saved-cache"
+ cache.rename(saved)
+ cache.mkdir(mode=0o700)
+ replacement = cache / ".collectivex-mount-sentinel-v1"
+ replacement.write_bytes(b"replacement".ljust(32, b"!"))
+ replacement.chmod(0o600)
+ replaced = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; export CX_BACKEND_CACHE_ROOT="$2" '
+ 'CX_BACKEND_CACHE_SENTINEL_SHA256="$3"; '
+ 'cx_verify_backend_cache_mount',
+ "_", str(common), str(cache), digest,
+ ]
+ )
+ self.assertNotEqual(replaced.returncode, 0)
+ replacement.unlink()
+ cache.rmdir()
+ saved.rename(cache)
+ (cache / ".collectivex-mount-sentinel-v1").unlink()
+ cache.rmdir()
+ target = Path(temporary) / "target"
+ target.mkdir(mode=0o700)
+ cache.symlink_to(target, target_is_directory=True)
+ fallback = subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_prepare_backend_cache "$2"; '
+ 'printf "%s\\n" "$CX_PREPARED_BACKEND_CACHE"',
+ "_", str(common), str(parent),
+ ],
+ text=True,
+ capture_output=True,
+ check=True,
+ )
+ v4 = parent / f".collectivex-backend-cache-v4-{os.getuid()}"
+ self.assertEqual(fallback.stdout.strip(), str(v4.resolve()))
+ self.assertTrue(cache.is_symlink())
+ self.assertTrue(v4.is_dir())
+ (v4 / ".collectivex-mount-sentinel-v1").unlink()
+ v4.rmdir()
+ v4.symlink_to(target, target_is_directory=True)
+ result = subprocess.run(
+ [
+ "bash", "-c", 'source "$1"; cx_prepare_backend_cache "$2"',
+ "_", str(common), str(parent),
+ ],
+ text=True,
+ capture_output=True,
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertNotIn(str(parent), result.stderr)
+ self.assertTrue(cache.is_symlink())
+ self.assertTrue(v4.is_symlink())
+
+ source = common.read_text().split("cx_prepare_backend_cache() {", 1)[1]
+ program = source.split("<<'PY'\n", 1)[1].split("\nPY\n", 1)[0]
+ with tempfile.TemporaryDirectory() as temporary:
+ parent = Path(temporary) / "stage"
+ parent.mkdir(mode=0o700)
+ fake_os = types.ModuleType("os")
+ fake_os.__dict__.update(os.__dict__)
+ fake_os.fsync = mock.Mock(side_effect=OSError("forced fsync failure"))
+ with (
+ mock.patch.dict(sys.modules, {"os": fake_os}),
+ mock.patch.object(sys, "argv", ["-", str(parent)]),
+ mock.patch.object(sys, "stdout", io.StringIO()),
+ self.assertRaises(SystemExit) as failure,
+ ):
+ exec(compile(program, "", "exec"), {})
+ self.assertEqual(failure.exception.code, 1)
+ self.assertEqual(
+ list(parent.rglob(".collectivex-mount-sentinel-v1.tmp.*")), []
+ )
+
+ def test_nvidia_namespace_package_roots_come_from_distribution_files(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ site = Path(temporary) / "site"
+ package = site / "nvidia" / "nccl"
+ (package / "include").mkdir(parents=True)
+ (package / "lib").mkdir()
+ (package / "include" / "nccl.h").write_text("header\n")
+ (package / "lib" / "libnccl.so.2").write_text("library\n")
+ info = site / "nvidia_nccl_cu13-2.30.4.dist-info"
+ info.mkdir()
+ (info / "METADATA").write_text(
+ "Metadata-Version: 2.1\nName: nvidia-nccl-cu13\nVersion: 2.30.4\n"
+ )
+ (info / "RECORD").write_text(
+ "nvidia/nccl/include/nccl.h,,\n"
+ "nvidia/nccl/lib/libnccl.so.2,,\n"
+ "nvidia_nccl_cu13-2.30.4.dist-info/METADATA,,\n"
+ "nvidia_nccl_cu13-2.30.4.dist-info/RECORD,,\n"
+ )
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_nvidia_package_root()/,/^}/p' "$1")"
+ root="$(cx_nvidia_package_root nvidia-nccl-cu13 nccl)"
+ test "$root" = "$2/nvidia/nccl"
+ ! cx_nvidia_package_root nvidia-nccl-cu13 nvshmem
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), str(site.resolve())],
+ check=True,
+ env={**os.environ, "PYTHONPATH": str(site)},
+ )
+
+ def test_cuda_cccl_exports_the_resolved_jit_toolchain_root(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ toolkit = root / "cuda-13.0"
+ (toolkit / "bin").mkdir(parents=True)
+ (toolkit / "include").mkdir()
+ (toolkit / "lib64").mkdir()
+ cccl = toolkit / "targets" / "x86_64-linux" / "include" / "cccl"
+ cccl.mkdir(parents=True)
+ nvcc = toolkit / "bin" / "nvcc"
+ nvcc.write_text("#!/bin/sh\nexit 0\n")
+ nvcc.chmod(0o755)
+ alias = root / "cuda"
+ alias.symlink_to(toolkit, target_is_directory=True)
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_prepare_cuda_cccl()/,/^}/p' "$1")"
+ cx_prepare_cuda_cccl
+ test "$CUDA_HOME" = "$2"
+ test "$CX_CUDA_CCCL" = "$2/targets/x86_64-linux/include/cccl"
+ test "$CPATH" = "$2/targets/x86_64-linux/include/cccl:"
+ test "$NVCC_PREPEND_FLAGS" = "-I$2/targets/x86_64-linux/include/cccl "
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), str(toolkit.resolve())],
+ check=True,
+ env={
+ **os.environ,
+ "PATH": f"{alias / 'bin'}:{os.environ['PATH']}",
+ "CPATH": "",
+ "NVCC_PREPEND_FLAGS": "",
+ },
+ )
+
+ def test_deepep_v2_toolchain_rejects_overlay_lock_failure(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_prepare_deepep_toolchain()/,/^}/p' "$1")"
+ cache_root="$2"
+ cx_nvidia_package_root() { printf '%s' /unused; }
+ cx_deepep_v2_root() { printf '%s' "$cache_root"; }
+ cx_log() { :; }
+ flock() { return 1; }
+ ! cx_prepare_deepep_toolchain
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), temporary],
+ check=True,
+ )
+
+ def test_pinned_source_fetch_retries_transient_failures(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_git()/,/^}/p' "$1")"
+ eval "$(sed -n '/^cx_git_in_tree()/,/^}/p' "$1")"
+ eval "$(sed -n '/^cx_fetch_revision()/,/^}/p' "$1")"
+ attempts=0
+ expected_directory="$(cd -P -- "$3" && pwd -P)"
+ sleep() { :; }
+ git() {
+ local argument has_directory=0 has_trust=0
+ if [ "$1" = '-c' ] && [ "$3" = init ]; then
+ mkdir -p "${@: -1}"
+ return 0
+ fi
+ for argument in "$@"; do
+ [ "$argument" != '-C' ] || has_directory=1
+ [ "$argument" != "safe.directory=$expected_directory" ] || has_trust=1
+ [ "$argument" != 'safe.directory=*' ] || return 1
+ done
+ [ "$has_directory" = 0 ] || [ "$has_trust" = 1 ] || return 1
+ case " $* " in
+ *' fetch '*)
+ attempts=$((attempts + 1))
+ [ "$attempts" = 3 ]
+ ;;
+ *' rev-parse HEAD '*) printf '%s\n' "$revision" ;;
+ *) return 0 ;;
+ esac
+ }
+ cx_fetch_revision https://example.invalid/repo "$2" "$3"
+ test "$attempts" = 3
+ '''
+ revision = "a" * 40
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common), revision, temporary],
+ check=True,
+ )
+
+ def test_git_tree_trust_is_exact_and_command_scoped(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ repository = root / "repo"
+ repository.mkdir()
+ alias = root / "alias"
+ alias.symlink_to(repository, target_is_directory=True)
+ wildcard = root / "*"
+ wildcard.mkdir()
+ arguments = root / "arguments"
+ command = r'''
+ set -euo pipefail
+ eval "$(sed -n '/^cx_git()/,/^}/p' "$1")"
+ eval "$(sed -n '/^cx_git_in_tree()/,/^}/p' "$1")"
+ arguments="$4"
+ git() { printf '%s\n' "$@" > "$arguments"; }
+ cx_git_in_tree "$2" status --porcelain
+ ! cx_git_in_tree relative status
+ ! cx_git_in_tree "$3" status
+ ! cx_git_in_tree "$5" status
+ '''
+ subprocess.run(
+ [
+ "bash",
+ "-c",
+ command,
+ "_",
+ str(common),
+ str(repository),
+ str(alias),
+ str(arguments),
+ str(wildcard),
+ ],
+ check=True,
+ )
+ self.assertEqual(
+ arguments.read_text().splitlines(),
+ [
+ "-c",
+ "credential.helper=",
+ "-c",
+ f"safe.directory={repository.resolve()}",
+ "-C",
+ str(repository.resolve()),
+ "status",
+ "--porcelain",
+ ],
+ )
+ self.assertNotIn("safe.directory=*", arguments.read_text())
+
+ def test_runtime_materializes_the_verified_host_source_without_network(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ seed = root / "seed"
+ seed.mkdir()
+ (seed / "pinned").write_text("source\n")
+ destination = root / "build"
+ fetched = root / "network-fetch"
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export CX_BACKEND_SOURCE_ROOT="$2/source"
+ SEED="$3" FETCHED="$5"
+ copy_mode=
+ cx_backend_source_path() { printf '%s' "$SEED"; }
+ cx_backend_source_is_valid() { test -f "$2/pinned"; }
+ cx_fetch_revision() { : > "$FETCHED"; return 1; }
+ cp() {
+ test "$1" = -R
+ copy_mode=recursive
+ command cp "$@"
+ }
+ cx_materialize_backend_source deepep-hybrid "$4"
+ test -f "$4/pinned"
+ test "$copy_mode" = recursive
+ python3 - "$4" <<'PY'
+import os
+import stat
+import sys
+assert stat.S_IMODE(os.stat(sys.argv[1]).st_mode) == 0o700
+PY
+ test ! -e "$FETCHED"
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_", str(common), str(root),
+ str(seed), str(destination), str(fetched),
+ ],
+ check=True,
+ )
+
+ def test_backend_source_validation_rejects_status_errors_and_ignored_files(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ cx_backend_source_pin() { printf '%s|%s|' revision tree; }
+ git() {
+ case " $* " in
+ *' rev-parse HEAD '*) printf '%s\n' revision ;;
+ *' rev-parse HEAD^{tree} '*) printf '%s\n' tree ;;
+ *' status --porcelain '*) [ "$mode" != status-error ] ;;
+ *' ls-files --others --ignored '*)
+ [ "$mode" != ignored ] || printf '%s\n' ignored.bin
+ ;;
+ *) return 1 ;;
+ esac
+ }
+ mode=status-error
+ ! cx_backend_source_is_valid backend "$2"
+ mode=ignored
+ ! cx_backend_source_is_valid backend "$2"
+ mode=clean
+ cx_backend_source_is_valid backend "$2"
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common), temporary],
+ check=True,
+ )
+
+ def test_backend_source_root_normalizes_inherited_special_mode(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ source_root = root / "experimental" / "CollectiveX" / ".cx_sources"
+ source = source_root / "backend-revision"
+ source.mkdir(parents=True)
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export COLLECTIVEX_EXECUTION_ID="source-mode-$$"
+ trap 'cx_cleanup_private_logs 0' EXIT
+ expected_mount="$2"
+ expected_source="$3"
+ expected_root="${expected_source%/*}"
+ observed_mode=2700
+ mock_stage_owner=4200
+ mock_root_owner=4200
+ chmod_calls=0
+ chmod() {
+ test "$1" = 700 && test "$2" = "$expected_root"
+ chmod_calls=$((chmod_calls + 1))
+ [ "$chmod_calls" = 2 ] || return 1
+ observed_mode=700
+ }
+ stat() {
+ case "$2" in
+ %u)
+ case "$3" in
+ "$expected_mount") printf '%s\n' "$mock_stage_owner" ;;
+ "$expected_root") printf '%s\n' "$mock_root_owner" ;;
+ *) return 1 ;;
+ esac
+ ;;
+ %a)
+ case "$3" in
+ "$expected_mount") printf '2700\n' ;;
+ "$expected_root") printf '%s\n' "$observed_mode" ;;
+ *) return 1 ;;
+ esac
+ ;;
+ *) return 1 ;;
+ esac
+ }
+ cx_backend_source_path() { printf '%s' "$expected_source"; }
+ cx_backend_source_is_valid() {
+ test "$1" = backend && test "$2" = "$expected_source"
+ }
+ cx_prepare_backend_source "$2" backend
+ test "$observed_mode" = 2700
+ test "$chmod_calls" = 0
+ observed_mode=2750
+ ! _cx_prepare_backend_source "$2" backend
+ test "$chmod_calls" = 1
+ _cx_prepare_backend_source "$2" backend
+ test "$observed_mode" = 700
+ mock_root_owner=4300
+ ! _cx_prepare_backend_source "$2" backend
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common), str(root), str(source)],
+ check=True,
+ )
+
+ def test_canonical_backend_sources_use_verified_seed_without_network(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ mount = root / "mount"
+ source_root = mount / "experimental" / "CollectiveX" / ".cx_sources"
+ seed_root = root / "seed"
+ seeds = [
+ seed_root / f"{backend}-revision"
+ for backend in ("backend-one", "backend-two")
+ ]
+ mount.mkdir(mode=0o700)
+ source_root.parent.mkdir(parents=True, mode=0o700)
+ for seed in seeds:
+ seed.mkdir(parents=True, mode=0o700)
+ (seed / "pinned").write_text("source\n")
+ network = root / "network"
+ command = r'''
+ set -euo pipefail
+ source "$1"
+ export COLLECTIVEX_CANONICAL_GHA=1
+ export CX_BACKEND_SOURCE_SEED_ROOT="$4"
+ export COLLECTIVEX_EXECUTION_ID="source-seed-$$"
+ trap 'cx_cleanup_private_logs 0' EXIT
+ NETWORK="$5"
+ stat() {
+ case "$2" in
+ %u) printf '4200\n' ;;
+ %a) printf '700\n' ;;
+ *) return 1 ;;
+ esac
+ }
+ cx_backend_source_path() { printf '%s/%s-revision' "$1" "$2"; }
+ cx_backend_source_is_valid() { test -f "$2/pinned"; }
+ cx_fetch_revision() { : > "$NETWORK"; return 1; }
+ cx_prepare_backend_source "$2" backend-one
+ cx_prepare_backend_source "$2" backend-two
+ test -f "$3/backend-one-revision/pinned"
+ test -f "$3/backend-two-revision/pinned"
+ test ! -e "$NETWORK"
+ rm -rf -- "$3/backend-one-revision" "$3/backend-two-revision"
+ unset CX_BACKEND_SOURCE_SEED_ROOT
+ ! _cx_prepare_backend_source "$2" backend-one
+ test ! -e "$NETWORK"
+ '''
+ subprocess.run(
+ [
+ "bash", "-c", command, "_", str(common), str(mount),
+ str(source_root), str(seed_root), str(network),
+ ],
+ check=True,
+ )
+
+ def test_deepep_hybrid_cache_reuse_revalidates_extensions(self) -> None:
+ common = ROOT / "runtime" / "common.sh"
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ (root / "deep_ep_cpp.so").write_bytes(b"deep")
+ (root / "hybrid_ep_cpp.so").write_bytes(b"hybrid")
+ command = r'''
+ set -euo pipefail
+ chmod 700 "$3"
+ source "$1"
+ eval "$(sed -n '/^cx_deepep_hybrid_marker_content_sha256()/,/^}/p' "$2")"
+ eval "$(sed -n '/^cx_deepep_hybrid_cache_is_valid()/,/^}/p' "$2")"
+ revision=revision tree=tree
+ cx_git() {
+ case " $* " in
+ *' rev-parse HEAD '*) printf '%s\n' "$revision" ;;
+ *' rev-parse HEAD^{tree} '*) printf '%s\n' "$tree" ;;
+ *' status --porcelain '*|*' ls-files --others '*) return 0 ;;
+ *) return 1 ;;
+ esac
+ }
+ cx_git_in_tree() { shift; cx_git "$@"; }
+ marker="$3/.collectivex-complete"
+ digest="$(cx_extension_pair_sha256 "$3" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')"
+ (umask 077; printf '%s\n%s\n%s\n' "$revision" "$tree" "$digest" > "$marker")
+ cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree"
+ printf changed > "$3/hybrid_ep_cpp.so"
+ ! cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree"
+ printf hybrid > "$3/hybrid_ep_cpp.so"
+ cp "$3/deep_ep_cpp.so" "$3/deep_ep_cpp-extra.so"
+ ! cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree"
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(common), str(runtime), temporary],
+ check=True,
+ )
+
+ def test_rack_backend_environment_is_shared_per_node_and_required(self) -> None:
+ runtime = ROOT / "runtime" / "run_in_container.sh"
+ launcher = (ROOT / "launchers" / "launch_gb-nv.sh").read_text()
+ assignment = next(
+ line for line in launcher.splitlines()
+ if line.startswith("SOURCE_BACKEND_ENV=")
+ )
+ self.assertNotIn("/tmp/.cx_backend_env", launcher)
+ self.assertIn('[ -f "$env_file" ] && [ -r "$env_file" ]', launcher)
+ self.assertIn('[ ! -L "$env_file" ]', launcher)
+ self.assertIn('$(stat -c "%u" "$env_root"):600', launcher)
+ self.assertIn('case "$(stat -c "%a" "$env_root")" in 700|[1-7]700)', launcher)
+ self.assertIn("node-${SLURM_NODEID}.sh", launcher)
+ self.assertIn("HybridEPBuffer", launcher)
+ self.assertIn('. "$env_file" || exit 66', launcher)
+ with tempfile.TemporaryDirectory() as temporary:
+ consumer = r'''
+ eval "$1"
+ env_root="$2/env"
+ SOURCE_BACKEND_ENV="${SOURCE_BACKEND_ENV//\/ix\/experimental\/CollectiveX\/.cx_backend\/env/$env_root}"
+ mkdir -p "$env_root"
+ env_file="$env_root/node-1.sh"
+ printf 'printf sourced > "$CX_SENTINEL"\n' > "$env_file"
+ chmod 600 "$env_file"
+ export CX_SENTINEL="$2/sentinel"
+ stat() {
+ [ "${STAT_FAIL:-0}" = 0 ] || return 1
+ case "$2" in
+ %a) printf '%s\n' "$ROOT_MODE" ;;
+ %u) printf '1000\n' ;;
+ %u:%a) printf '%s\n' "$FILE_OWNER_MODE" ;;
+ *) return 2 ;;
+ esac
+ }
+ run_case() {
+ rm -f "$CX_SENTINEL"
+ ROOT_MODE="$1" FILE_OWNER_MODE="$2" STAT_FAIL="$3" SLURM_NODEID="$4"
+ ( eval "$SOURCE_BACKEND_ENV" )
+ rc=$?
+ [ "$rc" = "$5" ] || return 1
+ if [ "$5" = 0 ]; then
+ [ -f "$CX_SENTINEL" ]
+ else
+ [ ! -e "$CX_SENTINEL" ]
+ fi
+ }
+ run_case 700 1000:600 0 1 0
+ run_case 2700 1000:600 0 1 0
+ run_case 755 1000:600 0 1 66
+ run_case 700 1000:600 1 1 66
+ run_case 700 2000:600 0 1 66
+ mv "$env_file" "$env_file.real"
+ ln -s "$env_file.real" "$env_file"
+ run_case 700 1000:600 0 1 66
+ rm "$env_file"
+ mv "$env_file.real" "$env_file"
+ run_case 700 1000:600 0 invalid 66
+ '''
+ subprocess.run(
+ ["bash", "-c", consumer, "_", assignment, temporary],
+ check=True,
+ )
+ command = r'''
+ set -euo pipefail
+ cd "$2"
+ eval "$(sed -n '/^cx_persist_backend_env()/,/^}/p' "$1")"
+ export SLURM_NODEID=1 PYTHONPATH=/ix/pinned DEEPEP_COMMIT=abc
+ cx_persist_backend_env
+ env_file="$PWD/.cx_backend/env/node-1.sh"
+ test -f "$env_file"
+ test "$(stat -f %Lp "$env_file" 2>/dev/null || stat -c %a "$env_file")" = 600
+ unset PYTHONPATH DEEPEP_COMMIT
+ . "$env_file"
+ test "$PYTHONPATH" = /ix/pinned
+ test "$DEEPEP_COMMIT" = abc
+ SLURM_NODEID=invalid && ! cx_persist_backend_env
+ '''
+ subprocess.run(
+ ["bash", "-c", command, "_", str(runtime), temporary],
+ check=True,
+ )
+
+ def test_stage_cleanup_failure_fails_job_but_marks_allocation_safe(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ (root / "repo").mkdir()
+ (root / "stage").mkdir()
+ command = r'''
+ source "$1"
+ cx_write_cleanup_guard() {
+ rm -f -- "$CX_JOB_ROOT/cleanup-safe" "$CX_JOB_ROOT/cleanup-unsafe"
+ : > "$CX_JOB_ROOT/cleanup-$1"
+ }
+ cx_cleanup_stage() { return 1; }
+ cx_cleanup_private_logs() { : > "$CX_JOB_ROOT/logs-deleted"; }
+ export CX_JOB_ROOT="$2" REPO_ROOT="$2/repo" MOUNT_SRC="$2/stage"
+ export COLLECTIVEX_CANONICAL_GHA=1 CX_ALLOCATION_UNCERTAIN=0
+ unset CX_BENCH JOB_ID
+ cx_launcher_cleanup 0
+ '''
+ result = subprocess.run(
+ ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"),
+ str(root)],
+ text=True,
+ capture_output=True,
+ env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"},
+ )
+ self.assertEqual(result.returncode, 1, result.stderr)
+ self.assertTrue((root / "cleanup-safe").is_file())
+ self.assertFalse((root / "cleanup-unsafe").exists())
+ self.assertFalse((root / "logs-deleted").exists())
+
+ def test_generated_stage_cleanup_never_removes_configured_base(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ base = root / "stage"
+ repo = root / "repo"
+ generated = base / "job_execution"
+ generated.mkdir(parents=True, mode=0o700)
+ repo.mkdir()
+ marker = generated / ".collectivex-stage-v1"
+ marker.write_text("collectivex-stage-v1\nexecution\n")
+ marker.chmod(0o600)
+ (generated / "payload").write_text("temporary")
+ subprocess.run(
+ [
+ "bash", "-c",
+ 'source "$1"; cx_cleanup_stage "$2" "$3"; '
+ '! cx_cleanup_stage "$4" "$3"',
+ "_", str(ROOT / "runtime" / "common.sh"), str(generated),
+ str(repo), str(base),
+ ],
+ check=True,
+ env={
+ **os.environ,
+ "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null",
+ "COLLECTIVEX_EXECUTION_ID": "execution",
+ "CX_STAGE_DIR": str(base),
+ },
+ )
+ self.assertFalse(generated.exists())
+ self.assertTrue(base.is_dir())
+ self.assertTrue(repo.is_dir())
+
+ def test_adapters_do_not_retain_dead_expected_methods(self) -> None:
+ for path in HERE.glob("ep_*.py"):
+ tree = ast.parse(path.read_text(), str(path))
+ methods = {
+ node.name for node in ast.walk(tree)
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
+ }
+ self.assertNotIn("expected", methods, path.name)
+
+ def test_artifact_safety_rejects_sensitive_material(self) -> None:
+ private_address = ".".join(str(octet) for octet in (10, 0, 0, 1))
+ secret = "github_pat_" + "A" * 24
+ sensitive = {
+ "ipv4": ({"note": private_address}, private_address),
+ "ipv6": ({"note": "[2001:db8::1]:29500"}, "2001:db8::1"),
+ "user-at-host": ({"note": "ssh admin@private-host"}, "admin@private-host"),
+ "hostname": ({"note": "host=compute-17"}, "compute-17"),
+ "private-dns": ({"note": "worker-7.cluster.local"}, "worker-7.cluster.local"),
+ "suffixed-host": ({"worker_hostname": "relative"}, "worker_hostname"),
+ "suffixed-address": ({"control_address": "relative"}, "control_address"),
+ "suffixed-path": ({"scheduler_path": "relative"}, "scheduler_path"),
+ "exact-address": ({"address": "relative"}, "address"),
+ "exact-ip": ({"ip": "relative"}, "ip"),
+ "camel-host": ({"workerHost": "relative"}, "workerHost"),
+ "camel-path": ({"schedulerPath": "relative"}, "schedulerPath"),
+ "acronym-gpu-uuid": ({"gpuUUID": "relative"}, "gpuUUID"),
+ "acronym-device-uuid": ({"deviceUUID": "relative"}, "deviceUUID"),
+ "acronym-pci-bus": ({"pciBusID": "relative"}, "pciBusID"),
+ "mac-address": ({"note": "00:11:22:33:44:55"}, "00:11:22:33:44:55"),
+ "ib-guid": ({"note": "00:11:22:33:44:55:66:77"}, "00:11:22:33:44:55:66:77"),
+ "dgx-host": ({"note": "dgx-b300-001"}, "dgx-b300-001"),
+ "cloud-host": ({"note": "ip-10-20-30-40"}, "ip-10-20-30-40"),
+ "credential-field": ({"service_token": "short"}, "service_token"),
+ "prefixed-token": ({"note": secret}, secret),
+ "hf-token": ({"note": "hf_" + "A" * 24}, "hf_" + "A" * 24),
+ "payment-token": ({"note": "sk_live_" + "A" * 24}, "sk_live_" + "A" * 24),
+ "generic-secret": ({"note": "password=not-a-real-secret"}, "not-a-real-secret"),
+ }
+ for root in ("data", "it-share", "lustre", "raid", "nvme_home", "scratch", "gpfs", "fsx"):
+ value = f"/{root}/collectivex/run"
+ sensitive[f"private-root-{root}"] = ({"note": value}, value)
+ for name, (document, offending) in sensitive.items():
+ with self.subTest(name=name), self.assertRaises(
+ artifact_safety.ArtifactSafetyError
+ ) as caught:
+ artifact_safety.assert_publication_safe([document])
+ self.assertNotIn(offending, str(caught.exception))
+
+ artifact_safety.assert_publication_safe([{
+ "runner": "b300",
+ "redaction": "sanitized-v1",
+ "path": "datasets/" + "a" * 64 + "/dataset.json",
+ "timing": "8:64:32",
+ "image_digest": "sha256:" + "b" * 64,
+ "source": "github.com",
+ }])
+ for ref in ("release@candidate", "worker1-feature", "sk-refactor-long-component-name"):
+ artifact_safety.assert_publication_safe([{"ref": ref}])
+
+ def test_artifact_safety_cli_does_not_echo_sensitive_values(self) -> None:
+ private_value = ".".join(str(octet) for octet in (10, 24, 68, 12))
+ with tempfile.TemporaryDirectory() as temporary:
+ path = Path(temporary) / "artifact.json"
+ path.write_text(json.dumps({"note": private_value}))
+ result = subprocess.run(
+ [sys.executable, str(ROOT / "artifact_safety.py"), str(path)],
+ text=True,
+ capture_output=True,
+ )
+ self.assertNotEqual(result.returncode, 0)
+ self.assertIn("forbidden ipv4-address value", result.stderr)
+ self.assertNotIn(private_value, result.stderr)
+
+ def test_artifact_safety_rejects_linked_and_special_inputs(self) -> None:
+ with tempfile.TemporaryDirectory() as temporary:
+ root = Path(temporary)
+ source = root / "source.json"
+ source.write_text("{}")
+ linked = root / "linked.json"
+ linked.symlink_to(source)
+ fifo = root / "fifo.json"
+ os.mkfifo(fifo)
+ for path in (linked, fifo):
+ with self.subTest(path=path.name), self.assertRaises(
+ artifact_safety.ArtifactSafetyError
+ ):
+ artifact_safety.load_documents([str(path)])
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/experimental/CollectiveX/tests/workload.py b/experimental/CollectiveX/tests/workload.py
new file mode 100644
index 0000000000..89a6b46052
--- /dev/null
+++ b/experimental/CollectiveX/tests/workload.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+"""Canonical, byte-stable CollectiveX routing workloads.
+
+A *canonical workload* is a routing trace generated ONCE, serialized to a platform-independent
+file, and referenced by an immutable `workload_id`. Every promoted benchmark point consumes the
+SAME serialized bytes, so "did NVIDIA and AMD run the identical workload?" is answered by a
+checksum match, not by trusting that two machines re-ran the same seeded generator.
+
+Layout on disk (one workload = two files, basename = workload_id):
+ /.npz topk_idx [gt,topk] int32, topk_weights [gt,topk] float32
+ /.manifest.json dims, routing profile, generator version, seed, SHA-256s
+
+Routing and gate weights come from a stdlib integer counter, not a framework RNG. The same
+parameters therefore produce the same int32/float32 bytes across PyTorch and accelerator images.
+"""
+from __future__ import annotations
+
+from array import array
+import bisect
+import hashlib
+import json
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import identity # noqa: E402
+
+WORKLOAD_SCHEMA_VERSION = 1
+# Bump when the counter or byte encoding changes. The workload ID binds parameters and trace bytes.
+GENERATOR_VERSION = "collectivex-routing-counter-v3"
+GATE_WEIGHT_FORMAT = "counter-u16-normalized-f32"
+ACTIVATION_GENERATOR = "collectivex-activation-counter-v3"
+_MASK64 = (1 << 64) - 1
+
+
+def _sha256(b: bytes) -> str:
+ return hashlib.sha256(b).hexdigest()
+
+
+def _mix64(value: int) -> int:
+ value = (value + 0x9E3779B97F4A7C15) & _MASK64
+ value = ((value ^ (value >> 30)) * 0xBF58476D1CE4E5B9) & _MASK64
+ value = ((value ^ (value >> 27)) * 0x94D049BB133111EB) & _MASK64
+ return value ^ (value >> 31)
+
+
+def _counter(seed: int, token: int, slot: int, attempt: int, stream: int) -> int:
+ value = (
+ (seed & _MASK64)
+ ^ (((token + 1) * 0xD2B74407B1CE6E93) & _MASK64)
+ ^ (((slot + 1) * 0xCA5A826395121157) & _MASK64)
+ ^ (((attempt + 1) * 0x9E3779B185EBCA87) & _MASK64)
+ ^ (((stream + 1) * 0xA24BAED4963EE407) & _MASK64)
+ )
+ return _mix64(value)
+
+
+def canonical_routing_rows(
+ global_tokens: int, experts: int, topk: int, routing: str, seed: int
+) -> tuple[list[list[int]], list[list[float]]]:
+ """Generate distinct experts and normalized weights using exact integer counters."""
+ if routing not in {"uniform", "zipf"}:
+ raise ValueError(f"unknown routing {routing!r} (uniform|zipf)")
+ if global_tokens <= 0 or experts <= 0 or topk <= 0 or topk > experts:
+ raise ValueError("global_tokens/experts/topk must be positive and topk <= experts")
+
+ cumulative: list[int] | None = None
+ if routing == "zipf":
+ total = 0
+ cumulative = []
+ for expert in range(experts):
+ total += (1 << 32) // (expert + 1)
+ cumulative.append(total)
+
+ indices: list[list[int]] = []
+ weights: list[list[float]] = []
+ for token in range(global_tokens):
+ selected: list[int] = []
+ used: set[int] = set()
+ for slot in range(topk):
+ attempt = 0
+ while True:
+ value = _counter(seed, token, slot, attempt, 0)
+ expert = (
+ value % experts
+ if cumulative is None
+ else bisect.bisect_right(cumulative, value % cumulative[-1])
+ )
+ if expert not in used:
+ used.add(expert)
+ selected.append(expert)
+ break
+ attempt += 1
+ if attempt > experts * 16:
+ raise RuntimeError("counter routing could not select distinct experts")
+ raw = [1 + _counter(seed, token, slot, 0, 1) % 65535 for slot in range(topk)]
+ denominator = float(sum(raw))
+ indices.append(selected)
+ weights.append([value / denominator for value in raw])
+ return indices, weights
+
+
+def _canonical_bytes(
+ indices: list[list[int]], weights: list[list[float]]
+) -> tuple[bytes, bytes]:
+ idx = array("i", (value for row in indices for value in row))
+ gate = array("f", (value for row in weights for value in row))
+ if idx.itemsize != 4 or gate.itemsize != 4:
+ raise RuntimeError("canonical workload requires 32-bit int and float arrays")
+ if sys.byteorder != "little":
+ idx.byteswap()
+ gate.byteswap()
+ return idx.tobytes(), gate.tobytes()
+
+
+def trace_checksums(
+ indices: list[list[int]], weights: list[list[float]]
+) -> dict[str, str]:
+ """Return the manifest hashes for exact logical or remapped routing rows."""
+ idx_bytes, weight_bytes = _canonical_bytes(indices, weights)
+ return {
+ "topk_idx": _sha256(idx_bytes),
+ "topk_weights": _sha256(weight_bytes),
+ "trace": _sha256(idx_bytes + weight_bytes),
+ }
+
+
+def canonical_member(
+ routing: str,
+ hidden: int,
+ topk: int,
+ experts: int,
+ ep_size: int,
+ tokens_per_rank: int,
+ seed: int,
+) -> tuple[str, dict[str, str], list[list[int]], list[list[float]]]:
+ """Derive one canonical manifest member and retain its rows for proof checks."""
+ global_tokens = ep_size * tokens_per_rank
+ indices, weights = canonical_routing_rows(global_tokens, experts, topk, routing, seed)
+ checksums = trace_checksums(indices, weights)
+ member = compute_workload_id(
+ routing,
+ hidden,
+ topk,
+ experts,
+ ep_size,
+ global_tokens,
+ seed,
+ trace_checksum=checksums["trace"],
+ )
+ return member, checksums, indices, weights
+
+
+def compute_workload_id(routing: str, hidden: int, topk: int, experts: int,
+ ep_size: int, global_tokens: int, seed: int,
+ generator: str = GENERATOR_VERSION,
+ trace_checksum: str | None = None) -> str:
+ """Deterministic ID over parameters and canonical trace bytes."""
+ if generator != GENERATOR_VERSION:
+ raise ValueError(f"unsupported workload generator {generator!r}")
+ if trace_checksum is None:
+ indices, weights = canonical_routing_rows(global_tokens, experts, topk, routing, seed)
+ idx_bytes, weight_bytes = _canonical_bytes(indices, weights)
+ trace_checksum = _sha256(idx_bytes + weight_bytes)
+ key = {
+ "generator": generator, "routing": routing, "hidden": hidden, "topk": topk,
+ "experts": experts, "ep_size": ep_size, "global_tokens": global_tokens,
+ "seed": seed, "trace_sha256": trace_checksum,
+ "activation_generator": ACTIVATION_GENERATOR,
+ "activation_identity": compute_activation_identity(seed, hidden),
+ }
+ return identity.workload_id(key)
+
+
+def compute_activation_identity(seed, hidden, generator=ACTIVATION_GENERATOR) -> str:
+ """Identity of the exact counter-derived activation generator."""
+ key = f"counter|seed={seed}|hidden={hidden}|gen={generator}"
+ return _sha256(key.encode())
+
+
+def build_manifest(routing, hidden, topk, experts, global_tokens, seed, experts_per_rank,
+ idx_np, weights_np):
+ """Assemble the manifest dict from the (numpy) trace arrays. Pure numpy/stdlib."""
+ if experts % experts_per_rank:
+ raise ValueError("experts must be divisible by experts_per_rank")
+ idx_bytes = idx_np.astype(" str:
+ import numpy as np
+ os.makedirs(out_dir, exist_ok=True)
+ wid = manifest["workload_id"]
+ np.savez_compressed(os.path.join(out_dir, f"{wid}.npz"),
+ topk_idx=idx_np.astype(np.int32), topk_weights=weights_np.astype(np.float32))
+ with open(os.path.join(out_dir, f"{wid}.manifest.json"), "w") as fh:
+ json.dump(manifest, fh, indent=2, sort_keys=True)
+ return wid
+
+
+def load_workload(npz_path, verify=True):
+ """Load a canonical trace (numpy + stdlib only). Returns (idx_np, weights_np, manifest).
+ Raises ValueError if verify=True and the on-disk bytes don't match the manifest checksums."""
+ import numpy as np
+ base = npz_path[:-4] if npz_path.endswith(".npz") else npz_path
+ with open(base + ".manifest.json") as fh:
+ manifest = json.load(fh)
+ if manifest.get("workload_id") != os.path.basename(base):
+ raise ValueError(f"workload manifest ID does not match filename for {base}")
+ with np.load(base + ".npz", allow_pickle=False) as archive:
+ if set(archive.files) != {"topk_idx", "topk_weights"}:
+ raise ValueError(f"workload archive fields differ for {base}")
+ idx_np = np.ascontiguousarray(archive["topk_idx"])
+ w_np = np.ascontiguousarray(archive["topk_weights"])
+ if verify:
+ ok, reason = verify_workload(manifest, idx_np, w_np)
+ if not ok:
+ raise ValueError(f"workload checksum mismatch for {base}: {reason}")
+ return idx_np, w_np, manifest
+
+
+def verify_workload(manifest, idx_np, weights_np):
+ """Recompute checksums and compare to the manifest. Returns (ok, reason)."""
+ import numpy as np
+ expected_fields = {
+ "schema_version", "workload_id", "generator_version", "gate_weight_format", "dims",
+ "routing_profile", "seed", "checksums", "activation_profile", "activation_generator",
+ "activation_identity",
+ }
+ if not isinstance(manifest, dict) or set(manifest) != expected_fields:
+ return False, "manifest fields differ from the v1 contract"
+ if (manifest["schema_version"] != WORKLOAD_SCHEMA_VERSION
+ or manifest["generator_version"] != GENERATOR_VERSION
+ or manifest["gate_weight_format"] != GATE_WEIGHT_FORMAT
+ or manifest["routing_profile"] not in {"uniform", "zipf"}):
+ return False, "manifest version or generator is unsupported"
+ if (isinstance(manifest["seed"], bool) or not isinstance(manifest["seed"], int)
+ or not identity.is_typed_id(manifest["workload_id"], "workload")):
+ return False, "manifest seed or workload ID is invalid"
+ dims = manifest["dims"]
+ dim_fields = {"hidden", "topk", "experts", "ep_size", "tokens_per_rank",
+ "global_tokens", "experts_per_rank"}
+ if not isinstance(dims, dict) or set(dims) != dim_fields:
+ return False, "manifest dimensions are invalid"
+ if any(isinstance(dims[key], bool) or not isinstance(dims[key], int) or dims[key] <= 0
+ for key in dim_fields):
+ return False, "manifest dimensions must be positive integers"
+ if (dims["experts"] != dims["ep_size"] * dims["experts_per_rank"]
+ or dims["global_tokens"] != dims["ep_size"] * dims["tokens_per_rank"]):
+ return False, "manifest EP dimensions are inconsistent"
+ shape = (dims["global_tokens"], dims["topk"])
+ if (idx_np.dtype != np.int32 or weights_np.dtype != np.float32
+ or idx_np.shape != shape or weights_np.shape != shape
+ or not idx_np.flags.c_contiguous or not weights_np.flags.c_contiguous):
+ return False, "workload array dtype, shape, or layout is invalid"
+ if (np.any(idx_np < 0) or np.any(idx_np >= dims["experts"])
+ or np.any(np.diff(np.sort(idx_np, axis=1), axis=1) == 0)):
+ return False, "expert indices are out of range or repeated"
+ if (not np.isfinite(weights_np).all() or np.any(weights_np < 0)
+ or not np.allclose(weights_np.sum(axis=1), 1.0, rtol=1e-5, atol=1e-6)):
+ return False, "gate weights are invalid"
+ if (manifest["activation_profile"] != "canonical-counter-source-v3"
+ or manifest["activation_generator"] != ACTIVATION_GENERATOR
+ or manifest["activation_identity"]
+ != compute_activation_identity(
+ manifest["seed"], dims["hidden"], manifest["activation_generator"]
+ )):
+ return False, "activation identity is invalid"
+ ib = idx_np.astype(" must fail
+ idx2[0, 0] = (int(idx2[0, 0]) + 1) % 256
+ bad, _ = verify_workload(man2, idx2, w2)
+ assert not bad, "verify must catch tampering"
+ print(f"save/load/verify roundtrip OK (workload_id={wid})")
+ except ImportError:
+ print("(numpy unavailable — skipped serialization roundtrip; id logic passed)")
+ print("workload self-test: PASS")
+ sys.exit(0)