diff --git a/.github/workflows/collectivex-sweep.yml b/.github/workflows/collectivex-sweep.yml index b56bc285a..91bb3c727 100644 --- a/.github/workflows/collectivex-sweep.yml +++ b/.github/workflows/collectivex-sweep.yml @@ -1,154 +1,851 @@ # CollectiveX Sweep — one structured run instead of thousands of dispatches. # -# Shape (mirrors the InferenceX CI tracker): setup -> sweep (a MATRIX job = "a job with other jobs -# in it") -> aggregate (the collector "at the end"). The matrix unit is a SHARD = one allocation that -# sweeps many cases sharing (sku, backend, mode, resource) — generate_matrix's own grouping, chunked -# so no cell exceeds the job budget. Each cell emits a handful of per-case JSONs; the aggregate job -# collects every shard into ONE line-delimited file (results/aggregate/*.ndjson) so there aren't -# thousands of individual result files. Run once per backend (deepep / uccl / flashinfer / -# deepep-hybrid / nccl-ep, + deepep_v2) for full parity. +# Shape: sweep/probe runs setup -> GPU cells; publish-v1 and refresh-v1 stay on disposable +# GitHub-hosted storage and skip GPU jobs. This file is registered on the default +# branch, so its collectivex branch revision can be dispatched with --ref. name: CollectiveX Sweep +permissions: + actions: read + contents: read on: workflow_dispatch: inputs: + operation: + description: Operation to execute + type: choice + default: sweep + options: [sweep, probe-precision, publish-v1, refresh-v1] backend: - description: EP library to sweep (deepep matrix is remapped onto the others, capability-filtered) + description: "EP library to sweep — 'all' runs every EP backend in one matrix" type: choice - default: deepep - options: [deepep, uccl, flashinfer, deepep-hybrid, nccl-ep] - deepep_v2: - description: DeepEP V2 from-source kernels (kernel_gen=v2; deepep backend only) - type: boolean - default: false + default: all + options: [all, deepep, deepep-v2, uccl, deepep-hybrid, mori, nccl-ep] suites: description: "'all' or comma-list of suite names" type: string default: all only_sku: - description: Restrict to one SKU (h100-dgxc|h200|b300|b200-dgxc|gb200|gb300|mi355x); blank = all + description: Restrict to one GHA runner pool (h100-dgxc|h200-dgxc|b300|b200-dgxc|gb200|gb300|mi325x|mi355x); blank = all + type: string + default: '' + min_nodes: + description: Keep only shards with at least this node/tray count (2 keeps every EP16 and GB EP8; blank = all) + type: string + default: '' + max_nodes: + description: Keep only shards with at most this node/tray count (1 keeps non-GB EP8; blank = all) type: string default: '' max_cases: - description: Max cases per shard cell (chunk larger shards) + description: Max cases per shard cell before chunking into another GHA job (128 = no chunking for current suites) type: string - default: '14' - + default: '128' + release_tag: + description: Publication gate; unversioned runs are diagnostic and cannot be published + type: choice + default: unversioned + options: [unversioned, v1] + qualification_index: + description: Deterministic execution-order index; V1 qualification uses 1, then 2, then 3 + type: choice + default: '1' + options: ['1', '2', '3'] + publish_run_ids: + description: For publish-v1, exactly three successful V1 sweep run IDs + type: string + default: '' + refresh_run_id: + description: For refresh-v1, source publication workflow run ID + type: string + default: '' + refresh_digest: + description: For refresh-v1, exact publication dataset SHA-256 + type: string + default: '' concurrency: - group: cx-sweep-${{ github.ref }}-${{ inputs.backend }}-${{ inputs.deepep_v2 }}-${{ inputs.only_sku }} + group: cx-${{ inputs.operation }}-${{ github.ref }}-${{ inputs.release_tag }}-${{ inputs.backend }}-${{ inputs.only_sku }} cancel-in-progress: false jobs: # ---- setup: resolve the suites into the shard matrix (the "pending jobs" node) ---- setup: + if: ${{ inputs.operation == 'sweep' || inputs.operation == 'probe-precision' }} runs-on: ubuntu-latest outputs: matrix: ${{ steps.gen.outputs.matrix }} n: ${{ steps.gen.outputs.n }} + max_parallel: ${{ steps.gen.outputs.max_parallel }} steps: - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v5.0.0 - with: { clean: true } - - run: pip install --quiet pyyaml + with: { clean: true, persist-credentials: false } + - name: Install matrix dependencies + run: python3 -m pip install --quiet PyYAML==6.0.2 - id: gen working-directory: experimental/CollectiveX + env: + INPUT_BACKEND: ${{ inputs.backend }} + INPUT_OPERATION: ${{ inputs.operation }} + INPUT_SUITES: ${{ inputs.suites }} + INPUT_ONLY_SKU: ${{ inputs.only_sku }} + INPUT_MIN_NODES: ${{ inputs.min_nodes }} + INPUT_MAX_NODES: ${{ inputs.max_nodes }} + INPUT_MAX_CASES: ${{ inputs.max_cases }} + INPUT_QUALIFICATION_INDEX: ${{ inputs.qualification_index }} + CX_QUALIFICATION_INDEX: ${{ inputs.qualification_index }} + COLLECTIVEX_SOURCE_SHA: ${{ github.sha }} + COLLECTIVEX_ARTIFACT_NAME: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }} + COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_unsupported run: | set -euo pipefail - ov=""; [ "${{ inputs.backend }}" != "deepep" ] && ov="--backend ${{ inputs.backend }}" - v2=""; [ "${{ inputs.deepep_v2 }}" = "true" ] && v2="--deepep-v2" - os=""; [ -n "${{ inputs.only_sku }}" ] && os="--only-sku ${{ inputs.only_sku }}" - # full matrix (with cases) -> artifact for the cells; slim (no cases) -> the strategy output. - python3 sweep_matrix.py --suites "${{ inputs.suites }}" --max-cases "${{ inputs.max_cases }}" $ov $v2 $os --out matrix_full.json >/dev/null - SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='cases'} for x in m['include']]}))") - echo "matrix=$SLIM" >> "$GITHUB_OUTPUT" - echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")" >> "$GITHUB_OUTPUT" - python3 -c "import json;m=json.load(open('matrix_full.json'));print('shard-cells:',len(m['include']),'cases:',sum(x['n'] for x in m['include']))" + if [ "$INPUT_OPERATION" = probe-precision ] && [ "${{ inputs.release_tag }}" != unversioned ]; then + echo 'precision probes cannot carry a V1 release tag' >&2 + exit 1 + fi + if [ "$INPUT_OPERATION" = sweep ] && [ "${{ inputs.release_tag }}" = v1 ]; then + [[ "$INPUT_QUALIFICATION_INDEX" =~ ^[123]$ ]] || { + echo 'V1 sweeps require qualification_index 1, 2, or 3' >&2 + exit 1 + } + [ "$INPUT_BACKEND" = all ] && [ "$INPUT_SUITES" = all ] \ + && [ -z "$INPUT_ONLY_SKU" ] && [ -z "$INPUT_MIN_NODES" ] \ + && [ -z "$INPUT_MAX_NODES" ] && [ "$INPUT_MAX_CASES" = 128 ] || { + echo 'V1 sweeps require the exact unfiltered full matrix' >&2 + exit 1 + } + python3 - <<'PY' + import capability + + if capability.provisional_precision_targets(): + raise SystemExit("V1 sweeps require every precision capability cell to be resolved") + PY + fi + if [ "$INPUT_OPERATION" = probe-precision ]; then + args=(--workflow-plan --backend "$INPUT_BACKEND" --out matrix_full.json) + [ -n "$INPUT_ONLY_SKU" ] && args+=(--only-sku "$INPUT_ONLY_SKU") + python3 tests/probe_precision.py "${args[@]}" + else + args=(--suites "$INPUT_SUITES" --max-cases "$INPUT_MAX_CASES") + case "$INPUT_BACKEND" in + all) args+=(--backends all) ;; + *) args+=(--backend "$INPUT_BACKEND") ;; + esac + [ -n "$INPUT_ONLY_SKU" ] && args+=(--only-sku "$INPUT_ONLY_SKU") + [ -n "$INPUT_MIN_NODES" ] && args+=(--min-nodes "$INPUT_MIN_NODES") + [ -n "$INPUT_MAX_NODES" ] && args+=(--max-nodes "$INPUT_MAX_NODES") + python3 sweep_matrix.py "${args[@]}" --out matrix_full.json >/dev/null + fi + python3 artifact_safety.py matrix_full.json + SLIM=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(json.dumps({'include':[{k:v for k,v in x.items() if k!='case_ids'} for x in m['include']]}))") + { + echo "matrix=$SLIM" + echo "n=$(python3 -c "import json;print(len(json.load(open('matrix_full.json'))['include']))")" + echo "source_backends=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(' '.join(sorted({x['backend'] for x in m['include']} & {'deepep-v2','deepep-hybrid'})))")" + echo "max_parallel=$(python3 -c "import json;m=json.load(open('matrix_full.json'));w=max(x['execution_weight'] for x in m['include']);b=64 if '$INPUT_OPERATION' == 'probe-precision' else 4096;print(max(1,min(10,b//w)))")" + } >> "$GITHUB_OUTPUT" + unsupported_n=0 + if [ "$INPUT_OPERATION" = sweep ]; then + unsupported_n=$(python3 -c "import json;m=json.load(open('matrix_full.json'));print(sum(x['disposition']=='unsupported' for x in m['requested_cases']))") + fi + echo "unsupported_n=$unsupported_n" >> "$GITHUB_OUTPUT" + if [ "$unsupported_n" -gt 0 ]; then + python3 sweep_matrix.py --emit-unsupported-from matrix_full.json \ + --out-dir unsupported + fi + python3 -c "import json;m=json.load(open('matrix_full.json'));print('execution-cells:',len(m['include']))" + - name: Prepare pinned backend source archive + if: ${{ steps.gen.outputs.source_backends != '' }} + working-directory: experimental/CollectiveX + env: + SOURCE_BACKENDS: ${{ steps.gen.outputs.source_backends }} + COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_sources + run: | + set -euo pipefail + source runtime/common.sh + work="$RUNNER_TEMP/collectivex-backend-sources" + archive="$RUNNER_TEMP/collectivex-backend-sources.tar" + rm -rf -- "$work" "$archive" + umask 077 + mkdir -m 700 "$work" + mkdir -p "$work/experimental/CollectiveX" + read -r -a backends <<< "$SOURCE_BACKENDS" + [ "${#backends[@]}" -gt 0 ] + for backend in "${backends[@]}"; do + cx_prepare_backend_source "$work" "$backend" + done + cx_cleanup_private_logs 0 + tar --sort=name --mtime='@1' --owner=0 --group=0 --numeric-owner \ + -C "$work/experimental/CollectiveX" -cf "$archive" .cx_sources + sha256sum "$archive" + rm -rf -- "$work" + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + if: ${{ steps.gen.outputs.source_backends != '' }} + with: + name: cxbackend-sources-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ runner.temp }}/collectivex-backend-sources.tar + if-no-files-found: error + retention-days: ${{ inputs.release_tag == 'v1' && 90 || 3 }} - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: cxsweep-matrix-${{ github.run_id }} + name: cxsweep-matrix-${{ github.run_id }}-${{ github.run_attempt }} path: experimental/CollectiveX/matrix_full.json if-no-files-found: error + retention-days: ${{ inputs.release_tag == 'v1' && 90 || 3 }} + - name: Create V1 release marker + if: ${{ inputs.operation == 'sweep' && inputs.release_tag == 'v1' }} + env: + EXPECTED_MATRIX_SHA256: f1ca85f9689922b90edd5767b9ff2a902f6b896f32f68b2ca086dde3fd2157d0 + RUN_ID: ${{ github.run_id }} + RUN_ATTEMPT: ${{ github.run_attempt }} + SOURCE_SHA: ${{ github.sha }} + QUALIFICATION_INDEX: ${{ inputs.qualification_index }} + run: | + set -euo pipefail + destination="$RUNNER_TEMP/collectivex-release" + install -d -m 700 "$destination" + python3 - "$destination/release.json" <<'PY' + import hashlib + import json + import os + import pathlib + import sys + + sys.path.insert(0, str(pathlib.Path("experimental/CollectiveX").resolve())) + import sweep_matrix + + matrix = pathlib.Path("experimental/CollectiveX/matrix_full.json").read_bytes() + matrix_sha256 = hashlib.sha256(matrix).hexdigest() + if matrix_sha256 != os.environ["EXPECTED_MATRIX_SHA256"]: + raise SystemExit("V1 release tag requires the locked full matrix") + qualification_index = int(os.environ["QUALIFICATION_INDEX"]) + execution_plan_sha256 = sweep_matrix.qualification_execution_plan_sha256( + json.loads(matrix), qualification_index + ) + marker = { + "execution_plan_sha256": execution_plan_sha256, + "format": "collectivex.release-tag.v1", + "matrix_sha256": matrix_sha256, + "qualification_index": qualification_index, + "release_tag": "v1", + "run_attempt": os.environ["RUN_ATTEMPT"], + "run_id": os.environ["RUN_ID"], + "source_sha": os.environ["SOURCE_SHA"], + } + pathlib.Path(sys.argv[1]).write_text( + json.dumps(marker, sort_keys=True, separators=(",", ":")) + "\n" + ) + PY + python3 experimental/CollectiveX/artifact_safety.py "$destination/release.json" + - name: Upload V1 release marker + if: ${{ inputs.operation == 'sweep' && inputs.release_tag == 'v1' }} + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: cxrelease-v1-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ runner.temp }}/collectivex-release/release.json + if-no-files-found: error + retention-days: 90 + - name: Validate unsupported artifact safety + id: unsupported_safety + if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 }} + run: | + python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/unsupported/*.json + - name: Validate unsupported outcomes + id: unsupported_contracts + if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 && steps.unsupported_safety.outcome == 'success' }} + env: + COLLECTIVEX_ARTIFACT_NAME: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }} + COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_unsupported + run: | + python3 experimental/CollectiveX/contracts.py validate-delivery \ + --source experimental/CollectiveX/matrix_full.json \ + --disposition unsupported \ + experimental/CollectiveX/unsupported/*.json + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + if: ${{ always() && fromJSON(steps.gen.outputs.unsupported_n) > 0 && steps.unsupported_contracts.outcome == 'success' && steps.unsupported_safety.outcome == 'success' }} + with: + name: cxunsupported-${{ github.run_id }}-${{ github.run_attempt }} + path: experimental/CollectiveX/unsupported/*.json + if-no-files-found: error + retention-days: ${{ inputs.release_tag == 'v1' && 90 || 3 }} # ---- sweep: ONE matrix cell per shard (the parent job with child jobs) ---- sweep: needs: setup - if: ${{ fromJSON(needs.setup.outputs.n) > 0 }} + if: ${{ (inputs.operation == 'sweep' || inputs.operation == 'probe-precision') && fromJSON(needs.setup.outputs.n) > 0 }} strategy: fail-fast: false - max-parallel: 10 # don't saturate the ~20-runner fleet; cells queue as slots free + max-parallel: ${{ fromJSON(needs.setup.outputs.max_parallel) }} matrix: ${{ fromJSON(needs.setup.outputs.matrix) }} - # h200 label spans two clusters; pin to the validated dgxc pool (mirrors collectivex-experimental). - runs-on: ${{ matrix.sku == 'h200' && 'h200-dgxc' || matrix.sku }} + runs-on: ${{ matrix.sku }} timeout-minutes: 350 env: CX_BENCH: ${{ matrix.backend }} - CX_DEEPEP_V2: ${{ matrix.deepep_v2 && '1' || '' }} CX_NODES: ${{ matrix.nodes }} - CX_SHARD_FILE: results/.shard_${{ matrix.id }}.json + CX_GPUS_PER_NODE: ${{ matrix.gpus_per_node }} + CX_SCALE_UP_DOMAIN: ${{ matrix.scale_up_domain }} + CX_SHARD_FILE: .shards/${{ matrix.id }}.json + CX_SHARD_SKU: ${{ matrix.sku }} + CX_PRECISION_PROBE: ${{ inputs.operation == 'probe-precision' && '1' || '0' }} + COLLECTIVEX_CANONICAL_GHA: '1' COLLECTIVEX_SOURCE_SHA: ${{ github.sha }} - CX_NODELIST: ${{ matrix.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }} - CX_STAGE_DIR: ${{ matrix.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} + COLLECTIVEX_ARTIFACT_NAME: ${{ inputs.operation == 'probe-precision' && format('cxprobe-{0}-{1}-{2}', matrix.id, github.run_id, github.run_attempt) || format('cxshard-{0}-{1}-{2}', matrix.id, github.run_id, github.run_attempt) }} + # Consolidated shards run one bounded build-group in one Slurm allocation, so + # the launcher's default 45-min --time is too short. 300 min covers a cold + # compute-node image import plus the shard. The allocation releases early + # when the shard finishes, so short shards don't waste it. + CX_TIME: ${{ inputs.operation == 'probe-precision' && '90' || '300' }} + COLLECTIVEX_EXECUTION_ID: ${{ github.run_id }}_${{ github.run_attempt }}_${{ matrix.id }} + CX_QUALIFICATION_INDEX: ${{ inputs.qualification_index }} + CX_JOB_ROOT: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }} + CX_SOURCE_ROOT: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}/source + HOME: /tmp/inferencex-collectivex-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.id }}/home steps: - - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v5.0.0 - with: { clean: true } + - name: Prepare isolated source + id: source + env: + COLLECTIVEX_REPOSITORY: ${{ github.repository }} + run: | + set -euo pipefail + python3 - <<'PY' + import os + import re + import shutil + import stat + import time + + pattern = re.compile(r"inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+") + cutoff = time.time() - 86400 + for entry in os.scandir("/tmp"): + if not pattern.fullmatch(entry.name): + continue + try: + metadata = entry.stat(follow_symlinks=False) + except FileNotFoundError: + continue + if ( + not stat.S_ISDIR(metadata.st_mode) + or metadata.st_uid != os.getuid() + or stat.S_IMODE(metadata.st_mode) != 0o700 + or metadata.st_mtime >= cutoff + ): + continue + marked = False + for marker_name in ("cleanup-safe", "cleanup-unsafe"): + try: + marker = os.stat( + os.path.join(entry.path, marker_name), follow_symlinks=False + ) + except FileNotFoundError: + continue + marked = ( + stat.S_ISREG(marker.st_mode) + and marker.st_uid == os.getuid() + and stat.S_IMODE(marker.st_mode) == 0o600 + ) + if marked: + break + if marked: + shutil.rmtree(entry.path) + PY + [[ "$CX_JOB_ROOT" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \ + || { echo "CollectiveX isolated root is invalid" >&2; exit 1; } + [ "$CX_SOURCE_ROOT" = "$CX_JOB_ROOT/source" ] \ + || { echo "CollectiveX source root is invalid" >&2; exit 1; } + if [ -e "$CX_JOB_ROOT" ] || [ -L "$CX_JOB_ROOT" ]; then + echo "CollectiveX isolated root already exists" >&2 + exit 1 + fi + umask 077 + mkdir -m 700 -- "$CX_JOB_ROOT" + trap 'rc=$?; [ "$rc" = 0 ] || rm -rf -- "$CX_JOB_ROOT"; exit "$rc"' EXIT + mkdir -m 700 -- "$HOME" "$CX_JOB_ROOT/control" "$CX_JOB_ROOT/artifact" "$CX_SOURCE_ROOT" + : > "$CX_JOB_ROOT/cleanup-safe" + if ! { + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null git init -q "$CX_SOURCE_ROOT" + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \ + git -C "$CX_SOURCE_ROOT" remote add origin \ + "https://github.com/${COLLECTIVEX_REPOSITORY}.git" + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \ + git -C "$CX_SOURCE_ROOT" -c credential.helper= -c protocol.version=2 \ + fetch -q --no-tags --depth=1 origin "$COLLECTIVEX_SOURCE_SHA" + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null \ + git -C "$CX_SOURCE_ROOT" -c advice.detachedHead=false \ + checkout -q --detach FETCH_HEAD + [ "$(git -C "$CX_SOURCE_ROOT" rev-parse HEAD)" = "$COLLECTIVEX_SOURCE_SHA" ] + } /dev/null 2>&1; then + echo "CollectiveX source preparation failed" >&2 + exit 1 + fi + [ "$(stat -c '%a' "$CX_JOB_ROOT")" = 700 ] \ + || { echo "CollectiveX isolated root has unsafe permissions" >&2; exit 1; } + echo 'prepared=true' >> "$GITHUB_OUTPUT" + trap - EXIT - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: - name: cxsweep-matrix-${{ github.run_id }} - path: experimental/CollectiveX - - name: Extract this shard's cases (stdlib only — no runner deps) - working-directory: experimental/CollectiveX + name: cxsweep-matrix-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ env.CX_JOB_ROOT }}/control + - name: Download pinned backend source archive + if: ${{ matrix.backend == 'deepep-v2' || matrix.backend == 'deepep-hybrid' }} + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: cxbackend-sources-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ env.CX_JOB_ROOT }}/control + - name: Install pinned backend source seed + if: ${{ matrix.backend == 'deepep-v2' || matrix.backend == 'deepep-hybrid' }} + env: + EXPECTED_BACKEND: ${{ matrix.backend }} run: | set -euo pipefail - python3 -c " - import json - m=json.load(open('matrix_full.json')) - s=[x for x in m['include'] if x['id']=='${{ matrix.id }}'] - assert s, 'shard ${{ matrix.id }} not in matrix' - s=s[0] - json.dump({'id':s['id'],'sku':s['sku'],'backend':s['backend'],'nodes':s['nodes'],'deepep_v2':s['deepep_v2'],'cases':s['cases']}, open('results/.shard_${{ matrix.id }}.json','w')) - print('shard ${{ matrix.id }}:', len(s['cases']), 'cases') - " - - name: Sweep shard ${{ matrix.id }} (${{ matrix.n }} cases, one allocation) + archive="$CX_JOB_ROOT/control/collectivex-backend-sources.tar" + destination="$CX_SOURCE_ROOT/experimental/CollectiveX" + seed_root="$destination/.cx_sources" + [ -f "$archive" ] && [ ! -e "$seed_root" ] && [ ! -L "$seed_root" ] + source "$destination/runtime/common.sh" + source_path="$(cx_backend_source_path "$seed_root" "$EXPECTED_BACKEND")" + source_basename="${source_path#"$seed_root/"}" + [ -n "$source_basename" ] \ + && [ "$source_path" = "$seed_root/$source_basename" ] \ + && [[ "$source_basename" != */* ]] + python3 "$destination/source_archive.py" \ + "$archive" "$destination" "$source_basename" + cx_backend_source_is_valid "$EXPECTED_BACKEND" "$source_path" + printf 'CX_BACKEND_SOURCE_SEED_ROOT=%s\n' "$seed_root" >> "$GITHUB_ENV" + - name: Extract and validate this execution control + run: | + set -euo pipefail + cd "$CX_SOURCE_ROOT/experimental/CollectiveX" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + if [ '${{ inputs.operation }}' = probe-precision ]; then + python3 tests/probe_precision.py \ + --extract-from "$CX_JOB_ROOT/control/matrix_full.json" \ + --probe-id '${{ matrix.id }}' \ + --expect-sku '${{ matrix.sku }}' \ + --expect-backend '${{ matrix.backend }}' \ + --expect-nodes '${{ matrix.nodes }}' \ + --out '${{ env.CX_SHARD_FILE }}' + else + python3 sweep_matrix.py \ + --extract-from "$CX_JOB_ROOT/control/matrix_full.json" \ + --shard-id '${{ matrix.id }}' \ + --expect-sku '${{ matrix.sku }}' \ + --expect-backend '${{ matrix.backend }}' \ + --expect-nodes '${{ matrix.nodes }}' \ + --out '${{ env.CX_SHARD_FILE }}' >/dev/null + fi + - name: Execute ${{ inputs.operation }} cell ${{ matrix.id }} + id: sweep_shard env: - RUNNER_NAME: ${{ runner.name }} - run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" + COLLECTIVEX_OPERATOR_CONFIG_CONTENT: ${{ secrets.COLLECTIVEX_OPERATOR_CONFIG_V1 }} + COLLECTIVEX_OPERATOR_CONFIG_REQUIRED: '1' + run: | + set -euo pipefail + umask 077 + : > "$CX_JOB_ROOT/cleanup-unsafe" + rm -f -- "$CX_JOB_ROOT/cleanup-safe" + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + bash "experimental/CollectiveX/launchers/launch_${{ matrix.launcher }}.sh" + - name: Confirm allocation cleanup + id: allocation_cleanup + if: ${{ always() && steps.source.outputs.prepared == 'true' }} + run: | + set -euo pipefail + [ -f "$CX_JOB_ROOT/cleanup-safe" ] && [ ! -e "$CX_JOB_ROOT/cleanup-unsafe" ] \ + || { echo "CollectiveX allocation cleanup was not confirmed" >&2; exit 1; } + - name: Validate shard artifact safety + id: artifact_safety + if: ${{ always() && steps.allocation_cleanup.outcome == 'success' && (inputs.operation != 'probe-precision' || steps.sweep_shard.outcome == 'success') }} + run: | + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + python3 experimental/CollectiveX/artifact_safety.py experimental/CollectiveX/results/*.json + - name: Validate shard delivery completeness + id: delivery_contracts + if: ${{ always() && steps.artifact_safety.outcome == 'success' }} + run: | + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + if [ '${{ inputs.operation }}' = probe-precision ]; then + python3 experimental/CollectiveX/tests/probe_precision.py \ + --validate-manifest experimental/CollectiveX/results/*.json + else + python3 experimental/CollectiveX/contracts.py validate-delivery \ + --source "experimental/CollectiveX/${CX_SHARD_FILE}" \ + experimental/CollectiveX/results/*.json + fi - name: Shard summary - if: always() - run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true + if: ${{ inputs.operation == 'sweep' && always() && steps.artifact_safety.outcome == 'success' && steps.delivery_contracts.outcome == 'success' }} + run: | + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + python3 experimental/CollectiveX/summarize.py \ + --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" || true + - name: Stage shard artifact + id: stage_artifact + if: ${{ always() && steps.delivery_contracts.outcome == 'success' && steps.artifact_safety.outcome == 'success' }} + run: | + set -euo pipefail + cd "$CX_SOURCE_ROOT" 2>/dev/null \ + || { echo "CollectiveX source is unavailable" >&2; exit 1; } + cp -- experimental/CollectiveX/results/*.json "$CX_JOB_ROOT/artifact/" - name: Upload shard results - if: always() + id: upload_artifact + if: always() && steps.stage_artifact.outcome == 'success' && steps.delivery_contracts.outcome == 'success' && steps.artifact_safety.outcome == 'success' uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: cxshard-${{ matrix.id }}-${{ github.run_id }} - path: experimental/CollectiveX/results/*.json # glob skips the hidden .shard_*.json - if-no-files-found: warn - - # ---- aggregate: collect every shard into ONE ndjson (the "result aggregator at the end") ---- - aggregate: - needs: sweep - if: always() + name: ${{ inputs.operation == 'probe-precision' && format('cxprobe-{0}-{1}-{2}', matrix.id, github.run_id, github.run_attempt) || format('cxshard-{0}-{1}-{2}', matrix.id, github.run_id, github.run_attempt) }} + path: | + ${{ env.CX_JOB_ROOT }}/artifact/*.json + if-no-files-found: error + retention-days: ${{ inputs.release_tag == 'v1' && 90 || 3 }} + - name: Cleanup isolated workspace + if: ${{ always() && steps.source.outputs.prepared == 'true' }} + run: | + set -euo pipefail + [[ "$CX_JOB_ROOT" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \ + || { echo "CollectiveX cleanup root is invalid" >&2; exit 1; } + [ "$CX_SOURCE_ROOT" = "$CX_JOB_ROOT/source" ] \ + || { echo "CollectiveX cleanup source is invalid" >&2; exit 1; } + [ -f "$CX_JOB_ROOT/cleanup-safe" ] && [ ! -e "$CX_JOB_ROOT/cleanup-unsafe" ] \ + || { echo "CollectiveX allocation cleanup was not confirmed; retaining isolated files" >&2; exit 1; } + if [ '${{ steps.sweep_shard.outcome }}' = success ] \ + && [ '${{ steps.allocation_cleanup.outcome }}' = success ] \ + && [ '${{ steps.artifact_safety.outcome }}' = success ] \ + && [ '${{ steps.delivery_contracts.outcome }}' = success ] \ + && [ '${{ steps.stage_artifact.outcome }}' = success ] \ + && [ '${{ steps.upload_artifact.outcome }}' = success ] \ + && [ -f "$CX_SOURCE_ROOT/experimental/CollectiveX/runtime/common.sh" ]; then + # shellcheck source=/dev/null + if source "$CX_SOURCE_ROOT/experimental/CollectiveX/runtime/common.sh" \ + >/dev/null 2>&1; then + cx_cleanup_private_logs 0 + fi + fi + rm -rf -- "$CX_JOB_ROOT" + + probe-summary: + needs: [setup, sweep] + if: ${{ always() && inputs.operation == 'probe-precision' && needs.sweep.result == 'success' }} runs-on: ubuntu-latest + timeout-minutes: 15 steps: - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v5.0.0 - with: { clean: true } + with: { clean: true, persist-credentials: false } - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: - pattern: cxshard-*-${{ github.run_id }} - path: _shards + name: cxsweep-matrix-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ runner.temp }}/collectivex-probes/control + - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + pattern: cxprobe-*-${{ github.run_id }}-${{ github.run_attempt }} merge-multiple: true - - name: Aggregate shards -> one ndjson - working-directory: experimental/CollectiveX + path: ${{ runner.temp }}/collectivex-probes/results + - name: Validate exact precision probe coverage + run: | + set -euo pipefail + plan="$RUNNER_TEMP/collectivex-probes/control/matrix_full.json" + shopt -s nullglob + manifests=("$RUNNER_TEMP"/collectivex-probes/results/*.json) + [ "${#manifests[@]}" -gt 0 ] || { + echo 'precision probe artifacts are empty' >&2 + exit 1 + } + python3 experimental/CollectiveX/artifact_safety.py "$plan" "${manifests[@]}" + python3 experimental/CollectiveX/tests/probe_precision.py \ + --validate-bundle "$plan" --validate-manifest "${manifests[@]}" + - name: Upload validated precision probe bundle + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: cxprecision-probes-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ runner.temp }}/collectivex-probes + if-no-files-found: error + retention-days: 30 + + publish: + if: ${{ inputs.operation == 'publish-v1' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + env: + GH_TOKEN: ${{ github.token }} + RUN_IDS: ${{ inputs.publish_run_ids }} + steps: + - name: Verify source runs + id: runs + env: + REPOSITORY: ${{ github.repository }} + run: | + set -euo pipefail + IFS=',' read -r -a run_ids <<< "$RUN_IDS" + [ "${#run_ids[@]}" -eq 3 ] || { + echo 'publish_run_ids must contain exactly three IDs' >&2 + exit 1 + } + [ "$(printf '%s\n' "${run_ids[@]}" | sort -u | wc -l)" -eq 3 ] || { + echo 'publish_run_ids must be unique' >&2 + exit 1 + } + + : > "$RUNNER_TEMP/collectivex-runs.tsv" + source_sha='' + for run_id in "${run_ids[@]}"; do + [[ "$run_id" =~ ^[1-9][0-9]*$ ]] || { + echo 'publish_run_ids contains a non-decimal ID' >&2 + exit 1 + } + metadata=$(gh api "repos/$REPOSITORY/actions/runs/$run_id") + name=$(jq -r '.name' <<< "$metadata") + path=$(jq -r '.path' <<< "$metadata") + branch=$(jq -r '.head_branch' <<< "$metadata") + status=$(jq -r '.status' <<< "$metadata") + conclusion=$(jq -r '.conclusion' <<< "$metadata") + sha=$(jq -r '.head_sha' <<< "$metadata") + attempt=$(jq -r '.run_attempt' <<< "$metadata") + [ "$name" = 'CollectiveX Sweep' ] \ + && [ "$path" = '.github/workflows/collectivex-sweep.yml' ] \ + && [ "$branch" = 'collectivex' ] \ + && [ "$status" = 'completed' ] \ + && [ "$conclusion" = 'success' ] \ + && [ "$attempt" = 1 ] \ + && [[ "$sha" =~ ^[0-9a-f]{40}$ ]] || { + echo "run $run_id is not an eligible first-attempt V1 sweep" >&2 + exit 1 + } + if [ -z "$source_sha" ]; then + source_sha="$sha" + else + [ "$sha" = "$source_sha" ] || { + echo 'source runs do not share one source SHA' >&2 + exit 1 + } + fi + printf '%s\t%s\t%s\n' "$run_id" "$attempt" "$sha" \ + >> "$RUNNER_TEMP/collectivex-runs.tsv" + done + echo "source_sha=$source_sha" >> "$GITHUB_OUTPUT" + + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v5.0.0 + with: + ref: ${{ steps.runs.outputs.source_sha }} + clean: true + persist-credentials: false + + - name: Install publisher dependencies + run: python3 -m pip install --quiet -r experimental/CollectiveX/requirements.txt + + - name: Build promoted publication + env: + REPOSITORY: ${{ github.repository }} run: | set -euo pipefail - tag="${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}" - python3 aggregate_results.py --in-dir ../../_shards --out "results/aggregate/collectivex_${tag}_${{ github.run_id }}.ndjson" + store="$RUNNER_TEMP/collectivex-publisher" + downloads="$RUNNER_TEMP/collectivex-downloads" + output="$RUNNER_TEMP/collectivex-publication" + umask 027 + mkdir -m 750 "$store" "$downloads" "$output" + : > "$RUNNER_TEMP/collectivex-bundles.txt" + : > "$RUNNER_TEMP/collectivex-qualification-indices.txt" + + while IFS=$'\t' read -r run_id attempt source_sha; do + run_dir="$downloads/$run_id" + mkdir -m 750 "$run_dir" + gh run download "$run_id" --repo "$REPOSITORY" --dir "$run_dir" + matrix="$run_dir/cxsweep-matrix-$run_id-$attempt/matrix_full.json" + marker="$run_dir/cxrelease-v1-$run_id-$attempt/release.json" + [ -f "$matrix" ] || { + echo "run $run_id is missing its exact matrix artifact" >&2 + exit 1 + } + [ -f "$marker" ] || { + echo "run $run_id is not tagged for V1 publication" >&2 + exit 1 + } + matrix_sha=$(sha256sum "$matrix" | cut -d' ' -f1) + qualification_index=$(jq -r '.qualification_index' "$marker") + execution_plan_sha=$( + PYTHONPATH=experimental/CollectiveX python3 -c \ + 'import json,sys,sweep_matrix as s; print(s.qualification_execution_plan_sha256(json.load(open(sys.argv[1], encoding="utf-8")), int(sys.argv[2])))' \ + "$matrix" "$qualification_index" + ) + jq -e \ + --arg run_id "$run_id" \ + --arg attempt "$attempt" \ + --arg source_sha "$source_sha" \ + --arg matrix_sha "$matrix_sha" \ + --arg execution_plan_sha "$execution_plan_sha" \ + 'keys == ["execution_plan_sha256","format","matrix_sha256","qualification_index","release_tag","run_attempt","run_id","source_sha"] + and .format == "collectivex.release-tag.v1" + and .release_tag == "v1" + and .run_id == $run_id + and .run_attempt == $attempt + and .source_sha == $source_sha + and (.qualification_index == 1 or .qualification_index == 2 or .qualification_index == 3) + and .matrix_sha256 == $matrix_sha + and .execution_plan_sha256 == $execution_plan_sha' \ + "$marker" >/dev/null || { + echo "run $run_id has an invalid V1 release marker" >&2 + exit 1 + } + jq -r '.qualification_index' "$marker" \ + >> "$RUNNER_TEMP/collectivex-qualification-indices.txt" + + mapfile -t artifacts < <( + find "$run_dir" -mindepth 1 -maxdepth 1 -type d \ + \( -name "cxshard-*-$run_id-$attempt" \ + -o -name "cxunsupported-$run_id-$attempt" \) -print | sort + ) + [ "${#artifacts[@]}" -gt 0 ] || { + echo "run $run_id has no result artifacts" >&2 + exit 1 + } + artifact_args=() + for artifact in "${artifacts[@]}"; do + artifact_args+=(--artifact "$artifact") + done + result=$( + python3 experimental/CollectiveX/publisher.py --store-root "$store" ingest \ + --matrix "$matrix" \ + "${artifact_args[@]}" \ + --repository "$REPOSITORY" \ + --run-id "$run_id" \ + --run-attempt "$attempt" \ + --qualification-index "$qualification_index" \ + --source-sha "$source_sha" + ) + bundle_id=$(jq -er '.bundle_id' <<< "$result") + printf '%s\n' "$bundle_id" >> "$RUNNER_TEMP/collectivex-bundles.txt" + done < "$RUNNER_TEMP/collectivex-runs.tsv" + + [ "$(sort -n "$RUNNER_TEMP/collectivex-qualification-indices.txt" | tr '\n' ' ')" = '1 2 3 ' ] || { + echo 'source runs must contain qualification indices 1, 2, and 3 exactly once' >&2 + exit 1 + } + + mapfile -t bundle_ids < "$RUNNER_TEMP/collectivex-bundles.txt" + promote_args=() + for bundle_id in "${bundle_ids[@]}"; do + promote_args+=(--bundle "$bundle_id") + done + result=$( + python3 experimental/CollectiveX/publisher.py --store-root "$store" promote \ + "${promote_args[@]}" + ) + dataset_id=$(jq -er '.dataset_sha256' <<< "$result") + dataset="$store/public/datasets/$dataset_id/dataset.json" + [ -f "$dataset" ] || { + echo 'publisher did not install the promoted dataset' >&2 + exit 1 + } + publication="$output/collectivex_public_v1_$dataset_id.ndjson" + cp -- "$dataset" "$publication" + python3 experimental/CollectiveX/artifact_safety.py "$publication" + python3 experimental/CollectiveX/publisher.py --store-root "$store" verify \ + --channel dev-latest "${promote_args[@]}" + [ "$(stat -c %s "$publication")" -le 33554432 ] || { + echo 'publication exceeds the 32 MiB frontend limit' >&2 + exit 1 + } + sha256sum "$publication" { - echo "## CollectiveX sweep aggregate (${tag})" - echo '```' - wc -l results/aggregate/*.ndjson 2>/dev/null || echo "no ndjson" - echo '```' + echo '## CollectiveX V1 publication' + echo + echo "Dataset: \`$dataset_id\`" + echo + echo 'Source runs:' + sed 's/^/- `/' "$RUNNER_TEMP/collectivex-runs.tsv" | sed 's/$/`/' } >> "$GITHUB_STEP_SUMMARY" - - name: Upload aggregate + + - name: Upload JIT publication artifact uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: cxsweep-aggregate-${{ inputs.backend }}${{ inputs.deepep_v2 && '-v2' || '' }}-${{ github.run_id }} - path: experimental/CollectiveX/results/aggregate/*.ndjson - if-no-files-found: warn + name: cxpublication-v1-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ runner.temp }}/collectivex-publication/*.ndjson + if-no-files-found: error + retention-days: 90 + + refresh: + if: ${{ inputs.operation == 'refresh-v1' }} + runs-on: ubuntu-latest + timeout-minutes: 10 + env: + GH_TOKEN: ${{ github.token }} + SOURCE_RUN_ID: ${{ inputs.refresh_run_id }} + EXPECTED_DIGEST: ${{ inputs.refresh_digest }} + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v5.0.0 + with: { clean: true, persist-credentials: false } + + - name: Revalidate promoted publication + env: + REPOSITORY: ${{ github.repository }} + run: | + set -euo pipefail + [[ "$SOURCE_RUN_ID" =~ ^[1-9][0-9]*$ ]] || { + echo 'refresh_run_id must be a decimal workflow run ID' >&2 + exit 1 + } + [[ "$EXPECTED_DIGEST" =~ ^[0-9a-f]{64}$ ]] || { + echo 'refresh_digest must be a SHA-256 digest' >&2 + exit 1 + } + metadata=$(gh api "repos/$REPOSITORY/actions/runs/$SOURCE_RUN_ID") + path=$(jq -r '.path' <<< "$metadata") + branch=$(jq -r '.head_branch' <<< "$metadata") + status=$(jq -r '.status' <<< "$metadata") + conclusion=$(jq -r '.conclusion' <<< "$metadata") + attempt=$(jq -r '.run_attempt' <<< "$metadata") + [ "$path" = '.github/workflows/collectivex-sweep.yml' ] \ + && [ "$branch" = 'collectivex' ] \ + && [ "$status" = completed ] \ + && [ "$conclusion" = success ] \ + && [[ "$attempt" =~ ^[1-9][0-9]*$ ]] || { + echo 'refresh source is not an eligible CollectiveX workflow run' >&2 + exit 1 + } + + source="$RUNNER_TEMP/collectivex-refresh-source" + output="$RUNNER_TEMP/collectivex-publication" + install -d -m 750 "$source" "$output" + gh run download "$SOURCE_RUN_ID" --repo "$REPOSITORY" \ + --name "cxpublication-v1-$SOURCE_RUN_ID-$attempt" --dir "$source" + mapfile -t files < <(find "$source" -mindepth 1 -maxdepth 1 -type f -print) + [ "${#files[@]}" -eq 1 ] || { + echo 'refresh source must contain exactly one root file' >&2 + exit 1 + } + expected_name="collectivex_public_v1_${EXPECTED_DIGEST}.ndjson" + [ "$(basename "${files[0]}")" = "$expected_name" ] || { + echo 'refresh source filename differs from its requested digest' >&2 + exit 1 + } + [ "$(sha256sum "${files[0]}" | cut -d' ' -f1)" = "$EXPECTED_DIGEST" ] || { + echo 'refresh source bytes differ from their requested digest' >&2 + exit 1 + } + [ "$(stat -c %s "${files[0]}")" -le 33554432 ] || { + echo 'refresh source exceeds the 32 MiB frontend limit' >&2 + exit 1 + } + python3 - "${files[0]}" <<'PY' + import json + import pathlib + import sys + + path = pathlib.Path(sys.argv[1]) + text = path.read_text(encoding="utf-8") + if not text.endswith("\n") or text.count("\n") != 1: + raise SystemExit("publication must contain exactly one NDJSON record") + document = json.loads(text) + if document.get("format") != "collectivex.public.v1": + raise SystemExit("publication format is invalid") + if document.get("promotion", {}).get("status") != "promoted": + raise SystemExit("publication is not promoted") + PY + python3 experimental/CollectiveX/artifact_safety.py "${files[0]}" + cp -- "${files[0]}" "$output/$expected_name" + + - name: Upload refreshed JIT publication artifact + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: cxpublication-v1-${{ github.run_id }}-${{ github.run_attempt }} + path: ${{ runner.temp }}/collectivex-publication/*.ndjson + if-no-files-found: error + retention-days: 90 diff --git a/AGENTS.md b/AGENTS.md index 6a385a5c1..b49f547e2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -2,6 +2,8 @@ Guidance for AI agents working with InferenceX. +> **Temporary language override:** Until the user explicitly removes this override, ignore all other repository instructions that require Simplified Chinese or bilingual output. Work in English only: do not create or update `_zh.md` files, Chinese documentation, translations, or Chinese text in commits, pull requests, issues, reviews, or comments. Leave existing Chinese content untouched. + > **Mandatory reading: [`CONTRIBUTING.md`](CONTRIBUTING.md)** — read it before opening or reviewing any PR. It covers the full PR review flow, the CODEOWNER sign-off process, the `/reuse-sweep-run` merge path, post-merge responsibilities, and critical cluster rules (e.g. never leaving root-owned files on AMD runners). > **PR and GitHub-issue titles & descriptions must be bilingual — include a Simplified Chinese version in addition to English.** Title format: ` / <中文标题>`. In the PR/issue body, follow the English content with its Chinese translation (e.g. a `## 中文说明` section mirroring the summary; don't translate code blocks, logs, or stack traces — summarize around them). **PR comments must include a Chinese translation too** — conversation comments, review summaries, and inline review comments alike: short comments as a single ` / <中文>` line, longer ones with the Chinese translation as a trailing paragraph (`中文:...`). Exception: the CODEOWNER sign-off template stays English-verbatim (the sign-off verifier triggers on its exact phrase); bot-generated comments follow their own workflow templates. This applies to every PR and every issue, matching the bilingual docs rule in Code Conventions. diff --git a/experimental/CollectiveX/.gitignore b/experimental/CollectiveX/.gitignore new file mode 100644 index 000000000..56b307215 --- /dev/null +++ b/experimental/CollectiveX/.gitignore @@ -0,0 +1,15 @@ +__pycache__/ +*.pyc +results/ +unsupported/ +.shards/ +.cx_workloads/ +.cx_backend/ +/matrix_full.json +gpucore.* + +# Local plans and infrastructure inventory. +goal.md +notes.md +configs/platforms.yaml +private-infra.md diff --git a/experimental/CollectiveX/README.md b/experimental/CollectiveX/README.md new file mode 100644 index 000000000..b58a5af61 --- /dev/null +++ b/experimental/CollectiveX/README.md @@ -0,0 +1,168 @@ +# CollectiveX + +
+ +**English** | [中文](./README_zh.md) + +
+ +CollectiveX is an experimental MoE expert-parallel communication benchmark. It measures dispatch, +combine, and paired roundtrip latency across EP libraries and accelerator systems. + +> Publication hold: historical schema 3-5 data is diagnostic. No current dataset is approved for +> rankings, recommendations, or regression baselines. + +> Development status: the sections below document the implemented BF16 pre-V1 baseline, not the +> final V1 qualification contract. Precision profiles, full point-level publication, and branch-only +> publication are under active implementation; case counts and digests are not frozen. + +## Implemented Pre-V1 Execution Profile + +Every scheduled case is BF16 with backend-tuned resources and packed placement. The explicit mode +selects one of two contracts: + +- Normal mode uses `layout-and-dispatch-v1`, rank-deduplicated token payloads, and activation-only + combine. Uniform core coverage and one Zipf sensitivity remain; EPLB is measured only as the Zipf + remedy. +- Low-latency mode uses `expert-packed-weighted-combine-v1`, token-expert payloads, and gate-weighted + combine through genuine DeepEP V1 or UCCL low-latency APIs. It is decode-only and never shares a + ranking cohort with normal mode. Other backends are explicitly unsupported for this suite. + +Both modes use `fixed-512-v1`: 64 trials x 8 timed iterations with 32 synchronized full roundtrip +warmups before each measured component at every trial/point. Roundtrip is measured first; each +iteration takes the cross-rank maximum before nearest-rank p50/p90/p95/p99, and roundtrip p99 is the +headline latency. A stdlib integer counter produces byte-identical routing and gate weights. + +The implemented baseline matrix covers H100, H200, B200, B300, GB200, GB300, MI325X, and MI355X. +It requests +608 cases / 1,600 token points: 364 runnable cases / 940 points, emitted as 58 executable workflow +shards/allocation cells, plus 244 explicit unsupported cases / 660 points. `sweep_matrix.py` +materializes every token ladder and rejects missing, stale, malformed, or altered shard controls. +Shards are emitted round-robin by SKU so the bounded GHA matrix uses every runner pool early. + +| Systems | EP8 | EP16 | +|---|---|---| +| H100/H200/B200/B300 | 1x8 NVLink, scale-up | 2x8 NVLink + RDMA, scale-out | +| MI325X/MI355X | 1x8 XGMI, scale-up | 2x8 XGMI + RDMA, scale-out | +| GB200/GB300 | 2x4 MNNVL, scale-up | 4x4 MNNVL, scale-up | + +Physical host count does not determine scope: both GB topologies stay inside one 72-GPU MNNVL +scale-up domain. + +| Backend | Current scope | +|---|---| +| DeepEP V1 | Image-pinned `deep_ep.Buffer`: normal and native low-latency APIs; upstream v1.2.1 on x86 and the image's GB fork on arm64 | +| DeepEP V2 | PR #605 `ElasticBuffer` plus #630: LSA for scale-up and GIN for x86 EP16 scale-out; source/SASS-bound reproducible JIT | +| DeepEP Hybrid | Pinned `HybridEPBuffer`: x86 EP16 multi-domain RDMA/DOCA; GB EP8/EP16 in one MNNVL communication domain | +| UCCL | Pinned 0.1.1 wheel and wrapper with normal and native low-latency APIs on Hopper; Blackwell is explicitly unsupported | +| NCCL/RCCL A2A | Portable rank-deduplicated payload plus expert/routing-metadata reference | +| MoRI | EP8 uses MI325X AsyncLL or MI355X IntraNode; EP16 pins InterNodeV1 over 2x8 XGMI + RDMA | + +FlashInfer is outside v1 because its exercised EP path failed intermittently at runtime. It is not +misreported as a platform capability limitation and can return after a stable pinned path is proven. + +DeepEP V2 means the `ElasticBuffer` implementation introduced by +[DeepEP PR #605](https://github.com/deepseek-ai/DeepEP/pull/605), not a newer legacy `Buffer` build. +The pinned source is the minimal upstream [PR #630](https://github.com/deepseek-ai/DeepEP/pull/630) +follow-up: its parent is the #605 merge tree and its only source change fixes pure scale-up +initialization when GIN is unavailable. Scale-up cases request NCCL Device API LSA and fail closed +unless the realized LSA team covers the full EP world. x86 EP16 scale-out cases instead require the +hybrid path with GIN, two logical scale-out domains represented by two physical RDMA ranks, and eight +scale-up ranks per domain; GB EP16 remains MNNVL scale-up and therefore uses LSA. The isolated build +records the API, source, loaded libraries, generated JIT source, executable SASS, and raw CUBIN +diagnostics. The current H100 runner pool is explicitly unsupported for V2 because NCCL 2.30.4 +reports that its EP8 communicator lacks Device API symmetric-memory support; re-enabling that pool +requires an all-rank CUDA P2P/LSA-capable runtime. Other NVIDIA SKUs remain unvalidated until their +GPU outcomes pass the native correctness and publication gates. + +Axes not implemented in this baseline include cached-layout `[cl]`, runtime-visible `[rv]`, FP8, +quantized combine, +extra routing distributions, activation profiles, uneven allocation, placement permutations, model +envelopes, and scaling studies. + +## Workflow And Artifacts + +`.github/workflows/collectivex-sweep.yml` generates a public-SKU matrix, extracts a strict ignored +`.shards/.json` control, executes one allocation per shard, privacy-checks result JSON, and uploads +raw GitHub artifacts. Runs default to `release_tag=unversioned` and are diagnostic-only. A V1 run +must explicitly select `release_tag=v1`; setup then requires the locked full-matrix digest and emits +a run/attempt/source-bound `cxrelease-v1-*` marker. Partial and filtered runs cannot receive it. + +The main-registered `.github/workflows/collectivex-sweep.yml` provides `sweep`, `publish-v1`, and +`refresh-v1` operations, so its branch revision can be dispatched with `--ref collectivex` without +a standalone branch-only workflow. Publication accepts exactly three successful first-attempt tagged +sweep run IDs from one source SHA, revalidates their metadata and release markers, and runs +`publisher.py` in a disposable runner-local workspace. Refresh revalidates and reuploads only the +exact content-addressed sanitized dataset. Raw artifacts and the private publisher workspace are +never exposed to the frontend. + +There is no results server, attached store, Vercel storage, GCP, Neon, managed database, or managed +object store. With the existing server-side `GITHUB_TOKEN`, the frontend discovers the latest +successful version-scoped publication workflow, downloads its NDJSON artifact just in time, verifies +the ZIP layout, UTF-8/NDJSON shape, schema, promotion state, and SHA-256, then serves versioned channel +and immutable dataset URLs. The UI keeps an explicit benchmark-version selector; V2 and later +versions must use separate release tags and publication identities. The full validation contract is +in [docs/methodology.md](docs/methodology.md). + +## Runner Configuration + +Runner-local Slurm and storage values use a strict per-SKU JSON document at +`$XDG_CONFIG_HOME/inferencex/collectivex.json` or `COLLECTIVEX_OPERATOR_CONFIG`. The mode-0600, +same-owner, non-symlink file is outside the checkout and never uploaded. Unknown runners, fields, +duplicate keys, endpoint literals, unsafe paths, and non-JSON input fail closed; configuration is +never evaluated as shell. GHA passes encrypted `COLLECTIVEX_OPERATOR_CONFIG_V1` content only to the +launcher, which validates it, exports the selected SKU's allowlisted values, and deletes the +temporary copy before allocation. Required JSON fields are: + +| SKU | Variables | +|---|---| +| `h100-dgxc`, `b200-dgxc` | `partition`, `account`, `squash_dir`, `stage_dir` | +| `h200-dgxc` | `partition`, `squash_dir`, `stage_dir` | +| `b300` | `partition`, `account`, `squash_dir`, `stage_dir` | +| `gb200` | `partition`, `account`, ordered `storage_roots` | +| `gb300` | `partition`, `account`, `squash_dir`, `stage_dir`, `enroot_cache_path` | +| `mi325x`, `mi355x` | `partition`, `squash_dir`, `stage_dir` | + +Every selected non-MNNVL EP16 placement additionally requires `socket_ifname` and `rdma_devices` +for its operator-approved fabric; optional +`ib_gid_index` and `rdma_service_level` are also allowlisted. CollectiveX does not heuristically +select a management route or HCA. After allocation, every non-MNNVL scale-out node must prove that +all configured interfaces and active HCA ports exist before backend setup. Scale-up and MNNVL jobs +clear these overrides. Scale-out NCCL/RCCL is pinned to `IB` with exact-match HCA selectors so a +socket fallback fails instead of being mislabeled as RDMA. + +`stage_dir` is a pre-existing, runner-owned, non-symlinked base outside the checkout and workflow +workspace. It is not group- or world-writable and is visible at the same path on the runner and every +allocated node. Jobs create only a marked mode-0700 execution child, prove cross-node read/write +visibility, and remove that exact child after allocation teardown; they never mount the runner +checkout or create a stage beneath image storage on AMD. + +Before import, each Docker Hub tag is resolved with bounded registry requests and must match its +pinned digest; digest-qualified overrides are rejected. Enroot imports use a fixed filesystem epoch +and a versioned, registry-digest-bound cache key. Every mounted squash is freshly hashed. The +verified registry digest and local squash hash are both recorded. Image-provided DeepEP is checked +against exact wheel and installed-file fingerprints; source-built backends use pinned commits and +runtime-verified GPU targets. DeepEP V2's mode-0700 cluster-local build cache is keyed by a versioned +build recipe, verified image, architecture, upstream trees, and dependency pins; only its fixed +`/cx-cache` mount reaches the container, and it never enters result artifacts. +Pinned V2 and Hybrid sources are fetched once per workflow. Each job validates the complete archive, +extracts only its exact backend root, permits only contained relative leaf symlinks to archived +regular files, and revalidates the Git tree and submodule pins before staging. +Compute containers receive an explicit environment allowlist. Private host, address, device, NIC, +credential, workspace, and path data stays in encrypted config, ignored operator notes, or bounded +mode-0600 runner logs; it is never uploaded. + +## Local Checks + +```bash +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python -m unittest discover experimental/CollectiveX/tests -p 'test_*.py' +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python experimental/CollectiveX/sweep_matrix.py --backends all --out /tmp/cx-matrix.json >/dev/null +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python experimental/CollectiveX/publisher.py --store-root "$COLLECTIVEX_STORE_ROOT" verify +bash -n experimental/CollectiveX/runtime/*.sh experimental/CollectiveX/launchers/*.sh +``` + +Core paths are `capability.py`, `configs/`, `contracts.py`, `schemas/`, `sweep_matrix.py`, +`publisher.py`, `runtime/`, `launchers/`, and `tests/`. diff --git a/experimental/CollectiveX/README_zh.md b/experimental/CollectiveX/README_zh.md new file mode 100644 index 000000000..fb32369b4 --- /dev/null +++ b/experimental/CollectiveX/README_zh.md @@ -0,0 +1,154 @@ +# CollectiveX + +
+ +[English](./README.md) | **中文** + +
+ +CollectiveX 是实验性的 MoE 专家并行通信基准,用于测量不同 EP 库和加速器系统的 +dispatch、combine 及配对 roundtrip 延迟。 + +> 发布暂停:历史 schema 3-5 数据仅供诊断。目前没有数据集获准用于排名、推荐或回归基线。 + +## v1 执行配置 + +每个调度用例均采用 BF16、后端调优资源和 packed placement。显式指定的 mode 选择以下两个 +契约之一: + +- Normal mode 使用 `layout-and-dispatch-v1`、按 rank 去重的 token payload 和 activation-only + combine。核心覆盖使用 uniform routing,并保留一个 Zipf 敏感性场景;EPLB 只作为 Zipf + 的修正方案测量。 +- Low-latency mode 使用 `expert-packed-weighted-combine-v1`、token-expert payload 和 + gate-weighted combine,并且只调用真正的 DeepEP V1 或 UCCL low-latency API。该模式仅覆盖 + 解码,绝不与 normal mode 共用排名 cohort。其他后端在此 suite 中均显式标为 unsupported。 + +两种模式统一使用 `fixed-512-v1`:64 trials x 8 timed iterations;每个 trial/point 的每个被测 +组件前执行 32 次同步完整 roundtrip warmup。先测 roundtrip;每次 iteration 先取跨 rank 最大值, +再按 nearest-rank 计算 p50/p90/p95/p99,主要延迟指标为 roundtrip p99。stdlib 整数计数器 +生成逐字节一致的 routing 和 gate weights。 + +规范矩阵覆盖 H100、H200、B200、B300、GB200、GB300、MI325X 和 MI355X。矩阵请求 +608 个 cases / 1,600 个 token points:364 个可运行 cases / 940 个 points,并形成 58 个可执行 +workflow shards/allocation cells;另有 244 个显式 unsupported cases / 660 个 points。 +`sweep_matrix.py` 物化每个 token ladder,并拒绝缺失、过期、格式错误或被修改的 shard controls。 +分片按 SKU round-robin 发出,使受限的 GHA matrix 尽早使用所有 runner pools。 + +| 系统 | EP8 | EP16 | +|---|---|---| +| H100/H200/B200/B300 | 1x8 NVLink,scale-up | 2x8 NVLink + RDMA,scale-out | +| MI325X/MI355X | 1x8 XGMI,scale-up | 2x8 XGMI + RDMA,scale-out | +| GB200/GB300 | 2x4 MNNVL,scale-up | 4x4 MNNVL,scale-up | + +物理主机数量不能决定通信范围:两种 GB 拓扑都位于同一个 72-GPU MNNVL scale-up domain 内。 + +| 后端 | 当前范围 | +|---|---| +| DeepEP V1 | 镜像固定的 `deep_ep.Buffer`:提供 normal 和原生 low-latency API;x86 使用 upstream v1.2.1,arm64 使用镜像内 GB fork | +| DeepEP V2 | PR #605 `ElasticBuffer` 加 #630:scale-up 使用 LSA,x86 EP16 scale-out 使用 GIN;JIT 可复现并绑定 source/SASS | +| DeepEP Hybrid | 固定的 `HybridEPBuffer`:x86 EP16 使用 multi-domain RDMA/DOCA;GB EP8/EP16 位于同一个 MNNVL communication domain | +| UCCL | Hopper 上固定的 0.1.1 wheel 和 wrapper,提供 normal 和原生 low-latency API;Blackwell 显式标为 unsupported | +| NCCL/RCCL A2A | 可移植的 rank-deduplicated payload 加 expert/routing-metadata reference | +| MoRI | EP8 使用 MI325X AsyncLL 或 MI355X IntraNode;EP16 固定使用 2x8 XGMI + RDMA 上的 InterNodeV1 | + +FlashInfer 不在 v1 范围内,因为已测试的 EP path 在运行时存在间歇性失败。该问题不会被误报为 +平台能力限制;在证明有稳定的固定实现后可重新加入。 + +DeepEP V2 指 [DeepEP PR #605](https://github.com/deepseek-ai/DeepEP/pull/605) 引入的 +`ElasticBuffer` 实现,而不是更新的 legacy `Buffer` build。固定 source 使用最小化的 upstream +[PR #630](https://github.com/deepseek-ai/DeepEP/pull/630) 后续修复:其 parent 是 #605 merge +tree,唯一 source 变更是修复 GIN 不可用时的纯 scale-up 初始化。Scale-up cases 请求 NCCL +Device API LSA;若实际建立的 LSA team 未覆盖整个 EP world,则直接失败。x86 EP16 scale-out +cases 必须使用启用 GIN 的 hybrid path,其精确拓扑为两个逻辑 scale-out domains(由两个物理 +RDMA ranks 表示)、每个 domain 八个 scale-up ranks;GB EP16 仍是 MNNVL scale-up,因此继续 +使用 LSA。隔离构建会记录 API、source、loaded libraries、generated JIT source、executable +SASS 与 raw CUBIN diagnostics。当前 H100 runner pool 被明确标记为 V2 unsupported,因为 NCCL +2.30.4 报告其 EP8 communicator 不具备 Device API symmetric-memory 支持;只有该 pool 的 +runtime 支持全 rank CUDA P2P/LSA 后才能重新启用。其他 NVIDIA SKU 在 GPU outcome 通过 native +correctness 和 publication gates 前仍为 unvalidated。 + +v1 已移除的轴包括 cached-layout `[cl]`、runtime-visible `[rv]`、FP8、quantized combine、 +额外 routing distributions、activation profiles、uneven allocation、placement permutations、 +model envelopes 和 scaling studies。 + +## Workflow 与产物 + +`.github/workflows/collectivex-sweep.yml` 生成 public-SKU matrix,提取严格且被忽略的 +`.shards/.json` control,每个 shard 执行一次 allocation,对结果 JSON 做隐私检查并上传 +raw GitHub artifacts。运行默认使用 `release_tag=unversioned`,仅供诊断。V1 运行必须显式选择 +`release_tag=v1`;setup 随后要求固定的完整 matrix digest,并生成绑定 run、attempt 与 source 的 +`cxrelease-v1-*` marker。Partial 或 filtered 运行无法获得该 marker。 + +`.github/workflows/collectivex-publish.yml` 是显式的 V1 gate。它只接受三个来自同一 source SHA、 +成功且带 V1 tag 的 sweep run IDs,重新校验 GitHub metadata 与 release markers,并在 runner 本地 +可丢弃工作区中执行 `publisher.py`。只有完整通过 promotion、隐私检查和内容寻址的数据集才会以 +`cxpublication-v1-*` 上传;raw artifacts 与 publisher private workspace 永不暴露给前端。 + +系统不需要 results server、attached store、Vercel storage、GCP、Neon、managed database 或 +managed object store。前端使用已有的 server-side `GITHUB_TOKEN`,即时发现最新成功且按版本隔离 +的 publication workflow,下载其 NDJSON artifact,校验 ZIP layout、UTF-8/NDJSON 结构、schema、 +promotion 状态与 SHA-256,随后提供带版本的 channel URL 和 immutable dataset URL。UI 保留显式 +benchmark-version selector;V2 及后续版本必须使用独立的 release tag 与 publication identity。 +完整 validation contract 见 [docs/methodology_zh.md](docs/methodology_zh.md)。 + +## Runner 配置 + +Runner 本地 Slurm 和 storage 值使用严格的 per-SKU JSON 文档,路径为 +`$XDG_CONFIG_HOME/inferencex/collectivex.json` 或 `COLLECTIVEX_OPERATOR_CONFIG`。该 mode-0600、 +同 owner、非 symlink 文件位于 checkout 外且永不上传。未知 runners、fields、duplicate keys、 +endpoint literals、unsafe paths 和非 JSON 输入均 fail closed;配置绝不作为 shell 执行。GHA +仅将加密的 `COLLECTIVEX_OPERATOR_CONFIG_V1` 内容传给 launcher;launcher 验证后只导出所选 +SKU 的 allowlisted values,并在 allocation 前删除临时副本。必需 JSON fields 如下: + +| SKU | 变量 | +|---|---| +| `h100-dgxc`, `b200-dgxc` | `partition`, `account`, `squash_dir`, `stage_dir` | +| `h200-dgxc` | `partition`, `squash_dir`, `stage_dir` | +| `b300` | `partition`, `account`, `squash_dir`, `stage_dir` | +| `gb200` | `partition`, `account`, 有序 `storage_roots` | +| `gb300` | `partition`, `account`, `squash_dir`, `stage_dir`, `enroot_cache_path` | +| `mi325x`, `mi355x` | `partition`, `squash_dir`, `stage_dir` | + +每个已选中的非 MNNVL EP16 placement 还必须提供 `socket_ifname` 和 `rdma_devices`,用来指定 +operator 审核过的 fabric;还可配置 allowlisted +`ib_gid_index` 与 `rdma_service_level`。CollectiveX 不会通过启发式规则选择 management route 或 +HCA。Allocation 完成后,每个非 MNNVL scale-out 节点都必须证明所有已配置 interface 与 active +HCA port 存在,之后才允许初始化 backend。Scale-up 和 MNNVL job 会清除这些 overrides。 +Scale-out NCCL/RCCL 固定使用 `IB` 与精确匹配的 HCA selectors;如果无法使用 RDMA,job 会失败, +而不会回退到 socket 后仍被错误标记为 RDMA。 + +`stage_dir` 必须是 checkout 与 workflow workspace 外预创建且由 runner owner 持有的 base, +不能经过 symlink,group 和 world 都不能写入,并且 runner 与所有 allocation 节点必须以相同路径 +访问。Job 只创建带 marker 的 mode-0700 execution child,验证跨节点读写可见性,并在 +allocation teardown 后只删除该 child;不会挂载 runner checkout,也不会在 AMD image storage +下创建 stage。 + +导入前,每个 Docker Hub tag 都通过有界 registry requests 解析,并且必须匹配固定 digest;拒绝 +digest-qualified overrides。Enroot imports 使用固定 filesystem epoch 和带版本、绑定 registry +digest 的 cache key。每个已挂载 squash 都重新计算 hash,同时记录 verified registry digest 和 +local squash hash。镜像提供的 DeepEP 会按精确 wheel 和 installed-file fingerprints 检查; +source-built backends 使用固定 commits 和 runtime-verified GPU targets。DeepEP V2 的 mode-0700 +cluster-local build cache 由版本化 build recipe、verified image、architecture、upstream +trees 和 dependency pins 共同寻址;container 只看到固定的 `/cx-cache` mount,且该 cache 永不 +进入 result artifacts。 +固定的 V2 与 Hybrid source 在每个 workflow 中只获取一次。每个 job 都会验证完整 archive,仅 +提取自身精确 backend root,只允许指向 archive 内 regular file 的受限相对 leaf symlink,并在 +staging 前重新核对 Git tree 与 submodule pins。 +Compute containers 仅接收显式 environment allowlist。Private host、address、device、NIC、 +credential、workspace 和 path 数据只保留在加密配置、忽略的 operator notes 或有界 mode-0600 +runner logs 中,永不上传。 + +## 本地检查 + +```bash +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python -m unittest discover experimental/CollectiveX/tests -p 'test_*.py' +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python experimental/CollectiveX/sweep_matrix.py --backends all --out /tmp/cx-matrix.json >/dev/null +uv run --with-requirements experimental/CollectiveX/requirements.txt \ + python experimental/CollectiveX/publisher.py --store-root "$COLLECTIVEX_STORE_ROOT" verify +bash -n experimental/CollectiveX/runtime/*.sh experimental/CollectiveX/launchers/*.sh +``` + +核心路径为 `capability.py`、`configs/`、`contracts.py`、`schemas/`、`sweep_matrix.py`、 +`publisher.py`、`runtime/`、`launchers/` 和 `tests/`。 diff --git a/experimental/CollectiveX/artifact_safety.py b/experimental/CollectiveX/artifact_safety.py new file mode 100644 index 000000000..83d522fba --- /dev/null +++ b/experimental/CollectiveX/artifact_safety.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +"""Fail-closed privacy check for CollectiveX public result documents.""" +from __future__ import annotations + +import argparse +import ipaddress +import json +import os +import re +import stat + + +SENSITIVE_FIELDS = frozenset({ + "environment", "env", "host", "hostname", "uuid", "gpu_uuid", "device_uuid", + "pci_bus_id", "ip_address", "ip_addresses", "master_addr", "ssh", "ssh_target", + "nodelist", "node_list", "nic_guid", "ib_guid", "topology_matrix", "rdma_devices", + "user", "username", "password", "passwd", "secret", "token", "access_token", + "api_token", "auth_token", "api_key", "private_key", "credential", "credentials", + "address", "addresses", "ip", "ips", +}) +SENSITIVE_FIELDS_COMPACT = frozenset(item.replace("_", "") for item in SENSITIVE_FIELDS) +SENSITIVE_FIELD_SUFFIXES = ( + "_host", "_hostname", "_address", "_addresses", "_path", "_paths", "_ip", "_ips", + "_password", "_passwd", "_secret", "_token", "_credential", "_credentials", + "_uuid", "_guid", "_bus_id", +) +SENSITIVE_VALUE_PATTERNS = ( + ("private-path", re.compile( + r"(? str: + normalized = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", str(value).strip()) + normalized = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", normalized) + return normalized.lower().replace("-", "_") + + +def _sensitive_value_rule(value: str, *, contextual: bool = True) -> str | None: + matched = next( + ( + name for name, pattern in SENSITIVE_VALUE_PATTERNS + if (contextual or name not in CONTEXTUAL_VALUE_RULES) and pattern.search(value) + ), + None, + ) + if matched: + return matched + for candidate in IPV6_CANDIDATE.findall(value): + try: + address = candidate.split("%", 1)[0] + if ipaddress.ip_address(address).version == 6: + return "ipv6-address" + except ValueError: + continue + return None + + +def assert_publication_safe(docs: list[dict]) -> None: + """Reject private infrastructure fields and value shapes.""" + def walk(value, doc_index: int, parent_field: str | None = None) -> None: + if isinstance(value, dict): + for key, child in value.items(): + field = _normalized_field(key) + compact = field.replace("_", "") + if ( + field in SENSITIVE_FIELDS + or compact in SENSITIVE_FIELDS_COMPACT + or field.endswith(SENSITIVE_FIELD_SUFFIXES) + ): + raise ArtifactSafetyError( + f"artifact safety: doc[{doc_index}] contains forbidden private field" + ) + key_rule = _sensitive_value_rule(str(key)) + if key_rule: + raise ArtifactSafetyError( + f"artifact safety: doc[{doc_index}] contains forbidden {key_rule} key" + ) + walk(child, doc_index, field) + elif isinstance(value, list): + for child in value: + walk(child, doc_index, parent_field) + elif isinstance(value, str): + rule = _sensitive_value_rule(value, contextual=parent_field != "ref") + if rule: + raise ArtifactSafetyError( + f"artifact safety: doc[{doc_index}] contains forbidden {rule} value" + ) + + for index, doc in enumerate(docs): + if not isinstance(doc, dict): + raise ArtifactSafetyError(f"artifact safety: doc[{index}] is not a JSON object") + walk(doc, index) + + +def load_documents(paths: list[str]) -> list[dict]: + docs: list[dict] = [] + for path in paths: + try: + metadata = os.lstat(path) + except OSError as exc: + raise ArtifactSafetyError("artifact safety: result file is unavailable") from exc + if ( + not stat.S_ISREG(metadata.st_mode) + or metadata.st_uid != os.getuid() + or metadata.st_size <= 0 + or metadata.st_size > MAX_INPUT_BYTES + ): + raise ArtifactSafetyError("artifact safety: result file is unavailable") + descriptor = -1 + try: + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + opened = os.fstat(descriptor) + if ( + not stat.S_ISREG(opened.st_mode) + or (opened.st_dev, opened.st_ino, opened.st_size) + != (metadata.st_dev, metadata.st_ino, metadata.st_size) + ): + raise ArtifactSafetyError("artifact safety: result file changed during open") + with os.fdopen(descriptor, encoding="utf-8") as fh: + descriptor = -1 + if path.endswith(".ndjson"): + for line_number, line in enumerate(fh, 1): + if not line.strip(): + continue + try: + docs.append(json.loads(line)) + except json.JSONDecodeError as exc: + raise ArtifactSafetyError( + f"artifact safety: malformed NDJSON at input line {line_number}" + ) from exc + else: + docs.append(json.load(fh)) + except json.JSONDecodeError as exc: + raise ArtifactSafetyError("artifact safety: malformed JSON input") from exc + except (OSError, UnicodeError) as exc: + raise ArtifactSafetyError("artifact safety: result file is unreadable") from exc + finally: + if descriptor >= 0: + os.close(descriptor) + if not docs: + raise ArtifactSafetyError("artifact safety: no public result documents found") + return docs + + +def main() -> int: + parser = argparse.ArgumentParser(description="Check CollectiveX result artifacts for private data") + parser.add_argument("paths", nargs="+") + args = parser.parse_args() + try: + docs = load_documents(args.paths) + assert_publication_safe(docs) + except ArtifactSafetyError as exc: + parser.error(str(exc)) + print(f"artifact safety: {len(docs)} public document(s) passed") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/capability.py b/experimental/CollectiveX/capability.py new file mode 100644 index 000000000..346e2ad14 --- /dev/null +++ b/experimental/CollectiveX/capability.py @@ -0,0 +1,569 @@ +#!/usr/bin/env python3 +"""Public runner and backend capability registry for CollectiveX v1.""" + +from __future__ import annotations + +import re +from typing import Any + +import identity + + +DEEPEP_V2_COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6" +DEEPEP_V2_SKU_CAPABILITIES = { + "h100-dgxc": { + "schedulable": False, + "basis": "current-runner-nccl-device-api-symmetric-memory-unavailable", + }, + "h200-dgxc": {"schedulable": True, "basis": "upstream-sm90-requirement"}, + "b200-dgxc": {"schedulable": True, "basis": "upstream-sm100-result"}, + "gb200": {"schedulable": True, "basis": "upstream-sm100-result"}, + "b300": {"schedulable": True, "basis": "pinned-pr605-pr630-sm103-maps-sm100f"}, + "gb300": {"schedulable": True, "basis": "pinned-pr605-pr630-sm103-maps-sm100f"}, + "mi325x": {"schedulable": False, "basis": "nvidia-only"}, + "mi355x": {"schedulable": False, "basis": "nvidia-only"}, +} + + +def _topologies( + product: str, *, gpus_per_node: int, scale_up_domain: int, scale_up_transport: str +) -> dict[int, dict[str, Any]]: + scale_up_class = ( + f"{product}-nvl72-mnnvl" + if scale_up_transport == "mnnvl" + else f"{product}-xgmi" + if scale_up_transport == "xgmi" + else f"{product}-{scale_up_transport}-island" + ) + return { + 8: { + "nodes": 8 // gpus_per_node, + "gpus_per_node": gpus_per_node, + "scale_up_domain": scale_up_domain, + "scope": "scale-up", + "scale_up_transport": scale_up_transport, + "scale_out_transport": None, + "transport": scale_up_transport, + "topology_class": scale_up_class, + }, + 16: { + "nodes": 16 // gpus_per_node, + "gpus_per_node": gpus_per_node, + "scale_up_domain": scale_up_domain, + "scope": "scale-up" if scale_up_domain >= 16 else "scale-out", + "scale_up_transport": scale_up_transport, + "scale_out_transport": None if scale_up_domain >= 16 else "rdma", + "transport": ( + scale_up_transport + if scale_up_domain >= 16 + else f"{scale_up_transport}-rdma" + ), + "topology_class": ( + scale_up_class + if scale_up_domain >= 16 + else f"{product}-{scale_up_transport}-rdma" + ), + }, + } + + +def _platform( + *, vendor: str, arch: str, machine: str, product: str, gpus_per_node: int, + scale_up_domain: int, scale_up_transport: str, launcher: str, +) -> dict[str, Any]: + topologies = _topologies( + product, + gpus_per_node=gpus_per_node, + scale_up_domain=scale_up_domain, + scale_up_transport=scale_up_transport, + ) + ep8 = topologies[8] + return { + "vendor": vendor, + "arch": arch, + "machine": machine, + "product": product, + # EP8 defaults remain while downstream readers migrate to per-EP records. + "transport": ep8["transport"], + "topology_class": ep8["topology_class"], + "gpus_per_node": gpus_per_node, + "scale_up_domain": scale_up_domain, + "ep_degrees": tuple(topologies), + "topologies": topologies, + "launcher": launcher, + } + + +PLATFORMS = { + "h100-dgxc": _platform( + vendor="nvidia", arch="sm90", machine="amd64", product="h100", + gpus_per_node=8, scale_up_domain=8, scale_up_transport="nvlink", + launcher="single-slurm", + ), + "h200-dgxc": _platform( + vendor="nvidia", arch="sm90", machine="amd64", product="h200", + gpus_per_node=8, scale_up_domain=8, scale_up_transport="nvlink", + launcher="single-slurm", + ), + "b200-dgxc": _platform( + vendor="nvidia", arch="sm100", machine="amd64", product="b200", + gpus_per_node=8, scale_up_domain=8, scale_up_transport="nvlink", + launcher="single-slurm", + ), + "b300": _platform( + vendor="nvidia", arch="sm103", machine="amd64", product="b300", + gpus_per_node=8, scale_up_domain=8, scale_up_transport="nvlink", + launcher="single-slurm", + ), + "gb200": _platform( + vendor="nvidia", arch="sm100", machine="arm64", product="gb200", + gpus_per_node=4, scale_up_domain=72, scale_up_transport="mnnvl", + launcher="gb-nv", + ), + "gb300": _platform( + vendor="nvidia", arch="sm103", machine="arm64", product="gb300", + gpus_per_node=4, scale_up_domain=72, scale_up_transport="mnnvl", + launcher="gb-nv", + ), + "mi325x": _platform( + vendor="amd", arch="gfx942", machine="amd64", product="mi325x", + gpus_per_node=8, scale_up_domain=8, scale_up_transport="xgmi", + launcher="mi-amds", + ), + "mi355x": _platform( + vendor="amd", arch="gfx950", machine="amd64", product="mi355x", + gpus_per_node=8, scale_up_domain=8, scale_up_transport="xgmi", + launcher="mi-amds", + ), +} + +BACKENDS = { + "deepep": {"vendors": {"nvidia"}}, + "deepep-v2": { + "vendors": {"nvidia"}, + "implementation": "deep_ep.ElasticBuffer", + "source": "deepseek-ai/DeepEP#605+#630", + "commit": DEEPEP_V2_COMMIT, + "communication_backend": "nccl-device-lsa", + "torch": "2.10.0+cu130", + "nccl": "2.30.4", + "sku_capabilities": DEEPEP_V2_SKU_CAPABILITIES, + }, + "uccl": { + "vendors": {"nvidia"}, + "machines": {"amd64"}, + "excluded_skus": {"b200-dgxc", "b300"}, + }, + "deepep-hybrid": {"vendors": {"nvidia"}}, + "mori": {"vendors": {"amd"}}, + "nccl-ep": {"vendors": {"nvidia", "amd"}}, +} +SWEEP_BACKENDS = tuple(BACKENDS) + +PRECISION_DISPOSITIONS = { + "supported", "unsupported", "not-applicable", "provisional", +} +_NVIDIA_SKUS = ( + "h100-dgxc", "h200-dgxc", "b200-dgxc", "b300", "gb200", "gb300", +) +_DEEPEP_V2_PRECISION_SKUS = ( + "h200-dgxc", "b200-dgxc", "b300", "gb200", "gb300", +) +_HOPPER_UCCL_SKUS = ("h100-dgxc", "h200-dgxc") + + +def _precision_rule( + *, + backend: str, + skus: tuple[str, ...], + ep_degrees: tuple[int, ...], + mode: str, + basis: str, + disposition: str = "provisional", +) -> dict[str, Any]: + return { + "backend": backend, + "skus": skus, + "ep_degrees": ep_degrees, + "mode": mode, + "disposition": disposition, + "basis": basis, + } + + +_NORMAL_E4M3FN_PROFILE = "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16" +_NORMAL_E4M3FNUZ_PROFILE = "d-fp8-e4m3fnuz-b128-f32-prequantized.c-bf16" +_LL_FP8_PROFILE = "d-fp8-e4m3fn-b128-f32-fused.c-bf16" +_LL_LOGFMT_PROFILE = "d-bf16.c-logfmt10-dynamic64" +_LL_FP8_LOGFMT_PROFILE = ( + "d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64" +) +_MORI_E4M3FN_DIRECT_PROFILE = "d-bf16.c-fp8-e4m3fn-direct-cast-noscale" +_MORI_E4M3FN_BOTH_PROFILE = ( + "d-fp8-e4m3fn-b128-f32-prequantized.c-fp8-e4m3fn-direct-cast-noscale" +) +_MORI_E4M3FNUZ_DIRECT_PROFILE = "d-bf16.c-fp8-e4m3fnuz-direct-cast-noscale" +_MORI_E4M3FNUZ_BOTH_PROFILE = ( + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-fp8-e4m3fnuz-direct-cast-noscale" +) + +# These are native-path candidates, not executable claims. A cell must be changed +# from provisional to supported or unsupported after its pinned runtime probe. +PRECISION_CAPABILITIES: dict[str, tuple[dict[str, Any], ...]] = { + _NORMAL_E4M3FN_PROFILE: ( + _precision_rule( + backend="deepep", skus=_NVIDIA_SKUS, ep_degrees=(8, 16), mode="normal", + basis="deepep-v1-normal-prequantized-e4m3fn-block128-f32-scale", + ), + _precision_rule( + backend="deepep-v2", skus=_DEEPEP_V2_PRECISION_SKUS, + ep_degrees=(8, 16), mode="normal", + basis="deepep-v2-normal-prequantized-e4m3fn-block128-f32-scale", + ), + _precision_rule( + backend="deepep-hybrid", skus=_NVIDIA_SKUS, + ep_degrees=(8, 16), mode="normal", + basis="deepep-hybrid-normal-uint8-e4m3fn-block128-f32-scale", + ), + _precision_rule( + backend="uccl", skus=_HOPPER_UCCL_SKUS, ep_degrees=(8, 16), mode="normal", + basis="uccl-deepep-api-normal-prequantized-e4m3fn-block128-f32-scale", + ), + _precision_rule( + backend="mori", skus=("mi355x",), ep_degrees=(8, 16), mode="normal", + basis="mori-gfx950-normal-prequantized-ocp-e4m3fn-block128-f32-scale", + ), + ), + _NORMAL_E4M3FNUZ_PROFILE: ( + _precision_rule( + backend="mori", skus=("mi325x",), ep_degrees=(8, 16), mode="normal", + basis="mori-gfx942-normal-prequantized-e4m3fnuz-block128-f32-scale", + ), + ), + _LL_FP8_PROFILE: ( + _precision_rule( + backend="deepep", skus=_NVIDIA_SKUS, ep_degrees=(8, 16), + mode="low-latency", + basis="deepep-v1-low-latency-fused-e4m3fn-block128-f32-scale", + ), + _precision_rule( + backend="uccl", skus=_HOPPER_UCCL_SKUS, ep_degrees=(8, 16), + mode="low-latency", + basis="uccl-deepep-api-low-latency-fused-e4m3fn-block128-f32-scale", + ), + ), + _LL_LOGFMT_PROFILE: ( + _precision_rule( + backend="deepep", skus=_NVIDIA_SKUS, ep_degrees=(8, 16), + mode="low-latency", + basis="deepep-v1-low-latency-logfmt10-dynamic-per64-combine", + ), + _precision_rule( + backend="uccl", skus=_HOPPER_UCCL_SKUS, ep_degrees=(8, 16), + mode="low-latency", + basis="uccl-deepep-api-low-latency-logfmt10-dynamic-per64-combine", + ), + ), + _LL_FP8_LOGFMT_PROFILE: ( + _precision_rule( + backend="deepep", skus=_NVIDIA_SKUS, ep_degrees=(8, 16), + mode="low-latency", + basis="deepep-v1-low-latency-fused-e4m3fn-dispatch-logfmt10-combine", + ), + _precision_rule( + backend="uccl", skus=_HOPPER_UCCL_SKUS, ep_degrees=(8, 16), + mode="low-latency", + basis="uccl-deepep-api-low-latency-fused-e4m3fn-dispatch-logfmt10-combine", + ), + ), + _MORI_E4M3FN_DIRECT_PROFILE: ( + _precision_rule( + backend="mori", skus=("mi355x",), ep_degrees=(8,), mode="normal", + basis="mori-gfx950-ep8-intranode-e4m3fn-direct-cast-combine", + ), + ), + _MORI_E4M3FN_BOTH_PROFILE: ( + _precision_rule( + backend="mori", skus=("mi355x",), ep_degrees=(8,), mode="normal", + basis="mori-gfx950-ep8-intranode-e4m3fn-dispatch-and-direct-cast-combine", + ), + ), + _MORI_E4M3FNUZ_DIRECT_PROFILE: ( + _precision_rule( + backend="mori", skus=("mi325x",), ep_degrees=(8,), mode="normal", + basis="mori-gfx942-ep8-asyncll-e4m3fnuz-direct-cast-combine", + ), + ), + _MORI_E4M3FNUZ_BOTH_PROFILE: ( + _precision_rule( + backend="mori", skus=("mi325x",), ep_degrees=(8,), mode="normal", + basis="mori-gfx942-ep8-asyncll-e4m3fnuz-dispatch-and-direct-cast-combine", + ), + ), +} + + +def runtime_identity_issues( + sku: str, *, vendor: str, arch: str, machine: str, device_name: str, + device_count: int, world_size: int, +) -> list[str]: + """Validate public product identity on every rank without private device identifiers.""" + platform = PLATFORMS.get(sku) + if platform is None: + return [f"unknown runner identity {sku!r}"] + issues = [] + for field, observed in (("vendor", vendor), ("arch", arch), ("machine", machine)): + if observed != platform[field]: + issues.append(f"{field}={observed!r}, expected {platform[field]!r}") + products = set(re.findall(r"[a-z]+\d+[a-z]*", device_name.lower())) + if platform["product"] not in products: + issues.append(f"device product {device_name!r} does not identify {platform['product']}") + if device_count != platform["gpus_per_node"]: + issues.append( + f"visible GPUs={device_count}, expected {platform['gpus_per_node']} per node" + ) + if world_size not in platform["ep_degrees"]: + issues.append(f"EP{world_size} is not registered for {sku}") + return issues + + +def topology_for(sku: str, ep: int) -> dict[str, Any] | None: + """Return the exact public topology registered for one SKU/EP cell.""" + platform = PLATFORMS.get(sku) + if platform is None: + return None + return platform["topologies"].get(ep) + + +def _resolve_base( + sku: str, + backend: str, + *, + ep: int | None = None, + nodes: int | None = None, + routing: str = "uniform", + eplb: bool = False, + mode: str = "normal", +) -> tuple[bool, str]: + """Resolve the existing BF16 capability without a precision candidate.""" + platform, implementation = PLATFORMS.get(sku), BACKENDS.get(backend) + if platform is None: + return False, f"unknown GHA runner label {sku!r}" + if implementation is None: + return False, f"unknown backend {backend!r}" + if mode not in {"normal", "low-latency"}: + return False, f"unknown benchmark mode {mode!r}" + if mode == "low-latency" and backend not in {"deepep", "uccl"}: + return False, f"{backend} has no distinct low-latency API" + if ep is None: + if nodes is None: + ep = platform["ep_degrees"][0] + else: + matches = [ + degree for degree, topology in platform["topologies"].items() + if topology["nodes"] == nodes + ] + if len(matches) != 1: + return False, f"{sku} does not register a unique {nodes}-node EP degree" + ep = matches[0] + topology = topology_for(sku, ep) + if topology is None or (nodes is not None and nodes != topology["nodes"]): + return False, f"{sku} does not register EP{ep} on {nodes} nodes" + if routing not in {"uniform", "zipf"} or (eplb and routing != "zipf"): + return False, "v1 routing is uniform or zipf, with EPLB only on zipf" + if platform["vendor"] not in implementation["vendors"]: + return False, f"{backend} does not support {platform['vendor']}" + sku_capability = implementation.get("sku_capabilities", {}).get(sku) + if sku_capability is not None and not sku_capability["schedulable"]: + return False, f"{backend} is unsupported on {sku}: {sku_capability['basis']}" + if platform["machine"] not in implementation.get("machines", {platform["machine"]}): + return False, f"{backend} does not support {platform['machine']}" + if sku in implementation.get("excluded_skus", set()): + return False, f"{backend} is unavailable on {sku}" + return True, "ok" + + +def precision_targets( + profile_names: tuple[str, ...] | list[str] | None = None, +) -> list[dict[str, Any]]: + """Expand exact native precision candidates into deterministic target cells.""" + names = list(PRECISION_CAPABILITIES) if profile_names is None else list(profile_names) + unknown = sorted(set(names) - set(PRECISION_CAPABILITIES)) + if unknown: + raise ValueError(f"unknown precision capability profiles {unknown}") + targets: list[dict[str, Any]] = [] + seen: set[tuple[str, str, str, int, str]] = set() + for profile_name in names: + for rule in PRECISION_CAPABILITIES[profile_name]: + for sku in rule["skus"]: + for ep in rule["ep_degrees"]: + key = (profile_name, rule["backend"], sku, ep, rule["mode"]) + if key in seen: + raise RuntimeError(f"duplicate precision capability target {key}") + seen.add(key) + targets.append({ + "precision_profile": profile_name, + "backend": rule["backend"], + "sku": sku, + "ep": ep, + "mode": rule["mode"], + "disposition": rule["disposition"], + "basis": rule["basis"], + }) + return targets + + +def provisional_precision_targets( + profile_names: tuple[str, ...] | list[str] | None = None, +) -> list[dict[str, Any]]: + """Return probe-gated targets that must be eliminated before scheduling.""" + return [ + target for target in precision_targets(profile_names) + if target["disposition"] == "provisional" + ] + + +def precision_target_declared( + precision_profile: str, + *, + sku: str, + backend: str, + ep: int, + mode: str, +) -> bool: + """Return whether a profile has an exact native candidate for this cell.""" + return any( + target["precision_profile"] == precision_profile + and target["sku"] == sku + and target["backend"] == backend + and target["ep"] == ep + and target["mode"] == mode + for target in precision_targets([precision_profile]) + ) + + +def resolve_disposition( + sku: str, + backend: str, + *, + ep: int | None = None, + nodes: int | None = None, + routing: str = "uniform", + eplb: bool = False, + mode: str = "normal", + precision_profile: str | None = None, +) -> tuple[str, str]: + """Resolve a baseline or exact precision cell to its capability disposition.""" + base_ok, base_detail = _resolve_base( + sku, + backend, + ep=ep, + nodes=nodes, + routing=routing, + eplb=eplb, + mode=mode, + ) + if precision_profile is None or precision_profile == identity.V1_CONTROL_PRECISION_PROFILE: + return ("supported", "ok") if base_ok else ("unsupported", base_detail) + if precision_profile not in identity.V1_PRECISION_PROFILES: + return "unsupported", f"unknown precision profile {precision_profile!r}" + profile = identity.V1_PRECISION_PROFILES[precision_profile] + if mode not in profile["modes"]: + return ( + "not-applicable", + f"precision profile {precision_profile} is not defined for {mode} mode", + ) + if ep is None: + platform = PLATFORMS.get(sku) + if platform is None: + return "unsupported", base_detail + if nodes is None: + ep = platform["ep_degrees"][0] + else: + matches = [ + degree for degree, topology in platform["topologies"].items() + if topology["nodes"] == nodes + ] + if len(matches) != 1: + return "unsupported", base_detail + ep = matches[0] + matches = [ + target for target in precision_targets([precision_profile]) + if target["sku"] == sku + and target["backend"] == backend + and target["ep"] == ep + and target["mode"] == mode + ] + if not matches: + return ( + "not-applicable", + f"{precision_profile} has no native {backend} target on {sku} EP{ep}", + ) + if not base_ok: + return "unsupported", base_detail + target = matches[0] + return target["disposition"], target["basis"] + + +def resolve( + sku: str, + backend: str, + *, + ep: int | None = None, + nodes: int | None = None, + routing: str = "uniform", + eplb: bool = False, + mode: str = "normal", + precision_profile: str | None = None, +) -> tuple[bool, str]: + """Return whether one fixed-v1 case can run on a public GHA runner label.""" + disposition, detail = resolve_disposition( + sku, + backend, + ep=ep, + nodes=nodes, + routing=routing, + eplb=eplb, + mode=mode, + precision_profile=precision_profile, + ) + return disposition == "supported", detail + + +def _validate_precision_capabilities() -> None: + expected = set(identity.V1_PRECISION_PROFILES) - { + identity.V1_CONTROL_PRECISION_PROFILE + } + if set(PRECISION_CAPABILITIES) != expected: + raise RuntimeError("precision capability profiles differ from the identity registry") + empty = sorted( + profile for profile, rules in PRECISION_CAPABILITIES.items() if not rules + ) + if empty: + raise RuntimeError(f"precision profiles have no native targets: {empty}") + for target in precision_targets(): + if target["backend"] not in BACKENDS or target["sku"] not in PLATFORMS: + raise RuntimeError(f"unknown precision target: {target}") + if target["ep"] not in PLATFORMS[target["sku"]]["ep_degrees"]: + raise RuntimeError(f"invalid precision target EP degree: {target}") + if target["disposition"] not in PRECISION_DISPOSITIONS - {"not-applicable"}: + raise RuntimeError(f"invalid declared precision disposition: {target}") + if target["mode"] not in identity.V1_PRECISION_PROFILES[ + target["precision_profile"] + ]["modes"]: + raise RuntimeError(f"precision target mode differs from its profile: {target}") + topology = topology_for(target["sku"], target["ep"]) + base_ok, base_detail = _resolve_base( + target["sku"], + target["backend"], + ep=target["ep"], + nodes=topology["nodes"] if topology is not None else None, + mode=target["mode"], + ) + if target["disposition"] in {"supported", "provisional"} and not base_ok: + raise RuntimeError( + f"precision target exceeds its backend capability: {target}: {base_detail}" + ) + + +_validate_precision_capabilities() diff --git a/experimental/CollectiveX/configs/suites.yaml b/experimental/CollectiveX/configs/suites.yaml new file mode 100644 index 000000000..2f5214724 --- /dev/null +++ b/experimental/CollectiveX/configs/suites.yaml @@ -0,0 +1,72 @@ +# CollectiveX v1 comparison suites. +schema_version: 1 + +suites: + ep-core-v1: + mode: normal + workloads: [deepseek-v3-v1] + platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x] + ep_degrees: [8, 16] + routings: [uniform] + phases: [decode, prefill] + token_points_prefill: [256, 512] + required_publication: official + + ep-routing-v1: + mode: normal + workloads: [deepseek-v3-v1] + platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x] + ep_degrees: [8, 16] + routings: [zipf] + eplb: [false, true] + phases: [decode, prefill] + token_points_decode: [128] + token_points_prefill: [512] + required_publication: comparable-experimental + + ep-low-latency-v1: + mode: low-latency + backends: [deepep, uccl] + workloads: [deepseek-v3-v1] + platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x] + ep_degrees: [8, 16] + routings: [uniform] + phases: [decode] + token_points_decode: [1, 2, 4, 8, 16, 32, 64, 128] + required_publication: official + + ep-precision-normal-v1: + mode: normal + backends: [deepep, deepep-v2, uccl, deepep-hybrid, mori] + workloads: [deepseek-v3-v1] + platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x] + ep_degrees: [8, 16] + routings: [uniform] + phases: [decode, prefill] + token_points_decode: [128] + token_points_prefill: [512] + precision_profiles: + - d-fp8-e4m3fn-b128-f32-prequantized.c-bf16 + - d-fp8-e4m3fnuz-b128-f32-prequantized.c-bf16 + - d-bf16.c-fp8-e4m3fn-direct-cast-noscale + - d-fp8-e4m3fn-b128-f32-prequantized.c-fp8-e4m3fn-direct-cast-noscale + - d-bf16.c-fp8-e4m3fnuz-direct-cast-noscale + - d-fp8-e4m3fnuz-b128-f32-prequantized.c-fp8-e4m3fnuz-direct-cast-noscale + provisional: true + required_publication: comparable-experimental + + ep-precision-low-latency-v1: + mode: low-latency + backends: [deepep, uccl] + workloads: [deepseek-v3-v1] + platforms: [h100-dgxc, h200-dgxc, b300, b200-dgxc, gb300, gb200, mi355x, mi325x] + ep_degrees: [8, 16] + routings: [uniform] + phases: [decode] + token_points_decode: [128] + precision_profiles: + - d-fp8-e4m3fn-b128-f32-fused.c-bf16 + - d-bf16.c-logfmt10-dynamic64 + - d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64 + provisional: true + required_publication: comparable-experimental diff --git a/experimental/CollectiveX/configs/workloads.yaml b/experimental/CollectiveX/configs/workloads.yaml new file mode 100644 index 000000000..b5b68334c --- /dev/null +++ b/experimental/CollectiveX/configs/workloads.yaml @@ -0,0 +1,9 @@ +# CollectiveX v1 canonical workload and phase metadata. +schema_version: 1 + +model_derived: + deepseek-v3-v1: + hidden: 7168 + topk: 8 + routed_experts: 256 + verified_against: "deepseek-ai/DeepSeek-V3@e815299b0bcbac849fa540c768ef21845365c9eb/config.json" diff --git a/experimental/CollectiveX/contracts.py b/experimental/CollectiveX/contracts.py new file mode 100644 index 000000000..04357da78 --- /dev/null +++ b/experimental/CollectiveX/contracts.py @@ -0,0 +1,3058 @@ +#!/usr/bin/env python3 +"""Strict native attempt contracts and metric validation for CollectiveX v1.""" +from __future__ import annotations + +import argparse +import datetime as dt +from functools import lru_cache +import hashlib +import json +import math +import os +from pathlib import Path, PurePosixPath +import re +import sys +from typing import Any, Iterable + +import artifact_safety +import capability +import identity + +TESTS = Path(__file__).resolve().parent / "tests" +sys.path.insert(0, str(TESTS)) +import eplb as eplb_contract # noqa: E402 +import workload as workload_contract # noqa: E402 + +RAW_FORMAT = "collectivex.ep.v1" +SAMPLES_FORMAT = "collectivex.samples.v1" +TERMINAL_FORMAT = "collectivex.terminal.v1" +TERMINAL_CASE_FIELDS = { + "backend", "canonical", "eplb", "ep", "experts", "gpus_per_node", "hidden", + "ladder", "mode", "nodes", "phase", "required_publication", "routing", + "samples_per_point", "scale_out_transport", "scale_up_domain", "scale_up_transport", + "scope", "suite", "timing", "topk", "topology_class", "transport", + "warmup_semantics", "workload", +} +ALLOCATION_FACTOR_FIELDS = { + "artifact", "execution_id", "job", "repo", "run_attempt", "run_id", "runner", + "source_sha", "qualification_index", +} +GIT_RUN_FIELDS = { + "artifact", "job", "qualification_index", "ref", "repo", "run_attempt", "run_id", + "source_sha", +} +PRE_EXECUTION_FAILURE_REASONS = { + "setup": "launcher-setup-failed", + "repository-stage": "repository-staging-failed", + "registry-verification": "container-registry-verification-failed", + "scheduler-allocation": "scheduler-allocation-failed", + "container-import": "container-image-preparation-failed", + "container-hash": "container-image-identity-failed", + "container-launch": "container-runtime-launch-failed", + "backend-setup": "backend-setup-failed", + "artifact-collection": "artifact-collection-failed", +} +RUNTIME_FAILURE_REASONS = { + **PRE_EXECUTION_FAILURE_REASONS, + "runtime-identity": "runtime-identity-mismatch", + "timeout": "execution-timeout", + "deadlock": "execution-deadlock", + "execution": "distributed-command-failed", +} +POST_EMIT_FAILURE_REASONS = { + mode: "post-emit-distributed-command-failed" + for mode in ("runtime-identity", "timeout", "deadlock", "execution") +} +CAPABILITY_FAILURE_REASONS = frozenset({ + "backend-platform-unsupported", + "backend-token-capacity", +}) +RETURN_CODE_FAILURE_MODES = { + 5: "runtime-identity", + 124: "timeout", + 137: "timeout", +} +PERCENTILES = ("p50", "p90", "p95", "p99") +V1_CONDITIONING_LADDERS = { + "decode": (1, 2, 4, 8, 16, 32, 64, 128), + "prefill": (1, 2, 4, 8, 16, 32, 64, 128, 256, 512), +} +V1_CONDITIONING_ROUNDS_PER_SHAPE = 8 +DEEPEP_V2_JIT_KERNELS = frozenset({ + "barrier", "combine", "combine_reduce_epilogue", "dispatch", + "dispatch_copy_epilogue", +}) +DEEPEP_V2_V1_PROVENANCE = { + "deepep_version": "2.0.0", + "deepep_distribution_version": "2.0.0+fa8a9b1", + "deepep_commit": "fa8a9b16898204afd347c663b89e65ef87dc6ce6", + "deepep_tree": "29809e75c5874e6609dac4804e7b651d5226959f", + "deepep_pr": 605, + "deepep_fix_pr": 630, + "fmt_commit": "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa", + "torch_version": "2.10.0+cu130", + "nccl_package_version": "2.30.4", + "nccl_version": "2.30.4", + "nvshmem_package_version": "3.3.9", +} +UCCL_DEPENDENCY_VERSIONS = { + "intervaltree": "3.1.0", + "nvidia-cuda-runtime-cu12": "12.9.79", + "sortedcontainers": "2.4.0", +} +SCHEMA_DIR = Path(__file__).resolve().parent / "schemas" +_SCHEMA_CACHE: dict[str, dict[str, Any]] = {} +REQUIRED_BACKEND_PROVENANCE = { + "deepep": ( + "deepep_version", "deepep_commit", "backend_lineage", "allow_mnnvl", + "mnnvl_comm", "mode", "num_nvl_bytes", "num_rdma_bytes", + ), + "deepep-v2": ( + *DEEPEP_V2_V1_PROVENANCE, "api_signature_sha256", "loaded_libraries", + "jit_cubins", "jit_random_seed", "deterministic", "num_experts", + "tuning_num_experts", "allow_hybrid_mode", "gin_enabled", + "communication_backend", + ), + "deepep-hybrid": ( + "deepep_commit", "deepep_tree", "branch", "backend_lineage", + "loaded_libraries", "realized_config", "jit_kernel_keys", "jit_shared_objects", + ), + "uccl": ( + "uccl_version", "uccl_commit", "uccl_wrapper_commit", "backend_lineage", + "loaded_libraries", "uccl_dependency_versions", "mode", "num_nvl_bytes", + "num_rdma_bytes", + ), + "mori": ("mori_commit",), + "nccl-ep": ("nccl_version", "collective_library", "backend_lineage"), +} +PROVENANCE_KEYS = { + "allocated_qps", "allow_hybrid_mode", "allow_mnnvl", "allow_multiple_reduction", + "api", "api_signature_sha256", "backend", "backend_lineage", "block_num", + "block_num_floored", "block_num_target", "branch", "collective_library", + "combine_dtype", "combine_warps", "communication_backend", "cuda_version", + "deepep_commit", "deepep_distribution_version", "deepep_fix_pr", "deepep_pr", "deepep_tree", + "deepep_version", "deterministic", "device_cus", + "device_sms", "dispatch_dtype", "dispatch_warps", "enable_sdma", "fmt_commit", + "gin_enabled", + "gpus_per_node", "heap_size", + "impl", "jit_cache_key", "jit_cubins", "jit_kernel_keys", "jit_random_seed", + "jit_shared_objects", "kernel_type", + "loaded_libraries", "local_experts", + "logical_scaleout_ranks", + "logical_scaleup_ranks", "mapping_variant", "max_num_inp_token_per_rank", + "max_num_tokens", "max_total_recv_tokens", "mnnvl_comm", "mode", "mori_commit", + "nccl_communicator", "nccl_package_version", "nccl_version", "num_experts", + "nvshmem_package_version", + "num_max_tokens_per_rank", "num_nvl_bytes", "num_qps", "num_qps_per_rank", + "num_rdma_bytes", "num_sms", "path", + "physical_nvlink_ranks", "physical_rdma_ranks", "prefer_overlap_with_compute", + "rdma_block_num", + "realized_config", "reference_semantics", "requested_num_sms", "resource_mode", "routing_factor", + "routing_metadata", "sm_fraction", "top_k", + "torch_git_version", "torch_version", "transport", "trtllm", "tuned_source", + "tuning_num_experts", + "uccl_commit", "uccl_dependency_versions", "uccl_version", "uccl_wrapper_commit", + "use_external_inp_buf", + "workspace", +} + + +class ContractError(ValueError): + """A document differs from the native v1 contract.""" + + +def scheduled_case_profile(case: dict[str, Any], path: str = "case") -> dict[str, Any]: + """Resolve an explicit scheduled mode to its immutable measurement profile.""" + try: + return identity.profile_for_case(case) + except identity.IdentityError as exc: + raise ContractError(f"{path}: {exc}") from exc + + +def _scheduled_case(value: Any, path: str) -> dict[str, Any]: + """Validate baseline or explicit-precision scheduled case fields.""" + fields = set(TERMINAL_CASE_FIELDS) + if isinstance(value, dict) and "precision_profile" in value: + fields.add("precision_profile") + return _keys(value, fields, path) + + +def resolve_deepep_mnnvl( + *, requested: bool, signature_parameters: Iterable[str], deepep_commit: str | None +) -> tuple[dict[str, bool], str]: + """Resolve one explicit DeepEP MNNVL API mode without signature fallbacks.""" + if not requested: + return {}, "not-requested" + if "allow_mnnvl" in set(signature_parameters): + return {"allow_mnnvl": True}, "explicit-allow-mnnvl" + raise ContractError( + f"requested DeepEP MNNVL is unsupported by commit {deepep_commit or 'unknown'}" + ) + + +def collective_kernel_generation(collective_library: Any) -> str: + """Return the public NCCL/RCCL implementation lineage.""" + if collective_library not in {"nccl", "rccl"}: + raise ContractError("reference collective library must be nccl or rccl") + return collective_library + + +def project_resource_profile(provenance: dict[str, Any]) -> dict[str, Any]: + """Project backend provenance into the canonical cross-backend resource vocabulary.""" + device_units = provenance.get("device_sms") or provenance.get("device_cus") + if provenance.get("num_sms") is not None: + kind, configured = "sm", provenance["num_sms"] + elif ( + provenance.get("block_num") is not None + and provenance.get("kernel_type") != "AsyncLL" + ): + kind, configured = "cu_block", provenance["block_num"] + else: + kind, configured = None, None + achieved = configured / device_units if configured and device_units else None + fixed = "fixed-kernel" in str(provenance.get("tuned_source", "")) + source = str(provenance.get("tuned_source", "")) + num_nvl_bytes = provenance.get("num_nvl_bytes") + num_rdma_bytes = provenance.get("num_rdma_bytes") + persistent_bytes = ( + (num_nvl_bytes or 0) + (num_rdma_bytes or 0) + if num_nvl_bytes is not None or num_rdma_bytes is not None + else provenance.get("heap_size") + ) + return { + "achieved_fraction": round(achieved, 4) if achieved else None, + "comm_units_kind": kind, + "configured_units": configured, + "conformance_class": ( + "not-applicable" if fixed else "backend-default" if "default" in source + else "pinned-upstream" + ), + "device_units": device_units, + "fixed_kernel": fixed, + "nonconforming": False, + "pareto_eligible": False, + "persistent_bytes": persistent_bytes, + "qps_per_rank": provenance.get("num_qps_per_rank"), + "requested_fraction": None, + "resource_class": "fixed-kernel" if fixed else "fixed-profile", + "target_achieved_within_tol": None, + "tolerance": 0.10, + "tuned_source": provenance.get("tuned_source"), + "warps_combine": provenance.get("combine_warps"), + "warps_dispatch": provenance.get("dispatch_warps"), + } + + +def backend_version(provenance: dict[str, Any]) -> str | None: + """Return the canonical public backend version from implementation provenance.""" + for field in ( + "deepep_version", "uccl_version", "nccl_version", + "mori_commit", "deepep_commit", + ): + value = provenance.get(field) + if value is not None and str(value).strip(): + return str(value)[:160] + return None + + +def public_series_config( + *, kernel_generation: Any, provenance: dict[str, Any], + resource_profile: dict[str, Any], resource_mode: Any, device_product: Any, +) -> dict[str, Any]: + """Project raw implementation facts into the exact public configuration fields.""" + generation = None if kernel_generation == "n-a" else kernel_generation + profile = "profile-" + _sha256_json(resource_profile)[:16] + return { + "backend": { + "generation": generation, + "version": backend_version(provenance), + }, + "resource": { + "mode": resource_mode, + "profile": profile, + "comm_units_kind": resource_profile.get("comm_units_kind"), + "configured_units": resource_profile.get("configured_units"), + }, + "system": {"label": str(device_product)[:160]}, + } + + +def public_series_config_sha256(config: dict[str, Any]) -> str: + """Commit the canonical public configuration projection into series identity.""" + return _sha256_json(config) + + +SOURCE_BUILT_LIBRARY_ROLES = frozenset({ + "deepep-extension", "deepep-hybrid-extension", +}) + + +def series_provenance(provenance: dict[str, Any]) -> dict[str, Any]: + """Project stable semantic build identity while retaining raw binaries in private evidence.""" + projected = { + key: value for key, value in provenance.items() + if key not in {"jit_cache_key", "jit_shared_objects", "path", "sm_fraction"} + } + libraries = provenance.get("loaded_libraries") + if isinstance(libraries, list): + projected["loaded_libraries"] = [ + { + "name": item.get("name"), + "role": item.get("role"), + "source_tree": provenance.get("deepep_tree"), + } + if isinstance(item, dict) and item.get("role") in SOURCE_BUILT_LIBRARY_ROLES + else item + for item in libraries + ] + jit_cubins = provenance.get("jit_cubins") + if isinstance(jit_cubins, list): + projected["jit_cubins"] = [ + { + "cache_key": item.get("cache_key"), + "sass_sha256": item.get("sass_sha256"), + "source_sha256": item.get("source_sha256"), + } + if isinstance(item, dict) + else item + for item in jit_cubins + ] + return projected + + +def routing_implementation_control_sha256(implementation: dict[str, Any]) -> str: + """Bind routing cohorts to the same static build/generator and non-treatment configuration.""" + provenance = implementation.get("provenance") + if not isinstance(provenance, dict): + raise ContractError("implementation provenance is unavailable") + semantic = series_provenance(provenance) + treatment_fields = { + "jit_cache_key", "jit_cubins", "jit_kernel_keys", "jit_shared_objects", + "local_experts", "num_experts", "path", "realized_config", "sm_fraction", + } + return _sha256_json({ + "kernel_generation": implementation.get("kernel_generation"), + "name": implementation.get("name"), + "provenance": { + key: value for key, value in semantic.items() + if key not in treatment_fields + }, + "resource_profile": implementation.get("resource_profile"), + }) + + +def _resolved_provenance_value(field: str, value: Any) -> bool: + if value is None or isinstance(value, (dict, list, tuple, set)) and not value: + return False + text = str(value).strip().lower() + if not text or text in {"unknown", "none", "null", "n/a", "?", "capture-failed"}: + return False + if "capture-failed" in text: + return False + if field.endswith("_commit") and ( + text in {"main", "hybrid-ep", "uccl", "pkg-uccl"} + or text.endswith(("-unknown", "-none", "-main", "-hybrid-ep")) + ): + return False + return True + + +def _content_evidence_is_valid(value: Any, required_roles: set[str]) -> bool: + if not isinstance(value, list) or not value: + return False + records: set[tuple[str, str]] = set() + roles: set[str] = set() + for item in value: + if not isinstance(item, dict) or set(item) != {"name", "role", "sha256"}: + return False + name, role, digest = item["name"], item["role"], item["sha256"] + if ( + not isinstance(name, str) + or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,159}", name) + or not isinstance(role, str) + or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,127}", role) + or not isinstance(digest, str) + or not re.fullmatch(r"[0-9a-f]{64}", digest) + or (role, name) in records + ): + return False + records.add((role, name)) + roles.add(role) + return required_roles <= roles + + +def _deepep_v2_jit_cubins_are_valid(value: Any) -> bool: + if not isinstance(value, list) or len(value) != len(DEEPEP_V2_JIT_KERNELS): + return False + cache_keys = [] + kernel_names = set() + for item in value: + if not isinstance(item, dict) or set(item) != { + "cache_key", "cubin_sha256", "sass_sha256", "source_sha256", + }: + return False + cache_key = item["cache_key"] + match = ( + re.fullmatch(r"kernel\.([A-Za-z0-9_+-]+)\.[0-9a-f]{32}", cache_key) + if isinstance(cache_key, str) + else None + ) + if ( + match is None + or any( + not isinstance(item[field], str) + or not re.fullmatch(r"[0-9a-f]{64}", item[field]) + for field in ("cubin_sha256", "sass_sha256", "source_sha256") + ) + ): + return False + cache_keys.append(cache_key) + kernel_names.add(match.group(1)) + return ( + cache_keys == sorted(set(cache_keys)) + and kernel_names == DEEPEP_V2_JIT_KERNELS + ) + + +HYBRID_REALIZED_CONFIG_FIELDS = { + "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank", + "num_of_ranks_per_node", "num_of_nodes", "pad_multiple", + "num_of_tokens_per_chunk_preprocessing_api", + "num_of_threads_per_block_preprocessing_api", "num_of_blocks_preprocessing_api", + "num_of_blocks_permute", "num_of_blocks_unpermute", "token_data_type", + "num_of_stages_dispatch_api", "num_of_stages_permute_block_dispatch_api", + "num_of_in_flight_s2g_dispatch_api", + "num_of_in_flight_s2g_permute_block_dispatch_api", + "num_of_additional_in_flight_s2g_dispatch_api", + "num_of_tokens_per_chunk_dispatch_api", "num_of_blocks_dispatch_api", + "forward_dispatch_api", "device_side_sync_dispatch_api", + "num_of_stages_g2s_combine_api", "num_of_stages_s2g_combine_api", + "num_of_tokens_per_chunk_combine_api", "num_of_tokens_per_group_combine_api", + "num_of_blocks_combine_api", "num_of_additional_in_flight_s2g_combine_api", + "backward_combine_api", "device_side_sync_combine_api", +} +HYBRID_REALIZED_BOOL_FIELDS = { + "forward_dispatch_api", "device_side_sync_dispatch_api", "backward_combine_api", + "device_side_sync_combine_api", +} + + +def _hybrid_realized_config_is_valid(value: Any) -> bool: + if not isinstance(value, dict) or set(value) != HYBRID_REALIZED_CONFIG_FIELDS: + return False + for field, field_value in value.items(): + if field in HYBRID_REALIZED_BOOL_FIELDS: + if type(field_value) is not bool: + return False + elif field == "token_data_type": + if field_value not in {"UINT8", "UINT16"}: + return False + elif type(field_value) is not int or field_value < 0: + return False + return all(value[field] > 0 for field in ( + "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank", + "num_of_ranks_per_node", "num_of_nodes", + )) + + +def hybrid_communication_domains(ep_size: int, scale_up_domain: int) -> tuple[int, int]: + """Return active ranks per fabric domain and the number of such domains.""" + if type(ep_size) is not int or type(scale_up_domain) is not int: + raise ContractError("hybrid communication topology must be integral") + if ep_size <= 0 or scale_up_domain <= 0: + raise ContractError("hybrid communication topology must be positive") + domain_ranks = min(ep_size, scale_up_domain) + if ep_size % domain_ranks: + raise ContractError("hybrid EP size does not divide into communication domains") + return domain_ranks, ep_size // domain_ranks + + +def _hybrid_kernel_keys_are_valid(value: Any) -> bool: + return ( + isinstance(value, list) + and len(value) == 3 + and len(set(value)) == 3 + and value == sorted(value) + and all( + isinstance(key, str) + and re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,511}", key) + for key in value + ) + ) + + +def _hybrid_jit_evidence_is_valid(value: Any, kernel_keys: Any) -> bool: + if not _hybrid_kernel_keys_are_valid(kernel_keys) or not isinstance(value, list): + return False + if len(value) != len(kernel_keys): + return False + rank_sets = [] + for expected_key, item in zip(kernel_keys, value): + if not isinstance(item, dict) or set(item) != {"kernel_key", "rank_artifacts"}: + return False + rank_artifacts = item["rank_artifacts"] + if item["kernel_key"] != expected_key or not isinstance(rank_artifacts, list): + return False + ranks = [] + for artifact in rank_artifacts: + if not isinstance(artifact, dict) or set(artifact) != {"bytes", "rank", "sha256"}: + return False + rank, digest, size = artifact["rank"], artifact["sha256"], artifact["bytes"] + if ( + type(rank) is not int + or rank < 0 + or not isinstance(digest, str) + or not re.fullmatch(r"[0-9a-f]{64}", digest) + or type(size) is not int + or size <= 0 + ): + return False + ranks.append(rank) + if not ranks or ranks != list(range(len(ranks))): + return False + rank_sets.append(ranks) + return all(ranks == rank_sets[0] for ranks in rank_sets) + + +def backend_provenance_issues(backend: str, provenance: dict[str, Any]) -> list[str]: + unknown = [ + field for field, value in provenance.items() + if isinstance(value, str) and value.strip().lower() == "unknown" + ] + unresolved = [ + field for field in REQUIRED_BACKEND_PROVENANCE.get(backend, ()) + if not _resolved_provenance_value(field, provenance.get(field)) + ] + if backend == "deepep": + mode = provenance.get("mnnvl_comm") + allow = provenance.get("allow_mnnvl") + valid_modes = { + "not-requested": False, + "explicit-allow-mnnvl": True, + } + if type(allow) is not bool or valid_modes.get(mode) is not allow: + unresolved.append("mnnvl_comm") + if provenance.get("backend_lineage") != "deepep-v1": + unresolved.append("backend_lineage") + if backend in {"deepep", "uccl"}: + mode = provenance.get("mode") + num_nvl_bytes = provenance.get("num_nvl_bytes") + num_rdma_bytes = provenance.get("num_rdma_bytes") + if mode not in {"normal", "low-latency"}: + unresolved.append("mode") + if type(num_nvl_bytes) is not int or num_nvl_bytes < 0: + unresolved.append("num_nvl_bytes") + if type(num_rdma_bytes) is not int or num_rdma_bytes < 0: + unresolved.append("num_rdma_bytes") + if mode == "normal" and (type(num_nvl_bytes) is not int or num_nvl_bytes <= 0): + unresolved.append("num_nvl_bytes") + if mode == "low-latency": + if num_nvl_bytes != 0: + unresolved.append("num_nvl_bytes") + if type(num_rdma_bytes) is not int or num_rdma_bytes <= 0: + unresolved.append("num_rdma_bytes") + if ( + type(provenance.get("num_max_tokens_per_rank")) is not int + or provenance["num_max_tokens_per_rank"] <= 0 + ): + unresolved.append("num_max_tokens_per_rank") + if backend == "deepep" and ( + type(provenance.get("num_qps_per_rank")) is not int + or provenance["num_qps_per_rank"] <= 0 + ): + unresolved.append("num_qps_per_rank") + if backend == "deepep-v2": + for field in ("num_experts", "tuning_num_experts"): + if type(provenance.get(field)) is not int or provenance[field] <= 0: + unresolved.append(field) + if not _deepep_v2_jit_cubins_are_valid(provenance.get("jit_cubins")): + unresolved.append("jit_cubins") + if provenance.get("jit_random_seed") != "collectivex-deepep-v2-fa8a9b1": + unresolved.append("jit_random_seed") + unresolved.extend( + field for field, expected in DEEPEP_V2_V1_PROVENANCE.items() + if provenance.get(field) != expected + ) + policy = ( + provenance.get("allow_hybrid_mode"), + provenance.get("gin_enabled"), + provenance.get("communication_backend"), + ) + if policy not in { + (False, False, "nccl-device-lsa"), + (True, True, "nccl-gin"), + }: + unresolved.extend( + ("allow_hybrid_mode", "gin_enabled", "communication_backend") + ) + content_roles = { + "deepep-v2": {"deepep-extension", "nccl", "nvshmem"}, + "deepep-hybrid": {"deepep-extension", "deepep-hybrid-extension"}, + "uccl": { + "uccl-distribution", "uccl-wrapper", "intervaltree-distribution", + "sortedcontainers-distribution", "cuda-runtime", + }, + }.get(backend) + if content_roles is not None and not _content_evidence_is_valid( + provenance.get("loaded_libraries"), content_roles + ): + unresolved.append("loaded_libraries") + if backend in {"deepep-v2", "deepep-hybrid"} and not re.fullmatch( + r"[0-9a-f]{40}", str(provenance.get("deepep_tree", "")) + ): + unresolved.append("deepep_tree") + if backend == "deepep-hybrid" and provenance.get("backend_lineage") != "deepep-hybrid": + unresolved.append("backend_lineage") + if backend == "deepep-hybrid": + if not _hybrid_realized_config_is_valid(provenance.get("realized_config")): + unresolved.append("realized_config") + if not _hybrid_kernel_keys_are_valid(provenance.get("jit_kernel_keys")): + unresolved.append("jit_kernel_keys") + if not _hybrid_jit_evidence_is_valid( + provenance.get("jit_shared_objects"), provenance.get("jit_kernel_keys") + ): + unresolved.append("jit_shared_objects") + if backend == "uccl" and provenance.get("backend_lineage") != "uccl": + unresolved.append("backend_lineage") + if backend == "uccl" and provenance.get("uccl_dependency_versions") != ( + UCCL_DEPENDENCY_VERSIONS + ): + unresolved.append("uccl_dependency_versions") + if backend == "nccl-ep": + collective = provenance.get("collective_library") + if collective not in {"nccl", "rccl"}: + unresolved.append("collective_library") + if provenance.get("backend_lineage") != collective: + unresolved.append("backend_lineage") + if backend == "mori" and provenance.get("kernel_type") == "InterNodeV1": + expected = { + "block_num": 96, + "rdma_block_num": 64, + "dispatch_warps": 8, + "combine_warps": 8, + "num_qps": 1, + "use_external_inp_buf": True, + "gpus_per_node": 8, + } + unresolved.extend( + field for field, value in expected.items() + if provenance.get(field) != value + ) + for field, minimum in ( + ("num_nvl_bytes", 0), ("num_rdma_bytes", 0), + ("num_qps_per_rank", 1), + ): + if field in provenance and ( + type(provenance[field]) is not int or provenance[field] < minimum + ): + unresolved.append(field) + if "rdma_block_num" in provenance and ( + type(provenance["rdma_block_num"]) is not int + or provenance["rdma_block_num"] < 0 + ): + unresolved.append("rdma_block_num") + if "use_external_inp_buf" in provenance and type( + provenance["use_external_inp_buf"] + ) is not bool: + unresolved.append("use_external_inp_buf") + return sorted(set(unknown + unresolved)) + + +def provenance_complete( + provenance: dict[str, Any], backend: str, git_run: dict[str, Any] | None, + *, allocation_stratum_sha256: Any, image_digest: Any, image_verified: Any, + squash_sha256: Any, +) -> bool: + image = str(image_digest or "") + squash = str(squash_sha256 or "") + allocation_stratum = str(allocation_stratum_sha256 or "") + return ( + not backend_provenance_issues(backend, provenance) + and bool(re.fullmatch(r"[0-9a-f]{64}", allocation_stratum)) + and image_verified is True + and bool(re.fullmatch(r"sha256:[0-9a-f]{64}", image)) + and bool(re.fullmatch(r"[0-9a-f]{64}", squash)) + and isinstance(git_run, dict) + and all(git_run.get(field) for field in GIT_RUN_FIELDS) + ) + + +def strict_load(path: str | os.PathLike[str]) -> Any: + """Load JSON while rejecting duplicate keys and non-finite constants.""" + def pairs(items): + result = {} + for key, value in items: + if key in result: + raise ContractError(f"duplicate JSON key {key!r}") + result[key] = value + return result + + def constant(value): + raise ContractError(f"non-finite JSON number {value}") + + try: + with open(path) as handle: + return json.load(handle, object_pairs_hook=pairs, parse_constant=constant) + except (OSError, json.JSONDecodeError) as exc: + raise ContractError(f"invalid JSON {path}: {exc}") from exc + + +def canonical_json_bytes(value: Any) -> bytes: + """Canonical finite JSON bytes for checksums and immutable artifacts.""" + _finite_tree(value) + try: + return json.dumps( + value, allow_nan=False, ensure_ascii=False, sort_keys=True, + separators=(",", ":"), + ).encode("utf-8") + except (TypeError, ValueError) as exc: + raise ContractError(f"value is not canonical JSON: {exc}") from exc + + +def content_manifest_evidence( + *, role: str, name: str, files: Iterable[tuple[str, str | os.PathLike[str]]] +) -> dict[str, str]: + """Hash a labeled file set without exposing any host path in provenance.""" + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,127}", role): + raise ContractError("content evidence role is invalid") + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,159}", name): + raise ContractError("content evidence name is invalid") + manifest: list[dict[str, Any]] = [] + labels: set[str] = set() + for label, raw_path in files: + logical = PurePosixPath(label) + if ( + not label + or logical.is_absolute() + or ".." in logical.parts + or label in labels + or any(ord(character) < 0x20 or ord(character) > 0x7E for character in label) + ): + raise ContractError("content evidence label is invalid or duplicated") + path = Path(raw_path) + if not path.is_file(): + raise ContractError("content evidence source is not a file") + digest = hashlib.sha256() + size = 0 + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + size += len(chunk) + labels.add(label) + manifest.append({"bytes": size, "label": label, "sha256": digest.hexdigest()}) + if not manifest: + raise ContractError("content evidence cannot be empty") + digest = hashlib.sha256( + canonical_json_bytes(sorted(manifest, key=lambda item: item["label"])) + ).hexdigest() + return {"name": name, "role": role, "sha256": digest} + + +def _obj(value: Any, path: str) -> dict[str, Any]: + if not isinstance(value, dict): + raise ContractError(f"{path} must be an object") + return value + + +def _keys(value: Any, expected: set[str], path: str) -> dict[str, Any]: + obj = _obj(value, path) + actual = set(obj) + if actual != expected: + raise ContractError( + f"{path} fields differ: missing={sorted(expected - actual)}, " + f"extra={sorted(actual - expected)}" + ) + return obj + + +def _text(value: Any, path: str, *, nullable: bool = False) -> str | None: + if nullable and value is None: + return None + if not isinstance(value, str) or not value: + raise ContractError(f"{path} must be a non-empty string") + return value + + +def _integer(value: Any, path: str, *, minimum: int = 0) -> int: + if type(value) is not int or value < minimum: + raise ContractError(f"{path} must be an integer >= {minimum}") + return value + + +def validate_conditioning_contract(value: Any, phase: str) -> dict[str, Any]: + """Validate the exact phase-specific v1 conditioning schedule.""" + if phase not in V1_CONDITIONING_LADDERS: + raise ContractError("raw conditioning phase is invalid") + conditioning = _keys( + value, {"contract", "ladder", "roundtrips_per_shape"}, + "raw.measurement.conditioning", + ) + ladder = conditioning["ladder"] + if ( + conditioning["contract"] != identity.V1_CASE_PROFILE["conditioning_contract"] + or type(ladder) is not list + or any(type(point) is not int for point in ladder) + or ladder != list(V1_CONDITIONING_LADDERS[phase]) + or _integer( + conditioning["roundtrips_per_shape"], + "raw.measurement.conditioning.roundtrips_per_shape", + minimum=1, + ) != V1_CONDITIONING_ROUNDS_PER_SHAPE + ): + raise ContractError(f"raw {phase} conditioning contract differs") + return conditioning + + +def _number(value: Any, path: str, *, minimum: float | None = None) -> float: + if isinstance(value, bool) or not isinstance(value, (int, float)) or not math.isfinite(value): + raise ContractError(f"{path} must be finite") + result = float(value) + if minimum is not None and result < minimum: + raise ContractError(f"{path} must be >= {minimum}") + return result + + +def _finite_tree(value: Any, path: str = "$") -> None: + if isinstance(value, float) and not math.isfinite(value): + raise ContractError(f"{path} contains a non-finite number") + if isinstance(value, list): + for index, item in enumerate(value): + _finite_tree(item, f"{path}[{index}]") + elif isinstance(value, dict): + for key, item in value.items(): + _finite_tree(item, f"{path}.{key}") + + +def _typed(value: Any, kind: str, path: str) -> str: + if not identity.is_typed_id(value, kind): + raise ContractError(f"{path} is not a {kind} ID") + return value + + +def _sha256_json(value: Any) -> str: + payload = json.dumps( + value, allow_nan=False, ensure_ascii=False, sort_keys=True, separators=(",", ":") + ).encode() + return hashlib.sha256(payload).hexdigest() + + +def _precision_byte_provenance( + axis: dict[str, Any], logical_copies: int, hidden: int +) -> dict[str, Any]: + bits_per_value = { + "bf16": 16, + "fp8-e4m3fn": 8, + "fp8-e4m3fnuz": 8, + "logfmt10": 10, + }.get(axis["communication_format"]) + if bits_per_value is None: + raise ContractError("unknown communication precision format") + scale_size = {None: 0, "f32": 4, "implicit-logfmt10": 0}.get(axis["scale_dtype"]) + if scale_size is None: + raise ContractError("unknown communication scale dtype") + group_size = axis["scale_group_size"] + groups = math.ceil(hidden / group_size) if group_size is not None else 0 + activation = logical_copies * math.ceil(hidden * bits_per_value / 8) + scales = logical_copies * groups * scale_size + return { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": activation, + "scale_bytes": scales, + "total_logical_bytes": activation + scales, + } + + +@lru_cache(maxsize=None) +def _expected_eplb_calibration( + routing: str, + hidden: int, + topk: int, + logical_experts: int, + physical_experts: int, + ep_size: int, + seed: int, + reference_tokens_per_rank: int, +) -> tuple[dict[str, Any], dict[str, Any]]: + member, checksums, indices, _ = workload_contract.canonical_eplb_calibration_member( + routing, + hidden, + topk, + logical_experts, + ep_size, + reference_tokens_per_rank, + seed, + ) + load = [0] * logical_experts + for row in indices: + for expert in row: + load[expert] += 1 + plan = eplb_contract.build_plan(load, physical_experts, ep_size) + descriptor = { + "calibration_token_offset": workload_contract.EPLB_CALIBRATION_TOKEN_OFFSET, + "calibration_trace_sha256": checksums["trace"], + "calibration_window": workload_contract.EPLB_CALIBRATION_WINDOW, + "calibration_workload_id": member, + } + return plan, descriptor + + +@lru_cache(maxsize=None) +def _expected_eplb_plan( + routing: str, + topk: int, + logical_experts: int, + physical_experts: int, + ep_size: int, + seed: int, + reference_tokens_per_rank: int, + hidden: int = 7168, +) -> dict[str, Any]: + """Compatibility wrapper returning the disjoint calibration plan.""" + plan, _ = _expected_eplb_calibration( + routing, + hidden, + topk, + logical_experts, + physical_experts, + ep_size, + seed, + reference_tokens_per_rank, + ) + return plan + + +@lru_cache(maxsize=None) +def _expected_canonical_trace( + routing: str, + hidden: int, + topk: int, + logical_experts: int, + physical_experts: int, + ep_size: int, + tokens_per_rank: int, + seed: int, + eplb_enabled: bool, + reference_tokens_per_rank: int, +) -> tuple[str, dict[str, str], str, list[list[int]], list[list[float]]]: + member, checksums, indices, weights = workload_contract.canonical_member( + routing, + hidden, + topk, + logical_experts, + ep_size, + tokens_per_rank, + seed, + ) + if eplb_enabled: + plan = _expected_eplb_plan( + routing, + topk, + logical_experts, + physical_experts, + ep_size, + seed, + reference_tokens_per_rank, + hidden, + ) + indices = eplb_contract.remap_rows(indices, plan) + routing_hash = workload_contract.trace_checksums(indices, weights)["trace"] + return member, checksums, routing_hash, indices, weights + + +def _coefficient_of_variation(values: list[int]) -> float: + mean = sum(values) / len(values) + if mean == 0: + return 0.0 + variance = sum((value - mean) ** 2 for value in values) / len(values) + return variance**0.5 / mean + + +def _expected_routing_summary( + indices: list[list[int]], + weights: list[list[float]], + *, + physical_experts: int, + ep_size: int, + tokens_per_rank: int, + gpus_per_node: int, + scale_up_domain: int, +) -> dict[str, Any]: + """Recompute every published routing/load statistic without torch.""" + experts_per_rank = physical_experts // ep_size + expert_load = [0] * physical_experts + assignment_load = [0] * ep_size + payload_load = [0] * ep_size + fanouts: list[int] = [] + local = same_node = same_domain = copies = 0 + for token, row in enumerate(indices): + destinations = {expert // experts_per_rank for expert in row} + source = token // tokens_per_rank + fanouts.append(len(destinations)) + for expert in row: + expert_load[expert] += 1 + assignment_load[expert // experts_per_rank] += 1 + for destination in destinations: + payload_load[destination] += 1 + copies += 1 + local += destination == source + same_node += destination // gpus_per_node == source // gpus_per_node + same_domain += destination // scale_up_domain == source // scale_up_domain + fanout_histogram = [fanouts.count(value) for value in range(1, ep_size + 1)] + expert_mean = sum(expert_load) / len(expert_load) + return { + "empty_expert_count": expert_load.count(0), + "empty_rank_count": payload_load.count(0), + "expert_assignment_rank_cv": _coefficient_of_variation(assignment_load), + "expert_assignments_per_rank": assignment_load, + "expert_load_cv": _coefficient_of_variation(expert_load), + "expert_load_max": max(expert_load), + "expert_load_mean": expert_mean, + "expert_load_min": min(expert_load), + "fanout_histogram": fanout_histogram, + "fanout_max": max(fanouts), + "fanout_mean": sum(fanouts) / len(fanouts), + "fanout_min": min(fanouts), + "hash": workload_contract.trace_checksums(indices, weights)["trace"], + "hotspot_ratio": max(expert_load) / expert_mean if expert_mean else 0.0, + "locality": { + "placement": "packed", + "local_rank_fraction": local / copies, + "same_node_fraction": same_node / copies, + "same_scaleup_domain_fraction": same_domain / copies, + "cross_node_fraction": 1 - same_node / copies, + "cross_domain_fraction": 1 - same_domain / copies, + "gpus_per_node": gpus_per_node, + "scale_up_domain": scale_up_domain, + "copies": copies, + }, + "payload_copies_per_rank": payload_load, + "payload_rank_cv": _coefficient_of_variation(payload_load), + "routed_copies": copies, + "source_token_stats": { + "min": tokens_per_rank, + "mean": float(tokens_per_rank), + "max": tokens_per_rank, + "cv": 0.0, + "empty_ranks": 0, + "total": tokens_per_rank * ep_size, + "ranks": ep_size, + }, + } + + +def _expected_histogram(samples: list[float], bins: int = 40) -> dict[str, Any]: + low, high = min(samples), max(samples) + if high <= low: + return {"n": len(samples), "min": low, "max": high, "bins": bins, "counts": [len(samples)]} + counts = [0] * bins + span = high - low + for sample in samples: + index = min(bins - 1, int((sample - low) / span * bins)) + counts[index] += 1 + return { + "n": len(samples), + "min": round(low, 3), + "max": round(high, 3), + "bins": bins, + "counts": counts, + } + + +def _expected_anomalies( + tokens: int, components: dict[str, Any] +) -> list[dict[str, Any]]: + dispatch = components["dispatch"]["percentiles_us"] + stage = components["stage"]["percentiles_us"] + combine = components["combine"]["percentiles_us"] + roundtrip = components["roundtrip"]["percentiles_us"] + isolated = components["isolated_sum"]["percentiles_us"] + anomalies: list[dict[str, Any]] = [] + if isolated is not None and roundtrip["p99"] > 3.0 * isolated["p99"]: + anomalies.append({ + "type": "roundtrip_gt_isolated_sum", + "T": tokens, + "roundtrip_p99": round(roundtrip["p99"], 2), + "isolated_sum_p99": round(isolated["p99"], 2), + "ratio": round(roundtrip["p99"] / isolated["p99"], 2), + "threshold": 3.0, + }) + floor = ( + max(dispatch["p50"], combine["p50"], stage["p50"] if stage is not None else 0.0) + if dispatch and combine else None + ) + if floor and roundtrip["p50"] < 0.95 * floor: + anomalies.append({ + "type": "roundtrip_lt_component_floor", + "T": tokens, + "roundtrip_p50": round(roundtrip["p50"], 2), + "component_floor_p50": round(floor, 2), + }) + return anomalies + + +def _validate_canonical_workload( + workload: dict[str, Any], + scheduled_case: dict[str, Any], + rows: list[dict[str, Any]], + eplb: dict[str, Any], +) -> None: + """Bind every canonical member and measured routing hash to its scheduled token row.""" + profile = identity.profile_for_case(scheduled_case) + if eplb["enabled"]: + plan = _expected_eplb_plan( + scheduled_case["routing"], + scheduled_case["topk"], + scheduled_case["experts"], + eplb["num_physical_experts"], + scheduled_case["ep"], + profile["seed"], + profile["eplb_reference_tokens_per_rank"], + scheduled_case["hidden"], + ) + if eplb["mapping_hash"] != eplb_contract.mapping_hash(plan): + raise ContractError("raw EPLB mapping differs from the frozen canonical plan") + + expected: dict[str, dict[str, str]] = {} + for index, row in enumerate(rows): + member, checksums, routing_hash, _, _ = _expected_canonical_trace( + scheduled_case["routing"], + scheduled_case["hidden"], + scheduled_case["topk"], + scheduled_case["experts"], + eplb["num_physical_experts"], + scheduled_case["ep"], + row["tokens_per_rank"], + profile["seed"], + eplb["enabled"], + profile["eplb_reference_tokens_per_rank"], + ) + if row["routing"]["hash"] != routing_hash: + raise ContractError( + f"raw.measurement.rows[{index}].routing.hash differs from its canonical member" + ) + expected[member] = checksums + if ( + len(expected) != len(rows) + or workload["members"] != sorted(expected) + or workload["manifest_checksums"] != expected + ): + raise ContractError("raw canonical member set/checksums differ from scheduled rows") + expected_workload_id = identity.workload_id({ + "members": [ + {"checksums": expected[member], "workload_id": member} + for member in sorted(expected) + ] + }) + if workload["workload_id"] != expected_workload_id: + raise ContractError("raw composite workload identity differs from scheduled rows") + + +def _nearest_rank(samples: list[float], q: int) -> float: + ordered = sorted(samples) + return ordered[max(0, min(len(ordered) - 1, math.ceil(q / 100 * len(ordered)) - 1))] + + +def _close(observed: Any, expected: float, path: str, tolerance: float = 1e-6) -> None: + value = _number(observed, path) + if not math.isclose(value, expected, rel_tol=tolerance, abs_tol=tolerance): + raise ContractError(f"{path}={value} differs from recomputed {expected}") + + +def _equivalent( + observed: Any, expected: Any, path: str, *, tolerance: float = 1e-6 +) -> None: + """Compare a recomputed JSON subtree while allowing only float roundoff.""" + if isinstance(expected, dict): + value = _keys(observed, set(expected), path) + for key, child in expected.items(): + _equivalent(value[key], child, f"{path}.{key}", tolerance=tolerance) + return + if isinstance(expected, list): + if not isinstance(observed, list) or len(observed) != len(expected): + raise ContractError(f"{path} differs from recomputed evidence") + for index, child in enumerate(expected): + _equivalent(observed[index], child, f"{path}[{index}]", tolerance=tolerance) + return + if isinstance(expected, float): + _close(observed, expected, path, tolerance) + return + if type(observed) is not type(expected) or observed != expected: + raise ContractError(f"{path} differs from recomputed evidence") + + +def _schema_equal(left: Any, right: Any) -> bool: + """JSON Schema equality: booleans are distinct from numbers.""" + if isinstance(left, bool) or isinstance(right, bool): + return type(left) is type(right) and left == right + if isinstance(left, dict) and isinstance(right, dict): + return set(left) == set(right) and all( + _schema_equal(left[key], right[key]) for key in left + ) + if isinstance(left, list) and isinstance(right, list): + return len(left) == len(right) and all( + _schema_equal(a, b) for a, b in zip(left, right, strict=True) + ) + return left == right + + +def _schema_ref(root: dict[str, Any], reference: str) -> dict[str, Any]: + if not reference.startswith("#/"): + raise ContractError("native artifact schema contains a non-local reference") + value: Any = root + for part in reference[2:].split("/"): + part = part.replace("~1", "/").replace("~0", "~") + if not isinstance(value, dict) or part not in value: + raise ContractError("native artifact schema contains a broken reference") + value = value[part] + if not isinstance(value, dict): + raise ContractError("native artifact schema reference is not an object") + return value + + +def _schema_type_matches(value: Any, expected: str) -> bool: + if expected == "null": + return value is None + if expected == "boolean": + return type(value) is bool + if expected == "object": + return isinstance(value, dict) + if expected == "array": + return isinstance(value, list) + if expected == "string": + return isinstance(value, str) + if expected == "number": + return ( + not isinstance(value, bool) + and isinstance(value, (int, float)) + and math.isfinite(value) + ) + if expected == "integer": + return ( + not isinstance(value, bool) + and isinstance(value, (int, float)) + and math.isfinite(value) + and float(value).is_integer() + ) + raise ContractError(f"native artifact schema uses unsupported type {expected!r}") + + +def _validate_schema_value( + value: Any, schema: dict[str, Any], root: dict[str, Any], path: str +) -> None: + """Validate the bounded JSON Schema subset used by native artifact contracts.""" + if "$ref" in schema: + _validate_schema_value(value, _schema_ref(root, schema["$ref"]), root, path) + return + if "oneOf" in schema: + matches = 0 + for candidate in schema["oneOf"]: + try: + _validate_schema_value(value, candidate, root, path) + except ContractError: + continue + matches += 1 + if matches != 1: + raise ContractError(f"{path} must match exactly one native schema alternative") + return + expected_type = schema.get("type") + if expected_type is not None and not _schema_type_matches(value, expected_type): + raise ContractError(f"{path} is not a schema {expected_type}") + if "const" in schema and not _schema_equal(value, schema["const"]): + raise ContractError(f"{path} differs from its schema constant") + if "enum" in schema and not any(_schema_equal(value, item) for item in schema["enum"]): + raise ContractError(f"{path} is outside its schema enum") + + if isinstance(value, dict): + required = set(schema.get("required", ())) + properties = schema.get("properties", {}) + missing = required - set(value) + if missing: + raise ContractError(f"{path} lacks schema fields {sorted(missing)}") + additional = schema.get("additionalProperties", True) + extra = set(value) - set(properties) + if additional is False and extra: + raise ContractError(f"{path} has extra schema fields {sorted(extra)}") + for key, item in value.items(): + if key in properties: + _validate_schema_value(item, properties[key], root, f"{path}.{key}") + elif isinstance(additional, dict): + _validate_schema_value(item, additional, root, f"{path}.{key}") + property_names = schema.get("propertyNames") + if property_names is not None: + for key in value: + _validate_schema_value(key, property_names, root, f"{path}.") + + if isinstance(value, list): + if len(value) < schema.get("minItems", 0): + raise ContractError(f"{path} has too few schema items") + maximum = schema.get("maxItems") + if maximum is not None and len(value) > maximum: + raise ContractError(f"{path} has too many schema items") + if schema.get("uniqueItems") and any( + _schema_equal(item, prior) + for index, item in enumerate(value) + for prior in value[:index] + ): + raise ContractError(f"{path} schema items are not unique") + if "items" in schema: + for index, item in enumerate(value): + _validate_schema_value(item, schema["items"], root, f"{path}[{index}]") + + if isinstance(value, str): + if len(value) < schema.get("minLength", 0): + raise ContractError(f"{path} is shorter than its schema minimum") + maximum = schema.get("maxLength") + if maximum is not None and len(value) > maximum: + raise ContractError(f"{path} is longer than its schema maximum") + if "pattern" in schema and re.search(schema["pattern"], value) is None: + raise ContractError(f"{path} does not match its schema pattern") + if schema.get("format") == "date-time": + try: + parsed = dt.datetime.fromisoformat(value.replace("Z", "+00:00")) + except ValueError as exc: + raise ContractError(f"{path} is not a schema date-time") from exc + if parsed.tzinfo is None: + raise ContractError(f"{path} schema date-time lacks a timezone") + + if ( + not isinstance(value, bool) + and isinstance(value, (int, float)) + and math.isfinite(value) + ): + if "minimum" in schema and value < schema["minimum"]: + raise ContractError(f"{path} is below its schema minimum") + if "maximum" in schema and value > schema["maximum"]: + raise ContractError(f"{path} is above its schema maximum") + + +def _validate_native_schema(name: str, value: Any) -> None: + schema = _SCHEMA_CACHE.get(name) + if schema is None: + loaded = strict_load(SCHEMA_DIR / name) + if not isinstance(loaded, dict): + raise ContractError(f"native artifact schema {name} is not an object") + schema = loaded + _SCHEMA_CACHE[name] = schema + _validate_schema_value(value, schema, schema, "$") + + +def validate_samples_document(document: Any) -> dict[str, Any]: + _validate_native_schema("samples-v1.schema.json", document) + doc = _keys( + document, + {"allocation_id", "attempt_id", "case_id", "format", "points", + "qualification_index", "sampling", "schema_version", "series_id"}, + "samples", + ) + if doc["format"] != SAMPLES_FORMAT or doc["schema_version"] != 1: + raise ContractError("samples format/schema differs from v1") + for field, kind in ( + ("allocation_id", "allocation"), ("attempt_id", "attempt"), + ("case_id", "case"), ("series_id", "series"), + ): + _typed(doc[field], kind, f"samples.{field}") + qualification_index = _integer( + doc["qualification_index"], "samples.qualification_index", minimum=1 + ) + if qualification_index > 3: + raise ContractError("samples.qualification_index must be in 1..3") + sampling = _keys( + doc["sampling"], {"iterations_per_trial", "reduction", "trials"}, "samples.sampling" + ) + if ( + _integer(sampling["iterations_per_trial"], "samples.sampling.iterations_per_trial", minimum=1) != 8 + or _integer(sampling["trials"], "samples.sampling.trials", minimum=1) != 64 + or sampling["reduction"] != identity.V1_CASE_PROFILE["rank_reduction"] + ): + raise ContractError("samples must use the fixed 8x64 cross-rank-max contract") + points = doc["points"] + if not isinstance(points, list) or not points: + raise ContractError("samples.points must be non-empty") + seen = set() + for index, point_value in enumerate(points): + path = f"samples.points[{index}]" + point = _keys( + point_value, + {"components", "evidence_id", "point_id", "sample_sha256", "tokens_per_rank"}, + path, + ) + tokens = _integer(point["tokens_per_rank"], f"{path}.tokens_per_rank", minimum=1) + if tokens in seen: + raise ContractError(f"duplicate sample token point {tokens}") + seen.add(tokens) + _typed(point["point_id"], "point", f"{path}.point_id") + _typed(point["evidence_id"], "evidence", f"{path}.evidence_id") + components = _keys( + point["components"], {"combine", "dispatch", "roundtrip", "stage"}, + f"{path}.components", + ) + for name, component_value in components.items(): + component = _keys( + component_value, {"availability", "sample_count", "trials"}, + f"{path}.components.{name}", + ) + availability = component["availability"] + count = _integer(component["sample_count"], f"{path}.components.{name}.sample_count") + trials = component["trials"] + if availability == "unavailable": + if count != 0 or trials is not None or name == "roundtrip": + raise ContractError(f"{path}.components.{name} has invalid unavailability") + continue + if availability != "measured" or not isinstance(trials, list) or len(trials) != 64: + raise ContractError(f"{path}.components.{name} must contain 64 measured trials") + if any(not isinstance(trial, list) or len(trial) != 8 for trial in trials): + raise ContractError(f"{path}.components.{name} trials must each contain 8 samples") + flattened = [ + _number(sample, f"{path}.components.{name}.trials", minimum=0.0) + for trial in trials for sample in trial + ] + if count != 512 or len(flattened) != 512: + raise ContractError(f"{path}.components.{name} must contain 512 samples") + sample_base = {"components": components, "tokens_per_rank": tokens} + if point["sample_sha256"] != _sha256_json(sample_base): + raise ContractError(f"{path}.sample_sha256 differs") + return doc + + +def _validate_component( + component_value: Any, + sample_component: dict[str, Any] | None, + path: str, + *, + derived: bool = False, +) -> None: + component = _keys( + component_value, {"availability", "origin", "percentiles_us", "sample_count"}, path + ) + availability = component["availability"] + if availability == "unavailable": + if component != { + "availability": "unavailable", "origin": None, + "percentiles_us": None, "sample_count": 0, + }: + raise ContractError(f"{path} has invalid unavailable representation") + if sample_component and sample_component["availability"] != "unavailable": + raise ContractError(f"{path} disagrees with samples") + return + expected_availability = "derived" if derived else "measured" + expected_origin = "derived-percentile-sum" if derived else "measured" + if availability != expected_availability or component["origin"] != expected_origin: + raise ContractError(f"{path} has invalid availability/origin") + percentiles = _keys(component["percentiles_us"], set(PERCENTILES), f"{path}.percentiles_us") + if derived: + if component["sample_count"] != 0: + raise ContractError(f"{path}.sample_count must be zero for a derived value") + return + if sample_component is None or sample_component["availability"] != "measured": + raise ContractError(f"{path} lacks measured sample evidence") + flattened = [sample for trial in sample_component["trials"] for sample in trial] + if component["sample_count"] != len(flattened): + raise ContractError(f"{path}.sample_count differs from exact samples") + for name, percentile in zip(PERCENTILES, (50, 90, 95, 99), strict=True): + _close(percentiles[name], _nearest_rank(flattened, percentile), f"{path}.{name}") + + +def _validate_oracle( + value: Any, path: str, profile: dict[str, Any] | None = None +) -> dict[str, Any]: + profile = profile or identity.V1_NORMAL_CASE_PROFILE + oracle = _keys( + value, + {"atol", "checks", "combine_weight_semantics", "contract", "dispatch_sha256", + "max_absolute_error", "max_elementwise_relative_error", "max_relative_error", + "max_weight_error", "order_sha256", "ordering_contract", "passed", "receive_count", + "rtol"}, + path, + ) + if oracle["contract"] != profile["oracle_contract"]: + raise ContractError(f"{path}.contract differs") + checks = _keys( + oracle["checks"], + {"combine_values", "counts", "metadata", "multiplicity", "payload", "source_set", + "weights"}, + f"{path}.checks", + ) + if any(type(value) is not bool for value in checks.values()): + raise ContractError(f"{path}.checks must be boolean") + if type(oracle["passed"]) is not bool: + raise ContractError(f"{path}.passed must be boolean") + _integer(oracle["receive_count"], f"{path}.receive_count") + _text(oracle["ordering_contract"], f"{path}.ordering_contract") + expected_weight_semantics = ( + "gate-weighted-sum" + if profile["combine_semantics"] == "gate-weighted" + else "unweighted-rank-sum" + ) + if oracle["combine_weight_semantics"] != expected_weight_semantics: + raise ContractError(f"{path}.combine_weight_semantics differs from v1") + _close(oracle["rtol"], 5e-2, f"{path}.rtol") + _close(oracle["atol"], 2e-2, f"{path}.atol") + for field in ("dispatch_sha256", "order_sha256"): + digest = oracle[field] + if digest is not None and ( + not isinstance(digest, str) or len(digest) != 64 + or any(character not in "0123456789abcdef" for character in digest) + ): + raise ContractError(f"{path}.{field} is not a SHA-256 digest") + for field in ( + "max_absolute_error", "max_elementwise_relative_error", "max_relative_error", + "max_weight_error", + ): + if oracle[field] is not None: + _number(oracle[field], f"{path}.{field}", minimum=0.0) + expected_pass = ( + all(checks.values()) + and oracle["max_relative_error"] is not None + and oracle["max_relative_error"] < 5e-2 + ) + if oracle["passed"] != expected_pass: + raise ContractError(f"{path}.passed differs from its evidence") + return oracle + + +def _validate_precision_evidence( + value: Any, profile_id: str, communication_precision: dict[str, Any], path: str +) -> dict[str, Any]: + precision = _keys(value, {"combine", "dispatch", "passed", "profile_id"}, path) + if precision["profile_id"] != profile_id or type(precision["passed"]) is not bool: + raise ContractError(f"{path} profile/outcome differs") + for direction in ("dispatch", "combine"): + axis_path = f"{path}.{direction}" + axis = _keys( + precision[direction], + {"dequantized_semantics", "encoded_payload_valid", "max_abs_error", + "max_rel_error", "passed", "saturation_count", "saturation_rate", + "scales_finite", "scales_positive"}, + axis_path, + ) + for field in ("dequantized_semantics", "encoded_payload_valid", "passed"): + if type(axis[field]) is not bool: + raise ContractError(f"{axis_path}.{field} must be boolean") + expects_scales = communication_precision[direction]["scale_dtype"] is not None + for field in ("scales_finite", "scales_positive"): + if expects_scales: + if type(axis[field]) is not bool: + raise ContractError(f"{axis_path}.{field} must be boolean") + elif axis[field] is not None: + raise ContractError(f"{axis_path}.{field} must be null without scales") + saturation_count = _integer( + axis["saturation_count"], f"{axis_path}.saturation_count" + ) + saturation_rate = _number( + axis["saturation_rate"], f"{axis_path}.saturation_rate", minimum=0.0 + ) + if saturation_rate > 1.0: + raise ContractError(f"{axis_path}.saturation_rate must be <= 1") + _number(axis["max_abs_error"], f"{axis_path}.max_abs_error", minimum=0.0) + _number(axis["max_rel_error"], f"{axis_path}.max_rel_error", minimum=0.0) + expected_pass = ( + axis["encoded_payload_valid"] + and axis["dequantized_semantics"] + and (not expects_scales or (axis["scales_finite"] and axis["scales_positive"])) + and saturation_count >= 0 + ) + if axis["passed"] != bool(expected_pass): + raise ContractError(f"{axis_path}.passed differs from its evidence") + expected_pass = precision["dispatch"]["passed"] and precision["combine"]["passed"] + if precision["passed"] != expected_pass: + raise ContractError(f"{path}.passed differs from direction evidence") + return precision + + +def validate_raw_document(document: Any, samples_document: Any) -> dict[str, Any]: + """Validate identities, exact samples, formulas, privacy, and the native raw shape.""" + _validate_native_schema("raw-case-v1.schema.json", document) + doc = _keys( + document, + {"case", "format", "generated_at", "identity", "implementation", "measurement", + "outcome", "provenance", "record_type", "runtime_fingerprint", "sample_artifact", + "schema_version", "topology", "workload"}, + "raw", + ) + _finite_tree(doc) + if doc["format"] != RAW_FORMAT or doc["schema_version"] != 1 or doc["record_type"] != "case-attempt": + raise ContractError("raw format/schema/record type differs from v1") + _text(doc["generated_at"], "raw.generated_at") + identifiers = _keys( + doc["identity"], + {"allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal", "case_factors", + "case_id", "series_factors", "series_id"}, + "raw.identity", + ) + for field, kind in ( + ("allocation_id", "allocation"), ("attempt_id", "attempt"), + ("case_id", "case"), ("series_id", "series"), + ): + _typed(identifiers[field], kind, f"raw.identity.{field}") + ordinal = _integer(identifiers["attempt_ordinal"], "raw.identity.attempt_ordinal", minimum=1) + allocation_factors = _keys( + identifiers["allocation_factors"], ALLOCATION_FACTOR_FIELDS, + "raw.identity.allocation_factors", + ) + qualification_index = _integer( + allocation_factors["qualification_index"], + "raw.identity.allocation_factors.qualification_index", + minimum=1, + ) + if qualification_index > 3: + raise ContractError("raw qualification index must be in 1..3") + case_factors = _keys( + identifiers["case_factors"], {"case", "profile", "sku"}, + "raw.identity.case_factors", + ) + scheduled_case = _scheduled_case( + case_factors["case"], "raw.identity.case_factors.case" + ) + profile = scheduled_case_profile(scheduled_case, "raw.identity.case_factors.case") + if case_factors["profile"] != profile: + raise ContractError("raw case profile differs from CollectiveX v1") + _text(case_factors["sku"], "raw.identity.case_factors.sku") + series_factors = _keys( + identifiers["series_factors"], + {"backend", "case_id", "image_digest", "implementation_contract_sha256", + "public_config_sha256", "routing_control_sha256", + "runtime_fingerprint_sha256", "source_sha", "squash_sha256", "workload_id"}, + "raw.identity.series_factors", + ) + if identity.allocation_id(identifiers["allocation_factors"]) != identifiers["allocation_id"]: + raise ContractError("allocation identity differs") + if identity.digest("case", identifiers["case_factors"]) != identifiers["case_id"]: + raise ContractError("case identity differs") + if identity.series_id(identifiers["series_factors"]) != identifiers["series_id"]: + raise ContractError("series identity differs") + if identity.attempt_id( + allocation=identifiers["allocation_id"], case=identifiers["case_id"], ordinal=ordinal + ) != identifiers["attempt_id"]: + raise ContractError("attempt identity differs") + + samples = validate_samples_document(samples_document) + for field in ("allocation_id", "attempt_id", "case_id", "series_id"): + if samples[field] != identifiers[field]: + raise ContractError(f"samples.{field} differs from raw identity") + if samples["qualification_index"] != qualification_index: + raise ContractError("samples qualification index differs from raw allocation") + sample_by_token = {point["tokens_per_rank"]: point for point in samples["points"]} + + case = _keys( + doc["case"], + {"attempt_ordinal", "backend", "eplb", "ep_size", "mode", "phase", + "required_publication", "resource_mode", "runner", "shape", "suite", "workload_name"}, + "raw.case", + ) + ep_size = _integer(case["ep_size"], "raw.case.ep_size", minimum=1) + if case["attempt_ordinal"] != ordinal: + raise ContractError("case attempt ordinal differs") + for field in ("backend", "mode", "phase", "required_publication", "resource_mode", "runner", + "suite", "workload_name"): + _text(case[field], f"raw.case.{field}") + shape = _keys( + case["shape"], + {"activation_profile", "combine_precision", "dispatch_precision", "eplb", "experts", + "experts_per_rank", "hidden", "kernel_gen", "num_logical_experts", + "precision_profile", "routing", "topk"}, + "raw.case.shape", + ) + hidden = _integer(shape["hidden"], "raw.case.shape.hidden", minimum=1) + topk = _integer(shape["topk"], "raw.case.shape.topk", minimum=1) + physical_experts = _integer( + shape["experts"], "raw.case.shape.experts", minimum=1 + ) + logical_experts = _integer( + shape["num_logical_experts"], + "raw.case.shape.num_logical_experts", + minimum=1, + ) + experts_per_rank = _integer( + shape["experts_per_rank"], "raw.case.shape.experts_per_rank", minimum=1 + ) + precision_profile_id = scheduled_case.get( + "precision_profile", identity.V1_CONTROL_PRECISION_PROFILE + ) + communication_precision = identity.precision_profile(precision_profile_id) + if ( + shape["precision_profile"] != precision_profile_id + or shape["dispatch_precision"] != communication_precision["dispatch"] + or shape["combine_precision"] != communication_precision["combine"] + ): + raise ContractError("raw communication precision differs from scheduled case") + eplb = _keys( + case["eplb"], + {"calibration_token_offset", "calibration_trace_sha256", "calibration_window", + "calibration_workload_id", "enabled", "imbalance_after", "imbalance_before", + "mapping_hash", "max_replicas", "num_logical_experts", "num_physical_experts", + "num_redundant", "planner", "reference_tokens_per_rank", "replicated_experts"}, + "raw.case.eplb", + ) + if not isinstance(eplb["enabled"], bool): + raise ContractError("raw.case.eplb.enabled must be boolean") + expected_redundant = ( + profile["eplb_redundant_experts"] if eplb["enabled"] else 0 + ) + expected_physical = eplb_contract.physical_count( + scheduled_case["experts"], expected_redundant, ep_size + ) + if ( + shape["eplb"] != eplb["enabled"] + or logical_experts != scheduled_case["experts"] + or physical_experts != expected_physical + or experts_per_rank * ep_size != physical_experts + or eplb["num_logical_experts"] != logical_experts + or eplb["num_physical_experts"] != physical_experts + or eplb["num_redundant"] != expected_redundant + ): + raise ContractError("raw EPLB/shape dimensions differ from the frozen profile") + if eplb["enabled"]: + expected_plan, calibration_descriptor = _expected_eplb_calibration( + scheduled_case["routing"], + hidden, + topk, + logical_experts, + physical_experts, + ep_size, + profile["seed"], + profile["eplb_reference_tokens_per_rank"], + ) + expected_eplb = { + **calibration_descriptor, + "enabled": True, + "imbalance_after": expected_plan["imbalance_after"], + "imbalance_before": expected_plan["imbalance_before"], + "mapping_hash": eplb_contract.mapping_hash(expected_plan), + "max_replicas": expected_plan["max_replicas"], + "num_logical_experts": logical_experts, + "num_physical_experts": physical_experts, + "num_redundant": expected_redundant, + "planner": profile["eplb_planner"], + "reference_tokens_per_rank": profile[ + "eplb_reference_tokens_per_rank" + ], + "replicated_experts": expected_plan["replicated_experts"], + } + else: + expected_eplb = { + "calibration_token_offset": None, + "calibration_trace_sha256": None, + "calibration_window": None, + "calibration_workload_id": None, + "enabled": False, + "imbalance_after": None, + "imbalance_before": None, + "mapping_hash": None, + "max_replicas": None, + "num_logical_experts": logical_experts, + "num_physical_experts": physical_experts, + "num_redundant": 0, + "planner": None, + "reference_tokens_per_rank": None, + "replicated_experts": 0, + } + _equivalent(eplb, expected_eplb, "raw.case.eplb", tolerance=1e-9) + if case_factors["sku"] != case["runner"]: + raise ContractError("raw case runner differs from case identity") + + workload = _keys( + doc["workload"], + {"activation_generator", "activation_identity", "activation_profile", + "cross_rank_consistent", "manifest_checksums", "members", "routing_generator", "source", + "trace_hashes", "trace_signature", "workload_id"}, + "raw.workload", + ) + if workload["source"] not in {"canonical-serialized", "seeded-runtime"}: + raise ContractError("raw workload source is invalid") + if workload["source"] == "canonical-serialized": + _typed(workload["workload_id"], "workload", "raw.workload.workload_id") + members = workload["members"] + checksums = workload["manifest_checksums"] + if ( + not isinstance(members, list) + or not members + or members != sorted(set(members)) + or not all(identity.is_typed_id(member, "workload") for member in members) + or not isinstance(checksums, dict) + or set(checksums) != set(members) + ): + raise ContractError("raw canonical workload members/checksums are invalid") + for member, values in checksums.items(): + if ( + not isinstance(values, dict) + or set(values) != {"topk_idx", "topk_weights", "trace"} + or any(not re.fullmatch(r"[0-9a-f]{64}", str(value)) for value in values.values()) + ): + raise ContractError(f"raw canonical workload checksums differ for {member}") + expected_workload_id = identity.workload_id({ + "members": [ + {"checksums": checksums[member], "workload_id": member} + for member in members + ] + }) + if workload["workload_id"] != expected_workload_id: + raise ContractError("raw composite workload identity differs from its members") + elif any(workload[field] is not None for field in ("members", "manifest_checksums", "workload_id")): + raise ContractError("raw seeded workload cannot claim serialized members") + if workload["cross_rank_consistent"] is not True: + raise ContractError("raw workload is not consistent across ranks") + + measurement = _keys( + doc["measurement"], + {"component_order_contract", "conditioning", "contract", "execution_order_sha256", + "qualification_index", "rows", "sampling", "source_allocation"}, + "raw.measurement", + ) + if measurement["qualification_index"] != qualification_index: + raise ContractError("raw measurement qualification index differs from allocation") + if not isinstance(measurement["execution_order_sha256"], str) or not re.fullmatch( + r"[0-9a-f]{64}", measurement["execution_order_sha256"] + ): + raise ContractError("raw measurement execution order digest is invalid") + validate_conditioning_contract(measurement["conditioning"], case["phase"]) + sampling = _keys( + measurement["sampling"], + {"contract", "iterations_per_trial", "percentile_method", "reduction", + "samples_per_component", "trials", "warmup_iterations", "warmup_semantics"}, + "raw.measurement.sampling", + ) + expected_sampling = { + "contract": profile["sampling_contract"], "iterations_per_trial": 8, + "percentile_method": profile["percentile_method"], + "reduction": profile["rank_reduction"], + "samples_per_component": 512, "trials": 64, "warmup_iterations": 32, + "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1", + } + if sampling != expected_sampling: + raise ContractError("raw sampling contract differs from fixed-512-v1") + if ( + case["mode"] != profile["mode"] + or case["resource_mode"] != profile["resource_mode"] + or measurement["contract"] != profile["contract"] + or measurement["component_order_contract"] != profile["component_order_contract"] + or measurement["source_allocation"] != "even" + or shape["activation_profile"] != profile["activation_profile"] + or workload["activation_generator"] != profile["activation_generator"] + or workload["activation_profile"] != profile["activation_profile"] + or workload["routing_generator"] != profile["routing_generator"] + ): + raise ContractError("raw case differs from the frozen v1 profile") + expected_activation = hashlib.sha256( + ( + f"counter|seed={profile['seed']}|hidden={hidden}|" + f"gen={profile['activation_generator']}" + ).encode() + ).hexdigest() + if workload["activation_identity"] != expected_activation: + raise ContractError("raw activation identity differs from the frozen seed/profile") + rows = measurement["rows"] + if not isinstance(rows, list) or not rows: + raise ContractError("raw.measurement.rows must be non-empty") + seen_points = set() + row_tokens = [] + recomputed_anomalies = 0 + for index, row_value in enumerate(rows): + path = f"raw.measurement.rows[{index}]" + row = _keys( + row_value, + {"anomalies", "byte_provenance", "components", "correctness", "evidence_id", + "global_tokens", "point_id", "receive", "routing", + "sample_histograms", "sample_sha256", "token_rate_at_latency_percentile", + "tokens_per_rank"}, + path, + ) + tokens = _integer(row["tokens_per_rank"], f"{path}.tokens_per_rank", minimum=1) + row_tokens.append(tokens) + if tokens in seen_points or tokens not in sample_by_token: + raise ContractError(f"{path} token point is duplicate or missing samples") + seen_points.add(tokens) + if row["global_tokens"] != tokens * ep_size: + raise ContractError(f"{path}.global_tokens formula differs") + sample_point = sample_by_token[tokens] + expected_point = identity.point_id(series=identifiers["series_id"], tokens_per_rank=tokens) + if row["point_id"] != expected_point or sample_point["point_id"] != expected_point: + raise ContractError(f"{path}.point_id differs") + expected_evidence = identity.evidence_id( + point=expected_point, allocation=identifiers["allocation_id"], + attempt=identifiers["attempt_id"], sample_sha256=sample_point["sample_sha256"], + ) + if row["evidence_id"] != expected_evidence or sample_point["evidence_id"] != expected_evidence: + raise ContractError(f"{path}.evidence_id differs") + if row["sample_sha256"] != sample_point["sample_sha256"]: + raise ContractError(f"{path}.sample_sha256 differs") + components = _keys( + row["components"], {"combine", "dispatch", "isolated_sum", "roundtrip", "stage"}, + f"{path}.components", + ) + for name in ("combine", "dispatch", "roundtrip", "stage"): + _validate_component( + components[name], sample_point["components"][name], f"{path}.components.{name}" + ) + _validate_component( + components["isolated_sum"], None, f"{path}.components.isolated_sum", derived=True + ) + expected_stage_availability = ( + "measured" + if communication_precision["dispatch"]["communication_format"] != "bf16" + or (case["backend"] == "mori" and shape["kernel_gen"] == "intranode") + else "unavailable" + ) + if components["stage"]["availability"] != expected_stage_availability: + raise ContractError(f"{path}.components.stage differs from adapter device work") + _, _, _, expected_indices, expected_weights = _expected_canonical_trace( + scheduled_case["routing"], + hidden, + topk, + logical_experts, + physical_experts, + ep_size, + tokens, + profile["seed"], + eplb["enabled"], + profile["eplb_reference_tokens_per_rank"], + ) + expected_routing = _expected_routing_summary( + expected_indices, + expected_weights, + physical_experts=physical_experts, + ep_size=ep_size, + tokens_per_rank=tokens, + gpus_per_node=scheduled_case["gpus_per_node"], + scale_up_domain=scheduled_case["scale_up_domain"], + ) + _equivalent( + row["routing"], expected_routing, f"{path}.routing", tolerance=1e-5 + ) + expected_payload_counts = ( + expected_routing["expert_assignments_per_rank"] + if profile["payload_unit"] == "token-expert" + else expected_routing["payload_copies_per_rank"] + ) + throughput = _keys( + row["token_rate_at_latency_percentile"], set(PERCENTILES), + f"{path}.token_rate_at_latency_percentile", + ) + for percentile in PERCENTILES: + latency = components["roundtrip"]["percentiles_us"][percentile] + if latency <= 0: + raise ContractError(f"{path} roundtrip latency must be positive") + _close( + throughput[percentile], row["global_tokens"] / (latency * 1e-6), + f"{path}.token_rate_at_latency_percentile.{percentile}", 1e-9, + ) + correctness = _keys( + row["correctness"], + {"contract", "max_relative_error", "passed", "precision", "rank_evidence", "scope"}, + f"{path}.correctness", + ) + if ( + correctness["contract"] != profile["oracle_contract"] + or correctness["scope"] != profile["correctness_scope"] + or type(correctness["passed"]) is not bool + ): + raise ContractError(f"{path}.correctness contract differs") + precision_evidence = _validate_precision_evidence( + correctness["precision"], precision_profile_id, communication_precision, + f"{path}.correctness.precision", + ) + _number( + correctness["max_relative_error"], + f"{path}.correctness.max_relative_error", + minimum=0.0, + ) + rank_evidence = correctness["rank_evidence"] + if not isinstance(rank_evidence, list) or len(rank_evidence) != ep_size: + raise ContractError(f"{path}.correctness.rank_evidence must cover every rank") + ranks = set() + observed_max_error = 0.0 + evidence_passed = True + for evidence_index, evidence_value in enumerate(rank_evidence): + evidence_path = f"{path}.correctness.rank_evidence[{evidence_index}]" + evidence = _keys( + evidence_value, + {"input_unchanged", "order_stable", "post_timing", "pre_timing", "rank"}, + evidence_path, + ) + evidence_rank = _integer(evidence["rank"], f"{evidence_path}.rank") + if evidence_rank >= ep_size: + raise ContractError(f"{evidence_path}.rank is outside the EP group") + ranks.add(evidence_rank) + if type(evidence["input_unchanged"]) is not bool or type(evidence["order_stable"]) is not bool: + raise ContractError(f"{evidence_path} stability fields must be boolean") + pre = _validate_oracle( + evidence["pre_timing"], f"{evidence_path}.pre_timing", profile + ) + post = _validate_oracle( + evidence["post_timing"], f"{evidence_path}.post_timing", profile + ) + if ( + pre["receive_count"] != expected_payload_counts[evidence_rank] + or post["receive_count"] != expected_payload_counts[evidence_rank] + ): + raise ContractError( + f"{evidence_path}.receive_count differs from canonical routing" + ) + expected_stability = all( + pre[field] == post[field] + for field in ("ordering_contract", "order_sha256", "dispatch_sha256") + ) + if evidence["order_stable"] != expected_stability: + raise ContractError(f"{evidence_path}.order_stable differs from the evidence") + errors = [ + oracle["max_relative_error"] + for oracle in (pre, post) + if oracle["max_relative_error"] is not None + ] + observed_max_error = max([observed_max_error, *errors]) + evidence_passed = evidence_passed and all( + (evidence["input_unchanged"], evidence["order_stable"], pre["passed"], post["passed"]) + ) + evidence_passed = evidence_passed and precision_evidence["passed"] + if ranks != set(range(ep_size)) or correctness["passed"] != evidence_passed: + raise ContractError(f"{path}.correctness rank coverage or outcome differs") + _close( + correctness["max_relative_error"], observed_max_error, + f"{path}.correctness.max_relative_error", + ) + if components["dispatch"]["availability"] == "measured": + for percentile in PERCENTILES: + expected = ( + components["dispatch"]["percentiles_us"][percentile] + + ( + components["stage"]["percentiles_us"][percentile] + if components["stage"]["availability"] == "measured" + else 0.0 + ) + + components["combine"]["percentiles_us"][percentile] + ) + _close( + components["isolated_sum"]["percentiles_us"][percentile], expected, + f"{path}.components.isolated_sum.{percentile}", + ) + logical_copies = ( + sum(expected_routing["expert_assignments_per_rank"]) + if profile["payload_unit"] == "token-expert" + else expected_routing["routed_copies"] + ) + dispatch_bytes = _precision_byte_provenance( + communication_precision["dispatch"], logical_copies, hidden + ) + combine_bytes = _precision_byte_provenance( + communication_precision["combine"], logical_copies, hidden + ) + stage_bytes = { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": 0, + "scale_bytes": 0, + "total_logical_bytes": 0, + } + roundtrip_bytes = { + "accounting_contract": "activation-data-plus-scales-v1", + **{ + field: dispatch_bytes[field] + combine_bytes[field] + for field in ( + "activation_data_bytes", "scale_bytes", "total_logical_bytes" + ) + }, + } + expected_byte_provenance = { + "combine": combine_bytes, + "dispatch": dispatch_bytes, + "roundtrip": roundtrip_bytes, + "stage": stage_bytes, + } + _equivalent( + row["byte_provenance"], expected_byte_provenance, f"{path}.byte_provenance" + ) + + max_receive = max(expected_payload_counts) + expected_receive = { + "max": max_receive, + "mean": sum(expected_payload_counts) / ep_size, + "min": min(expected_payload_counts), + "total": sum(expected_payload_counts), + } + _equivalent(row["receive"], expected_receive, f"{path}.receive") + expected_histograms = { + name: ( + _expected_histogram([ + sample + for trial in sample_point["components"][name]["trials"] + for sample in trial + ]) + if sample_point["components"][name]["availability"] == "measured" + else None + ) + for name in ("dispatch", "stage", "combine", "roundtrip") + } + _equivalent( + row["sample_histograms"], expected_histograms, f"{path}.sample_histograms" + ) + expected_anomalies = _expected_anomalies(tokens, components) + _equivalent(row["anomalies"], expected_anomalies, f"{path}.anomalies") + recomputed_anomalies += len(expected_anomalies) + if seen_points != set(sample_by_token): + raise ContractError("raw rows and sample points differ") + if row_tokens != sorted(row_tokens): + raise ContractError("raw rows must follow the scheduled token ladder") + expected_trace_hashes = sorted(row["routing"]["hash"] for row in rows) + if workload["trace_hashes"] != expected_trace_hashes: + raise ContractError("raw workload trace hashes differ from measured rows") + expected_trace_signature = hashlib.sha256( + "|".join(expected_trace_hashes).encode() + ).hexdigest() + if workload["trace_signature"] != expected_trace_signature: + raise ContractError("raw workload trace signature differs from measured rows") + + implementation = _keys( + doc["implementation"], {"kernel_generation", "name", "provenance", "resource_profile"}, + "raw.implementation", + ) + if ( + implementation["name"] != case["backend"] + or implementation["kernel_generation"] != shape["kernel_gen"] + ): + raise ContractError("raw implementation identity differs from the case") + provenance_fields = _obj(implementation["provenance"], "raw.implementation.provenance") + unknown = set(provenance_fields) - PROVENANCE_KEYS + if unknown: + raise ContractError(f"raw implementation provenance has unknown fields {sorted(unknown)}") + if ( + implementation["name"] == "deepep-v2" + and provenance_fields.get("deterministic") is not False + ): + raise ContractError("DeepEP V2 deterministic mode differs from the v1 kernel contract") + if implementation["name"] == "deepep-v2" and ( + _integer( + provenance_fields.get("tuning_num_experts"), + "raw.implementation.provenance.tuning_num_experts", + minimum=1, + ) != logical_experts + or _integer( + provenance_fields.get("num_experts"), + "raw.implementation.provenance.num_experts", + minimum=1, + ) != physical_experts + ): + raise ContractError("DeepEP V2 expert-count provenance differs from the case") + if implementation["name"] == "deepep-hybrid": + realized_config = provenance_fields.get("realized_config") + jit_kernel_keys = provenance_fields.get("jit_kernel_keys") + jit_shared_objects = provenance_fields.get("jit_shared_objects") + domain_ranks, communication_domains = hybrid_communication_domains( + ep_size, scheduled_case["scale_up_domain"] + ) + if ( + not _hybrid_realized_config_is_valid(realized_config) + or not _hybrid_jit_evidence_is_valid(jit_shared_objects, jit_kernel_keys) + or realized_config["hidden_dim"] != shape["hidden"] + or realized_config["num_of_experts_per_rank"] * ep_size != physical_experts + or realized_config["num_of_ranks_per_node"] != domain_ranks + or realized_config["num_of_nodes"] != communication_domains + or realized_config["token_data_type"] != "UINT16" + or any( + len(artifact["rank_artifacts"]) != ep_size + for artifact in jit_shared_objects + ) + ): + raise ContractError("DeepEP Hybrid realized config/JIT evidence differs from the case") + if implementation["name"] == "nccl-ep" and implementation["kernel_generation"] != ( + collective_kernel_generation(provenance_fields.get("collective_library")) + ): + raise ContractError("NCCL/RCCL kernel generation differs from collective lineage") + resource_profile = _obj( + implementation["resource_profile"], "raw.implementation.resource_profile" + ) + expected_resource_profile = project_resource_profile(provenance_fields) + if resource_profile != expected_resource_profile: + raise ContractError("raw resource profile differs from implementation provenance") + topology = _keys( + doc["topology"], + {"device_count", "device_product", "gpus_per_node", "nodes", "placement", + "realized_placement", "scale_out_transport", "scale_up_domain", + "scale_up_transport", "scope", "topology_class", "transport", "world_size"}, + "raw.topology", + ) + for field in ("device_count", "gpus_per_node", "nodes", "scale_up_domain", "world_size"): + _integer(topology[field], f"raw.topology.{field}", minimum=1) + for field in ("scale_up_transport", "scope", "topology_class", "transport"): + _text(topology[field], f"raw.topology.{field}") + if topology["scale_out_transport"] is not None: + _text(topology["scale_out_transport"], "raw.topology.scale_out_transport") + realized = _keys( + topology["realized_placement"], + {"gpus_per_node", "nodes", "ranks_per_node", "unique_local_ranks", "valid"}, + "raw.topology.realized_placement", + ) + if realized != { + "gpus_per_node": topology["gpus_per_node"], + "nodes": topology["nodes"], + "ranks_per_node": topology["gpus_per_node"], + "unique_local_ranks": True, + "valid": True, + }: + raise ContractError("raw realized placement differs from requested topology") + if ( + topology["world_size"] != ep_size + or topology["nodes"] * topology["gpus_per_node"] != ep_size + or topology["device_count"] != topology["gpus_per_node"] + or topology["placement"] != profile["placement"] + or ( + topology["scope"] == "scale-up" + and ( + ep_size > topology["scale_up_domain"] + or topology["scale_out_transport"] is not None + ) + ) + or ( + topology["scope"] == "scale-out" + and ( + ep_size <= topology["scale_up_domain"] + or ep_size % topology["scale_up_domain"] != 0 + or topology["scale_out_transport"] is None + ) + ) + or topology["scope"] not in {"scale-up", "scale-out"} + ): + raise ContractError("raw topology dimensions differ from the case") + if implementation["name"] == "deepep-v2": + scale_out = scheduled_case["scope"] == "scale-out" + expected_policy = ( + (True, True, "nccl-gin") + if scale_out + else (False, False, "nccl-device-lsa") + ) + if ( + provenance_fields.get("allow_hybrid_mode"), + provenance_fields.get("gin_enabled"), + provenance_fields.get("communication_backend"), + ) != expected_policy: + raise ContractError("DeepEP V2 communication policy differs from the v1 contract") + lsa_topology = tuple( + _integer( + provenance_fields.get(field), + f"raw.implementation.provenance.{field}", + minimum=1, + ) + for field in ( + "physical_rdma_ranks", "physical_nvlink_ranks", + "logical_scaleout_ranks", "logical_scaleup_ranks", + ) + ) + domains = ep_size // scheduled_case["scale_up_domain"] + expected_v2_topology = ( + ( + domains, + scheduled_case["scale_up_domain"], + domains, + scheduled_case["scale_up_domain"], + ) + if scale_out + else (1, ep_size, 1, ep_size) + ) + if lsa_topology != expected_v2_topology: + raise ContractError("DeepEP V2 realized communication domains differ from topology") + runtime = _keys( + doc["runtime_fingerprint"], + {"accelerator_runtime", "collective_library", "device", "driver_version", "framework", + "machine", "python_version", "vendor"}, + "raw.runtime_fingerprint", + ) + for field in ("machine", "python_version", "vendor"): + _text(runtime[field], f"raw.runtime_fingerprint.{field}") + runtime_device = _keys( + runtime["device"], {"arch", "compute_units", "memory_bytes", "product", "warp_size"}, + "raw.runtime_fingerprint.device", + ) + if topology["device_product"] != runtime_device["product"]: + raise ContractError("raw topology and runtime device products differ") + platform = capability.PLATFORMS.get(case["runner"]) + if platform is not None: + identity_issues = capability.runtime_identity_issues( + case["runner"], vendor=runtime["vendor"], arch=runtime_device["arch"], + machine=runtime["machine"], device_name=runtime_device["product"], + device_count=topology["device_count"], world_size=topology["world_size"], + ) + registered_topology = capability.topology_for(case["runner"], ep_size) + if identity_issues or ( + registered_topology is None + or topology["gpus_per_node"] != platform["gpus_per_node"] + or topology["scale_up_domain"] != platform["scale_up_domain"] + or any( + topology[field] != registered_topology[field] + for field in ( + "nodes", "scope", "scale_up_transport", "scale_out_transport", + "topology_class", "transport", + ) + ) + ): + raise ContractError( + "raw runtime/topology differs from the scheduled SKU: " + + "; ".join(identity_issues) + ) + raw_provenance = _keys( + doc["provenance"], + {"allocation_stratum_sha256", "command", "distributed_launcher", "git_run", + "image", "redaction"}, + "raw.provenance", + ) + allocation_stratum = raw_provenance["allocation_stratum_sha256"] + if workload["source"] == "canonical-serialized" and not ( + isinstance(allocation_stratum, str) + and re.fullmatch(r"[0-9a-f]{64}", allocation_stratum) + ): + raise ContractError("canonical raw evidence is missing its private allocation stratum") + image = _keys( + raw_provenance["image"], + {"arch", "digest", "digest_verified", "reference", "squash_sha256"}, + "raw.provenance.image", + ) + if ( + image["digest_verified"] is not True + or not isinstance(image["digest"], str) + or not re.fullmatch(r"sha256:[0-9a-f]{64}", image["digest"]) + ): + raise ContractError("raw image digest was not registry-verified") + if raw_provenance["redaction"] != "sanitized-v1": + raise ContractError("raw provenance redaction contract differs") + git_run = raw_provenance["git_run"] + if git_run is not None: + git_run = _keys(git_run, GIT_RUN_FIELDS, "raw.provenance.git_run") + if git_run["qualification_index"] != qualification_index: + raise ContractError("raw git run qualification index differs from allocation") + expected_provenance_complete = provenance_complete( + provenance_fields, + case["backend"], + git_run, + allocation_stratum_sha256=allocation_stratum, + image_digest=image["digest"], + image_verified=image["digest_verified"], + squash_sha256=image["squash_sha256"], + ) + + actual_scheduled_case = { + "backend": case["backend"], + "canonical": workload["source"] == "canonical-serialized", + "eplb": eplb["enabled"], + "ep": ep_size, + "experts": shape["num_logical_experts"], + "gpus_per_node": topology["gpus_per_node"], + "hidden": hidden, + "ladder": " ".join(map(str, row_tokens)), + "mode": case["mode"], + "nodes": topology["nodes"], + "phase": case["phase"], + "required_publication": case["required_publication"], + "routing": shape["routing"], + "samples_per_point": sampling["samples_per_component"], + "scale_out_transport": topology["scale_out_transport"], + "scale_up_domain": topology["scale_up_domain"], + "scale_up_transport": topology["scale_up_transport"], + "scope": topology["scope"], + "suite": case["suite"], + "timing": ( + f"{sampling['iterations_per_trial']}:{sampling['trials']}:" + f"{sampling['warmup_iterations']}" + ), + "topk": shape["topk"], + "topology_class": topology["topology_class"], + "transport": topology["transport"], + "warmup_semantics": sampling["warmup_semantics"], + "workload": case["workload_name"], + } + if "precision_profile" in scheduled_case: + actual_scheduled_case["precision_profile"] = shape["precision_profile"] + if scheduled_case != actual_scheduled_case: + mismatches = sorted( + field for field in scheduled_case + if scheduled_case[field] != actual_scheduled_case[field] + ) + raise ContractError(f"raw data differs from scheduled case fields {mismatches}") + + if workload["source"] == "canonical-serialized": + _validate_canonical_workload(workload, scheduled_case, rows, eplb) + + expected_series = { + "backend": case["backend"], + "case_id": identifiers["case_id"], + "image_digest": image["digest"], + "implementation_contract_sha256": _sha256_json({ + "kernel_generation": implementation["kernel_generation"], + "name": implementation["name"], + "provenance": series_provenance(provenance_fields), + "resource_profile": resource_profile, + }), + "public_config_sha256": public_series_config_sha256(public_series_config( + kernel_generation=implementation["kernel_generation"], + provenance=provenance_fields, + resource_profile=resource_profile, + resource_mode=case["resource_mode"], + device_product=topology["device_product"], + )), + "routing_control_sha256": routing_implementation_control_sha256(implementation), + "runtime_fingerprint_sha256": _sha256_json(runtime), + "source_sha": git_run["source_sha"] if git_run is not None else None, + "squash_sha256": image["squash_sha256"], + "workload_id": workload["workload_id"] or workload["trace_signature"], + } + if series_factors != expected_series: + raise ContractError("raw series factors differ from measured implementation/runtime") + expected_allocation = { + "artifact": git_run["artifact"] if git_run is not None else None, + "execution_id": allocation_factors["execution_id"], + "job": git_run["job"] if git_run is not None else None, + "qualification_index": qualification_index, + "repo": git_run["repo"] if git_run is not None else None, + "run_attempt": git_run["run_attempt"] if git_run is not None else None, + "run_id": git_run["run_id"] if git_run is not None else None, + "runner": case["runner"], + "source_sha": git_run["source_sha"] if git_run is not None else None, + } + if allocation_factors != expected_allocation: + raise ContractError("raw allocation factors differ from provenance") + artifact = _keys(doc["sample_artifact"], {"bytes", "format", "path", "sha256"}, "raw.sample_artifact") + if artifact["format"] != SAMPLES_FORMAT or Path(artifact["path"]).name != artifact["path"]: + raise ContractError("raw.sample_artifact format/path is invalid") + if not isinstance(artifact["sha256"], str) or len(artifact["sha256"]) != 64: + raise ContractError("raw.sample_artifact.sha256 is invalid") + _integer(artifact["bytes"], "raw.sample_artifact.bytes", minimum=1) + outcome = _keys(doc["outcome"], {"publication_status", "reasons", "status", "validity"}, "raw.outcome") + if outcome["status"] not in {"success", "invalid"} or outcome["publication_status"] not in {"diagnostic", "invalid"}: + raise ContractError("raw outcome status is invalid") + if not isinstance(outcome["reasons"], list) or not all(isinstance(x, str) for x in outcome["reasons"]): + raise ContractError("raw outcome reasons must be strings") + validity = _keys( + outcome["validity"], + {"anomaly_free", "execution_status", "measurement_conformance", "provenance_complete", + "resource_conformance", "sampling_conformance", "semantic_correctness", + "workload_identity", "workload_source"}, + "raw.outcome.validity", + ) + correctness_passed = all(row["correctness"]["passed"] for row in rows) + workload_consistent = workload["cross_rank_consistent"] is True + expected_status = "success" if correctness_passed and workload_consistent else "invalid" + expected_publication = "diagnostic" if expected_status == "success" else "invalid" + if ( + outcome["status"] != expected_status + or outcome["publication_status"] != expected_publication + or bool(outcome["reasons"]) == (expected_status == "success") + or validity["execution_status"] != "complete" + or validity["semantic_correctness"] != ("pass" if correctness_passed else "fail") + or validity["workload_identity"] != ( + "consistent-across-ranks" if workload_consistent else "inconsistent" + ) + or validity["workload_source"] != workload["source"] + or validity["measurement_conformance"] != "conformant" + or validity["sampling_conformance"] != "conformant" + or validity["resource_conformance"] != resource_profile["conformance_class"] + or validity["anomaly_free"] != (recomputed_anomalies == 0) + or validity["provenance_complete"] is not expected_provenance_complete + ): + raise ContractError("raw outcome differs from its measurement evidence") + artifact_safety.assert_publication_safe([doc]) + return doc + + +def make_terminal_document( + *, + allocation_factors: dict[str, Any], + attempt_ordinal: int, + case: dict[str, Any], + case_factors: dict[str, Any], + control_sha256: str | None, + failure_mode: str, + generated_at: str, + git_run: dict[str, Any] | None, + reason: str, + return_code: int, + source: str, + status: str, + expected_case_id: str | None = None, +) -> dict[str, Any]: + """Build and self-validate one attributable non-success attempt.""" + case_id = identity.digest("case", case_factors) + if expected_case_id is not None and expected_case_id != case_id: + raise ContractError( + f"scheduled case ID differs from terminal factors: {expected_case_id} != {case_id}" + ) + allocation_id = identity.allocation_id(allocation_factors) + attempt_id = identity.attempt_id( + allocation=allocation_id, case=case_id, ordinal=attempt_ordinal + ) + document = { + "format": TERMINAL_FORMAT, + "schema_version": 1, + "record_type": "terminal-outcome", + "generated_at": generated_at, + "identity": { + "allocation_factors": allocation_factors, + "allocation_id": allocation_id, + "attempt_id": attempt_id, + "attempt_ordinal": attempt_ordinal, + "case_factors": case_factors, + "case_id": case_id, + }, + "case": case, + "provenance": { + "git_run": git_run, + "control_sha256": control_sha256, + "redaction": "sanitized-v1", + "source": source, + }, + "outcome": { + "status": status, + "failure_mode": failure_mode, + "reason": reason, + "return_code": return_code, + }, + } + return validate_terminal_document(document) + + +def validate_terminal_document(document: Any) -> dict[str, Any]: + _validate_native_schema("terminal-outcome-v1.schema.json", document) + doc = _keys( + document, + {"case", "format", "generated_at", "identity", "outcome", "provenance", "record_type", + "schema_version"}, + "terminal", + ) + if doc["format"] != TERMINAL_FORMAT or doc["schema_version"] != 1 or doc["record_type"] != "terminal-outcome": + raise ContractError("terminal format/schema/record type differs from v1") + ids = _keys(doc["identity"], { + "allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal", + "case_factors", "case_id", + }, "terminal.identity") + for field, kind in (("allocation_id", "allocation"), ("attempt_id", "attempt"), ("case_id", "case")): + _typed(ids[field], kind, f"terminal.identity.{field}") + ordinal = _integer(ids["attempt_ordinal"], "terminal.identity.attempt_ordinal", minimum=1) + case = _scheduled_case(doc["case"], "terminal.case") + factors = _keys(ids["case_factors"], {"case", "profile", "sku"}, "terminal.identity.case_factors") + if factors["case"] != case or factors["profile"] != scheduled_case_profile( + case, "terminal.case" + ): + raise ContractError("terminal case factors differ from the scheduled case/profile") + _text(factors["sku"], "terminal.identity.case_factors.sku") + allocation = _keys( + ids["allocation_factors"], ALLOCATION_FACTOR_FIELDS, + "terminal.identity.allocation_factors", + ) + qualification_index = _integer( + allocation["qualification_index"], + "terminal.identity.allocation_factors.qualification_index", + minimum=1, + ) + if qualification_index > 3: + raise ContractError("terminal qualification index must be in 1..3") + expected_case = identity.digest("case", factors) + expected_allocation = identity.allocation_id(allocation) + expected_attempt = identity.attempt_id( + allocation=expected_allocation, case=expected_case, ordinal=ordinal + ) + if (ids["case_id"], ids["allocation_id"], ids["attempt_id"]) != ( + expected_case, expected_allocation, expected_attempt + ): + raise ContractError("terminal typed identities do not match their factors") + provenance = _keys( + doc["provenance"], {"git_run", "control_sha256", "redaction", "source"}, + "terminal.provenance", + ) + git_run = provenance["git_run"] + if git_run is not None: + git_run = _keys(git_run, GIT_RUN_FIELDS, "terminal.provenance.git_run") + if git_run["qualification_index"] != qualification_index: + raise ContractError( + "terminal git run qualification index differs from allocation" + ) + control = provenance["control_sha256"] + if control is not None and ( + not isinstance(control, str) or len(control) != 64 + or any(char not in "0123456789abcdef" for char in control) + ): + raise ContractError("terminal control_sha256 is invalid") + if provenance["redaction"] != "sanitized-v1": + raise ContractError("terminal redaction contract differs") + source = _text(provenance["source"], "terminal.provenance.source") + outcome = _keys( + doc["outcome"], {"failure_mode", "reason", "return_code", "status"}, "terminal.outcome" + ) + if outcome["status"] not in {"failed", "invalid", "unsupported"}: + raise ContractError("terminal outcome status is invalid") + failure_mode = _text(outcome["failure_mode"], "terminal.outcome.failure_mode") + reason = _text(outcome["reason"], "terminal.outcome.reason") + _integer(outcome["return_code"], "terminal.outcome.return_code") + if source == "runtime-emitter": + expected_runner = factors["sku"] + expected_reason = RUNTIME_FAILURE_REASONS.get(failure_mode) + valid_outcome = outcome["status"] == "failed" and reason == expected_reason + elif source == "post-emit-command": + expected_runner = factors["sku"] + expected_reason = POST_EMIT_FAILURE_REASONS.get(failure_mode) + valid_outcome = outcome["status"] == "failed" and reason == expected_reason + elif source == "matrix-capability-resolver": + expected_runner = "capability-resolver" + valid_outcome = ( + outcome["status"] == "unsupported" + and failure_mode == "capability" + and reason in CAPABILITY_FAILURE_REASONS + ) + else: + raise ContractError("terminal provenance source is not registered") + if not valid_outcome: + raise ContractError("terminal source and outcome are not registered") + expected_allocation = { + "artifact": git_run["artifact"] if git_run is not None else None, + "execution_id": allocation["execution_id"], + "job": git_run["job"] if git_run is not None else None, + "qualification_index": qualification_index, + "repo": git_run["repo"] if git_run is not None else None, + "run_attempt": git_run["run_attempt"] if git_run is not None else None, + "run_id": git_run["run_id"] if git_run is not None else None, + "runner": expected_runner, + "source_sha": git_run["source_sha"] if git_run is not None else None, + } + if allocation != expected_allocation: + raise ContractError("terminal allocation factors differ from provenance or source") + artifact_safety.assert_publication_safe([doc]) + return doc + + +def load_raw_attempt(path: str | os.PathLike[str]) -> dict[str, Any]: + document = strict_load(path) + artifact = _obj(document, "raw").get("sample_artifact") + artifact = _obj(artifact, "raw.sample_artifact") + sample_path = Path(path).with_name(_text(artifact.get("path"), "raw.sample_artifact.path")) + payload = sample_path.read_bytes() + if len(payload) != artifact.get("bytes") or hashlib.sha256(payload).hexdigest() != artifact.get("sha256"): + raise ContractError("sample artifact bytes or digest differ") + samples = strict_load(sample_path) + return validate_raw_document(document, samples) + + +def load_attempt(path: str | os.PathLike[str]) -> dict[str, Any]: + """Fully validate and return one native raw or terminal attempt.""" + document = strict_load(path) + if isinstance(document, dict) and document.get("format") == RAW_FORMAT: + return load_raw_attempt(path) + if isinstance(document, dict) and document.get("format") == TERMINAL_FORMAT: + return validate_terminal_document(document) + raise ContractError("unknown native attempt format") + + +def quarantine_invalid_attempt(path: str | os.PathLike[str]) -> bool: + """Move an invalid attempt and its basename-safe sample outside JSON upload globs.""" + destination = Path(path) + if not destination.is_file(): + return False + try: + load_attempt(destination) + return False + except (ContractError, OSError, ValueError): + try: + document = json.loads(destination.read_bytes()) + except (OSError, json.JSONDecodeError): + document = {} + artifact = document.get("sample_artifact") if isinstance(document, dict) else None + sample_name = artifact.get("path") if isinstance(artifact, dict) else None + if isinstance(sample_name, str) and Path(sample_name).name == sample_name: + sample_path = destination.with_name(sample_name) + if sample_path.is_file(): + os.replace(sample_path, sample_path.with_name(sample_path.name + ".quarantine")) + os.replace(destination, destination.with_name(destination.name + ".quarantine")) + return True + + +def normalize_attempt(document: dict[str, Any]) -> dict[str, Any]: + """Return the publisher-facing projection after native validation.""" + if document.get("format") == RAW_FORMAT: + ids = document["identity"] + return { + "allocation_id": ids["allocation_id"], + "attempt_id": ids["attempt_id"], + "case": document["case"], + "case_id": ids["case_id"], + "generated_at": document["generated_at"], + "outcome": document["outcome"], + "points": document["measurement"]["rows"], + "runtime_fingerprint": document["runtime_fingerprint"], + "series_id": ids["series_id"], + } + if document.get("format") == TERMINAL_FORMAT: + ids = document["identity"] + return { + "allocation_id": ids["allocation_id"], + "attempt_id": ids["attempt_id"], + "case": document["case"], + "case_id": ids["case_id"], + "generated_at": document["generated_at"], + "outcome": document["outcome"], + "points": [], + "runtime_fingerprint": None, + "series_id": None, + } + raise ContractError("unknown attempt format") + + +def _env_integer(name: str, default: int) -> int: + try: + return int(os.environ.get(name, str(default))) + except ValueError: + return default + + +def _env_enabled(name: str) -> bool: + return os.environ.get(name, "").lower() in {"1", "true", "yes"} + + +def _terminal_case_from_environment(backend: str, phase: str) -> dict[str, Any]: + ep = _env_integer("CX_EP", _env_integer("CX_NGPUS", 1)) + gpus_per_node = _env_integer("CX_GPUS_PER_NODE", ep) + ladder = os.environ.get("CX_TOKENS_LADDER", "") or ( + "1 2 4 8 16 32 64 128" + if phase == "decode" + else "128 256 512 1024 2048 4096" + ) + case = { + "suite": os.environ.get("CX_SUITE") or "manual", + "workload": os.environ.get("CX_WORKLOAD_NAME") or "manual", + "required_publication": os.environ.get("CX_REQUIRED_PUBLICATION") or "diagnostic", + "backend": backend, + "mode": os.environ.get("CX_MODE", "normal"), + "routing": os.environ.get("CX_ROUTING", "uniform"), + "phase": phase, + "ep": ep, + "eplb": _env_enabled("CX_EPLB"), + "hidden": _env_integer("CX_HIDDEN", 7168), + "topk": _env_integer("CX_TOPK", 8), + "experts": _env_integer("CX_EXPERTS", 256), + "samples_per_point": _env_integer("CX_SAMPLES_PER_POINT", 512), + "warmup_semantics": os.environ.get( + "CX_WARMUP_SEMANTICS", + "full-roundtrip-before-each-component-trial-point-v1", + ), + "ladder": ladder, + "timing": ( + f'{_env_integer("CX_ITERS", 8)}:{_env_integer("CX_TRIALS", 64)}:' + f'{_env_integer("CX_WARMUP", 32)}' + ), + "canonical": _env_enabled("CX_CANONICAL"), + "nodes": _env_integer("CX_NODES", _env_integer("SLURM_NNODES", 1)), + "gpus_per_node": gpus_per_node, + "scale_up_domain": _env_integer("CX_SCALE_UP_DOMAIN", gpus_per_node), + "scope": os.environ.get("CX_SCOPE", "scale-up"), + "topology_class": os.environ.get("CX_TOPO", "manual"), + "transport": os.environ.get("CX_TRANSPORT", "unknown"), + "scale_up_transport": os.environ.get("CX_SCALE_UP_TRANSPORT", "unknown"), + "scale_out_transport": os.environ.get("CX_SCALE_OUT_TRANSPORT") or None, + } + precision_profile = os.environ.get("CX_PRECISION_PROFILE") or None + if precision_profile is not None: + case["precision_profile"] = precision_profile + return case + + +def _git_run_from_environment() -> dict[str, Any] | None: + def value(name: str) -> str | None: + return os.environ.get(name) or None + + git_run = { + "run_id": value("GITHUB_RUN_ID"), + "run_attempt": value("GITHUB_RUN_ATTEMPT"), + "ref": value("GITHUB_REF_NAME") or value("GITHUB_REF"), + "source_sha": value("COLLECTIVEX_SOURCE_SHA") or value("GITHUB_SHA"), + "repo": value("GITHUB_REPOSITORY"), + "job": value("GITHUB_JOB"), + "artifact": value("COLLECTIVEX_ARTIFACT_NAME"), + } + if not any(item is not None for item in git_run.values()): + return None + git_run["qualification_index"] = _env_integer("CX_QUALIFICATION_INDEX", 1) + return git_run + + +def _allocation_factors_from_environment( + runner: str, git_run: dict[str, Any] | None +) -> dict[str, Any]: + return { + "artifact": git_run["artifact"] if git_run is not None else None, + "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID") or None, + "job": git_run["job"] if git_run is not None else None, + "qualification_index": _env_integer("CX_QUALIFICATION_INDEX", 1), + "repo": git_run["repo"] if git_run is not None else None, + "run_attempt": git_run["run_attempt"] if git_run is not None else None, + "run_id": git_run["run_id"] if git_run is not None else None, + "runner": runner, + "source_sha": git_run["source_sha"] if git_run is not None else None, + } + + +def make_terminal_from_environment( + *, backend: str, phase: str, return_code: int, failure_mode: str | None = None +) -> dict[str, Any]: + """Build a terminal document from the same exported case coordinates as run_ep.""" + mode = failure_mode or RETURN_CODE_FAILURE_MODES.get(return_code, "execution") + reason = RUNTIME_FAILURE_REASONS.get(mode) + if reason is None: + raise ContractError("runtime failure mode is not registered") + runner = os.environ.get("CX_RUNNER", "") + case = _terminal_case_from_environment(backend, phase) + case_factors = { + "case": case, + "profile": scheduled_case_profile(case, "runtime case"), + "sku": runner, + } + git_run = _git_run_from_environment() + control = os.environ.get("COLLECTIVEX_CONTROL_SHA256") or None + return make_terminal_document( + allocation_factors=_allocation_factors_from_environment(runner, git_run), + attempt_ordinal=_env_integer("CX_ATTEMPT_ID", 1), + case=case, + case_factors=case_factors, + control_sha256=control, + failure_mode=mode, + generated_at=dt.datetime.now(dt.timezone.utc).isoformat(), + git_run=git_run, + reason=reason, + return_code=return_code, + source="runtime-emitter", + status="failed", + expected_case_id=os.environ.get("CX_CASE_ID") or None, + ) + + +def _write_document(path: str | os.PathLike[str], document: dict[str, Any]) -> None: + destination = Path(path) + destination.parent.mkdir(parents=True, exist_ok=True) + temporary = destination.with_name(destination.name + ".tmp") + temporary.write_text(json.dumps(document, indent=2, sort_keys=True) + "\n") + os.replace(temporary, destination) + + +def demote_raw_attempt(path: str | os.PathLike[str], return_code: int) -> dict[str, Any]: + """Replace a rank-zero raw result when the distributed command later fails.""" + destination = Path(path) + raw = strict_load(destination) + if not isinstance(raw, dict) or raw.get("format") != RAW_FORMAT: + raise ContractError("only a native raw attempt can be demoted") + ids = _obj(raw.get("identity"), "raw.identity") + required = { + "allocation_factors", "allocation_id", "attempt_id", "attempt_ordinal", + "case_factors", "case_id", + } + if not required.issubset(ids): + raise ContractError("raw identity lacks terminal factors") + mode = RETURN_CODE_FAILURE_MODES.get(return_code, "execution") + git_run = _obj(raw.get("provenance"), "raw.provenance").get("git_run") + if git_run is not None: + git_run = _keys(git_run, GIT_RUN_FIELDS, "raw.provenance.git_run") + terminal = make_terminal_document( + allocation_factors=ids["allocation_factors"], + attempt_ordinal=ids["attempt_ordinal"], + case=ids["case_factors"]["case"], + case_factors=ids["case_factors"], + control_sha256=os.environ.get("COLLECTIVEX_CONTROL_SHA256") or None, + failure_mode=mode, + generated_at=dt.datetime.now(dt.timezone.utc).isoformat(), + git_run=git_run, + reason=POST_EMIT_FAILURE_REASONS[mode], + return_code=return_code, + source="post-emit-command", + status="failed", + expected_case_id=ids["case_id"], + ) + artifact = raw.get("sample_artifact") or {} + sample_name = artifact.get("path") + if isinstance(sample_name, str) and Path(sample_name).name == sample_name: + destination.with_name(sample_name).unlink(missing_ok=True) + _write_document(destination, terminal) + return terminal + + +def validate_attempt_paths(paths: list[str]) -> int: + """Fully validate a result directory's attempts and paired sample artifacts.""" + if not paths or len(paths) != len(set(paths)): + raise ContractError("validate-many requires unique result paths") + sample_paths: set[Path] = set() + referenced_samples: set[Path] = set() + attempt_count = 0 + for raw_path in paths: + path = Path(raw_path).resolve() + document = strict_load(path) + if isinstance(document, dict) and document.get("format") == RAW_FORMAT: + document = load_raw_attempt(path) + referenced_samples.add(path.with_name(document["sample_artifact"]["path"])) + attempt_count += 1 + elif isinstance(document, dict) and document.get("format") == TERMINAL_FORMAT: + validate_terminal_document(document) + attempt_count += 1 + elif isinstance(document, dict) and document.get("format") == SAMPLES_FORMAT: + validate_samples_document(document) + sample_paths.add(path) + else: + raise ContractError(f"unknown result artifact {path.name}") + if sample_paths != referenced_samples: + raise ContractError("sample artifacts are missing, orphaned, or outside the validated set") + if attempt_count == 0: + raise ContractError("result set contains no native attempts") + return attempt_count + + +def validate_delivery( + paths: list[str], source_path: str, *, disposition: str | None = None +) -> int: + """Reconcile a shard or matrix disposition with its complete native attempt set.""" + source_file = Path(source_path).resolve() + source = strict_load(source_file) + if isinstance(source, dict) and source.get("format") == "collectivex.matrix.v1": + if disposition is None: + raise ContractError("matrix delivery validation requires a disposition") + wrappers = [ + item for item in source.get("requested_cases", []) + if isinstance(item, dict) and item.get("disposition") == disposition + ] + expected = { + item["case"]["case_id"]: (item["sku"], item["case"]) + for item in wrappers + } + expected_count = len(wrappers) + require_one_allocation = disposition == "unsupported" + elif isinstance(source, dict) and isinstance(source.get("cases"), list): + expected = { + case["case_id"]: (source.get("sku"), case) + for case in source["cases"] + } + expected_count = len(source["cases"]) + require_one_allocation = True + else: + raise ContractError("delivery source is not a matrix or shard control") + if not expected or len(expected) != expected_count: + raise ContractError("delivery source has empty or duplicate case coverage") + + validate_attempt_paths(paths) + attempts = [] + for raw_path in paths: + document = strict_load(raw_path) + if isinstance(document, dict) and document.get("format") in {RAW_FORMAT, TERMINAL_FORMAT}: + attempts.append(load_attempt(raw_path)) + by_case: dict[str, list[dict[str, Any]]] = {} + attempt_ids = set() + allocation_ids = set() + source_sha256 = hashlib.sha256(source_file.read_bytes()).hexdigest() + for document in attempts: + ids = document["identity"] + case_id = ids["case_id"] + if case_id not in expected or ids["attempt_id"] in attempt_ids: + raise ContractError("delivery contains an extra case or duplicate attempt") + attempt_ids.add(ids["attempt_id"]) + allocation_ids.add(ids["allocation_id"]) + sku, scheduled = expected[case_id] + scheduled_case = {key: value for key, value in scheduled.items() if key != "case_id"} + if ids["case_factors"] != { + "case": scheduled_case, + "profile": scheduled_case_profile(scheduled_case, "delivery case"), + "sku": sku, + }: + raise ContractError("delivery attempt differs from its scheduled case") + factors = ids["allocation_factors"] + expected_environment = { + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"), + "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID"), + "job": os.environ.get("GITHUB_JOB"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "run_id": os.environ.get("GITHUB_RUN_ID"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"), + } + expected_runner = ( + "capability-resolver" + if document["format"] == TERMINAL_FORMAT + and document["provenance"]["source"] == "matrix-capability-resolver" + else sku + ) + if any( + value is not None and factors[field] != value + for field, value in expected_environment.items() + ) or factors["runner"] != expected_runner: + raise ContractError("delivery allocation factors differ from the workflow") + if document["format"] == TERMINAL_FORMAT: + control = document["provenance"]["control_sha256"] + if control != source_sha256: + raise ContractError("terminal outcome does not reference its exact control document") + by_case.setdefault(case_id, []).append(document) + if set(by_case) != set(expected): + raise ContractError("delivery case coverage is incomplete") + for case_id, documents in by_case.items(): + ordinals = sorted(document["identity"]["attempt_ordinal"] for document in documents) + if ordinals != list(range(1, len(ordinals) + 1)): + raise ContractError(f"delivery attempt ordinals are not contiguous for {case_id}") + if require_one_allocation and len(allocation_ids) != 1: + raise ContractError("one shard must use exactly one allocation identity") + return len(attempts) + + +def main() -> int: + parser = argparse.ArgumentParser(description="CollectiveX native attempt contracts") + subparsers = parser.add_subparsers(dest="command", required=True) + probe = subparsers.add_parser("probe") + probe.add_argument("path") + probe.add_argument("--status", choices=("success", "invalid")) + emit = subparsers.add_parser("emit-terminal") + emit.add_argument("--out", required=True) + emit.add_argument("--backend", required=True) + emit.add_argument("--phase", required=True, choices=("decode", "prefill")) + emit.add_argument("--return-code", required=True, type=int) + emit.add_argument("--failure-mode") + demote = subparsers.add_parser("demote") + demote.add_argument("path") + demote.add_argument("--return-code", required=True, type=int) + validate_many = subparsers.add_parser("validate-many") + validate_many.add_argument("paths", nargs="+") + quarantine = subparsers.add_parser("quarantine-invalid") + quarantine.add_argument("path") + delivery = subparsers.add_parser("validate-delivery") + delivery.add_argument("--source", required=True) + delivery.add_argument("--disposition") + delivery.add_argument("paths", nargs="+") + args = parser.parse_args() + try: + if args.command == "probe": + document = load_attempt(args.path) + if args.status is None: + return 0 + if document.get("format") != RAW_FORMAT: + return 1 + outcome = document["outcome"] + validity = outcome.get("validity") + return int( + not ( + isinstance(validity, dict) + and validity.get("execution_status") == "complete" + and outcome.get("status") == args.status + ) + ) + if args.command == "emit-terminal": + document = make_terminal_from_environment( + backend=args.backend, + phase=args.phase, + return_code=args.return_code, + failure_mode=args.failure_mode, + ) + _write_document(args.out, document) + print(f"preserved terminal outcome ({document['outcome']['failure_mode']})") + return 0 + if args.command == "validate-many": + print(f"validated {validate_attempt_paths(args.paths)} native attempts") + return 0 + if args.command == "quarantine-invalid": + quarantine_invalid_attempt(args.path) + return 0 + if args.command == "validate-delivery": + print( + f"validated {validate_delivery(args.paths, args.source, disposition=args.disposition)} " + "delivery attempts" + ) + return 0 + demote_raw_attempt(args.path, args.return_code) + return 0 + except (ContractError, identity.IdentityError, OSError, ValueError) as exc: + print(f"terminal contract error: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/docs/methodology.md b/experimental/CollectiveX/docs/methodology.md new file mode 100644 index 000000000..396a705b6 --- /dev/null +++ b/experimental/CollectiveX/docs/methodology.md @@ -0,0 +1,311 @@ +# CollectiveX EP Pre-V1 Baseline + +
+ +**English** | [中文](./methodology_zh.md) + +
+ +This document describes the implemented BF16 baseline. It is not yet the V1 qualification contract. +Before any V1-tagged run, this English document must match the implemented precision, measurement, +publication, and frontend contracts; counts and digests remain unfrozen. Chinese documentation +synchronization is explicitly deferred for the V1 implementation phase. + +## Product Boundary + +CollectiveX is a communication microbenchmark for: + +- comparing EP libraries on one chip/topology; +- comparing EP latency and logical payload bandwidth across systems under the same workload; and +- exposing unsupported, failed, invalid, and unstable evidence without contaminating decisions. + +It does not predict serving throughput without a separate correlation study. + +## Implemented Matrix + +The implemented workload is `deepseek-v3-v1`: hidden 7168, top-k 8, 256 routed experts, BF16 dispatch +and combine, packed placement, and backend-tuned resources. Each case explicitly selects normal +`layout-and-dispatch-v1` or low-latency `expert-packed-weighted-combine-v1` semantics. + +- `ep-core-v1`: uniform routing; decode T=1..128 powers of two; prefill T=256/512. +- `ep-routing-v1`: Zipf with EPLB off/on; decode T=128; prefill T=512. +- `ep-low-latency-v1`: DeepEP V1/UCCL native low-latency APIs; uniform decode T=1..128 powers of + two; the capability contract rejects every other backend instead of fabricating a low-latency path. +- Implemented baseline surface: 608 requested cases / 1,600 token points; 364 runnable cases / 940 + points in + 58 executable workflow shards/allocation cells; 244 unsupported cases / 660 points. + +| Systems | EP8 | EP16 | +|---|---|---| +| H100/H200/B200/B300 | 1x8 NVLink, scale-up | 2x8 NVLink + RDMA, scale-out | +| MI325X/MI355X | 1x8 XGMI, scale-up | 2x8 XGMI + RDMA, scale-out | +| GB200/GB300 | 2x4 MNNVL, scale-up | 4x4 MNNVL, scale-up | + +Physical host count does not define scope. Both GB cells remain inside one 72-GPU MNNVL scale-up +domain. + +Unsupported combinations are terminal outcomes, not silently skipped coverage. DeepEP V2 is the +`ElasticBuffer` introduced by PR #605, pinned with upstream PR #630's minimal pure-scale-up fix. +Scale-up cases request NCCL Device API LSA and fail closed unless the realized LSA team covers the +full EP world. x86 EP16 scale-out uses the hybrid path with GIN and requires two logical scale-out +domains represented by two physical RDMA ranks, with eight scale-up ranks per domain. GB EP16 +remains MNNVL scale-up and uses LSA. NVIDIA capabilities declared in source remain unvalidated until +GPU outcomes pass the native oracle and publisher gates. H100 V2 on the current runner pool is a +declared unsupported combination in v1 because NCCL 2.30.4 reports no Device API symmetric-memory +support for its EP8 communicator; that pool can return only after all-rank CUDA P2P/LSA support is +restored. This baseline omits `[cl]`, `[rv]`, quantization, alternate activation/routing profiles, +uneven allocation, placement permutations, model envelopes, and scaling. +FlashInfer is excluded from v1 after repeatable intermittent execution failures; those failures are +not converted into planned-unsupported coverage. +MoRI EP8 uses MI325X AsyncLL or MI355X IntraNode in normal mode. EP16 uses pinned InterNodeV1 over +2x8 XGMI + RDMA with 96 blocks, 64 RDMA blocks, 8 warps, one QP per PE, and external input. MoRI's +AsyncLL transport is not the genuine low-latency suite contract and is never labeled as such. + +## Workload Identity + +One canonical workload is generated over the global token batch and sliced by source rank. Expert +indices and gate weights are serialized. Activations use a versioned integer counter formula whose +BF16 values are exact across runtimes; its full identity is bound into the manifest. The manifest +also binds shape/EP coordinates and oracle version. SHA-256 covers canonical bytes and parameters; +library RNG regeneration is not proof of identity. + +Routing traffic distinguishes: + +- token-expert assignments, which determine expert compute load; and +- rank-deduplicated token payload copies, which determine EP activation traffic. + +Adapters may not generate routing or reinterpret one quantity as the other. + +## Measurement + +Normal mode uses `layout-and-dispatch-v1`: dispatch timing includes layout plus communication, and +combine returns activation payload through an unweighted rank-sum path. Low-latency mode uses +`expert-packed-weighted-combine-v1`: native DeepEP V1/UCCL APIs dispatch token-expert assignments and +perform gate-weighted combine. Expert-output staging is outside isolated combine timing and inside +measured paired roundtrip. Each component declares availability, origin, start/end states, stage +scope, and sample count. A paired-only API reports null isolated components. `isolated_sum` is +derived and never used for throughput or recommendations. Mode is series identity, and normal and +low-latency evidence cannot share a ranking cohort. + +Every measured component uses `fixed-512-v1`: + +- 64 trials x 8 timed iterations = 512 observations; +- 32 synchronized full dispatch-stage-combine warmups before each available measured component at + every trial/point; +- roundtrip first, then isolated dispatch and combine, with a fixed per-phase conditioning ladder; and +- per-iteration maximum latency across ranks before nearest-rank p50/p90/p95/p99. + +Measured roundtrip p99 is the headline latency. Retries remain separate attempts; a later success +does not erase earlier failures. Decode and prefill identify the serving regime represented by one +MoE-layer collective; they do not change the timed primitive at an otherwise identical shape. + +The NCCL/RCCL reference is an end-to-end Python adapter, not a bare fabric primitive. Its dispatch +boundary includes layout, count exchange, a device-to-host split synchronization, fresh receive +allocation, and four payload/metadata all-to-all calls; activation-only combine adds one all-to-all plus +scatter/reduction. Its p99 therefore measures the complete reference-adapter boundary and can be +host/scheduler-sensitive. It is useful for portable system controls but must not be labeled fabric, +link, bus, or single-collective latency. + +The versioned conditioning and EPLB planner contracts (reference trace, redundant count, and +placement/remap version) are part of scheduled and evidence identity. + +Logical payload bandwidth is: + +`logical_payload_bytes / measured_latency_seconds` + +Normal-mode payload bytes use rank-deduplicated token-rank activations; low-latency bytes use +token-expert assignments. Both add required scale bytes at the named boundary and exclude expert +metadata, padding, and backend buffer capacity. Algorithm bandwidth, bus bandwidth, wire utilization, +and physical-link utilization are not published without a defined primitive model or transport +counters. Logical bandwidth must never be labeled physical bandwidth. Published payload and token +rates are named `rate_at_latency_percentile`: bytes or tokens divided by the matching latency +percentile. They are lower-tail service rates at p99 latency, not p99 percentiles of an inverted rate +distribution. + +## Correctness + +An implementation-independent oracle uses an expert-specific deterministic transform so wrong +expert routing cannot pass an identity roundtrip. For every rank and point it verifies: + +1. destination rank/expert, source token, multiplicity, gate weight, and receive counts; +2. dispatched payload and metadata before timing; +3. combined output before timing; +4. unchanged semantic inputs through all timed samples; and +5. dispatched payload/metadata and combined output again after timing. + +Normal-mode adapters use activation-only, unweighted rank-sum combine. The oracle builds each rank's +gate-weighted expert aggregate before combine, independently derives `sum(gate * expert(token))`, +and checks the dispatch metadata and transformed output. Low-latency adapters separately verify the +expert-packed source/expert assignment, native gate weights, and gate-weighted combined output. Both +contracts check every element with recorded `rtol=0.05` and `atol=0.02`. Any failed rank or point +makes the case ineligible. +Pre/post dispatch evidence is hashed in canonical source-token order. Native receive slots may be +assigned nondeterministically, so physical receive order is not treated as a correctness property. + +## Native Result + +One raw case document uses `format: "collectivex.ep.v1"`, rejects unknown fields, and contains: + +- `case`: stable case ID, suite, required tier, and coordinate; +- `workload`: canonical identity and logical MoE shape; +- `measurement`: sampling, component states, timing, and byte accounting; +- `implementation`: instantiated class/API, pinned source, loaded libraries, and resources; +- `topology`: requested and realized SKU, devices, placement, scale-up domain, and transport; +- `provenance`: source SHA, image/squash hashes, allocation, run, and attempt; +- `rows`: point latency, byte accounting, token rate, correctness, load, fanout, and anomaly evidence; and +- `outcome`: `success`, `failed`, `invalid`, `diagnostic`, or `unsupported`, with reasons. + +Raw result documents and exact samples pass through transient GitHub delivery artifacts before the +publisher archives them in the private bundle; they never enter the public tree. Private environment +details remain in local mode-0600 logs and ignored operator notes; they are never archived or +published. Every expected case has one terminal selected outcome while every attempt remains retained. + +## Identity And Comparisons + +Canonical JSON produces three full SHA-256 IDs: + +- `series_id`: all locked factors except token coordinate and repeat allocation; +- `point_id`: `series_id` plus token coordinate; and +- `evidence_id`: `point_id` plus allocation/run/attempt/sample checksum. + +Locked factors include workload bytes, measurement and sampling contract, resources, realized +topology, implementation/build, loaded libraries, image/squash, runtime, and source SHA. +Deferred code generation is captured before measurement and recaptured afterward. DeepEP V2 uses a +fixed NVCC random seed and binds final cache keys plus generated-source and executable-SASS hashes; +raw CUBIN bytes remain private diagnostics. Hybrid binds its realized auto-tuned config and complete +kernel-key set while retaining rank-local shared-object hashes as private diagnostics. Locally built +extension hashes are diagnostic; their pinned source trees, build recipe, runtime, and dependencies +remain series-bound. +The series identity includes the case ID, which binds the complete scheduled token ladder and the +frozen percentile, rank-reduction, conditioning, warmup, and correctness semantics. + +A controlled comparison declares one contrast: + +- `library`: backend implementation and its tuned resource profile may differ; the realized system, + workload, EP, resource policy, source, and measurement remain matched; +- `chip`: a controlled platform contrast. The full realized system/topology and tuned resource + profile may differ while workload, EP, placement class, resource policy, backend lineage, source, + and measurement remain matched. It is not a silicon-only comparison; +- `system`: all hardware/backend differences stay visible while workload, EP, and measurement match; +- `routing`: routing distribution/EPLB differs while the static implementation build/generator, + system, model shape, resource profile, and measurement remain matched. Uniform and Zipf without + EPLB reuse the same generated implementation; EPLB's physical-expert/JIT configuration remains an + explicit treatment difference. + +Any undeclared mismatch rejects the overlay. Chip/system results describe measured systems, not +silicon alone. + +## Evidence Policy + +Capability declarations say what may be attempted; artifacts determine evidence status. Promotion +requires exact expected coverage with no missing, extra, duplicate, malformed, or heterogeneous +case. Public coverage preserves each matrix disposition; promotion requires every runnable case to +succeed and every planned-unsupported case to remain unsupported in every selected run. Only the +pinned canonical full-v1 matrix, with a decision-grade library, chip, system, and routing cohort, +may advance `dev-latest`; partial matrices remain diagnostic. The full-matrix digest intentionally +pins the exact workflow shard grouping as well as the requested cases, so changing `--max-cases` +or the SKU round-robin scheduling order produces diagnostic-only runs even when case coverage is +unchanged. Superseded retries, +planned-unsupported outcomes, and unstable comparison cohorts may render diagnostically but cannot +rank or recommend; every successful required series in a promoted dataset remains decision-grade. +Any failed, invalid, or diagnostic retry of a runnable case blocks promotion even if a later retry +succeeds. Routing cohorts are comparable-experimental sensitivities and never produce configuration +recommendations; official library/platform/system cohorts own actionable recommendations. + +A point becomes decision-grade only after three independent workflow runs and allocation IDs pass +correctness, identity, provenance, tail gates, p50/p99 repeat-stability thresholds, and stable ordering. The +publisher, not the frontend, computes eligibility, controlled cohorts, sensitivity pairs, and +recommendations. + +## Execution Isolation + +Every non-MNNVL scale-out case uses operator-pinned socket and RDMA selectors. The launcher rejects +missing or partial profiles, then probes every allocated node for the configured interface, active +HCA port, and configured GID before backend initialization. It never substitutes a default route, +inherited runner environment, or transport fallback. Scale-up and MNNVL cases clear the profile; +scale-out NCCL/RCCL forces `NCCL_NET=IB` and exact HCA matching. Selector values remain in encrypted +config and mode-0600 private logs. + +Repository staging uses a pre-existing, runner-owned, group/world non-writable shared base outside +the checkout and workflow workspace. The parent process resolves the exact execution child before +copying, claims it with a runner-owned marker, and verifies that all allocated nodes can read and +write the same bytes. Cleanup waits for confirmed allocation teardown and removes only that child, +including a safely identified partial claim. The same-run V2/Hybrid source archive is fully validated +under fixed member and expanded-size bounds, and only the selected pinned root is extracted; a +symlink is accepted only when it is a relative leaf pointing to a regular member inside the same +backend root, followed by exact Git tree/submodule validation. + +## Artifact Validation And JIT Delivery + +There is no self-hosted service, Vercel storage, GCP, Neon, managed database, or managed object +store. The publication workflow uses runner-local temporary storage only as a disposable validation +and promotion workspace: + +```text +$COLLECTIVEX_STORE_ROOT/ + private/incoming/ # write-once downloaded GHA attempts + private/bundles// # immutable source archives, native results/samples, matrix, checksums + private/quarantine/ # rejected attempts plus machine-readable reasons + public/datasets// # immutable sanitized frontend datasets + public/channels/ # small atomic pointers: latest-attempt, dev-latest + locks/ +``` + +Private and public trees use separate permissions. JSON manifests and checksums are authoritative; +a rebuildable catalog is only an index. Raw sweep artifacts are transient publisher input; only the +sanitized promoted NDJSON is retained as a frontend publication artifact. + +Container tags are checked against pinned registry digests. Enroot imports use a fixed +`SOURCE_DATE_EPOCH` and versioned cache generation; every mounted squash is freshly hashed into +series identity. Image-provided DeepEP is also checked against exact per-architecture wheel and +installed-file fingerprints, so a stale cache cannot inherit the pinned source identity. +Source-built DeepEP V2 uses a separate mode-0700 cluster-local cache mounted only as `/cx-cache`. +Its content key binds a versioned build recipe, verified image digest, CPU/GPU architecture, +upstream source trees, and pinned build dependencies. The cache is never an artifact or publisher +input; per-execution source/results stages remain isolated and disposable, and marker plus runtime +probes fail closed before reuse. The runner UID is inside the trusted cluster boundary: this cache +guards against stale or accidental mutation, not hostile same-UID jobs. Only an unpublished partial +build may be reset automatically; a published cache that fails integrity or runtime checks is left +intact and rejected so a concurrent allocation cannot lose files it is using. + +Publication is fail-closed: + +1. acquire an exclusive filesystem lock and stage on the destination filesystem; +2. archive source bytes before parsing; +3. require the exact matrix-declared artifact set and reject every unconsumed archive member; +4. validate strict schemas, privacy, checksums, identities, timing, and exact matrix outcomes; +5. write checksums and `COMPLETE`, fsync, then atomically rename the private bundle; +6. build and validate the sanitized content-addressed dataset, fsync, then atomically rename it; +7. atomically replace `dev-latest.json` only when every promotion gate passes. + +Rejected attempts may update workspace `latest-attempt` but never `dev-latest`. The workspace is +destroyed with the publication runner and is never attached to the frontend. No artifact is emitted +unless all three selected bundles advance `dev-latest`. + +`publisher.py ingest` accepts the exact matrix plus one `--artifact` directory or ZIP per GitHub +artifact. `promote` accepts explicit immutable bundle IDs. Default `verify` requires +`latest-attempt`; it also verifies `dev-latest` when present, while an explicit +`--channel dev-latest` requires it. The workflow copies only the verified sanitized dataset to a +one-record `collectivex_public_v1_.ndjson` artifact. Raw artifacts and private workspace +content are never bundled into the application. + +Sweeps default to `release_tag=unversioned`. The main-registered `collectivex-sweep.yml` owns +`sweep`, `publish-v1`, and `refresh-v1`, so its branch revision remains dispatchable. V1 emits a +marker bound to the run ID, first attempt, qualification index, source SHA, and locked matrix digest. +Publication accepts exactly three unique successful run IDs from one source SHA with qualification +indices 1, 2, and 3, downloads their immutable artifacts, and passes the same provenance assertions +to `publisher.py ingest`. Refresh requires an exact source run and dataset digest and reuploads the +same validated sanitized bytes. Partial, filtered, untagged, cross-source, rerun, failed, expired, +or digest-mismatched inputs fail closed. + +Using a server-side GitHub read token, the frontend discovers the latest successful version-scoped +publication run and downloads the publication artifact just in time. It requires exactly one root +NDJSON entry, validates UTF-8, schema, promotion status, and filename/body SHA-256, then exposes a +short-lived versioned channel pointer and immutable versioned dataset URL. The benchmark-version +selector currently exposes V1; later versions require separate release and publication identities. +The frontend never invents missing values, selects retries, or recomputes decision eligibility. + +## Legacy Data + +Numeric schemas 3-5 are outside the v1 publisher and frontend reader. They remain historical +diagnostic evidence and cannot seed `dev-latest` or drive v1 decisions. diff --git a/experimental/CollectiveX/docs/methodology_zh.md b/experimental/CollectiveX/docs/methodology_zh.md new file mode 100644 index 000000000..7f6dcb67a --- /dev/null +++ b/experimental/CollectiveX/docs/methodology_zh.md @@ -0,0 +1,297 @@ +# CollectiveX EP v1 契约 + +
+ +[English](./methodology.md) | **中文** + +
+ +本文档定义新的 CollectiveX 结果。历史运行笔记是 evidence,不是 contract。 + +## 产品边界 + +CollectiveX 是通信 microbenchmark,用于: + +- 在同一 chip/topology 上比较 EP libraries; +- 在相同 workload 下比较不同系统的 EP latency 和 logical payload bandwidth; +- 展示 unsupported、failed、invalid 和 unstable evidence,同时避免污染决策。 + +若没有单独的 correlation study,它不能预测 serving throughput。 + +## 矩阵 + +提升后的 workload 为 `deepseek-v3-v1`:hidden 7168、top-k 8、256 routed experts、BF16 +dispatch 和 combine、packed placement,以及 backend-tuned resources。每个 case 都显式选择 +normal `layout-and-dispatch-v1` 或 low-latency `expert-packed-weighted-combine-v1` 语义。 + +- `ep-core-v1`:uniform routing;decode T=1..128 的 2 次幂;prefill T=256/512。 +- `ep-routing-v1`:Zipf,EPLB off/on;decode T=128;prefill T=512。 +- `ep-low-latency-v1`:使用 DeepEP V1/UCCL 原生 low-latency API;uniform decode T=1..128 的 + 2 次幂;capability contract 会拒绝其他后端,不会伪造 low-latency path。 +- 规范矩阵范围:请求 608 个 cases / 1,600 个 token points;364 个可运行 cases / 940 个 + points,分布在 58 个可执行 workflow shards/allocation cells;244 个 unsupported cases / 660 个 + points。 + +| 系统 | EP8 | EP16 | +|---|---|---| +| H100/H200/B200/B300 | 1x8 NVLink,scale-up | 2x8 NVLink + RDMA,scale-out | +| MI325X/MI355X | 1x8 XGMI,scale-up | 2x8 XGMI + RDMA,scale-out | +| GB200/GB300 | 2x4 MNNVL,scale-up | 4x4 MNNVL,scale-up | + +物理主机数量不能定义通信范围。两个 GB 配置都位于同一个 72-GPU MNNVL scale-up domain 内。 + +Unsupported combinations 是 terminal outcomes,不会被静默跳过。DeepEP V2 指 PR #605 +引入的 `ElasticBuffer`,并固定使用 upstream PR #630 的最小纯 scale-up 修复。V2 的 scale-up +cases 请求 NCCL Device API LSA;若实际建立的 LSA team 未覆盖整个 EP world,则直接失败。x86 +EP16 scale-out 使用启用 GIN 的 hybrid path,并要求两个逻辑 scale-out domains(由两个物理 RDMA +ranks 表示)、每个 domain 八个 scale-up ranks。GB EP16 仍是 MNNVL scale-up,因此使用 LSA。 +Source 中声明的 NVIDIA capabilities 在 GPU outcomes 通过 native oracle 和 publisher gates 前仍为 +unvalidated。当前 runner pool 上的 H100 V2 在 v1 中被声明为 unsupported,因为 NCCL 2.30.4 +报告其 EP8 communicator 不具备 Device API symmetric-memory 支持;只有该 pool 恢复全 rank +CUDA P2P/LSA 支持后才能重新加入。已移除的轴包括 `[cl]`、`[rv]`、quantization、alternate +activation/routing profiles、uneven allocation、placement +permutations、model envelopes 和 scaling。 +FlashInfer 因可重复出现的间歇性执行失败而排除在 v1 外;这些失败不会转为 planned-unsupported +coverage。 +MoRI EP8 在 normal mode 下使用 MI325X AsyncLL 或 MI355X IntraNode。EP16 固定使用 2x8 XGMI + +RDMA 上的 InterNodeV1,配置为 96 blocks、64 RDMA blocks、8 warps、每个 PE 一个 QP,以及 +external input。MoRI 的 AsyncLL transport 不属于真正的 low-latency suite contract,也绝不会 +以该模式标注。 + +## Workload 身份 + +一个 canonical workload 在 global token batch 上生成,再按 source rank 切分。Expert indices +和 gate weights 会序列化。Activations 使用带版本的整数计数器公式,其 BF16 值在不同 runtime +中精确一致;完整身份绑定到 manifest。Manifest 还绑定 shape/EP coordinates 和 oracle version。 +SHA-256 覆盖 canonical bytes 和 parameters;重新生成 library RNG 不能证明身份一致。 + +Routing traffic 区分: + +- token-expert assignments,决定 expert compute load; +- rank-deduplicated token payload copies,决定 EP activation traffic。 + +Adapters 不得生成 routing,也不得将两种量相互解释。 + +## 测量 + +Normal mode 使用 `layout-and-dispatch-v1`:dispatch timing 包括 layout 和 communication,combine +通过 unweighted rank-sum path 返回 activation payload。Low-latency mode 使用 +`expert-packed-weighted-combine-v1`:DeepEP V1/UCCL 原生 API dispatch token-expert assignments, +并执行 gate-weighted combine。Expert-output staging 不计入 isolated combine timing,但计入被测 +paired roundtrip。每个 component 声明 availability、origin、start/end states、stage scope 和 sample +count。仅有 paired API 时,isolated components 报 null。`isolated_sum` 为派生值,不用于 +throughput 或 recommendations。Mode 属于 series identity;normal 和 low-latency evidence 不能 +共用排名 cohort。 + +每个被测 component 均使用 `fixed-512-v1`: + +- 64 trials x 8 timed iterations = 512 observations; +- 每个 trial/point 的每个可用被测 component 前,执行 32 次同步完整 + dispatch-stage-combine warmups; +- 先测 roundtrip,再测 isolated dispatch 和 combine,并使用固定的 per-phase conditioning ladder; +- 每次 iteration 先取跨 rank 最大 latency,再以 nearest-rank 计算 p50/p90/p95/p99。 + +被测 roundtrip p99 是 headline latency。Retries 保持为独立 attempts;后续成功不会抹除早期失败。 +Decode 和 prefill 表示一个 MoE-layer collective 所代表的 serving regime;在其他 shape 相同时, +它们不会改变 timed primitive。 + +NCCL/RCCL reference 是 end-to-end Python adapter,而不是 bare fabric primitive。其 dispatch +boundary 包含 layout、count exchange、device-to-host split synchronization、fresh receive +allocation,以及四次 payload/metadata all-to-all;activation-only combine 还包含一次 all-to-all 和 +scatter/reduction。因此其 p99 测量完整 reference-adapter boundary,可能对 host/scheduler 敏感。 +它可作为 portable system control,但不得标记为 fabric、link、bus 或 single-collective latency。 + +带版本的 conditioning 和 EPLB planner contracts(reference trace、redundant count 和 +placement/remap version)属于 scheduled 和 evidence identity。 + +Logical payload bandwidth 为: + +`logical_payload_bytes / measured_latency_seconds` + +Normal-mode payload bytes 使用按 rank 去重的 token-rank activations;low-latency bytes 使用 +token-expert assignments。两种模式都在命名边界上加入必需 scale bytes,并排除 expert metadata、 +padding 和 backend buffer capacity。若没有定义 primitive model 或 transport counters,不发布 +algorithm bandwidth、bus bandwidth、wire utilization 或 physical-link utilization。Logical +bandwidth 绝不能标为 physical bandwidth。已发布 payload 和 token rates 命名为 +`rate_at_latency_percentile`:bytes 或 tokens 除以对应 latency percentile。它们是 p99 latency +下的 lower-tail service rates,不是 inverted rate distribution 的 p99 percentiles。 + +## 正确性 + +与实现无关的 oracle 使用 expert-specific deterministic transform,使错误 expert routing 无法 +通过 identity roundtrip。它对每个 rank 和 point 验证: + +1. destination rank/expert、source token、multiplicity、gate weight 和 receive counts; +2. timing 前的 dispatched payload 和 metadata; +3. timing 前的 combined output; +4. 所有 timed samples 期间 semantic inputs 不变; +5. timing 后再次验证 dispatched payload/metadata 和 combined output。 + +Normal-mode adapters 使用 activation-only、unweighted rank-sum combine。Oracle 在 combine 前 +构造每个 rank 的 gate-weighted expert aggregate,独立计算 `sum(gate * expert(token))`,并检查 +dispatch metadata 和 transformed output。Low-latency adapters 单独验证 expert-packed +source/expert assignment、原生 gate weights 和 gate-weighted combined output。两个契约都使用 +已记录的 `rtol=0.05` 和 `atol=0.02` 检查每个 element。任一 rank 或 point 失败都会使 case +不合格。Pre/post dispatch evidence 按 +canonical source-token order 计算 hash。Native receive slots 可能非确定性分配,因此 physical +receive order 不作为 correctness property。 + +## Native 结果 + +单个 raw case document 使用 `format: "collectivex.ep.v1"`,拒绝未知 fields,并包含: + +- `case`:稳定 case ID、suite、required tier 和 coordinate; +- `workload`:canonical identity 和 logical MoE shape; +- `measurement`:sampling、component states、timing 和 byte accounting; +- `implementation`:实例化 class/API、固定 source、loaded libraries 和 resources; +- `topology`:requested 和 realized SKU、devices、placement、scale-up domain 和 transport; +- `provenance`:source SHA、image/squash hashes、allocation、run 和 attempt; +- `rows`:point latency、byte accounting、token rate、correctness、load、fanout 和 anomaly evidence; +- `outcome`:`success`、`failed`、`invalid`、`diagnostic` 或 `unsupported`,以及 reasons。 + +Raw result documents 和 exact samples 会先经过临时 GitHub delivery artifacts,再由 publisher +归档到 private bundle;它们不会进入 public tree。Private environment details 只保留在本地 +mode-0600 logs 和忽略的 operator notes 中;不会归档或发布。每个 expected case 有一个 terminal +selected outcome,同时保留每次 attempt。 + +## 身份与比较 + +Canonical JSON 生成三个完整 SHA-256 IDs: + +- `series_id`:除 token coordinate 和 repeat allocation 外的所有 locked factors; +- `point_id`:`series_id` 加 token coordinate; +- `evidence_id`:`point_id` 加 allocation/run/attempt/sample checksum。 + +Locked factors 包括 workload bytes、measurement 和 sampling contract、resources、realized +topology、implementation/build、loaded libraries、image/squash、runtime 和 source SHA。 +Deferred code generation 会在 measurement 前捕获,并在之后再次捕获。DeepEP V2 使用固定的 +NVCC random seed,并绑定最终 cache keys、generated-source hashes 与 executable-SASS hashes; +raw CUBIN bytes 仅保留为 private diagnostics。Hybrid 绑定实际自动调优配置与完整 kernel-key +set,同时将各 rank 的 shared-object hashes 仅保留为 private diagnostics。本地构建的 extension +hashes 属于 diagnostic;其固定 source trees、build recipe、runtime 与 dependencies 仍绑定到 +series。 +Series identity 包含 case ID;case ID 绑定完整 scheduled token ladder,以及固定的 percentile、 +rank-reduction、conditioning、warmup 和 correctness semantics。 + +Controlled comparison 只声明一个 contrast: + +- `library`:backend implementation 及其 tuned resource profile 可以不同;realized system、 + workload、EP、resource policy、source 和 measurement 必须匹配; +- `chip`:受控 platform contrast。完整 realized system/topology 和 tuned resource profile 可以不同, + 但 workload、EP、placement class、resource policy、backend lineage、source 和 measurement 必须 + 匹配。它不是 silicon-only comparison; +- `system`:保留所有 hardware/backend 差异,同时匹配 workload、EP 和 measurement; +- `routing`:routing distribution/EPLB 可以不同,但 static implementation build/generator、system、 + model shape、resource profile 和 measurement 必须匹配。未启用 EPLB 的 Uniform 和 Zipf 复用 + 同一 generated implementation;EPLB 的 physical-expert/JIT configuration 是显式 treatment + difference。 + +任何未声明的 mismatch 都会拒绝 overlay。Chip/system results 描述 measured systems,而非仅描述 +silicon。 + +## Evidence 策略 + +Capability declarations 说明可以尝试什么;artifacts 决定 evidence status。Promotion 要求完整的 +expected coverage,不能有 missing、extra、duplicate、malformed 或 heterogeneous case。Public +coverage 保留每个 matrix disposition;promotion 要求每个 runnable case 在所有 selected runs 中 +成功,且每个 planned-unsupported case 始终为 unsupported。只有固定 canonical full-v1 matrix, +且具有 decision-grade library、chip、system 和 routing cohort,才能推进 `dev-latest`;partial +matrices 仍为 diagnostic。Full-matrix digest 有意绑定精确 workflow shard grouping 和 requested +cases,因此即使 case coverage 不变,修改 `--max-cases` 或 SKU round-robin scheduling order 也只 +会产生 diagnostic-only runs。Superseded retries、planned-unsupported outcomes 和 unstable +comparison cohorts 可以用于诊断展示,但不能排名或推荐;promoted dataset 中每个成功的 required +series 都必须保持 decision-grade。Runnable case 的任何 failed、invalid 或 diagnostic retry 都会 +阻止 promotion,即使后续 retry 成功。Routing cohorts 是 comparable-experimental sensitivities, +不会产生 configuration recommendations;official library/platform/system cohorts 才能产生可执行 +recommendations。 + +一个 point 只有在三个独立 workflow runs 和 allocation IDs 均通过 correctness、identity、 +provenance、tail gates、p50/p99 repeat-stability thresholds 和 stable ordering 后才成为 +decision-grade。Eligibility、controlled cohorts、sensitivity pairs 和 recommendations 由 +publisher 而非 frontend 计算。 + +## 执行隔离 + +每个非 MNNVL scale-out case 都使用 operator 固定的 socket 与 RDMA selectors。Launcher 会拒绝 +缺失或不完整的 profile,并在 backend 初始化前逐个 allocation 节点检查已配置 interface、active +HCA port 与指定 GID。它不会改用 default route、继承的 runner environment 或 transport +fallback。Scale-up 和 MNNVL case 会清除该 profile;scale-out NCCL/RCCL 强制设置 +`NCCL_NET=IB` 并精确匹配 HCA。Selector values 只保留在加密配置和 mode-0600 private logs 中。 + +Repository staging 使用 checkout 与 workflow workspace 外预创建的 shared base;该 base 由 +runner owner 持有,group/world 均不可写。父进程在复制前解析精确 execution child,以 +runner-owned marker 声明所有权,并验证所有 allocation 节点读写的是同一份 bytes。Cleanup 会 +等待 allocation teardown 得到确认,并只删除该 child,包括可安全识别的未完成 claim。同一 run +的 V2/Hybrid source archive 会在固定 member 数和解压大小上限内完整验证,并且只提取所选 fixed +root;仅当相对 leaf symlink 指向同一 backend root 内的 regular member 时才允许创建,之后还要 +通过精确 Git tree/submodule 校验。 + +## 产物验证与即时交付 + +不使用 self-hosted service、Vercel storage、GCP、Neon、managed database 或 managed object +store。Publication workflow 仅将 runner 本地临时存储用作可丢弃的 validation 与 promotion +工作区: + +```text +$COLLECTIVEX_STORE_ROOT/ + private/incoming/ # write-once downloaded GHA attempts + private/bundles// # immutable source archives, native results/samples, matrix, checksums + private/quarantine/ # rejected attempts plus machine-readable reasons + public/datasets// # immutable sanitized frontend datasets + public/channels/ # small atomic pointers: latest-attempt, dev-latest + locks/ +``` + +Private 和 public trees 使用不同 permissions。JSON manifests 和 checksums 是权威记录;可重建 +catalog 仅为 index。Raw sweep artifacts 只是 publisher 的临时输入;只有清理并完成 promotion +的 NDJSON 会保留为前端 publication artifact。 + +Container tags 会与固定 registry digests 核对。Enroot imports 使用固定 +`SOURCE_DATE_EPOCH` 和 versioned cache generation;每个 mounted squash 都重新计算 hash 并纳入 +series identity。Image-provided DeepEP 也按精确 per-architecture wheel 和 installed-file +fingerprints 检查,因此 stale cache 不能继承固定 source identity。 +Source-built DeepEP V2 使用独立的 mode-0700 cluster-local cache,并且只以 `/cx-cache` 挂载。 +其 content key 绑定版本化 build recipe、verified image digest、CPU/GPU architecture、 +upstream source trees 和固定 build dependencies。该 cache 既不是 artifact,也不是 publisher +input;每次执行的 source/results stage 仍然隔离且可丢弃,并在复用前以 marker 和 runtime probe +fail closed。Runner UID 属于受信任的 cluster boundary:该 cache 用于防止 stale 或意外修改, +不防御恶意的同 UID job。只有从未发布的 partial build 才能自动重置;已发布 cache 一旦未通过 +integrity 或 runtime 检查,将保持原样并被拒绝,避免并发 allocation 正在使用的文件被删除。 + +Publication 采用 fail-closed: + +1. 获取 exclusive filesystem lock,并在 destination filesystem 上 stage; +2. 解析前归档 source bytes; +3. 要求精确 matrix-declared artifact set,并拒绝每个未消费 archive member; +4. 验证 strict schemas、privacy、checksums、identities、timing 和精确 matrix outcomes; +5. 写入 checksums 和 `COMPLETE`,fsync,然后原子 rename private bundle; +6. 构建并验证 sanitized content-addressed dataset,fsync,然后原子 rename; +7. 仅在全部 promotion gates 通过后原子替换 `dev-latest.json`。 + +Rejected attempts 可以更新工作区中的 `latest-attempt`,但不能更新 `dev-latest`。工作区会随 +publication runner 销毁,且绝不连接到前端。只有三个选定 bundles 全部推进 `dev-latest` 后才会 +生成 artifact。 + +`publisher.py ingest` 接受精确 matrix,并为每个 GitHub artifact 接受一个 `--artifact` directory +或 ZIP。`promote` 接受显式 immutable bundle IDs。默认 `verify` 要求 `latest-attempt`;若存在 +`dev-latest` 也会验证,而显式 `--channel dev-latest` 则要求其存在。Workflow 只会将通过验证并 +清理后的 dataset 复制到单记录 `collectivex_public_v1_.ndjson` artifact。Raw artifacts 和 +private workspace 内容绝不打包进应用。 + +Sweeps 默认使用 `release_tag=unversioned`。选择 `v1` 时必须匹配固定的完整 matrix digest,并 +生成绑定 run ID、attempt、source SHA 与 matrix SHA-256 的 marker。手动 publication workflow +只接受三个唯一、成功、来自同一 source SHA 的 `CollectiveX Sweep` run IDs;它会重新校验 +metadata 与精确 markers,下载 immutable artifacts,并将相同 provenance assertions 传给 +`publisher.py ingest`。Partial、filtered、untagged、跨 source、失败或过期的输入都会 fail closed。 + +前端使用 server-side GitHub read token,即时发现最新成功且按版本隔离的 publication run,并 +下载 publication artifact。它要求 ZIP 根目录只有一个 NDJSON entry,校验 UTF-8、schema、 +promotion 状态及 filename/body SHA-256,随后提供短期缓存的带版本 channel pointer 和 immutable +带版本 dataset URL。Benchmark-version selector 当前只显示 V1;后续版本必须使用独立的 release +与 publication identity。前端不会虚构 missing values、选择 retries,或重新计算 decision +eligibility。 + +## 历史数据 + +Numeric schemas 3-5 不在 v1 publisher 和 frontend reader 范围内。它们仍是 historical +diagnostic evidence,不能作为 `dev-latest` 初始数据或驱动 v1 decisions。 diff --git a/experimental/CollectiveX/identity.py b/experimental/CollectiveX/identity.py new file mode 100644 index 000000000..3b263cc9c --- /dev/null +++ b/experimental/CollectiveX/identity.py @@ -0,0 +1,404 @@ +#!/usr/bin/env python3 +"""Canonical, cross-runtime identities for CollectiveX v1.""" +from __future__ import annotations + +import hashlib +import json +import re +from copy import deepcopy +from typing import Any + +IDENTITY_VERSION = 1 +MAX_SAFE_INTEGER = (1 << 53) - 1 +PREFIXES = { + "case": "cxcase-v1-", + "workload": "cxwork-v1-", + "series": "cxseries-v1-", + "point": "cxpoint-v1-", + "evidence": "cxevidence-v1-", + "allocation": "cxallocation-v1-", + "attempt": "cxattempt-v1-", +} +V1_NORMAL_CASE_PROFILE = { + "activation_generator": "collectivex-activation-counter-v4", + "activation_profile": "canonical-counter-source-v4", + "combine_dtype": "bf16", + "combine_quant_mode": "none", + "combine_semantics": "activation-only", + "component_order_contract": "qualification-hash-rotated-components-v1", + "conditioning_contract": "fixed-phase-ramp-8-roundtrips-v1", + "contract": "layout-and-dispatch-v1", + "correctness_scope": "dispatch-metadata-and-transformed-combine", + "dtype": "bf16", + "eplb_planner": "greedy-rank-major-v1", + "eplb_redundant_experts": 32, + "eplb_reference_tokens_per_rank": 2048, + "mode": "normal", + "oracle_contract": "expert-specific-transform-v1", + "oracle_tolerances": "rtol=0.05,atol=0.02", + "payload_unit": "token-rank", + "placement": "packed", + "percentile_method": "nearest-rank", + "rank_reduction": "cross-rank-max-per-iteration", + "resource_mode": "fixed-profile", + "routing_generator": "collectivex-routing-counter-v3", + "sampling_contract": "fixed-512-v1", + "seed": 67, + "source_identity_contract": "bounded-sign-bit-source-v1", +} + +V1_LOW_LATENCY_CASE_PROFILE = { + **V1_NORMAL_CASE_PROFILE, + "component_order_contract": "qualification-hash-rotated-components-v1", + "combine_semantics": "gate-weighted", + "contract": "expert-packed-weighted-combine-v1", + "correctness_scope": "expert-assignment-and-weighted-combine", + "mode": "low-latency", + "oracle_contract": "expert-assignment-transform-v1", + "payload_unit": "token-expert", +} + +# Compatibility alias for normal-mode callers. New scheduling and validation +# must select a profile from the explicit case mode. +V1_CASE_PROFILE = V1_NORMAL_CASE_PROFILE +V1_CASE_PROFILES = { + "normal": V1_NORMAL_CASE_PROFILE, + "low-latency": V1_LOW_LATENCY_CASE_PROFILE, +} + +V1_CONTROL_PRECISION_PROFILE = "d-bf16.c-bf16" +V1_NORMAL_PRECISION_PROFILE_IDS = ( + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-bf16", + "d-bf16.c-fp8-e4m3fn-direct-cast-noscale", + "d-fp8-e4m3fn-b128-f32-prequantized.c-fp8-e4m3fn-direct-cast-noscale", + "d-bf16.c-fp8-e4m3fnuz-direct-cast-noscale", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-fp8-e4m3fnuz-direct-cast-noscale", +) +V1_LOW_LATENCY_PRECISION_PROFILE_IDS = ( + "d-fp8-e4m3fn-b128-f32-fused.c-bf16", + "d-bf16.c-logfmt10-dynamic64", + "d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64", +) + + +def _communication_axis( + *, + api_input_dtype: str, + api_output_dtype: str, + communication_format: str, + scale_dtype: str | None, + scale_layout: str, + scale_group_size: int | None, + padding_contract: str, + alignment_contract: str, + quantization_origin: str, + conversion_boundary: str, +) -> dict[str, Any]: + return { + "api_input_dtype": api_input_dtype, + "api_output_dtype": api_output_dtype, + "communication_format": communication_format, + "scale_dtype": scale_dtype, + "scale_layout": scale_layout, + "scale_group_size": scale_group_size, + "padding_contract": padding_contract, + "alignment_contract": alignment_contract, + "quantization_origin": quantization_origin, + "conversion_boundary": conversion_boundary, + } + + +_BF16_AXIS = _communication_axis( + api_input_dtype="bf16", + api_output_dtype="bf16", + communication_format="bf16", + scale_dtype=None, + scale_layout="none", + scale_group_size=None, + padding_contract="none", + alignment_contract="native-bf16-vector-alignment", + quantization_origin="none", + conversion_boundary="none", +) +_FP8_E4M3FN_PREQUANTIZED_DISPATCH = _communication_axis( + api_input_dtype="fp8-e4m3fn-with-f32-scale", + api_output_dtype="fp8-e4m3fn-with-f32-scale", + communication_format="fp8-e4m3fn", + scale_dtype="f32", + scale_layout="per-token-hidden-block", + scale_group_size=128, + padding_contract="right-zero-pad-hidden-to-128", + alignment_contract="hidden-block-128", + quantization_origin="caller-prequantized", + conversion_boundary="before-dispatch-timing", +) +_FP8_E4M3FNUZ_PREQUANTIZED_DISPATCH = _communication_axis( + api_input_dtype="fp8-e4m3fnuz-with-f32-scale", + api_output_dtype="fp8-e4m3fnuz-with-f32-scale", + communication_format="fp8-e4m3fnuz", + scale_dtype="f32", + scale_layout="per-token-hidden-block", + scale_group_size=128, + padding_contract="right-zero-pad-hidden-to-128", + alignment_contract="hidden-block-128", + quantization_origin="caller-prequantized", + conversion_boundary="before-dispatch-timing", +) +_FP8_E4M3FN_FUSED_DISPATCH = _communication_axis( + api_input_dtype="bf16", + api_output_dtype="fp8-e4m3fn-with-f32-scale", + communication_format="fp8-e4m3fn", + scale_dtype="f32", + scale_layout="per-token-hidden-block", + scale_group_size=128, + padding_contract="right-zero-pad-hidden-to-128", + alignment_contract="hidden-block-128", + quantization_origin="backend-fused", + conversion_boundary="inside-dispatch-timing", +) +_LOGFMT10_DYNAMIC64_COMBINE = _communication_axis( + api_input_dtype="bf16", + api_output_dtype="bf16", + communication_format="logfmt10", + scale_dtype="implicit-logfmt10", + scale_layout="dynamic-per-64-values", + scale_group_size=64, + padding_contract="right-zero-pad-values-to-64", + alignment_contract="value-block-64", + quantization_origin="backend-internal", + conversion_boundary="inside-combine-timing", +) +_FP8_E4M3FN_DIRECT_CAST_COMBINE = _communication_axis( + api_input_dtype="bf16", + api_output_dtype="bf16", + communication_format="fp8-e4m3fn", + scale_dtype=None, + scale_layout="none", + scale_group_size=None, + padding_contract="none", + alignment_contract="native-fp8-vector-alignment", + quantization_origin="backend-internal-direct-cast", + conversion_boundary="inside-combine-timing", +) +_FP8_E4M3FNUZ_DIRECT_CAST_COMBINE = _communication_axis( + api_input_dtype="bf16", + api_output_dtype="bf16", + communication_format="fp8-e4m3fnuz", + scale_dtype=None, + scale_layout="none", + scale_group_size=None, + padding_contract="none", + alignment_contract="native-fp8-vector-alignment", + quantization_origin="backend-internal-direct-cast", + conversion_boundary="inside-combine-timing", +) + +V1_PRECISION_PROFILES: dict[str, dict[str, Any]] = { + V1_CONTROL_PRECISION_PROFILE: { + "modes": ["normal", "low-latency"], + "dispatch": _BF16_AXIS, + "combine": _BF16_AXIS, + }, + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16": { + "modes": ["normal"], + "dispatch": _FP8_E4M3FN_PREQUANTIZED_DISPATCH, + "combine": _BF16_AXIS, + }, + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-bf16": { + "modes": ["normal"], + "dispatch": _FP8_E4M3FNUZ_PREQUANTIZED_DISPATCH, + "combine": _BF16_AXIS, + }, + "d-fp8-e4m3fn-b128-f32-fused.c-bf16": { + "modes": ["low-latency"], + "dispatch": _FP8_E4M3FN_FUSED_DISPATCH, + "combine": _BF16_AXIS, + }, + "d-bf16.c-logfmt10-dynamic64": { + "modes": ["low-latency"], + "dispatch": _BF16_AXIS, + "combine": _LOGFMT10_DYNAMIC64_COMBINE, + }, + "d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64": { + "modes": ["low-latency"], + "dispatch": _FP8_E4M3FN_FUSED_DISPATCH, + "combine": _LOGFMT10_DYNAMIC64_COMBINE, + }, + "d-bf16.c-fp8-e4m3fn-direct-cast-noscale": { + "modes": ["normal"], + "dispatch": _BF16_AXIS, + "combine": _FP8_E4M3FN_DIRECT_CAST_COMBINE, + }, + "d-fp8-e4m3fn-b128-f32-prequantized.c-fp8-e4m3fn-direct-cast-noscale": { + "modes": ["normal"], + "dispatch": _FP8_E4M3FN_PREQUANTIZED_DISPATCH, + "combine": _FP8_E4M3FN_DIRECT_CAST_COMBINE, + }, + "d-bf16.c-fp8-e4m3fnuz-direct-cast-noscale": { + "modes": ["normal"], + "dispatch": _BF16_AXIS, + "combine": _FP8_E4M3FNUZ_DIRECT_CAST_COMBINE, + }, + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-fp8-e4m3fnuz-direct-cast-noscale": { + "modes": ["normal"], + "dispatch": _FP8_E4M3FNUZ_PREQUANTIZED_DISPATCH, + "combine": _FP8_E4M3FNUZ_DIRECT_CAST_COMBINE, + }, +} + + +def case_profile(mode: str) -> dict[str, Any]: + """Return the immutable measurement profile for one scheduled mode.""" + try: + return V1_CASE_PROFILES[mode] + except KeyError as exc: + raise IdentityError(f"unknown CollectiveX case mode {mode!r}") from exc + + +def precision_profile(name: str) -> dict[str, Any]: + """Return one exact dispatch/combine communication-format profile.""" + try: + profile = V1_PRECISION_PROFILES[name] + except KeyError as exc: + raise IdentityError(f"unknown CollectiveX precision profile {name!r}") from exc + return {"profile_id": name, **deepcopy(profile)} + + +def profile_for_case(case: dict[str, Any]) -> dict[str, Any]: + """Resolve a scheduled case's explicit mode to its identity profile.""" + mode = case.get("mode") + if not isinstance(mode, str): + raise IdentityError("scheduled case mode is missing") + base = case_profile(mode) + precision_name = case.get("precision_profile") + if precision_name is None: + return base + if not isinstance(precision_name, str): + raise IdentityError("scheduled case precision_profile must be a string") + precision = precision_profile(precision_name) + if mode not in precision["modes"]: + raise IdentityError( + f"precision profile {precision_name!r} is not valid in mode {mode!r}" + ) + return {**base, "communication_precision": precision} + + +class IdentityError(ValueError): + """An identity payload cannot be represented consistently across runtimes.""" + + +def _validate(value: Any, path: str = "$") -> None: + if value is None or isinstance(value, bool): + return + if isinstance(value, str): + if any(ord(character) < 0x20 or ord(character) > 0x7E for character in value): + raise IdentityError(f"{path}: string must contain printable ASCII only") + return + if type(value) is int: + if abs(value) > MAX_SAFE_INTEGER: + raise IdentityError(f"{path}: integer exceeds the cross-runtime safe range") + return + if isinstance(value, list): + for index, item in enumerate(value): + _validate(item, f"{path}[{index}]") + return + if isinstance(value, dict): + for key, item in value.items(): + if not isinstance(key, str): + raise IdentityError(f"{path}: object key is not a string") + if any(ord(character) < 0x20 or ord(character) > 0x7E for character in key): + raise IdentityError(f"{path}: object key must contain printable ASCII only") + _validate(item, f"{path}.{key}") + return + raise IdentityError(f"{path}: unsupported identity value {type(value).__name__}") + + +def canonical_bytes(value: Any) -> bytes: + """Return compact UTF-8 JSON after enforcing the portable value subset.""" + _validate(value) + return json.dumps( + value, + ensure_ascii=False, + allow_nan=False, + sort_keys=True, + separators=(",", ":"), + ).encode("utf-8") + + +def digest(kind: str, value: Any) -> str: + """Hash a typed v1 identity payload and return its typed identifier.""" + try: + prefix = PREFIXES[kind] + except KeyError as exc: + raise IdentityError(f"unknown identity kind {kind!r}") from exc + body = {"kind": kind, "value": value, "version": IDENTITY_VERSION} + return prefix + hashlib.sha256(canonical_bytes(body)).hexdigest() + + +def is_typed_id(value: Any, kind: str) -> bool: + prefix = PREFIXES.get(kind) + return bool( + isinstance(value, str) + and prefix + and re.fullmatch(re.escape(prefix) + r"[0-9a-f]{64}", value) + ) + + +def case_id(*, sku: str, profile: dict[str, Any], case: dict[str, Any]) -> str: + return digest("case", {"case": case, "profile": profile, "sku": sku}) + + +def workload_id(value: dict[str, Any]) -> str: + return digest("workload", value) + + +def series_id(value: dict[str, Any]) -> str: + return digest("series", value) + + +def point_id(*, series: str, tokens_per_rank: int) -> str: + return digest("point", {"series_id": series, "tokens_per_rank": tokens_per_rank}) + + +def allocation_id(value: dict[str, Any]) -> str: + return digest("allocation", value) + + +def attempt_id(*, allocation: str, case: str, ordinal: int) -> str: + return digest( + "attempt", {"allocation_id": allocation, "case_id": case, "ordinal": ordinal} + ) + + +def evidence_id( + *, point: str, allocation: str, attempt: str, sample_sha256: str +) -> str: + return digest( + "evidence", + { + "allocation_id": allocation, + "attempt_id": attempt, + "point_id": point, + "sample_sha256": sample_sha256, + }, + ) + + +IDENTITY_TEST_VECTOR = { + "payload": {"backend": "deepep", "ep": 8, "shape": [7168, 8, 256]}, + "series_id": "cxseries-v1-a79bf758488e3edd50f5531f3af825f371bf42aae7c4097e461fd2a32615af81", +} + + +def verify_test_vector() -> None: + observed = series_id(IDENTITY_TEST_VECTOR["payload"]) + if observed != IDENTITY_TEST_VECTOR["series_id"]: + raise IdentityError( + f"identity implementation differs: {observed} != {IDENTITY_TEST_VECTOR['series_id']}" + ) + + +if __name__ == "__main__": + verify_test_vector() + print(IDENTITY_TEST_VECTOR["series_id"]) diff --git a/experimental/CollectiveX/launchers/launch_gb-nv.sh b/experimental/CollectiveX/launchers/launch_gb-nv.sh new file mode 100644 index 000000000..21aae4c13 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_gb-nv.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# CollectiveX shared GB200/GB300 NVL72 (aarch64) launcher. +# shellcheck disable=SC2016,SC2034 +# +# EP8/EP16 use one Slurm task per GPU across two or four trays in the same +# MNNVL scale-up domain. +set -euo pipefail +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)"; REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +PRODUCT="${CX_SHARD_SKU:-${CX_GB_PRODUCT:-${CX_PUBLIC_RUNNER:-}}}" +case "$PRODUCT" in + gb200|gb300) ;; + *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to gb200 or gb300" ;; +esac +RUNNER="$PRODUCT" +export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-deepep}" +export CX_IMAGE_PLATFORM=linux/arm64 +JOB_ID="" +cx_install_launcher_fail_safe +cx_set_failure_stage setup +cx_load_operator_config +cx_lock_canonical_gha_env "$RUNNER" +NODES="${CX_NODES:-2}"; GPN="${CX_GPUS_PER_NODE:-4}" +SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-72}" +EXPECTED_WORLD=$((NODES * GPN)) +NGPUS="${CX_NGPUS:-$EXPECTED_WORLD}" +if [ "$PRODUCT" = gb200 ]; then default_time=30; else default_time=90; fi +TIME_MIN="${CX_TIME:-$default_time}" +[ "$NODES" = 2 ] || [ "$NODES" = 4 ] \ + || cx_die "$PRODUCT v1 supports two or four four-GPU trays" +[ "$GPN" = 4 ] || cx_die "$PRODUCT requires four GPUs per tray" +[ "$SCALE_UP_DOMAIN" = 72 ] || cx_die "$PRODUCT requires the NVL72 scale-up domain" +[ "$NGPUS" = "$EXPECTED_WORLD" ] \ + || cx_die "$PRODUCT world size must equal nodes x GPUs per tray" +cx_apply_timing_profile +IMAGE="${CX_IMAGE:-$(cx_default_image "$PRODUCT")}" +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +export CX_RUNNER="$RUNNER" CX_TS="$TS" CX_TOPO="${PRODUCT}-nvl72-mnnvl" +export CX_SCOPE=scale-up CX_TRANSPORT=mnnvl CX_SCALE_UP_TRANSPORT=mnnvl +export CX_NODES="$NODES" CX_GPUS_PER_NODE="$GPN" CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN" +export CX_NGPUS="$NGPUS" +unset CX_SCALE_OUT_TRANSPORT +case "$CX_BENCH" in + deepep|deepep-v2|deepep-hybrid|nccl-ep) ;; + *) cx_die "unsupported $PRODUCT EP backend: $CX_BENCH" ;; +esac +cx_validate_shard_control "$CX_DIR" +cx_require_vars CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR +[ "$PRODUCT" != gb300 ] || cx_require_vars CX_ENROOT_CACHE_PATH +PARTITION="$CX_PARTITION"; ACCOUNT="$CX_ACCOUNT"; SQUASH_DIR="$CX_SQUASH_DIR" +[ -z "${CX_ENROOT_CACHE_PATH:-}" ] || export ENROOT_CACHE_PATH="$CX_ENROOT_CACHE_PATH" +export NCCL_CUMEM_ENABLE=1 NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 +cx_apply_network_profile "$NODES" "$CX_TRANSPORT" + +cx_log "$PRODUCT nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH" +[ "${CX_DRYRUN:-0}" = 1 ] && { cx_log "DRYRUN"; exit 0; } +cx_set_failure_stage registry-verification +cx_verify_registry_image "$IMAGE" +cx_set_failure_stage repository-stage +MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "$CX_STAGE_DIR")" +cx_stage_repo "$REPO_ROOT" "$MOUNT_SRC" +cx_prepare_runtime_marker "$MOUNT_SRC" +CONTAINER_MOUNTS="$MOUNT_SRC:/ix" +if [ "$CX_BENCH" = deepep-v2 ] || [ "$CX_BENCH" = deepep-hybrid ]; then + cx_set_failure_stage backend-setup + cx_prepare_backend_source "$MOUNT_SRC" "$CX_BENCH" \ + || cx_die "cannot stage the pinned backend source" + export CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources +fi +if [ "$CX_BENCH" = deepep-v2 ]; then + cx_prepare_backend_cache "$CX_SQUASH_DIR" \ + || cx_die "cannot prepare the isolated backend cache" + CONTAINER_MOUNTS="$CONTAINER_MOUNTS,$CX_PREPARED_BACKEND_CACHE:/cx-cache" + export CX_BACKEND_CACHE_ROOT=/cx-cache +fi + +cx_set_failure_stage scheduler-allocation +command -v salloc >/dev/null || cx_die "salloc not found" +cx_salloc_jobid --partition="$PARTITION" --account="$ACCOUNT" --nodes="$NODES" \ + --gres=gpu:"$GPN" --ntasks-per-node="$GPN" --exclusive --mem=0 --cpus-per-task=35 \ + --time="$TIME_MIN" +[ -n "$JOB_ID" ] || cx_die "no JOB_ID from salloc" +cx_set_failure_stage container-import +SQUASH_FILE="$(cx_ensure_squash_on_job "$JOB_ID" "$SQUASH_DIR" "$IMAGE")" +cx_set_failure_stage container-hash +cx_export_squash_identity "$SQUASH_FILE" +cx_preflight_allocation "$JOB_ID" "$NODES" "$MOUNT_SRC" "$SQUASH_FILE" \ + "${CX_SHARD_FILE:-}" + +# Keep the loader policy here because it is platform/container specific and +# security tests evaluate this literal independently. +SOURCE_BACKEND_ENV='case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 66;; esac; env_file="/ix/experimental/CollectiveX/.cx_backend/env/node-${SLURM_NODEID}.sh"; env_root="${env_file%/*}"; [ -d "$env_root" ] && [ ! -L "$env_root" ] || exit 66; case "$(stat -c "%a" "$env_root")" in 700|[1-7]700) ;; *) exit 66;; esac; [ -f "$env_file" ] && [ -r "$env_file" ] && [ ! -L "$env_file" ] && [ "$(stat -c "%u:%a" "$env_file")" = "$(stat -c "%u" "$env_root"):600" ] || exit 66; . "$env_file" || exit 66' +BACKEND_PROBE="$SOURCE_BACKEND_ENV"'; case "$CX_BENCH" in deepep) python3 -c "from deep_ep import Buffer";; deepep-v2) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''ElasticBuffer'\'')";; deepep-hybrid) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''HybridEPBuffer'\'')";; nccl-ep) python3 -c "import torch";; esac' +WRAP="${SOURCE_BACKEND_ENV}"$'\n'"$(cx_slurm_rank_wrapper)" +CX_DISTRIBUTED_CONTAINER_ARGS=(--container-writable --container-remap-root) +[ "$CX_BENCH" != deepep ] || export CX_ALLOW_MNNVL=1 +run_rc=0 +cx_set_failure_stage container-launch +cx_run_distributed_shard || run_rc=$? + +cx_adopt_runtime_stage "$MOUNT_SRC" +collect_rc=0 +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$? +[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection +final_rc="$run_rc" +[ "$final_rc" != 0 ] || final_rc="$collect_rc" +exit "$final_rc" diff --git a/experimental/CollectiveX/launchers/launch_mi-amds.sh b/experimental/CollectiveX/launchers/launch_mi-amds.sh new file mode 100644 index 000000000..f66f820f5 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_mi-amds.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash +# CollectiveX shared MI325X/MI355X AMD Slurm launcher (one or two nodes). +# shellcheck disable=SC2016,SC2034 +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +RUNNER="${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}" +case "$RUNNER" in + mi325x) CPUS_PER_TASK=256; DEVICE_MOUNTS=",/dev/kfd:/dev/kfd,/dev/dri:/dev/dri" ;; + mi355x) CPUS_PER_TASK=128; DEVICE_MOUNTS="" ;; + *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to mi325x or mi355x" ;; +esac +export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-mori}" +export CX_IMAGE_PLATFORM=linux/amd64 +JOB_ID="" +cx_install_launcher_fail_safe +cx_set_failure_stage setup +cx_load_operator_config +cx_lock_canonical_gha_env "$RUNNER" + +NODES="${CX_NODES:-1}"; GPN="${CX_GPUS_PER_NODE:-8}" +SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-8}" +EXPECTED_WORLD=$((NODES * GPN)) +NGPUS="${CX_NGPUS:-$EXPECTED_WORLD}" +TIME_MIN="${CX_TIME:-60}" +EXCLUDE_NODES="${CX_EXCLUDE_NODES:-}" +NODELIST="${CX_NODELIST:-}" +MOUNT_DIR=/ix +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +[ "$NODES" = 1 ] || [ "$NODES" = 2 ] \ + || cx_die "$RUNNER supports one or two nodes" +[ "$GPN" = 8 ] || cx_die "$RUNNER requires eight GPUs per node" +[ "$SCALE_UP_DOMAIN" = 8 ] || cx_die "$RUNNER requires an eight-GPU scale-up domain" +[ "$NGPUS" = "$EXPECTED_WORLD" ] \ + || cx_die "$RUNNER world size must equal nodes x GPUs per node" +case "$CX_BENCH" in + mori|nccl-ep) ;; + *) cx_die "unsupported AMD EP backend: $CX_BENCH" ;; +esac + +if [ "$RUNNER" = mi325x ]; then + export MORI_DISABLE_AUTO_XGMI="${MORI_DISABLE_AUTO_XGMI:-0}" + export MORI_ENABLE_SDMA="${MORI_ENABLE_SDMA:-1}" + export MORI_APP_LOG_LEVEL="${MORI_APP_LOG_LEVEL:-info}" + export MORI_SHMEM_LOG_LEVEL="${MORI_SHMEM_LOG_LEVEL:-info}" + export MORI_IO_LOG_LEVEL="${MORI_IO_LOG_LEVEL:-info}" + [ "$CX_BENCH" != mori ] \ + || export CX_IMAGE="${CX_IMAGE:-$CX_IMAGE_AMD_MORI_MI325}" +fi +if [ "$CX_BENCH" = mori ]; then + if [ "$NODES" -gt 1 ]; then + export CX_MORI_KERNEL_TYPE=internode-v1 + elif [ "$RUNNER" = mi325x ]; then + export CX_MORI_KERNEL_TYPE="${CX_MORI_KERNEL_TYPE:-asyncll}" + else + export CX_MORI_KERNEL_TYPE="${CX_MORI_KERNEL_TYPE:-intranode}" + fi +fi +IMAGE="${CX_IMAGE:-$(cx_default_image "$RUNNER")}" +export CX_RUNNER="$RUNNER" CX_NGPUS="$NGPUS" CX_NODES="$NODES" +export CX_GPUS_PER_NODE="$GPN" CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN" CX_TS="$TS" +export CX_SCALE_UP_TRANSPORT=xgmi +if [ "$NODES" -gt 1 ]; then + export CX_SCOPE=scale-out CX_SCALE_OUT_TRANSPORT=rdma + export CX_TRANSPORT=xgmi-rdma CX_TOPO="${RUNNER}-xgmi-rdma" +else + export CX_SCOPE=scale-up CX_TRANSPORT=xgmi CX_TOPO="${RUNNER}-xgmi" + unset CX_SCALE_OUT_TRANSPORT +fi +export CX_RUN_TIMEOUT="${CX_RUN_TIMEOUT:-1800}" +cx_apply_network_profile "$NODES" "$CX_TRANSPORT" +cx_validate_shard_control "$CX_DIR" +cx_require_vars CX_PARTITION CX_SQUASH_DIR CX_STAGE_DIR +PARTITION="$CX_PARTITION"; SQUASH_DIR="$CX_SQUASH_DIR" + +cx_log "runner=$RUNNER nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH" +cx_set_failure_stage repository-stage +MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "$CX_STAGE_DIR")" +cx_stage_repo "$REPO_ROOT" "$MOUNT_SRC" +cx_prepare_runtime_marker "$MOUNT_SRC" +[ "${CX_DRYRUN:-0}" != 1 ] || { cx_log "CX_DRYRUN=1 - not allocating"; exit 0; } +cx_set_failure_stage registry-verification +cx_verify_registry_image "$IMAGE" +cx_set_failure_stage scheduler-allocation +command -v salloc >/dev/null || cx_die "salloc not found on this runner" + +allocation=(--partition="$PARTITION" --nodes="$NODES" --gres=gpu:"$GPN" --exclusive + --time="$TIME_MIN") +if [ "$NODES" = 1 ]; then + allocation+=(--cpus-per-task="$CPUS_PER_TASK") +else + allocation+=(--ntasks-per-node="$GPN" --cpus-per-task="$((CPUS_PER_TASK / GPN))") +fi +if [ -n "$NODELIST" ]; then + cx_log "using configured node pin" + allocation+=(--nodelist="$NODELIST") +elif [ -n "$EXCLUDE_NODES" ]; then + allocation+=(--exclude="$EXCLUDE_NODES") +fi +cx_salloc_jobid "${allocation[@]}" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +cx_set_failure_stage setup +cx_validate_network_profile_on_job "$JOB_ID" "$NODES" "$CX_TRANSPORT" + +cx_set_failure_stage container-import +SQUASH_FILE="$(cx_ensure_squash_on_job \ + "$JOB_ID" "$SQUASH_DIR" "$IMAGE" "${CX_LOCK_DIR:-}")" +cx_set_failure_stage container-hash +import_log="$(cx_private_log_path image-hash)" +if ! COLLECTIVEX_SQUASH_SHA256="$( + srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --chdir=/tmp \ + --export="$(cx_host_exports)" sha256sum "$SQUASH_FILE" \ + 2>>"$import_log" | awk 'NR==1 {print $1}' +)"; then + cx_fail_stage container-hash "$import_log" +fi +[[ "$COLLECTIVEX_SQUASH_SHA256" =~ ^[0-9a-f]{64}$ ]] \ + || cx_fail_stage container-hash "$import_log" +export COLLECTIVEX_SQUASH_SHA256 +cx_preflight_allocation "$JOB_ID" "$NODES" "$MOUNT_SRC" "$SQUASH_FILE" \ + "${CX_SHARD_FILE:-}" +CONTAINER_MOUNTS="$MOUNT_SRC:$MOUNT_DIR$DEVICE_MOUNTS" + +if [ "$NODES" = 1 ]; then + run_rc=0 + cx_set_failure_stage container-launch + runtime_log="$(cx_private_log_path runtime)" + srun --jobid="$JOB_ID" --chdir=/tmp --container-image="$SQUASH_FILE" \ + --container-mounts="$CONTAINER_MOUNTS" --container-writable --container-remap-root \ + --no-container-mount-home --container-workdir="$MOUNT_DIR/experimental/CollectiveX" \ + --no-container-entrypoint --export="$(cx_container_exports)" \ + bash "$MOUNT_DIR/experimental/CollectiveX/runtime/run_in_container.sh" \ + >"$runtime_log" 2>&1 || run_rc=$? +else + SOURCE_BACKEND_ENV='case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 66;; esac; env_file="/ix/experimental/CollectiveX/.cx_backend/env/node-${SLURM_NODEID}.sh"; env_root="${env_file%/*}"; [ -d "$env_root" ] && [ ! -L "$env_root" ] || exit 66; case "$(stat -c "%a" "$env_root")" in 700|[1-7]700) ;; *) exit 66;; esac; [ -f "$env_file" ] && [ -r "$env_file" ] && [ ! -L "$env_file" ] && [ "$(stat -c "%u:%a" "$env_file")" = "$(stat -c "%u" "$env_root"):600" ] || exit 66; . "$env_file" || exit 66' + BACKEND_PROBE="$SOURCE_BACKEND_ENV"'; case "$CX_BENCH" in mori) python3 -c "import mori";; nccl-ep) python3 -c "import torch";; esac' + WRAP="${SOURCE_BACKEND_ENV}"$'\n'"$(cx_slurm_rank_wrapper)" + CX_DISTRIBUTED_CONTAINER_ARGS=(--container-writable --container-remap-root) + run_rc=0 + cx_set_failure_stage container-launch + cx_run_distributed_shard || run_rc=$? +fi + +cx_adopt_runtime_stage "$MOUNT_SRC" +if [ "$NODES" = 1 ] && [ "$run_rc" != 0 ]; then + cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true +fi +collect_rc=0 +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$? +[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection +final_rc="$run_rc" +[ "$final_rc" != 0 ] || final_rc="$collect_rc" +rm -f "$MOUNT_SRC"/experimental/CollectiveX/gpucore.* 2>/dev/null || true +cx_log "done - result artifacts collected" +exit "$final_rc" diff --git a/experimental/CollectiveX/launchers/launch_single-slurm.sh b/experimental/CollectiveX/launchers/launch_single-slurm.sh new file mode 100644 index 000000000..eade8fb75 --- /dev/null +++ b/experimental/CollectiveX/launchers/launch_single-slurm.sh @@ -0,0 +1,154 @@ +#!/usr/bin/env bash +# CollectiveX shared standard NVIDIA Slurm launcher (one or two nodes). +# shellcheck disable=SC2016,SC2034 +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CX_DIR="$(cd "$HERE/.." && pwd)" +REPO_ROOT="$(cd "$CX_DIR/../.." && pwd)" +# shellcheck source=../runtime/common.sh +source "$HERE/../runtime/common.sh" + +RUNNER="${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}" +ALLOC_EXTRA=(); SRUN_EXTRA=(); LOCAL_IMPORT=0 +case "$RUNNER" in + h100-dgxc) PRODUCT=h100; TOPO=h100-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=1 ;; + h200-dgxc) + PRODUCT=h200; TOPO=h200-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=0 + SRUN_EXTRA=(--container-remap-root) + ;; + b200-dgxc) + PRODUCT=b200; TOPO=b200-nvlink-island; DEFAULT_TIME=30; REQUIRE_ACCOUNT=1 + ALLOC_EXTRA=(--mem=0) + ;; + b300) + PRODUCT=b300; TOPO=b300-nvlink-island; DEFAULT_TIME=45; REQUIRE_ACCOUNT=1 + # Do not restore ALLOC_EXTRA=(-N 1 --mem=0); it blocks two-node B300 jobs. + ALLOC_EXTRA=(--mem=0) + SRUN_EXTRA=(--mpi=none --container-remap-root) + LOCAL_IMPORT=1 + ;; + *) cx_die "set CX_SHARD_SKU or CX_PUBLIC_RUNNER to a registered NVIDIA SKU" ;; +esac +export CX_RUNNER="$RUNNER" CX_BENCH="${CX_BENCH:-deepep}" +export CX_IMAGE_PLATFORM=linux/amd64 +JOB_ID="" +cx_install_launcher_fail_safe +cx_set_failure_stage setup +cx_load_operator_config +cx_lock_canonical_gha_env "$RUNNER" + +NODES="${CX_NODES:-1}"; GPN="${CX_GPUS_PER_NODE:-8}" +SCALE_UP_DOMAIN="${CX_SCALE_UP_DOMAIN:-8}" +EXPECTED_WORLD=$((NODES * GPN)) +NGPUS="${CX_NGPUS:-$EXPECTED_WORLD}" +TIME_MIN="${CX_TIME:-$DEFAULT_TIME}" +IMAGE="${CX_IMAGE:-$(cx_default_image "$PRODUCT")}" +TS="$(date -u +%Y-%m-%dT%H-%M-%SZ)" +[ "$NODES" = 1 ] || [ "$NODES" = 2 ] \ + || cx_die "$RUNNER supports one or two nodes" +[ "$GPN" = 8 ] || cx_die "$RUNNER requires eight GPUs per node" +[ "$SCALE_UP_DOMAIN" = 8 ] || cx_die "$RUNNER requires an eight-GPU scale-up domain" +[ "$NGPUS" = "$EXPECTED_WORLD" ] \ + || cx_die "$RUNNER world size must equal nodes x GPUs per node" +case "$CX_BENCH" in + deepep|deepep-v2|deepep-hybrid|uccl|nccl-ep) ;; + *) cx_die "unsupported $RUNNER EP backend: $CX_BENCH" ;; +esac + +export CX_RUNNER="$RUNNER" CX_NGPUS="$NGPUS" CX_NODES="$NODES" +export CX_GPUS_PER_NODE="$GPN" CX_SCALE_UP_DOMAIN="$SCALE_UP_DOMAIN" +export CX_TS="$TS" CX_SCALE_UP_TRANSPORT=nvlink +if [ "$NODES" -gt 1 ]; then + export CX_SCOPE=scale-out CX_SCALE_OUT_TRANSPORT=rdma + export CX_TRANSPORT=nvlink-rdma CX_TOPO="${PRODUCT}-nvlink-rdma" +else + export CX_SCOPE=scale-up CX_TRANSPORT=nvlink CX_TOPO="$TOPO" + unset CX_SCALE_OUT_TRANSPORT +fi +export CX_NCCL_HOME="${CX_NCCL_HOME:-/usr}" NCCL_CUMEM_ENABLE=1 +cx_apply_network_profile "$NODES" "$CX_TRANSPORT" +cx_validate_shard_control "$CX_DIR" +cx_require_vars CX_PARTITION CX_SQUASH_DIR +[ "$REQUIRE_ACCOUNT" = 0 ] || cx_require_vars CX_ACCOUNT +[ "$RUNNER" != b300 ] || cx_require_vars CX_STAGE_DIR + +cx_log "runner=$RUNNER nodes=$NODES x ${GPN}gpu world=$NGPUS bench=$CX_BENCH" +[ "${CX_DRYRUN:-0}" != 1 ] || { cx_log "CX_DRYRUN=1 - not allocating"; exit 0; } +cx_set_failure_stage registry-verification +cx_verify_registry_image "$IMAGE" +SQUASH_FILE="" +cx_set_failure_stage repository-stage +MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "${CX_STAGE_DIR:-}")" +cx_stage_repo "$REPO_ROOT" "$MOUNT_SRC" +cx_prepare_runtime_marker "$MOUNT_SRC" +CONTAINER_MOUNTS="$MOUNT_SRC:/ix" +if [ "$CX_BENCH" = deepep-v2 ] || [ "$CX_BENCH" = deepep-hybrid ]; then + cx_set_failure_stage backend-setup + cx_prepare_backend_source "$MOUNT_SRC" "$CX_BENCH" \ + || cx_die "cannot stage the pinned backend source" + export CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources +fi +if [ "$CX_BENCH" = deepep-v2 ]; then + cx_prepare_backend_cache "$CX_SQUASH_DIR" \ + || cx_die "cannot prepare the isolated backend cache" + CONTAINER_MOUNTS="$CONTAINER_MOUNTS,$CX_PREPARED_BACKEND_CACHE:/cx-cache" + export CX_BACKEND_CACHE_ROOT=/cx-cache +fi + +cx_set_failure_stage scheduler-allocation +command -v salloc >/dev/null || cx_die "salloc not found on this runner" +allocation=(--partition="$CX_PARTITION" --nodes="$NODES" --gres=gpu:"$GPN" --exclusive + --time="$TIME_MIN" "${ALLOC_EXTRA[@]}") +[ "$NODES" = 1 ] || allocation+=(--ntasks-per-node="$GPN") +[ -z "${CX_ACCOUNT:-}" ] || allocation+=(--account="$CX_ACCOUNT") +[ -z "${CX_EXCLUDE_NODES:-}" ] || allocation+=(--exclude="$CX_EXCLUDE_NODES") +cx_salloc_jobid "${allocation[@]}" +[ -n "$JOB_ID" ] || cx_die "could not resolve allocated JOB_ID from salloc" +cx_set_failure_stage setup +cx_validate_network_profile_on_job "$JOB_ID" "$NODES" "$CX_TRANSPORT" +if [ "$LOCAL_IMPORT" = 1 ]; then + cx_set_failure_stage container-import + SQUASH_FILE="$(CX_ENROOT_LOCAL_IMPORT=1 cx_ensure_squash "$CX_SQUASH_DIR" "$IMAGE")" + cx_set_failure_stage container-hash + cx_export_squash_identity "$SQUASH_FILE" +else + cx_set_failure_stage container-import + SQUASH_FILE="$(cx_ensure_squash_on_job "$JOB_ID" "$CX_SQUASH_DIR" "$IMAGE")" + cx_set_failure_stage container-hash + cx_export_squash_identity "$SQUASH_FILE" +fi +cx_preflight_allocation "$JOB_ID" "$NODES" "$MOUNT_SRC" "$SQUASH_FILE" \ + "${CX_SHARD_FILE:-}" + +if [ "$NODES" = 1 ]; then + run_rc=0 + cx_set_failure_stage container-launch + runtime_log="$(cx_private_log_path runtime)" + srun --jobid="$JOB_ID" --container-image="$SQUASH_FILE" \ + --container-mounts="$CONTAINER_MOUNTS" --no-container-mount-home \ + --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint \ + "${SRUN_EXTRA[@]}" --export="$(cx_container_exports)" \ + bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \ + >"$runtime_log" 2>&1 || run_rc=$? +else + SOURCE_BACKEND_ENV='case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 66;; esac; env_file="/ix/experimental/CollectiveX/.cx_backend/env/node-${SLURM_NODEID}.sh"; env_root="${env_file%/*}"; [ -d "$env_root" ] && [ ! -L "$env_root" ] || exit 66; case "$(stat -c "%a" "$env_root")" in 700|[1-7]700) ;; *) exit 66;; esac; [ -f "$env_file" ] && [ -r "$env_file" ] && [ ! -L "$env_file" ] && [ "$(stat -c "%u:%a" "$env_file")" = "$(stat -c "%u" "$env_root"):600" ] || exit 66; . "$env_file" || exit 66' + BACKEND_PROBE="$SOURCE_BACKEND_ENV"'; case "$CX_BENCH" in deepep) python3 -c "from deep_ep import Buffer";; deepep-v2) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''ElasticBuffer'\'')";; deepep-hybrid) python3 -c "import deep_ep; assert hasattr(deep_ep, '\''HybridEPBuffer'\'')";; uccl) python3 -c "import torch; from uccl_deepep import Buffer";; nccl-ep) python3 -c "import torch";; esac' + WRAP="${SOURCE_BACKEND_ENV}"$'\n'"$(cx_slurm_rank_wrapper)" + CX_DISTRIBUTED_CONTAINER_ARGS=(--container-writable "${SRUN_EXTRA[@]}") + run_rc=0 + cx_set_failure_stage container-launch + cx_run_distributed_shard || run_rc=$? +fi + +cx_adopt_runtime_stage "$MOUNT_SRC" +if [ "$NODES" = 1 ] && [ "$run_rc" != 0 ]; then + cx_fail_stage "$CX_FAILSAFE_MODE" "$runtime_log" || true +fi +collect_rc=0 +cx_collect_results "$MOUNT_SRC" "$REPO_ROOT" || collect_rc=$? +[ "$run_rc" != 0 ] || [ "$collect_rc" = 0 ] || cx_set_failure_stage artifact-collection +final_rc="$run_rc" +[ "$final_rc" != 0 ] || final_rc="$collect_rc" +cx_log "done - result artifacts collected" +exit "$final_rc" diff --git a/experimental/CollectiveX/publisher.py b/experimental/CollectiveX/publisher.py new file mode 100644 index 000000000..e64f52c12 --- /dev/null +++ b/experimental/CollectiveX/publisher.py @@ -0,0 +1,4260 @@ +#!/usr/bin/env python3 +"""Fail-closed filesystem publisher for CollectiveX EP v1 artifacts.""" +from __future__ import annotations + +import argparse +import contextlib +import datetime as dt +import fcntl +from functools import lru_cache +import hashlib +import json +import math +import os +from pathlib import Path, PurePosixPath +import re +import shutil +import stat +import statistics +import sys +import tempfile +from typing import Any, Iterator, Sequence +import zipfile + +import jsonschema +import numpy as np + +HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(HERE)) + +import artifact_safety # noqa: E402 +import capability # noqa: E402 +import contracts # noqa: E402 +import identity # noqa: E402 +import sweep_matrix # noqa: E402 + +FORMAT_BUNDLE = "collectivex.private.bundle.v1" +FORMAT_PUBLIC = "collectivex.public.v1" +FORMAT_CHANNEL = "collectivex.channel.v1" +POLICY = "collectivex-decision-grade-v1" +PUBLISHER_POLICY = "collectivex-publisher-v1" +OUTCOMES = ("success", "unsupported", "failed", "invalid", "diagnostic") +REQUIRED_ALLOCATIONS = 3 +REQUIRED_COHORT_KINDS = ("library", "chip", "system", "routing") +PRECISION_COHORT_KINDS = ( + "dispatch-precision", "combine-precision", "precision-pair", +) +REQUIRED_PROMOTION_COHORT_COUNTS = {"library": 76, "system": 12, "routing": 116} +CANONICAL_FULL_V1_MATRIX_SHA256 = ( + "f1ca85f9689922b90edd5767b9ff2a902f6b896f32f68b2ca086dde3fd2157d0" +) +CANONICAL_FULL_V1_CASE_CATALOG_SHA256 = ( + "8e262178f770b0cdde12b7ec71604afd87251fa55685d4594f29717153ad6bbd" +) +P50_STABILITY_LIMIT = 1.10 +P99_STABILITY_LIMIT = 1.25 +TRIAL_DRIFT_RATIO_LIMIT = 1.10 +TRIAL_OUTLIER_FRACTION_LIMIT = 0.05 +TRIAL_OUTLIER_MAD_MULTIPLIER = 6.0 +BOOTSTRAP_RESAMPLES = 10_000 +BOOTSTRAP_CONFIDENCE = 0.95 +BOOTSTRAP_EQUIVALENCE_BAND = 0.05 +BOOTSTRAP_POLICY = "hierarchical-run-trial-p99-ratio-v1" +BOOTSTRAP_CHUNK_SIZE = 250 +MAX_ARCHIVE_MEMBERS = 20_000 +MAX_ARCHIVE_MEMBER_BYTES = 2 * 1024**3 +MAX_ARCHIVE_TOTAL_BYTES = 16 * 1024**3 +MAX_PUBLIC_DATASET_BYTES = 32 * 1024**2 +HEX64 = re.compile(r"[0-9a-f]{64}") +SAFE_ID = re.compile(r"[a-z0-9][a-z0-9_.-]{0,127}") +REASON = re.compile(r"[a-z0-9][a-z0-9.-]{0,95}") +ARTIFACT_NAME = re.compile( + r"cx(?:unsupported|shard-[a-z0-9][a-z0-9_.-]{0,127})-[1-9][0-9]*-[1-9][0-9]*" +) +COVERAGE_TOPOLOGY_FIELDS = ( + "ep_size", "nodes", "gpus_per_node", "scale_up_domain", "scope", + "scale_up_transport", "scale_out_transport", "transport", "topology_class", +) +CHANNEL_PATH = re.compile(r"datasets/([0-9a-f]{64})/dataset\.json") +SCHEMA_DIR = HERE / "schemas" +_SCHEMAS: dict[str, jsonschema.protocols.Validator] = {} +_BOOTSTRAP_CACHE: dict[tuple[Any, ...], dict[str, Any]] = {} + + +class PublisherError(ValueError): + """Input or stored state violates the publication contract.""" + + +strict_load = contracts.strict_load +_canonical = contracts.canonical_json_bytes + + +def _sha_bytes(data: bytes) -> str: + return hashlib.sha256(data).hexdigest() + + +def _sha_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _latest_timestamp(values: Sequence[str]) -> str: + """Return the latest evidence timestamp without introducing publisher wall time.""" + if not values: + raise PublisherError("cannot derive a timestamp without evidence") + + def parsed(value: str) -> dt.datetime: + try: + timestamp = dt.datetime.fromisoformat(value.replace("Z", "+00:00")) + except ValueError as exc: + raise PublisherError("evidence timestamp is not ISO-8601") from exc + if timestamp.tzinfo is None: + raise PublisherError("evidence timestamp must include a timezone") + return timestamp.astimezone(dt.timezone.utc) + + return max(values, key=lambda value: (parsed(value), value)) + + +def _schema(name: str, value: Any) -> None: + validator = _SCHEMAS.get(name) + if validator is None: + schema = strict_load(SCHEMA_DIR / name) + jsonschema.Draft202012Validator.check_schema(schema) + validator = jsonschema.Draft202012Validator( + schema, format_checker=jsonschema.FormatChecker() + ) + _SCHEMAS[name] = validator + errors = sorted(validator.iter_errors(value), key=lambda error: list(error.absolute_path)) + if errors: + error = errors[0] + location = ".".join(map(str, error.absolute_path)) or "$" + raise PublisherError(f"{name}:{location}: {error.message}") +def _exact(obj: Any, fields: set[str], path: str) -> dict[str, Any]: + if not isinstance(obj, dict): + raise PublisherError(f"{path} must be an object") + actual = set(obj) + if actual != fields: + raise PublisherError( + f"{path} fields differ: missing={sorted(fields - actual)}, " + f"extra={sorted(actual - fields)}" + ) + return obj +def _array(value: Any, path: str, *, nonempty: bool = False) -> list[Any]: + if not isinstance(value, list) or (nonempty and not value): + qualifier = "a nonempty" if nonempty else "an" + raise PublisherError(f"{path} must be {qualifier} array") + return value + + +def _integer(value: Any, path: str, *, minimum: int = 0) -> int: + if type(value) is not int or value < minimum: + raise PublisherError(f"{path} must be an integer >= {minimum}") + return value + + +def _unique(values: Sequence[Any], path: str) -> None: + serialized = [_canonical(value) for value in values] + if len(serialized) != len(set(serialized)): + raise PublisherError(f"{path} contains duplicates") + +def _eligibility(value: dict[str, Any], path: str) -> dict[str, Any]: + allocations = value["allocation_ids"] + p50 = value["p50_max_min_ratio"] + p99 = value["p99_max_min_ratio"] + gates = ( + len(allocations) >= REQUIRED_ALLOCATIONS, + value["complete"], value["correct"], value["measured_roundtrip_p99"], + value["stable_p50"], value["stable_p99"], value["stable_ordering"], + p50 is not None and p50 <= P50_STABILITY_LIMIT, + p99 is not None and p99 <= P99_STABILITY_LIMIT, + ) + if value["decision_grade"] != (all(gates) and not value["reasons"]): + raise PublisherError(f"{path}.decision_grade does not match promotion gates") + if value["decision_grade"] == bool(value["reasons"]): + raise PublisherError(f"{path}.reasons does not match decision status") + return value + + +def validate_channel(doc: Any, *, expected_channel: str | None = None) -> dict[str, Any]: + _schema("channel-v1.schema.json", doc) + if expected_channel and doc["channel"] != expected_channel: + raise PublisherError("channel name does not match its file") + target = doc["dataset"] + match = CHANNEL_PATH.fullmatch(target["path"]) if isinstance(target["path"], str) else None + if not match or match.group(1) != target["sha256"]: + raise PublisherError("channel dataset path and sha256 do not agree") + return doc + + +def _metric_value(series: dict[str, Any], metric: dict[str, Any]) -> tuple[str, float, str]: + point = next( + (point for point in series["points"] if point["tokens_per_rank"] == metric["tokens_per_rank"]), + None, + ) + if point is None or series["phase"] != metric["phase"]: + raise PublisherError("decision metric references an unavailable point") + component = point["components"]["roundtrip"] + if metric["measure"] == "latency_us": + value = component["latency_us"][metric["statistic"]] + unit = "us" + else: + rates = component[metric["measure"]] + if rates is None: + raise PublisherError("data-rate decision has no byte accounting contract") + value = rates[metric["statistic"]] + unit = "GB/s" + return point["point_id"], value, unit + + +def _validate_metric(metric: dict[str, Any]) -> None: + expected = "min" if metric["measure"] == "latency_us" else "max" + if metric["objective"] != expected: + raise PublisherError(f"{metric['measure']} objective must be {expected}") + + +def _metric_label(measure: str, statistic: str) -> str: + if measure == "latency_us": + return f"{statistic} latency" + label = ( + "activation data rate" + if measure == "activation_data_rate_gbps_at_latency_percentile" + else "total logical data rate" + ) + return f"{label} at {statistic} latency" + + +def _routing_build_control(build: dict[str, Any]) -> dict[str, Any]: + return { + key: build[key] + for key in ( + "routing_control_sha256", "image_digest", "source_sha", "squash_sha256", + ) + } + + +def _routing_implementation_mismatch(members: Sequence[dict[str, Any]]) -> bool: + off_eplb_hashes = { + member["build"]["implementation_contract_sha256"] + for member in members if not member["workload"]["eplb"] + } + return len(off_eplb_hashes) > 1 + + +def _public_case_factors(series: dict[str, Any]) -> dict[str, Any]: + workload = series["workload"] + system = series["system"] + measurement = series["measurement"] + ep_size = system["ep_size"] + case = { + "backend": series["backend"]["id"], + "canonical": True, + "eplb": workload["eplb"], + "ep": ep_size, + "experts": workload["experts"], + "gpus_per_node": system["gpus_per_node"], + "hidden": workload["hidden"], + "ladder": " ".join(str(point["tokens_per_rank"]) for point in series["points"]), + "mode": series["mode"], + "nodes": system["nodes"], + "phase": series["phase"], + "required_publication": series["publication_tier"], + "routing": workload["routing"], + "samples_per_point": measurement["samples_per_component"], + "scale_out_transport": system["scale_out_transport"], + "scale_up_domain": system["scale_up_domain"], + "scale_up_transport": system["scale_up_transport"], + "scope": system["scope"], + "suite": series["suite"], + "timing": ( + f"{measurement['iters']}:{measurement['trials']}:" + f"{measurement['warmups']}" + ), + "topk": workload["top_k"], + "topology_class": system["topology_class"], + "transport": system["transport"], + "warmup_semantics": sweep_matrix.ep_harness.WARMUP_SEMANTICS, + "workload": series["model"], + } + if workload["precision_profile"] != identity.V1_CONTROL_PRECISION_PROFILE: + case["precision_profile"] = workload["precision_profile"] + return { + "case": case, + "profile": identity.profile_for_case(case), + "sku": system["sku"], + } + + +def _coverage_topology(case: dict[str, Any]) -> dict[str, Any]: + """Project exact fabric placement without exposing private runner details.""" + return { + "ep_size": case.get("ep_size", case.get("ep")), + **{field: case[field] for field in COVERAGE_TOPOLOGY_FIELDS if field != "ep_size"}, + } + + +def _coverage_coordinates(case: dict[str, Any]) -> dict[str, Any]: + return { + "sku": case["sku"], "backend": case["backend"], + "mode": case["mode"], "phase": case["phase"], + "topology": _coverage_topology(case), + } + + +@lru_cache(maxsize=1) +def _canonical_coverage_cases() -> dict[str, dict[str, Any]]: + matrix = sweep_matrix.resolve_matrix(suites="all", max_cases=128, backends="all") + return { + item["case"]["case_id"]: { + "sku": item["sku"], + **item["case"], + "disposition": item["disposition"], + "reason": item["reason"], + } + for item in matrix["requested_cases"] + } + + +def _public_series_config(series: dict[str, Any]) -> dict[str, Any]: + return { + "backend": { + "generation": series["backend"]["generation"], + "version": series["backend"]["version"], + }, + "resource": series["resource"], + "system": {"label": series["system"]["label"]}, + } + + +def _public_cohort_factors(kind: str, item: dict[str, Any]) -> tuple[Any, Any]: + workload = item["workload"] + build = item["build"] + shape = { + key: workload[key] + for key in ( + "hidden", "top_k", "experts", "precision_profile", "dispatch_precision", + "combine_precision", "activation_profile", + ) + } + common = { + "model": item["model"], "mode": item["mode"], "phase": item["phase"], + "shape": shape, "measurement": item["measurement"], + "ep_size": item["system"]["ep_size"], + } + if kind == "library": + return ( + {**common, "system": item["system"], "workload": workload, + "resource_mode": item["resource"]["mode"], "source": build["source_sha"]}, + item["backend"]["id"], + ) + if kind == "chip": + return ( + {**common, "backend": item["backend"], "workload": workload, + "resource_mode": item["resource"]["mode"], "source": build["source_sha"]}, + item["system"], + ) + if kind == "system": + return {**common, "workload": workload, "source": build["source_sha"]}, [ + item["system"]["sku"], item["backend"]["id"], item["resource"]["profile"] + ] + if kind == "routing": + return ( + {**common, "backend": item["backend"], "system": item["system"], + "resource": item["resource"], "build": _routing_build_control(build)}, + [workload["routing"], workload["eplb"], + build["implementation_contract_sha256"]], + ) + if kind in PRECISION_COHORT_KINDS: + static_shape = { + key: workload[key] + for key in ("hidden", "top_k", "experts", "activation_profile") + } + control = { + "backend": item["backend"], + "build": { + key: build[key] + for key in ( + "image_digest", "runtime_fingerprint_sha256", "source_sha", + "squash_sha256", + ) + }, + "measurement": item["measurement"], + "mode": item["mode"], + "model": item["model"], + "phase": item["phase"], + "resource": item["resource"], + "shape": static_shape, + "system": item["system"], + "workload": { + "eplb": workload["eplb"], + "routing": workload["routing"], + }, + } + if kind == "dispatch-precision": + control["combine_precision"] = workload["combine_precision"] + variant = workload["dispatch_precision"] + elif kind == "combine-precision": + control["dispatch_precision"] = workload["dispatch_precision"] + variant = workload["combine_precision"] + else: + control.pop("resource") + variant = { + "combine_precision": workload["combine_precision"], + "dispatch_precision": workload["dispatch_precision"], + "precision_profile": workload["precision_profile"], + "resource": item["resource"], + } + return control, variant + raise PublisherError(f"unknown cohort kind {kind}") + + +def _case_disposition_catalog_sha256(coverage: Sequence[dict[str, Any]]) -> str: + catalog = [ + {"case_id": item["case_id"], "disposition": item["disposition"]} + for item in sorted(coverage, key=lambda item: item["case_id"]) + ] + return _sha_bytes(_canonical(catalog)) + + +def validate_public_dataset(doc: Any) -> dict[str, Any]: + _schema("public-dataset-v1.schema.json", doc) + if len(_canonical(doc)) + 1 > MAX_PUBLIC_DATASET_BYTES: + raise PublisherError("public dataset exceeds the serving size limit") + try: + artifact_safety.assert_publication_safe([doc]) + except artifact_safety.ArtifactSafetyError as exc: + raise PublisherError(str(exc)) from exc + if doc["source_bundle_ids"] != sorted(doc["source_bundle_ids"]): + raise PublisherError("source bundle IDs are not canonical") + for field, key in ( + ("coverage", "case_id"), ("attempts", "attempt_id"), + ("series", "series_id"), ("cohorts", "cohort_id"), + ("rankings", "ranking_id"), ("recommendations", "recommendation_id"), + ("sensitivities", "sensitivity_id"), + ): + if doc[field] != sorted(doc[field], key=lambda item: item[key]): + raise PublisherError(f"{field} are not in canonical identity order") + promotion = doc["promotion"] + quarantined = promotion["status"] == "quarantined" + if quarantined != (promotion["reason"] is not None) or quarantined != ( + promotion["matrix_id"] is None + ): + raise PublisherError("promotion reason/matrix identity differs from status") + attempts = {item["attempt_id"]: item for item in doc["attempts"]} + if len(attempts) != len(doc["attempts"]): + raise PublisherError("dataset has duplicate attempt IDs") + evidence = [ + value["evidence_id"] for item in doc["attempts"] for value in item["evidence"] + ] + _unique(evidence, "dataset attempt evidence") + series = {item["series_id"]: item for item in doc["series"]} + if len(series) != len(doc["series"]): + raise PublisherError("dataset has duplicate series IDs") + allocation_ids = set(promotion["allocation_ids"]) + case_ids = {item["case_id"] for item in doc["coverage"]} + if len(case_ids) != len(doc["coverage"]): + raise PublisherError("dataset has duplicate case coverage") + coverage_by_case = {item["case_id"]: item for item in doc["coverage"]} + series_case_ids = { + case_id for item in doc["series"] for case_id in item["case_ids"] + } + canonical_cases = _canonical_coverage_cases() + for item in doc["coverage"]: + topology = item["topology"] + registered = capability.topology_for(item["sku"], topology["ep_size"]) + if ( + item["sku"] not in capability.PLATFORMS + or item["backend"] not in capability.BACKENDS + or registered is None + or any( + topology[field] != registered[field] + for field in COVERAGE_TOPOLOGY_FIELDS if field != "ep_size" + ) + ): + raise PublisherError("coverage topology differs from the capability registry") + canonical = canonical_cases.get(item["case_id"]) + if canonical is not None: + precision_profile = canonical.get( + "precision_profile", identity.V1_CONTROL_PRECISION_PROFILE + ) + precision = identity.precision_profile(precision_profile) + expected_projection = { + "sku": canonical["sku"], + "suite": canonical["suite"], + "workload": canonical["workload"], + "publication_tier": canonical["required_publication"], + "backend": canonical["backend"], + "mode": canonical["mode"], + "phase": canonical["phase"], + "routing": canonical["routing"], + "eplb": canonical["eplb"], + "precision_profile": precision_profile, + "dispatch_precision": precision["dispatch"], + "combine_precision": precision["combine"], + "topology": _coverage_topology(canonical), + "disposition": canonical["disposition"], + } + if any(item[field] != value for field, value in expected_projection.items()): + raise PublisherError("coverage dimensions differ from its case identity") + expected_tokens = [int(value) for value in canonical["ladder"].split()] + if [point["tokens_per_rank"] for point in item["points"]] != expected_tokens: + raise PublisherError("coverage points differ from the requested token ladder") + if canonical is None and item["case_id"] not in series_case_ids: + raise PublisherError("coverage case identity is outside the v1 catalog") + for point in item["points"]: + if point["global_tokens"] != point["tokens_per_rank"] * topology["ep_size"]: + raise PublisherError("coverage point global token count differs") + if (point["terminal_status"] == "measured") != (point["reason"] is None): + raise PublisherError("coverage point terminal reason differs from status") + for item in doc["attempts"]: + if item["case_id"] not in case_ids or item["allocation_id"] not in allocation_ids: + raise PublisherError("attempt references undeclared coverage or allocation") + if item["series_id"] is not None and item["series_id"] not in series: + raise PublisherError("attempt references unknown series") + if (item["outcome"] == "success") != (item["reason"] is None): + raise PublisherError("attempt reason must be null exactly for success") + if item["outcome"] == "success" and item["failure_mode"] is not None: + raise PublisherError("successful attempt cannot have a failure mode") + if (item["outcome"] == "success" and item["selected"]) != ( + item["series_id"] is not None + ): + raise PublisherError("attempt series must be present exactly for selected success") + if {item["allocation_id"] for item in doc["attempts"]} != allocation_ids: + raise PublisherError("promotion allocation catalog differs from attempts") + attempt_groups: dict[tuple[str, str], list[dict[str, Any]]] = {} + for item in doc["attempts"]: + attempt_groups.setdefault((item["case_id"], item["allocation_id"]), []).append(item) + for (case_id, allocation_id), group in attempt_groups.items(): + ordinals = sorted(item["attempt_index"] for item in group) + if ordinals != list(range(1, len(group) + 1)): + raise PublisherError("public retries must retain contiguous attempt indexes") + if any( + item["attempt_id"] != identity.attempt_id( + allocation=allocation_id, case=case_id, ordinal=item["attempt_index"] + ) + for item in group + ): + raise PublisherError("public retry identity differs from its case/allocation/index") + selected = [item for item in group if item["selected"]] + if len(selected) != 1 or selected[0]["attempt_index"] != ordinals[-1]: + raise PublisherError("publisher must select the latest retry per case/allocation") + selected_by_series: dict[str, list[dict[str, Any]]] = {} + for item in doc["attempts"]: + if item["selected"] and item["outcome"] == "success": + selected_by_series.setdefault(item["series_id"], []).append(item) + terminal = 0 + for item in doc["coverage"]: + listed = set(item["attempt_ids"]) + selected = item["selected_attempt_id"] + expected_attempts = { + attempt_id for attempt_id, attempt in attempts.items() + if attempt["case_id"] == item["case_id"] + } + if listed != expected_attempts: + raise PublisherError("coverage references attempts from another case") + if selected is not None: + terminal += 1 + if (selected not in listed or not attempts[selected]["selected"] + or any(attempts[selected][field] != item[field] + for field in ("outcome", "failure_mode", "reason"))): + raise PublisherError("coverage selected outcome differs") + selected_candidates = [attempts[value] for value in listed if attempts[value]["selected"]] + latest = max( + selected_candidates, + key=lambda attempt: ( + int(attempt["run_id"]), attempt["run_attempt"], + attempt["attempt_index"], attempt["attempt_id"] + ), + ) + if selected != latest["attempt_id"]: + raise PublisherError("coverage does not select the latest canonical allocation") + expected_status = ( + "measured" if attempts[selected]["outcome"] == "success" + else attempts[selected]["outcome"] + ) + if any(point["terminal_status"] != expected_status for point in item["points"]): + raise PublisherError("coverage point status differs from selected attempt") + if expected_status == "measured": + selected_series = series.get(attempts[selected]["series_id"]) + if selected_series is None: + raise PublisherError("measured coverage points lack a public series") + public_points = { + point["tokens_per_rank"]: point for point in selected_series["points"] + } + if any( + point["series_id"] != selected_series["series_id"] + or point["point_id"] + != public_points.get(point["tokens_per_rank"], {}).get("point_id") + for point in item["points"] + ): + raise PublisherError("coverage point identities differ from series") + measured_cases = sum( + all(point["terminal_status"] == "measured" for point in item["points"]) + for item in doc["coverage"] + ) + unsupported_cases = sum( + all(point["terminal_status"] == "unsupported" for point in item["points"]) + for item in doc["coverage"] + ) + requested_points = sum(len(item["points"]) for item in doc["coverage"]) + measured_points = sum( + point["terminal_status"] == "measured" + for item in doc["coverage"] for point in item["points"] + ) + unsupported_points = sum( + point["terminal_status"] == "unsupported" + for item in doc["coverage"] for point in item["points"] + ) + expected_counts = { + "requested_cases": len(doc["coverage"]), + "terminal_cases": terminal, + "measured_cases": measured_cases, + "unsupported_cases": unsupported_cases, + "requested_points": requested_points, + "terminal_points": requested_points, + "measured_points": measured_points, + "unsupported_points": unsupported_points, + } + if any(promotion[field] != value for field, value in expected_counts.items()): + raise PublisherError("promotion coverage counts differ") + selected_evidence: dict[tuple[str, str], set[str]] = {} + for attempt in doc["attempts"]: + if attempt["selected"] and attempt["series_id"] is not None: + for value in attempt["evidence"]: + selected_evidence.setdefault( + (attempt["series_id"], value["point_id"]), set() + ).add(value["evidence_id"]) + for item in doc["series"]: + eligibility = _eligibility(item["eligibility"], f"series {item['series_id']}") + workload = item["workload"] + model, hidden, top_k, experts = sweep_matrix.V1_WORKLOAD + suite_contract = sweep_matrix.V1_SUITE_CONTRACTS.get(item["suite"]) + coordinate = ( + item["mode"], item["phase"], workload["routing"], workload["eplb"] + ) + profile_case = {"mode": item["mode"]} + if workload["precision_profile"] != identity.V1_CONTROL_PRECISION_PROFILE: + profile_case["precision_profile"] = workload["precision_profile"] + profile = identity.profile_for_case(profile_case) + communication_precision = identity.precision_profile(workload["precision_profile"]) + if ( + item["model"] != model + or (workload["hidden"], workload["top_k"], workload["experts"]) + != (hidden, top_k, experts) + or suite_contract is None + or coordinate not in suite_contract["coordinates"] + or ( + suite_contract.get("backends") is not None + and item["backend"]["id"] not in suite_contract["backends"] + ) + or item["publication_tier"] != suite_contract["publication"] + or item["measurement"]["contract"] != profile["contract"] + or item["measurement"]["component_order_contract"] + != profile["component_order_contract"] + or item["measurement"]["combine_semantics"] != profile["combine_semantics"] + or item["measurement"]["payload_unit"] != profile["payload_unit"] + or workload["dispatch_precision"] != communication_precision["dispatch"] + or workload["combine_precision"] != communication_precision["combine"] + or item["measurement"]["qualification_indices"] + != sorted(item["measurement"]["qualification_indices"]) + or len(set(item["measurement"]["qualification_indices"])) + != len(item["measurement"]["qualification_indices"]) + ): + raise PublisherError("series differs from the frozen v1 workload/suite profile") + backend_id = item["backend"]["id"] + expected_role = "reference" if backend_id == "nccl-ep" else "library" + if ( + backend_id not in capability.BACKENDS + or item["backend"]["label"] != BACKEND_LABELS[backend_id] + or item["backend"]["role"] != expected_role + or item["backend"]["version"] is None + ): + raise PublisherError("series backend projection differs from v1") + sku = item["system"]["sku"] + platform = capability.PLATFORMS.get(sku) + ep_size = item["system"]["ep_size"] + registered_topology = capability.topology_for(sku, ep_size) + if platform is None or registered_topology is None: + raise PublisherError("series system projection differs from v1") + disposition, _ = capability.resolve_disposition( + sku, backend_id, ep=ep_size, nodes=item["system"]["nodes"], + routing=workload["routing"], eplb=workload["eplb"], + mode=item["mode"], + precision_profile=( + workload["precision_profile"] + if workload["precision_profile"] != identity.V1_CONTROL_PRECISION_PROFILE + else None + ), + ) + if ( + disposition != "supported" + or item["system"]["vendor"] != platform["vendor"] + or any( + item["system"][field] != registered_topology[field] + for field in ( + "nodes", "gpus_per_node", "scale_up_domain", "scope", + "scale_up_transport", "scale_out_transport", "transport", + "topology_class", + ) + ) + or item["system"]["world_size"] != ep_size + or platform["product"] not in set( + re.findall(r"[a-z]+\d+[a-z]*", item["system"]["label"].lower()) + ) + ): + raise PublisherError("series system projection differs from v1") + if contracts.public_series_config_sha256(_public_series_config(item)) != item[ + "build" + ]["public_config_sha256"]: + raise PublisherError("public series configuration differs from its commitment") + covered = [coverage_by_case.get(case_id) for case_id in item["case_ids"]] + if not covered or any( + case is None + or { + "sku": case["sku"], "backend": case["backend"], + "mode": case["mode"], "phase": case["phase"], + "topology": case["topology"], + } + != { + "sku": sku, "backend": backend_id, + "mode": item["mode"], "phase": item["phase"], + "topology": _coverage_topology(item["system"]), + } + for case in covered + ): + raise PublisherError("series projection differs from its case coverage") + if ( + item["eplb"]["enabled"] != item["workload"]["eplb"] + or item["eplb"]["logical_experts"] != item["workload"]["experts"] + ): + raise PublisherError("series EPLB descriptor differs from its workload") + eplb = item["eplb"] + expected_physical = eplb["logical_experts"] + eplb["redundant_experts"] + nullable_eplb = ( + "planner", "mapping_sha256", "reference_tokens_per_rank", "max_replicas", + "imbalance_before", "imbalance_after", "calibration_workload_id", + "calibration_trace_sha256", "calibration_window", "calibration_token_offset", + ) + if eplb["enabled"]: + if ( + item["workload"]["routing"] != "zipf" + or any(eplb[field] is None for field in nullable_eplb) + or eplb["planner"] != "greedy-rank-major-v1" + or eplb["reference_tokens_per_rank"] != 2048 + or eplb["redundant_experts"] != 32 + or eplb["redundant_experts"] % ep_size != 0 + or eplb["physical_experts"] != expected_physical + or eplb["logical_experts"] % ep_size != 0 + or eplb["physical_experts"] % ep_size != 0 + or not 1 <= eplb["replicated_experts"] <= min( + eplb["logical_experts"], eplb["redundant_experts"] + ) + or not 2 <= eplb["max_replicas"] <= 1 + eplb["redundant_experts"] + or not 1 <= eplb["imbalance_after"] <= eplb["imbalance_before"] <= ep_size + ): + raise PublisherError("enabled EPLB descriptor is incomplete") + expected_plan, calibration = contracts._expected_eplb_calibration( + workload["routing"], workload["hidden"], workload["top_k"], + eplb["logical_experts"], eplb["physical_experts"], ep_size, + identity.V1_CASE_PROFILE["seed"], + identity.V1_CASE_PROFILE["eplb_reference_tokens_per_rank"], + ) + expected_eplb = { + **calibration, + "enabled": True, + "planner": identity.V1_CASE_PROFILE["eplb_planner"], + "mapping_sha256": contracts.eplb_contract.mapping_hash(expected_plan), + "logical_experts": eplb["logical_experts"], + "physical_experts": eplb["physical_experts"], + "redundant_experts": identity.V1_CASE_PROFILE["eplb_redundant_experts"], + "reference_tokens_per_rank": identity.V1_CASE_PROFILE[ + "eplb_reference_tokens_per_rank" + ], + "replicated_experts": expected_plan["replicated_experts"], + "max_replicas": expected_plan["max_replicas"], + "imbalance_before": expected_plan["imbalance_before"], + "imbalance_after": expected_plan["imbalance_after"], + } + if eplb != expected_eplb: + raise PublisherError("enabled EPLB descriptor differs from deterministic plan") + elif ( + any(eplb[field] is not None for field in nullable_eplb) + or eplb["physical_experts"] != expected_physical + or eplb["redundant_experts"] != 0 + or eplb["replicated_experts"] != 0 + ): + raise PublisherError("disabled EPLB descriptor claims a plan") + if item["backend"]["id"] == "nccl-ep": + expected_generation = ( + "nccl" if item["system"]["vendor"] == "nvidia" else "rccl" + ) + if item["backend"]["generation"] != expected_generation: + raise PublisherError("NCCL/RCCL reference generation differs from system vendor") + if (item["status"] == "decision-grade") != eligibility["decision_grade"]: + raise PublisherError("series status differs from eligibility") + if ( + set(eligibility["allocation_ids"]) != set(item["allocation_ids"]) + or eligibility["correct"] != all( + point["correctness"]["semantic_pass"] + and point["correctness"]["precision"]["passed"] + for point in item["points"] + ) + ): + raise PublisherError("series eligibility differs from its evidence") + selected_attempts = selected_by_series.get(item["series_id"], []) + if ( + set(item["case_ids"]) != {attempt["case_id"] for attempt in selected_attempts} + or set(item["allocation_ids"]) + != {attempt["allocation_id"] for attempt in selected_attempts} + or item["measurement"]["qualification_indices"] + != sorted({attempt["qualification_index"] for attempt in selected_attempts}) + ): + raise PublisherError("series case/allocation catalog differs from selected attempts") + if item["eligibility"]["decision_grade"] and len( + {attempt["run_id"] for attempt in selected_attempts} + ) < REQUIRED_ALLOCATIONS: + raise PublisherError("decision-grade series lacks independent workflow runs") + tokens = [point["tokens_per_rank"] for point in item["points"]] + if tokens != sorted(set(tokens)): + raise PublisherError("series points are not in unique ascending token order") + if len(item["case_ids"]) != 1: + raise PublisherError("public series must represent exactly one v1 case") + case_id = item["case_ids"][0] + if identity.digest("case", _public_case_factors(item)) != case_id: + raise PublisherError("public series projection differs from its case identity") + build = item["build"] + expected_series_id = identity.series_id({ + "backend": backend_id, + "case_id": case_id, + "image_digest": build["image_digest"], + "implementation_contract_sha256": build[ + "implementation_contract_sha256" + ], + "public_config_sha256": build["public_config_sha256"], + "routing_control_sha256": build["routing_control_sha256"], + "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"], + "source_sha": build["source_sha"], + "squash_sha256": build["squash_sha256"], + "workload_id": workload["workload_id"], + }) + if item["series_id"] != expected_series_id: + raise PublisherError("public series identity differs from its committed factors") + for point in item["points"]: + if point["point_id"] != identity.point_id(series=item["series_id"], tokens_per_rank=point["tokens_per_rank"]): + raise PublisherError("point identity differs") + if point["global_tokens"] != point["tokens_per_rank"] * item["system"]["ep_size"]: + raise PublisherError("global_tokens must use EP size") + routing = point["routing"] + max_fanout = min(item["workload"]["top_k"], item["system"]["ep_size"]) + if ( + routing["routed_copies"] < point["global_tokens"] + or routing["routed_copies"] > point["global_tokens"] * max_fanout + or routing["recv_tokens_max"] > routing["routed_copies"] + or routing["recv_tokens_max"] * item["system"]["ep_size"] + < routing["routed_copies"] + or not math.isclose( + routing["fanout_mean"], + routing["routed_copies"] / point["global_tokens"], + rel_tol=1e-12, + ) + or routing["hotspot_ratio"] < 1 + or routing["empty_expert_count"] >= eplb["physical_experts"] + or routing["empty_rank_count"] >= item["system"]["ep_size"] + ): + raise PublisherError("point routing/load facts are internally inconsistent") + expected_evidence = selected_evidence.get( + (item["series_id"], point["point_id"]), set() + ) + if set(point["evidence_ids"]) != expected_evidence: + raise PublisherError("point evidence differs from selected series attempts") + point_correctness = point["correctness"] + if ( + point_correctness["precision"]["profile_id"] + != workload["precision_profile"] + or ( + point_correctness["semantic_pass"] + and not point_correctness["precision"]["passed"] + ) + or point["stability"]["qualification_indices"] + != item["measurement"]["qualification_indices"] + ): + raise PublisherError("point correctness/stability differs from series evidence") + diagnostics = point["trial_diagnostics"] + diagnostic_reasons = set(diagnostics["reasons"]) + component_reasons: set[str] = set() + for name, summary in diagnostics["components"].items(): + if summary is None: + if point["components"][name] is not None: + raise PublisherError("trial diagnostics omit a measured component") + continue + if point["components"][name] is None: + raise PublisherError("trial diagnostics describe an unavailable component") + if summary["drift_flagged"] != ( + summary["first_last_median_ratio"] > TRIAL_DRIFT_RATIO_LIMIT + ) or summary["outlier_flagged"] != ( + summary["robust_outlier_fraction"] > TRIAL_OUTLIER_FRACTION_LIMIT + ): + raise PublisherError("trial diagnostic flags differ from their thresholds") + if summary["drift_flagged"]: + component_reasons.add("trial-drift") + if summary["outlier_flagged"]: + component_reasons.add("trial-outliers") + if ( + diagnostic_reasons != component_reasons + or diagnostics["flagged"] != bool(diagnostic_reasons) + or not diagnostic_reasons.issubset(point["anomalies"]) + ): + raise PublisherError("trial diagnostic summary is inconsistent") + components = point["components"] + if (components["dispatch"] is None) != (components["combine"] is None): + raise PublisherError("dispatch/combine availability differs") + for name, component in components.items(): + if component is None: + continue + expected_origin = "derived" if name == "isolated_sum" else "measured" + expected_samples = None if name == "isolated_sum" else 512 + if component["origin"] != expected_origin or component["sample_count"] != expected_samples: + raise PublisherError(f"{name} origin or sample count differs") + rate_fields = ( + "activation_data_rate_gbps_at_latency_percentile", + "total_logical_data_rate_gbps_at_latency_percentile", + ) + if name == "isolated_sum" and any(component[field] is not None for field in rate_fields): + raise PublisherError("isolated_sum cannot publish a derived data rate") + if name != "isolated_sum" and any(component[field] is None for field in rate_fields): + raise PublisherError(f"{name} measured data rates are missing") + latency = component["latency_us"] + if list(latency.values()) != sorted(latency.values()): + raise PublisherError("latency percentiles are not ordered") + byte_provenance = component["byte_provenance"] + if byte_provenance["total_logical_bytes"] != ( + byte_provenance["activation_data_bytes"] + byte_provenance["scale_bytes"] + ): + raise PublisherError("component byte accounting does not reconcile") + for field, byte_field in ( + ("activation_data_rate_gbps_at_latency_percentile", "activation_data_bytes"), + ("total_logical_data_rate_gbps_at_latency_percentile", "total_logical_bytes"), + ): + if component[field] is not None: + for statistic, rate in component[field].items(): + expected = byte_provenance[byte_field] / (latency[statistic] * 1000.0) + if not math.isclose(rate, expected, rel_tol=1e-9, abs_tol=1e-12): + raise PublisherError("component GB/s formula differs") + if components["roundtrip"] is None or components["roundtrip"]["origin"] != "measured": + raise PublisherError("roundtrip must be measured") + for statistic, throughput in point["roundtrip_token_rate_at_latency_percentile"].items(): + expected = point["global_tokens"] / ( + components["roundtrip"]["latency_us"][statistic] * 1e-6 + ) + if not math.isclose(throughput, expected, rel_tol=1e-9): + raise PublisherError("roundtrip token throughput formula differs") + if components["dispatch"] is not None: + derived = components["isolated_sum"] + if derived is None or any(not math.isclose( + derived["latency_us"][statistic], + components["dispatch"]["latency_us"][statistic] + + ( + components["stage"]["latency_us"][statistic] + if components["stage"] is not None else 0.0 + ) + + components["combine"]["latency_us"][statistic], rel_tol=1e-12 + ) for statistic in ("p50", "p90", "p95", "p99")): + raise PublisherError("isolated_sum is not the component percentile sum") + elif components["isolated_sum"] is not None: + raise PublisherError("isolated_sum requires measured dispatch/combine components") + if any(point["trial_diagnostics"]["flagged"] for point in item["points"]) != ( + "unresolved-trial-diagnostic" in item["eligibility"]["reasons"] + ): + raise PublisherError("series trial diagnostic eligibility is inconsistent") + cohorts = {item["cohort_id"]: item for item in doc["cohorts"]} + if len(cohorts) != len(doc["cohorts"]): + raise PublisherError("dataset has duplicate cohort IDs") + for item in doc["cohorts"]: + if not set(item["series_ids"]).issubset(series): + raise PublisherError("cohort references unknown series") + members = [series[series_id] for series_id in item["series_ids"]] + expected_tier = ( + "comparable-experimental" + if any(member["publication_tier"] == "comparable-experimental" for member in members) + else "official" + ) + if item["publication_tier"] != expected_tier: + raise PublisherError("cohort publication tier differs from its members") + if f"/ {members[0]['mode']} /" not in item["label"]: + raise PublisherError("cohort label omits its controlled mode") + roles = {member["backend"]["role"] for member in members} + if item["kind"] == "library" and roles != {"library"}: + raise PublisherError("library cohort contains non-library evidence") + if item["kind"] == "system" and roles != {"reference"}: + raise PublisherError("system cohort is not a portable reference comparison") + if item["kind"] in {"chip", "routing", *PRECISION_COHORT_KINDS} and len( + {_canonical(member["backend"]) for member in members} + ) != 1: + raise PublisherError(f"{item['kind']} cohort mixes backend implementations") + public_factors = [_public_cohort_factors(item["kind"], member) for member in members] + if len({_canonical(value[0]) for value in public_factors}) != 1: + raise PublisherError(f"{item['kind']} cohort does not control its public factors") + if len({_canonical(value[1]) for value in public_factors}) < 2: + raise PublisherError(f"{item['kind']} cohort does not vary its declared contrast") + if item["kind"] == "routing": + if item["publication_tier"] != "comparable-experimental": + raise PublisherError("routing cohort must be experimental") + has_baseline = sum( + member["workload"]["routing"] == "uniform" + and not member["workload"]["eplb"] + for member in members + ) == 1 + missing_reason = "missing-uniform-baseline" in item["eligibility"]["reasons"] + if has_baseline == missing_reason: + raise PublisherError("routing baseline and eligibility reason disagree") + mismatch = _routing_implementation_mismatch(members) + mismatch_reason = "implementation-config-mismatch" in item["eligibility"]["reasons"] + if mismatch != mismatch_reason: + raise PublisherError("routing implementation control and eligibility disagree") + if item["kind"] in PRECISION_COHORT_KINDS: + if item["publication_tier"] != "comparable-experimental": + raise PublisherError("precision cohorts must be experimental") + if item["kind"] in {"dispatch-precision", "combine-precision"}: + axis = ( + "dispatch" + if item["kind"] == "dispatch-precision" + else "combine" + ) + field = f"{axis}_precision" + bf16 = identity.precision_profile( + identity.V1_CONTROL_PRECISION_PROFILE + )[axis] + has_baseline = sum( + _canonical(member["workload"][field]) == _canonical(bf16) + for member in members + ) == 1 + missing_reason = ( + "missing-bf16-precision-baseline" + in item["eligibility"]["reasons"] + ) + if has_baseline == missing_reason: + raise PublisherError( + "precision baseline and eligibility reason disagree" + ) + expected_id = _derived_id("cxcohort-v1-", { + "kind": item["kind"], "series_ids": item["series_ids"], + "controlled_factors": item["controlled_factors"], + "varying_factors": item["varying_factors"], + }) + if item["cohort_id"] != expected_id: + raise PublisherError("cohort ID differs from its public factors") + expected_factors = { + "library": ( + ["system", "workload", "mode", "phase", "measurement", "resource.mode", "source"], + ["backend", "resource"], + ), + "chip": ( + ["backend", "source", "workload", "mode", "phase", "measurement", "resource.mode"], + ["system", "resource"], + ), + "system": ( + ["workload", "mode", "phase", "measurement", "source"], + ["system", "backend", "resource"], + ), + "routing": ( + ["backend", "implementation-static-build", "system", "model-shape", "mode", "phase", "measurement", "resource"], + ["workload.routing", "workload.eplb", "implementation-config"], + ), + "dispatch-precision": ( + [ + "backend", "implementation-static-build", "system", "model-shape", + "mode", "phase", "workload.routing", "workload.eplb", + "measurement", "resource", "combine-precision", + ], + ["dispatch-precision"], + ), + "combine-precision": ( + [ + "backend", "implementation-static-build", "system", "model-shape", + "mode", "phase", "workload.routing", "workload.eplb", + "measurement", "resource", "dispatch-precision", + ], + ["combine-precision"], + ), + "precision-pair": ( + [ + "backend", "implementation-static-build", "system", "model-shape", + "mode", "phase", "workload.routing", "workload.eplb", + "measurement", + ], + [ + "dispatch-precision", "combine-precision", "precision-profile", + "resource", + ], + ), + }[item["kind"]] + member_allocations = { + allocation for series_id in item["series_ids"] + for allocation in series[series_id]["allocation_ids"] + } + if ( + (item["controlled_factors"], item["varying_factors"]) != expected_factors + or set(item["eligibility"]["allocation_ids"]) != member_allocations + ): + raise PublisherError("cohort factors or allocations differ from its members") + _eligibility(item["eligibility"], f"cohort {item['cohort_id']}") + expected_ranking_keys: set[tuple[str, str, str, int]] = set() + for cohort in doc["cohorts"]: + if not cohort["eligibility"]["decision_grade"]: + continue + members = [series[series_id] for series_id in cohort["series_ids"]] + tokens = set.intersection(*( + {point["tokens_per_rank"] for point in member["points"]} + for member in members + )) + expected_ranking_keys.update( + (cohort["cohort_id"], measure, statistic, token) + for token in tokens + for measure in ( + "latency_us", "activation_data_rate_gbps_at_latency_percentile", + "total_logical_data_rate_gbps_at_latency_percentile", + ) + for statistic in ("p50", "p99") + ) + ranking_top: dict[ + tuple[str, str, str, int], dict[str, Any] | None + ] = {} + ranking_ids: set[str] = set() + for ranking in doc["rankings"]: + cohort = cohorts.get(ranking["cohort_id"]) + if ( + cohort is None + or not cohort["eligibility"]["decision_grade"] + or ranking["eligibility"] != cohort["eligibility"] + or ranking["publication_tier"] != cohort["publication_tier"] + ): + raise PublisherError("ranking references an ineligible cohort") + entries = ranking["entries"] + _validate_metric(ranking["metric"]) + if cohort["kind"] == "library" and any( + series[series_id]["backend"]["role"] == "reference" + for series_id in cohort["series_ids"] + ): + raise PublisherError("reference evidence cannot drive a library ranking") + if {entry["series_id"] for entry in entries} != set(cohort["series_ids"]): + raise PublisherError("ranking does not cover its cohort") + for entry in entries: + point_id, value, unit = _metric_value(series[entry["series_id"]], ranking["metric"]) + if entry["point_id"] != point_id or entry["unit"] != unit or not math.isclose(entry["value"], value, rel_tol=1e-12): + raise PublisherError("ranking entry differs from series data") + reverse = ranking["metric"]["objective"] == "max" + expected = sorted(entries, key=lambda entry: (entry["value"], entry["series_id"]), reverse=reverse) + metric = ranking["metric"] + ranks = [entry["rank"] for entry in entries] + if metric["measure"] == "latency_us" and metric["statistic"] == "p99": + tied_first = sum(rank == 1 for rank in ranks) + expected_ranks = [1] * tied_first + list( + range(tied_first + 1, len(entries) + 1) + ) + else: + expected_ranks = list(range(1, len(entries) + 1)) + if entries != expected or not ranks or ranks != expected_ranks: + raise PublisherError("ranking order differs") + expected_id = _derived_id("cxranking-v1-", { + "cohort_id": ranking["cohort_id"], "metric": metric, + }) + if ranking["ranking_id"] != expected_id or expected_id in ranking_ids: + raise PublisherError("ranking ID is duplicate or differs") + ranking_ids.add(expected_id) + ranking_top[(ranking["cohort_id"], metric["measure"], metric["statistic"], metric["tokens_per_rank"])] = ( + entries[0] if ranks.count(1) == 1 else None + ) + if set(ranking_top) != expected_ranking_keys: + raise PublisherError("rankings do not cover every eligible cohort metric") + objective = { + "min-p50-latency": ("latency_us", "p50"), "min-p99-latency": ("latency_us", "p99"), + "max-activation-data-rate-at-p50-latency": ( + "activation_data_rate_gbps_at_latency_percentile", "p50" + ), + "max-activation-data-rate-at-p99-latency": ( + "activation_data_rate_gbps_at_latency_percentile", "p99" + ), + "max-total-logical-data-rate-at-p50-latency": ( + "total_logical_data_rate_gbps_at_latency_percentile", "p50" + ), + "max-total-logical-data-rate-at-p99-latency": ( + "total_logical_data_rate_gbps_at_latency_percentile", "p99" + ), + } + recommendation_ids: set[str] = set() + for item in doc["recommendations"]: + if item["objective"] != "min-p99-latency": + raise PublisherError("recommendation is not a unique p99 latency winner") + measure, statistic = objective[item["objective"]] + candidates = [top for key, top in ranking_top.items() + if key[:3] == (item["cohort_id"], measure, statistic) + and top is not None and top["point_id"] == item["point_id"]] + if len(candidates) != 1 or any(item[field] != candidates[0][field] for field in ("series_id", "point_id", "value", "unit")): + raise PublisherError("recommendation is not a ranking winner") + matching_ranking = next( + ranking for ranking in doc["rankings"] + if ranking["cohort_id"] == item["cohort_id"] + and ranking["metric"]["measure"] == measure + and ranking["metric"]["statistic"] == statistic + and ranking["entries"][0]["point_id"] == item["point_id"] + ) + expected_id = _derived_id("cxrecommendation-v1-", { + "objective": item["objective"], "ranking_id": matching_ranking["ranking_id"], + }) + cohort = cohorts[item["cohort_id"]] + if (item["recommendation_id"] != expected_id or expected_id in recommendation_ids + or cohort["publication_tier"] != "official" + or item["publication_tier"] != "official" + or item["eligibility"] != cohort["eligibility"]): + raise PublisherError("recommendation ID/eligibility differs") + recommendation_ids.add(expected_id) + expected_recommendations = sum( + cohorts[ranking["cohort_id"]]["publication_tier"] == "official" + and ranking["metric"]["measure"] == "latency_us" + and ranking["metric"]["statistic"] == "p99" + and sum(entry["rank"] == 1 for entry in ranking["entries"]) == 1 + for ranking in doc["rankings"] + ) + if len(doc["recommendations"]) != expected_recommendations: + raise PublisherError("recommendations do not cover every actionable ranking") + sensitivity_ids: set[str] = set() + sensitivity_keys: set[tuple[str, str, str, str, str, int]] = set() + for item in doc["sensitivities"]: + cohort = cohorts.get(item["cohort_id"]) + if ( + cohort is None + or cohort["kind"] not in { + "routing", "dispatch-precision", "combine-precision", + } + or not cohort["eligibility"]["decision_grade"] + or item["publication_tier"] != cohort["publication_tier"] + or item["eligibility"] != cohort["eligibility"] + ): + raise PublisherError("sensitivity references an ineligible contrast cohort") + if ( + item["baseline_series_id"] == item["candidate_series_id"] + or not {item["baseline_series_id"], item["candidate_series_id"]}.issubset(cohort["series_ids"]) + ): + raise PublisherError("sensitivity series differ from its cohort") + _validate_metric(item["metric"]) + baseline_series = series[item["baseline_series_id"]] + if cohort["kind"] == "routing": + if ( + baseline_series["workload"]["routing"] != "uniform" + or baseline_series["workload"]["eplb"] + ): + raise PublisherError("sensitivity baseline is not uniform without EPLB") + else: + axis = ( + "dispatch" + if cohort["kind"] == "dispatch-precision" + else "combine" + ) + field = f"{axis}_precision" + bf16 = identity.precision_profile( + identity.V1_CONTROL_PRECISION_PROFILE + )[axis] + if _canonical(baseline_series["workload"][field]) != _canonical(bf16): + raise PublisherError("precision sensitivity baseline is not BF16") + _, baseline, _ = _metric_value(series[item["baseline_series_id"]], item["metric"]) + _, candidate, _ = _metric_value(series[item["candidate_series_id"]], item["metric"]) + if not math.isclose(item["signed_change_ratio"], (candidate - baseline) / baseline, rel_tol=1e-12): + raise PublisherError("sensitivity ratio differs") + expected_id = _derived_id("cxsensitivity-v1-", { + "baseline": item["baseline_series_id"], + "candidate": item["candidate_series_id"], + "cohort": item["cohort_id"], "metric": item["metric"], + }) + if item["sensitivity_id"] != expected_id or expected_id in sensitivity_ids: + raise PublisherError("sensitivity ID is duplicate or differs") + sensitivity_ids.add(expected_id) + sensitivity_keys.add(( + item["cohort_id"], item["baseline_series_id"], item["candidate_series_id"], + item["metric"]["measure"], item["metric"]["statistic"], + item["metric"]["tokens_per_rank"], + )) + expected_sensitivity_keys: set[tuple[str, str, str, str, str, int]] = set() + for cohort in doc["cohorts"]: + if ( + cohort["kind"] not in { + "routing", "dispatch-precision", "combine-precision", + } + or not cohort["eligibility"]["decision_grade"] + ): + continue + members = [series[series_id] for series_id in cohort["series_ids"]] + if cohort["kind"] == "routing": + baseline = next(( + member for member in members + if member["workload"]["routing"] == "uniform" + and not member["workload"]["eplb"] + ), None) + else: + axis = ( + "dispatch" + if cohort["kind"] == "dispatch-precision" + else "combine" + ) + field = f"{axis}_precision" + bf16 = identity.precision_profile( + identity.V1_CONTROL_PRECISION_PROFILE + )[axis] + baseline = next(( + member for member in members + if _canonical(member["workload"][field]) == _canonical(bf16) + ), None) + if baseline is None: + continue + tokens = set.intersection(*( + {point["tokens_per_rank"] for point in member["points"]} + for member in members + )) + expected_sensitivity_keys.update( + (cohort["cohort_id"], baseline["series_id"], candidate["series_id"], + measure, statistic, token) + for candidate in members if candidate is not baseline + for token in tokens + for measure in ( + "latency_us", "activation_data_rate_gbps_at_latency_percentile", + "total_logical_data_rate_gbps_at_latency_percentile", + ) + for statistic in ("p50", "p99") + ) + if sensitivity_keys != expected_sensitivity_keys: + raise PublisherError("sensitivities do not cover every declared contrast metric") + observed_qualification_indices = sorted({ + item["qualification_index"] for item in doc["attempts"] if item["selected"] + }) + if promotion["qualification_indices"] != observed_qualification_indices: + raise PublisherError("promotion qualification index catalog differs from attempts") + if promotion["status"] == "promoted": + run_ids = {item["run_id"] for item in doc["attempts"] if item["selected"]} + repeated_cases = all( + { + attempts[attempt_id]["qualification_index"] + for attempt_id in coverage["attempt_ids"] + if attempts[attempt_id]["selected"] + } == {1, 2, 3} + for coverage in doc["coverage"] + ) + if promotion["matrix_id"] != CANONICAL_FULL_V1_MATRIX_SHA256: + raise PublisherError("promotion requires the canonical full-v1 matrix") + if ( + _case_disposition_catalog_sha256(doc["coverage"]) + != CANONICAL_FULL_V1_CASE_CATALOG_SHA256 + ): + raise PublisherError("promotion requires the canonical case/disposition catalog") + if ( + terminal != len(doc["coverage"]) + or promotion["qualification_indices"] != [1, 2, 3] + or promotion["measured_cases"] + promotion["unsupported_cases"] + != promotion["requested_cases"] + or promotion["measured_points"] + promotion["unsupported_points"] + != promotion["requested_points"] + or promotion["terminal_points"] != promotion["requested_points"] + or len(doc["source_bundle_ids"]) != REQUIRED_ALLOCATIONS + or len(run_ids) != REQUIRED_ALLOCATIONS + or not repeated_cases + ): + raise PublisherError("promoted dataset lacks complete coverage") + expected_outcomes = { + item["case_id"]: ( + "success" if item["disposition"] == "runnable" else "unsupported" + ) + for item in doc["coverage"] + } + if any( + item["selected"] + and item["outcome"] != expected_outcomes[item["case_id"]] + for item in doc["attempts"] + ): + raise PublisherError("promoted outcomes differ from requested dispositions") + runnable_cases = { + item["case_id"] for item in doc["coverage"] + if item["disposition"] == "runnable" + } + if any( + item["case_id"] in runnable_cases and item["outcome"] != "success" + for item in doc["attempts"] + ): + raise PublisherError( + "promotion rejects runnable cases with failed, invalid, or diagnostic retries" + ) + _require_promotion_series(doc["series"]) + _require_promotion_cohorts(doc["cohorts"], doc["series"]) + if not doc["rankings"]: + raise PublisherError("promoted dataset lacks eligible rankings") + if promotion["status"] == "quarantined" and any(( + doc["source_bundle_ids"], promotion["allocation_ids"], doc["coverage"], + doc["attempts"], doc["series"], doc["cohorts"], doc["rankings"], + doc["recommendations"], doc["sensitivities"], + )): + raise PublisherError("quarantined dataset exposes unvalidated evidence") + return doc + + +def _file_record(value: Any, path: str) -> dict[str, Any]: + item = _exact(value, {"path", "sha256", "bytes"}, path) + if not isinstance(item["path"], str) or PurePosixPath(item["path"]).is_absolute() or ".." in PurePosixPath(item["path"]).parts: + raise PublisherError(f"{path}.path is unsafe") + if not isinstance(item["sha256"], str) or HEX64.fullmatch(item["sha256"]) is None: + raise PublisherError(f"{path}.sha256 is invalid") + _integer(item["bytes"], f"{path}.bytes", minimum=1) + return item + +def validate_bundle_manifest(doc: Any) -> dict[str, Any]: + _schema("private-bundle-v1.schema.json", doc) + attempts = {item["attempt_id"]: item for item in doc["attempts"]} + if len(attempts) != len(doc["attempts"]): + raise PublisherError("bundle has duplicate attempt IDs") + selections = doc["coverage"]["selections"] + if len({item["case_id"] for item in selections}) != len(selections): + raise PublisherError("bundle has duplicate selected cases") + counts = {name: 0 for name in OUTCOMES} + for selection in selections: + attempt = attempts.get(selection["selected_attempt_id"]) + if attempt is None or not attempt["selected"] or attempt["case_id"] != selection["case_id"] or attempt["outcome"] != selection["outcome"]: + raise PublisherError("bundle selection differs from retained attempt") + counts[selection["outcome"]] += 1 + coverage = doc["coverage"] + if coverage["terminal_cases"] != len(selections) or coverage["outcome_counts"] != counts: + raise PublisherError("bundle terminal counts differ") + if coverage["complete"] != (coverage["expected_cases"] == len(selections)): + raise PublisherError("bundle completeness differs from coverage") + fingerprints: dict[str, set[str]] = {} + for attempt in doc["attempts"]: + value = attempt["runtime_fingerprint_sha256"] + if value: + fingerprints.setdefault(attempt["allocation_id"], set()).add(value) + if any(len(values) != 1 for values in fingerprints.values()): + raise PublisherError("bundle runtime is heterogeneous within an allocation") + return doc + + +def _fsync_dir(path: Path) -> None: + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_DIRECTORY", 0)) + try: + os.fsync(descriptor) + finally: + os.close(descriptor) + + +def _write_bytes(path: Path, data: bytes, *, mode: int) -> None: + descriptor = os.open( + path, + os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0), + mode, + ) + try: + os.fchmod(descriptor, mode) + with os.fdopen(descriptor, "wb", closefd=False) as handle: + handle.write(data) + handle.flush() + os.fsync(handle.fileno()) + finally: + os.close(descriptor) + + +def _write_all(descriptor: int, data: bytes) -> None: + view = memoryview(data) + while view: + view = view[os.write(descriptor, view):] + + +def _write_json(path: Path, value: Any, *, mode: int) -> bytes: + data = _canonical(value) + b"\n" + _write_bytes(path, data, mode=mode) + return data + + +def _file_metadata(path: Path, relative_to: Path) -> dict[str, Any]: + return { + "path": path.relative_to(relative_to).as_posix(), + "sha256": _sha_file(path), + "bytes": path.stat().st_size, + } + + +def _tree_files(root: Path) -> list[Path]: + return sorted( + path for path in root.rglob("*") + if path.is_file() and not path.is_symlink() and path.name != "COMPLETE" + ) + + +def _verify_regular_file(path: Path, expected_mode: int) -> None: + _reject_symlinked_path(path.parent) + try: + metadata = os.lstat(path) + except FileNotFoundError as exc: + raise PublisherError(f"required file is missing: {path.name}") from exc + if ( + not stat.S_ISREG(metadata.st_mode) + or metadata.st_uid != os.getuid() + or stat.S_IMODE(metadata.st_mode) != expected_mode + ): + raise PublisherError( + f"file is not an owned regular {expected_mode:o} object: {path.name}" + ) + + +def _verify_frozen_tree(root: Path, *, private: bool) -> None: + _reject_symlinked_path(root) + directory_mode = 0o500 if private else 0o555 + file_mode = 0o400 if private else 0o444 + try: + root_metadata = os.lstat(root) + except OSError as exc: + raise PublisherError(f"cannot inspect immutable object: {root.name}") from exc + if not stat.S_ISDIR(root_metadata.st_mode): + raise PublisherError(f"immutable object is not a real directory: {root.name}") + try: + entries = [root, *root.rglob("*")] + except OSError as exc: + raise PublisherError(f"cannot inspect immutable object: {root.name}") from exc + for path in entries: + metadata = os.lstat(path) + if metadata.st_uid != os.getuid(): + raise PublisherError(f"immutable object has the wrong owner: {path.name}") + if stat.S_ISDIR(metadata.st_mode): + expected = directory_mode + elif stat.S_ISREG(metadata.st_mode): + expected = file_mode + else: + raise PublisherError(f"immutable object contains a linked or special entry: {path.name}") + if stat.S_IMODE(metadata.st_mode) != expected: + raise PublisherError( + f"immutable object mode differs for {path.name}: expected {expected:o}" + ) + + +def _freeze_tree(root: Path, *, private: bool) -> None: + files: list[Path] = [] + directories = [root] + for path in root.rglob("*"): + metadata = os.lstat(path) + if stat.S_ISDIR(metadata.st_mode): + directories.append(path) + elif stat.S_ISREG(metadata.st_mode): + files.append(path) + else: + raise PublisherError(f"immutable object contains a linked or special entry: {path.name}") + for path in files: + os.chmod(path, 0o400 if private else 0o444) + for path in sorted(directories, key=lambda item: len(item.parts), reverse=True): + os.chmod(path, 0o500 if private else 0o555) + _fsync_dir(path) + _verify_frozen_tree(root, private=private) + + +def _reject_symlinked_path(path: Path) -> None: + current = Path(path.anchor) + for part in path.parts[1:]: + current /= part + try: + metadata = os.lstat(current) + except FileNotFoundError: + break + if stat.S_ISLNK(metadata.st_mode): + raise PublisherError("COLLECTIVEX_STORE_ROOT must not traverse a symlinked parent") + if not stat.S_ISDIR(metadata.st_mode): + raise PublisherError(f"store path component is not a directory: {current}") + + +class Store: + """Atomic private/public directory operations on one operator filesystem.""" + + def __init__(self, root: str | os.PathLike[str]): + candidate = Path(os.path.abspath(os.path.expanduser(root))) + _reject_symlinked_path(candidate) + candidate.mkdir(parents=True, exist_ok=True, mode=0o750) + resolved = candidate.resolve() + if candidate != resolved: + raise PublisherError( + "COLLECTIVEX_STORE_ROOT must not traverse a symlinked parent" + ) + root_metadata = candidate.stat() + if root_metadata.st_uid != os.getuid() or stat.S_IMODE(root_metadata.st_mode) & 0o022: + raise PublisherError( + "COLLECTIVEX_STORE_ROOT must be owned by this user and not group/world writable" + ) + os.chmod(candidate, 0o750) + if stat.S_IMODE(candidate.stat().st_mode) != 0o750: + raise PublisherError("COLLECTIVEX_STORE_ROOT mode must be 750") + self.root = resolved + raw = self.root + self.private = raw / "private" + self.incoming = self.private / "incoming" + self.bundles = self.private / "bundles" + self.quarantine = self.private / "quarantine" + self.public = raw / "public" + self.datasets = self.public / "datasets" + self.channels = self.public / "channels" + self.locks = raw / "locks" + for path, mode in ( + (self.private, 0o700), (self.incoming, 0o700), (self.bundles, 0o700), + (self.quarantine, 0o700), (self.public, 0o755), (self.datasets, 0o755), + (self.channels, 0o755), (self.locks, 0o700), + ): + path.mkdir(parents=True, exist_ok=True, mode=mode) + if path.is_symlink() or not path.is_dir(): + raise PublisherError(f"store path is not a real directory: {path}") + os.chmod(path, mode) + + @contextlib.contextmanager + def locked(self) -> Iterator[None]: + lock_path = self.locks / "publisher.lock" + descriptor = os.open( + lock_path, + os.O_RDWR | os.O_CREAT | getattr(os, "O_NOFOLLOW", 0), + 0o600, + ) + try: + os.fchmod(descriptor, 0o600) + metadata = os.fstat(descriptor) + if ( + not stat.S_ISREG(metadata.st_mode) + or metadata.st_uid != os.getuid() + or stat.S_IMODE(metadata.st_mode) != 0o600 + ): + raise PublisherError("publisher lock is not an owned regular 600 file") + fcntl.flock(descriptor, fcntl.LOCK_EX) + yield + finally: + fcntl.flock(descriptor, fcntl.LOCK_UN) + os.close(descriptor) + + @contextlib.contextmanager + def staging(self, parent: Path, *, private: bool) -> Iterator[Path]: + stage = Path(tempfile.mkdtemp(prefix=".staging-", dir=parent)) + os.chmod(stage, 0o700 if private else 0o755) + try: + yield stage + finally: + if stage.exists(): + for path in stage.rglob("*"): + metadata = os.lstat(path) + if stat.S_ISDIR(metadata.st_mode): + os.chmod(path, 0o700) + elif stat.S_ISREG(metadata.st_mode): + os.chmod(path, 0o600) + os.chmod(stage, 0o700) + shutil.rmtree(stage, ignore_errors=True) + + @staticmethod + def complete(stage: Path, value: str, *, private: bool) -> None: + _write_bytes(stage / "COMPLETE", (value + "\n").encode(), mode=0o600 if private else 0o644) + _fsync_dir(stage) + + @staticmethod + def install(stage: Path, destination: Path, *, private: bool) -> None: + if destination.is_symlink(): + raise PublisherError(f"immutable destination is a symlink: {destination.name}") + if destination.exists(): + _verify_frozen_tree(destination, private=private) + marker = destination / "COMPLETE" + if not marker.is_file() or marker.read_text().strip() != destination.name: + raise PublisherError(f"immutable destination is incomplete: {destination.name}") + return + _freeze_tree(stage, private=private) + os.rename(stage, destination) + _fsync_dir(destination.parent) + _verify_frozen_tree(destination, private=private) + + def install_dataset(self, dataset: dict[str, Any]) -> tuple[str, int]: + validate_public_dataset(dataset) + payload = _canonical(dataset) + b"\n" + if len(payload) > MAX_PUBLIC_DATASET_BYTES: + raise PublisherError("public dataset exceeds the serving size limit") + digest = _sha_bytes(payload) + destination = self.datasets / digest + with self.staging(self.datasets, private=False) as stage: + _write_bytes(stage / "dataset.json", payload, mode=0o644) + self.complete(stage, digest, private=False) + self.install(stage, destination, private=False) + stored = destination / "dataset.json" + marker = destination / "COMPLETE" + if (not marker.is_file() or marker.read_text().strip() != digest + or _sha_file(stored) != digest or stored.stat().st_size != len(payload)): + raise PublisherError("stored dataset checksum differs after installation") + return digest, len(payload) + + def update_channel(self, channel: str, digest: str, size: int, generated_at: str) -> None: + if size > MAX_PUBLIC_DATASET_BYTES: + raise PublisherError("channel dataset exceeds the serving size limit") + _verify_frozen_tree(self.datasets / digest, private=False) + marker = self.datasets / digest / "COMPLETE" + if not marker.is_file() or marker.read_text().strip() != digest: + raise PublisherError("cannot advance a channel to an incomplete dataset") + dataset_path = self.datasets / digest / "dataset.json" + dataset = validate_public_dataset(strict_load(dataset_path)) + if ( + _sha_file(dataset_path) != digest + or dataset_path.stat().st_size != size + or dataset["generated_at"] != generated_at + ): + raise PublisherError("channel metadata differs from its stored dataset") + if channel == "dev-latest" and dataset["promotion"]["status"] != "promoted": + raise PublisherError("dev-latest may only reference a promoted dataset") + pointer = { + "format": FORMAT_CHANNEL, + "channel": channel, + "dataset": { + "path": f"datasets/{digest}/dataset.json", + "sha256": digest, + "bytes": size, + }, + "generated_at": generated_at, + } + validate_channel(pointer, expected_channel=channel) + destination = self.channels / f"{channel}.json" + temporary = self.channels / f".{channel}.tmp-{os.getpid()}" + try: + data = _canonical(pointer) + b"\n" + _write_bytes(temporary, data, mode=0o644) + os.replace(temporary, destination) + _fsync_dir(self.channels) + finally: + temporary.unlink(missing_ok=True) + + def verify_channel(self, channel: str) -> dict[str, Any]: + channel_path = self.channels / f"{channel}.json" + _verify_regular_file(channel_path, 0o644) + pointer = validate_channel(strict_load(channel_path), expected_channel=channel) + target = self.public / pointer["dataset"]["path"] + _verify_frozen_tree(target.parent, private=False) + if target.stat().st_size != pointer["dataset"]["bytes"] or _sha_file(target) != pointer["dataset"]["sha256"]: + raise PublisherError(f"channel {channel} dataset checksum differs") + marker = target.parent / "COMPLETE" + if not marker.is_file() or marker.read_text().strip() != pointer["dataset"]["sha256"]: + raise PublisherError(f"channel {channel} dataset is incomplete") + dataset = validate_public_dataset(strict_load(target)) + if pointer["generated_at"] != dataset["generated_at"]: + raise PublisherError(f"channel {channel} metadata differs from its dataset") + if channel == "dev-latest" and dataset["promotion"]["status"] != "promoted": + raise PublisherError("dev-latest points to a non-promoted dataset") + return pointer + + +def _copy_source(source: Path, destination: Path) -> None: + if source.is_symlink() or not source.is_file() or not stat.S_ISREG(source.stat().st_mode): + raise PublisherError(f"source must be a regular non-symlink file: {source}") + descriptor = os.open(source, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + output = os.open(destination, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600) + try: + while True: + chunk = os.read(descriptor, 1024 * 1024) + if not chunk: + break + _write_all(output, chunk) + os.fsync(output) + finally: + os.close(output) + finally: + os.close(descriptor) + + +def _archive_download_directory(source: Path, destination: Path) -> None: + if source.is_symlink() or not source.is_dir(): + raise PublisherError(f"artifact directory is invalid: {source}") + files: list[Path] = [] + for path in source.rglob("*"): + if path.is_symlink(): + raise PublisherError("artifact directory contains a symlink") + if path.is_dir(): + continue + if not path.is_file(): + raise PublisherError("artifact directory contains a non-regular entry") + files.append(path) + files.sort() + if not files or len(files) > MAX_ARCHIVE_MEMBERS: + raise PublisherError("artifact directory has an invalid file count") + total = 0 + with zipfile.ZipFile(destination, "x", compression=zipfile.ZIP_STORED) as archive: + for path in files: + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + with os.fdopen(descriptor, "rb") as handle: + metadata = os.fstat(handle.fileno()) + if not stat.S_ISREG(metadata.st_mode): + raise PublisherError("artifact directory member changed type") + size = metadata.st_size + total += size + if size > MAX_ARCHIVE_MEMBER_BYTES or total > MAX_ARCHIVE_TOTAL_BYTES: + raise PublisherError("artifact directory exceeds size limits") + relative = path.relative_to(source).as_posix() + _safe_member(relative) + info = zipfile.ZipInfo(relative, date_time=(1980, 1, 1, 0, 0, 0)) + info.compress_type = zipfile.ZIP_STORED + info.external_attr = (stat.S_IFREG | 0o600) << 16 + with archive.open(info, "w") as output: + written = 0 + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + output.write(chunk) + written += len(chunk) + if written != size: + raise PublisherError("artifact directory member changed size") + descriptor = os.open(destination, os.O_RDONLY) + try: + os.fsync(descriptor) + finally: + os.close(descriptor) + + +def _artifact_name(source: Path) -> str: + name = source.name if source.is_dir() else source.name.removesuffix(".zip") + if ( + not source.is_dir() and source.suffix != ".zip" + or ARTIFACT_NAME.fullmatch(name) is None + ): + raise PublisherError(f"artifact source has an invalid GHA name: {source.name}") + return name + + +def archive_incoming( + store: Store, + matrix: Path, + artifacts: Sequence[Path], + run: dict[str, Any], +) -> tuple[str, Path, list[dict[str, Any]]]: + """Copy exact delivery bytes into immutable incoming before any JSON/ZIP parse.""" + if not artifacts: + raise PublisherError("at least one GitHub artifact archive is required") + with store.staging(store.incoming, private=True) as stage: + sources = stage / "sources" + sources.mkdir(mode=0o700) + copied: list[dict[str, Any]] = [] + named_artifacts = sorted( + ((_artifact_name(path), path) for path in artifacts), key=lambda item: item[0] + ) + artifact_names = [name for name, _ in named_artifacts] + if len(artifact_names) != len(set(artifact_names)): + raise PublisherError("artifact delivery contains duplicate GHA names") + inputs = [("matrix.json", matrix, "matrix", None)] + [ + (f"artifact-{index:04d}.zip", path, "artifact", artifact_name) + for index, (artifact_name, path) in enumerate(named_artifacts) + ] + for name, source, kind, artifact_name in inputs: + destination = sources / name + if source.is_dir(): + _archive_download_directory(source, destination) + else: + if source != matrix and source.stat().st_size > MAX_ARCHIVE_TOTAL_BYTES: + raise PublisherError("artifact archive exceeds the size limit") + _copy_source(source, destination) + copied.append({ + **_file_metadata(destination, stage), + "kind": kind, + "artifact_name": artifact_name, + }) + ingest_id = _sha_bytes(_canonical({"run": run, "sources": copied})) + incoming_manifest = { + "format": "collectivex.incoming.v1", + "schema_version": 1, + "ingest_id": ingest_id, + "run": run, + "sources": copied, + } + _write_json(stage / "incoming.json", incoming_manifest, mode=0o600) + store.complete(stage, ingest_id, private=True) + destination = store.incoming / ingest_id + store.install(stage, destination, private=True) + installed = store.incoming / ingest_id + if strict_load(installed / "incoming.json") != incoming_manifest: + raise PublisherError("existing incoming object differs from archived delivery") + for record in copied: + _resolve_bundle_file(installed, record) + return ingest_id, installed, copied + + +def _safe_member(name: str) -> PurePosixPath: + if "\\" in name or "\0" in name: + raise PublisherError("archive member has an unsafe separator") + path = PurePosixPath(name) + if path.is_absolute() or not path.parts or any(part in {"", ".", ".."} for part in path.parts): + raise PublisherError("archive member path escapes its artifact") + return path + + +def extract_archive(archive: Path, destination: Path) -> list[Path]: + """Extract a bounded regular-file ZIP without trusting member paths or links.""" + try: + handle = zipfile.ZipFile(archive) + except (OSError, zipfile.BadZipFile) as exc: + raise PublisherError("artifact is not a valid ZIP archive") from exc + extracted: list[Path] = [] + seen: set[str] = set() + total = 0 + with handle: + members = handle.infolist() + if not members or len(members) > MAX_ARCHIVE_MEMBERS: + raise PublisherError("artifact has an invalid member count") + for member in members: + path = _safe_member(member.filename.rstrip("/")) + key = path.as_posix() + if key in seen: + raise PublisherError("artifact contains duplicate member paths") + seen.add(key) + mode = member.external_attr >> 16 + if stat.S_ISLNK(mode) or (mode and not (stat.S_ISREG(mode) or stat.S_ISDIR(mode))): + raise PublisherError("artifact contains a non-regular member") + if member.flag_bits & 0x1: + raise PublisherError("encrypted artifact members are not accepted") + if member.file_size > MAX_ARCHIVE_MEMBER_BYTES: + raise PublisherError("artifact member exceeds the size limit") + total += member.file_size + if total > MAX_ARCHIVE_TOTAL_BYTES: + raise PublisherError("artifact exceeds the expanded size limit") + target = destination.joinpath(*path.parts) + if member.is_dir(): + target.mkdir(parents=True, exist_ok=True, mode=0o700) + continue + target.parent.mkdir(parents=True, exist_ok=True, mode=0o700) + output = os.open(target, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600) + try: + with handle.open(member, "r") as source: + written = 0 + while True: + chunk = source.read(1024 * 1024) + if not chunk: + break + _write_all(output, chunk) + written += len(chunk) + if written != member.file_size: + raise PublisherError("artifact member size changed during extraction") + os.fsync(output) + finally: + os.close(output) + extracted.append(target) + return extracted + + +def validate_matrix(document: Any) -> list[dict[str, Any]]: + try: + artifact_safety.assert_publication_safe([document]) + matrix = sweep_matrix.validate_matrix_document(document) + except (SystemExit, ValueError, artifact_safety.ArtifactSafetyError) as exc: + raise PublisherError(f"requested matrix is invalid: {exc}") from exc + return [ + { + "sku": item["sku"], + **item["case"], + "_disposition": item["disposition"], + "_reason": item["reason"], + } + for item in matrix["requested_cases"] + ] + + +def _expected_deliveries( + matrix: dict[str, Any], cases: Sequence[dict[str, Any]], run: dict[str, Any] +) -> dict[str, tuple[str, str, str]]: + shard_by_case: dict[str, str] = {} + for shard in matrix["include"]: + for case_id in shard["case_ids"]: + if case_id in shard_by_case: + raise PublisherError("requested case appears in two runnable shards") + shard_by_case[case_id] = shard["id"] + suffix = f"{run['run_id']}-{run['run_attempt']}" + deliveries: dict[str, tuple[str, str, str]] = {} + for case in cases: + case_id = case["case_id"] + if case["_disposition"] == "unsupported": + deliveries[case_id] = ( + f"cxunsupported-{suffix}", "setup", + f"{run['run_id']}_{run['run_attempt']}_unsupported", + ) + continue + shard_id = shard_by_case.get(case_id) + if shard_id is None: + raise PublisherError("runnable case has no matrix shard") + deliveries[case_id] = ( + f"cxshard-{shard_id}-{suffix}", "sweep", + f"{run['run_id']}_{run['run_attempt']}_{shard_id}", + ) + return deliveries + + +def _document_git_run(document: dict[str, Any]) -> dict[str, Any] | None: + provenance = document.get("provenance") + if not isinstance(provenance, dict): + return None + value = provenance.get("git_run", provenance) + return value if isinstance(value, dict) else None + + +def _run_matches(document: dict[str, Any], run: dict[str, Any]) -> bool: + git_run = _document_git_run(document) + if git_run is None: + return False + return ( + str(git_run.get("run_id")) == run["run_id"] + and str(git_run.get("run_attempt")) == str(run["run_attempt"]) + and git_run.get("qualification_index") == run["qualification_index"] + and git_run.get("source_sha") == run["source_sha"] + and (git_run.get("repo") or git_run.get("repository")) == run["repository"] + ) + + +def _case_matches(document: dict[str, Any], expected: dict[str, Any]) -> bool: + scheduled = { + key: value for key, value in expected.items() + if key not in {"sku", "case_id"} and not key.startswith("_") + } + return document.get("identity", {}).get("case_factors") == { + "case": scheduled, + "profile": identity.profile_for_case(scheduled), + "sku": expected["sku"], + } + + +def _outcome(document: dict[str, Any]) -> tuple[str, str | None]: + status = document["outcome"]["status"] + if status == "success": + return status, None + native = document["outcome"].get("reason") + reason = native if isinstance(native, str) and REASON.fullmatch(native) else { + "unsupported": "unsupported-capability", "failed": "execution-failed", + "invalid": "validation-failed", "diagnostic": "diagnostic-evidence", + }.get(status) + if reason is None: + raise PublisherError(f"unsupported native outcome {status!r}") + return status, reason + + +def _attempt_record( + document: dict[str, Any], path: Path, root: Path, *, selected: bool +) -> dict[str, Any]: + normalized = contracts.normalize_attempt(document) + runtime = normalized["runtime_fingerprint"] + runtime_sha = _sha_bytes(_canonical(runtime)) if runtime is not None else None + sample_record = None + evidence_ids: list[str] = [] + series_ids: list[str] = [] + if document["format"] == contracts.RAW_FORMAT: + sample_path = path.with_name(document["sample_artifact"]["path"]) + sample_record = _file_metadata(sample_path, root) + evidence_ids = [row["evidence_id"] for row in document["measurement"]["rows"]] + series_ids = [document["identity"]["series_id"]] + declared = document["identity"]["series_factors"]["runtime_fingerprint_sha256"] + if runtime_sha != declared: + raise PublisherError("runtime fingerprint checksum differs from series identity") + status, reason = _outcome(document) + return { + "attempt_id": normalized["attempt_id"], + "allocation_id": normalized["allocation_id"], + "case_id": normalized["case_id"], + "outcome": status, + "reason": reason, + "selected": selected, + "document": _file_metadata(path, root), + "samples": sample_record, + "runtime_fingerprint_sha256": runtime_sha, + "series_ids": series_ids, + "evidence_ids": evidence_ids, + } + + +def _validate_delivery_binding( + document: dict[str, Any], path: Path, raw_root: Path, + artifact_by_root: dict[str, str], expected_by_id: dict[str, dict[str, Any]], + expected_deliveries: dict[str, tuple[str, str, str]], run: dict[str, Any], +) -> str: + case_id = document["identity"]["case_id"] + if case_id not in expected_by_id: + raise PublisherError("artifact contains an extra case outcome") + expected = expected_by_id[case_id] + if not _case_matches(document, expected): + raise PublisherError("attempt case coordinates differ from the requested matrix") + unsupported = document["outcome"]["status"] == "unsupported" + if (expected["_disposition"] == "unsupported") != unsupported: + raise PublisherError("terminal outcome differs from requested capability disposition") + if unsupported and document["outcome"]["reason"] != expected["_reason"]: + raise PublisherError("unsupported outcome reason differs from requested matrix") + if not _run_matches(document, run): + raise PublisherError("attempt provenance differs from publisher run metadata") + relative = path.relative_to(raw_root) + if len(relative.parts) < 2: + raise PublisherError("attempt document is outside a delivered artifact") + delivered_name = artifact_by_root.get(relative.parts[0]) + expected_name, expected_job, expected_execution = expected_deliveries[case_id] + git_run = _document_git_run(document) + allocation = document["identity"]["allocation_factors"] + if ( + git_run is None + or delivered_name != expected_name + or git_run["artifact"] != delivered_name + or git_run["job"] != expected_job + or allocation["execution_id"] != expected_execution + ): + raise PublisherError("attempt provenance differs from its delivered GHA shard") + return case_id + + +def _parse_extracted(root: Path) -> tuple[list[tuple[Path, dict[str, Any]]], set[Path]]: + attempts: list[tuple[Path, dict[str, Any]]] = [] + consumed_samples: set[Path] = set() + json_paths = sorted(path for path in root.rglob("*.json") if path.is_file()) + for path in json_paths: + if path in consumed_samples: + continue + try: + document = contracts.strict_load(path) + artifact_safety.assert_publication_safe([document]) + format_name = document.get("format") if isinstance(document, dict) else None + if format_name == contracts.SAMPLES_FORMAT: + _schema("samples-v1.schema.json", document) + # It must be claimed by a raw document; orphan checking happens after the scan. + continue + if format_name == contracts.RAW_FORMAT: + _schema("raw-case-v1.schema.json", document) + sample_path = path.with_name(document["sample_artifact"]["path"]) + sample_document = contracts.strict_load(sample_path) + artifact_safety.assert_publication_safe([sample_document]) + _schema("samples-v1.schema.json", sample_document) + validated = contracts.load_raw_attempt(path) + consumed_samples.add(sample_path) + elif format_name == contracts.TERMINAL_FORMAT: + _schema("terminal-outcome-v1.schema.json", document) + validated = contracts.validate_terminal_document(document) + else: + raise PublisherError(f"artifact contains unknown JSON document {path.name}") + except ( + contracts.ContractError, artifact_safety.ArtifactSafetyError, + jsonschema.ValidationError, OSError, + ) as exc: + raise PublisherError(f"native contract rejected {path.name}: {exc}") from exc + attempts.append((path, validated)) + orphan_samples = [ + path for path in json_paths + if isinstance((doc := contracts.strict_load(path)), dict) + and doc.get("format") == contracts.SAMPLES_FORMAT + and path not in consumed_samples + ] + if orphan_samples: + raise PublisherError("artifact contains an orphan samples document") + if not attempts: + raise PublisherError("artifact contains zero native attempt documents") + return attempts, consumed_samples + + +def build_bundle( + store: Store, + incoming_id: str, + incoming_path: Path, + run: dict[str, Any], +) -> tuple[str, dict[str, Any], list[dict[str, Any]]]: + """Validate one exact workflow delivery and install its immutable private bundle.""" + incoming_manifest = strict_load(incoming_path / "incoming.json") + _exact( + incoming_manifest, + {"format", "schema_version", "ingest_id", "run", "sources"}, + "incoming", + ) + artifact_safety.assert_publication_safe([incoming_manifest]) + if ( + incoming_manifest["format"] != "collectivex.incoming.v1" + or incoming_manifest["schema_version"] != 1 + or incoming_manifest["ingest_id"] != incoming_id + or incoming_manifest["run"] != run + or _sha_bytes(_canonical({"run": run, "sources": incoming_manifest["sources"]})) + != incoming_id + ): + raise PublisherError("incoming manifest identity differs from archived delivery") + incoming_sources = _array(incoming_manifest["sources"], "incoming.sources", nonempty=True) + for index, record in enumerate(incoming_sources): + _exact( + record, + {"path", "sha256", "bytes", "kind", "artifact_name"}, + f"incoming.sources[{index}]", + ) + _resolve_bundle_file(incoming_path, record) + matrix_records = [record for record in incoming_sources if record["kind"] == "matrix"] + artifact_records = [record for record in incoming_sources if record["kind"] == "artifact"] + if ( + len(matrix_records) != 1 + or matrix_records[0]["artifact_name"] is not None + or not artifact_records + or any(ARTIFACT_NAME.fullmatch(record["artifact_name"] or "") is None + for record in artifact_records) + or len({record["artifact_name"] for record in artifact_records}) != len(artifact_records) + ): + raise PublisherError("incoming source catalog is invalid") + matrix_source = _resolve_bundle_file(incoming_path, matrix_records[0]) + matrix_document = strict_load(matrix_source) + expected_cases = validate_matrix(matrix_document) + expected_by_id = {case["case_id"]: case for case in expected_cases} + expected_deliveries = _expected_deliveries(matrix_document, expected_cases, run) + if {record["artifact_name"] for record in artifact_records} != { + delivery[0] for delivery in expected_deliveries.values() + }: + raise PublisherError("incoming artifact archive set differs from requested matrix shards") + with store.staging(store.bundles, private=True) as stage: + source_copy = stage / "source" + raw_root = stage / "raw" + source_copy.mkdir(mode=0o700) + raw_root.mkdir(mode=0o700) + matrix_path = stage / "matrix.json" + _copy_source(matrix_source, matrix_path) + source_records: list[dict[str, Any]] = [] + artifact_by_root: dict[str, str] = {} + for index, source_record in enumerate(artifact_records): + archive = _resolve_bundle_file(incoming_path, source_record) + copied = source_copy / f"artifact-{index:04d}.zip" + _copy_source(archive, copied) + source_records.append({ + **_file_metadata(copied, stage), + "artifact_name": source_record["artifact_name"], + }) + artifact_root = raw_root / f"artifact-{index:04d}" + artifact_root.mkdir(mode=0o700) + artifact_by_root[artifact_root.name] = source_record["artifact_name"] + extract_archive(copied, artifact_root) + parsed, consumed_samples = _parse_extracted(raw_root) + created_at = _latest_timestamp( + [document["generated_at"] for _, document in parsed] + ) + consumed_files = {path for path, _ in parsed} | consumed_samples + extracted_files = { + path for path in raw_root.rglob("*") + if path.is_file() and not path.is_symlink() + } + if consumed_files != extracted_files: + raise PublisherError("artifact contains an unconsumed non-native member") + by_case: dict[str, list[tuple[Path, dict[str, Any]]]] = {} + for path, document in parsed: + case_id = _validate_delivery_binding( + document, path, raw_root, artifact_by_root, expected_by_id, + expected_deliveries, run, + ) + by_case.setdefault(case_id, []).append((path, document)) + missing = set(expected_by_id) - set(by_case) + if missing: + raise PublisherError(f"artifact is missing {len(missing)} requested case outcomes") + attempt_records: list[dict[str, Any]] = [] + selections: list[dict[str, Any]] = [] + selected_documents: list[dict[str, Any]] = [] + runtime_hashes: set[str] = set() + outcome_counts = {name: 0 for name in OUTCOMES} + for case_id in sorted(expected_by_id): + case_attempts = by_case[case_id] + ordinals = [document["identity"]["attempt_ordinal"] for _, document in case_attempts] + allocations_for_case = { + document["identity"]["allocation_id"] for _, document in case_attempts + } + if len(allocations_for_case) != 1 or sorted(ordinals) != list( + range(1, len(ordinals) + 1) + ): + raise PublisherError( + "case retries must retain contiguous ordinals in one allocation" + ) + _, selected_document = max( + case_attempts, key=lambda item: item[1]["identity"]["attempt_ordinal"] + ) + selected_id = selected_document["identity"]["attempt_id"] + selected_documents.append(selected_document) + selected_status, _ = _outcome(selected_document) + selections.append({ + "case_id": case_id, + "selected_attempt_id": selected_id, + "outcome": selected_status, + }) + outcome_counts[selected_status] += 1 + for path, document in sorted( + case_attempts, key=lambda item: item[1]["identity"]["attempt_ordinal"] + ): + normalized = contracts.normalize_attempt(document) + if document["format"] == contracts.RAW_FORMAT: + sample_path = path.with_name(document["sample_artifact"]["path"]) + if sample_path not in consumed_samples: + raise PublisherError("validated raw attempt lost its samples document") + record = _attempt_record( + document, path, stage, + selected=normalized["attempt_id"] == selected_id, + ) + if record["runtime_fingerprint_sha256"]: + runtime_hashes.add(record["runtime_fingerprint_sha256"]) + attempt_records.append(record) + # Every extracted byte is covered; the bundle manifest anchors this checksum catalog. + payload_records = [_file_metadata(path, stage) for path in _tree_files(stage)] + checksum_document = { + "format": "collectivex.checksums.v1", + "files": payload_records, + } + checksum_path = stage / "checksums.json" + _write_json(checksum_path, checksum_document, mode=0o600) + bundle = { + "format": FORMAT_BUNDLE, + "schema_version": 1, + "created_at": created_at, + "ingest_id": incoming_id, + "run": run, + "matrix": _file_metadata(matrix_path, stage), + "sources": source_records, + "attempts": attempt_records, + "coverage": { + "expected_cases": len(expected_cases), + "terminal_cases": len(selections), + "complete": len(selections) == len(expected_cases), + "outcome_counts": outcome_counts, + "selections": selections, + }, + "runtime_fingerprints": sorted(runtime_hashes), + "checksums": _file_metadata(checksum_path, stage), + "validation": { + "policy": PUBLISHER_POLICY, + "passed": True, + "checks": [ + "archive-safety", "checksums", "exact-coverage", "identity", + "native-schema", "privacy", "runtime-homogeneity", "terminal-outcomes", + ], + }, + } + validate_bundle_manifest(bundle) + # Runtime homogeneity is scoped to a realized allocation, not across unlike SKUs. + by_allocation: dict[str, set[str]] = {} + for attempt in attempt_records: + fingerprint = attempt["runtime_fingerprint_sha256"] + if fingerprint: + by_allocation.setdefault(attempt["allocation_id"], set()).add(fingerprint) + if any(len(values) != 1 for values in by_allocation.values()): + raise PublisherError("runtime fingerprint is heterogeneous within an allocation") + bundle_bytes = _canonical(bundle) + b"\n" + bundle_id = _sha_bytes(bundle_bytes) + _write_bytes(stage / "bundle.json", bundle_bytes, mode=0o600) + store.complete(stage, bundle_id, private=True) + store.install(stage, store.bundles / bundle_id, private=True) + installed = load_bundle(store, bundle_id) + if installed["manifest"] != bundle: + raise PublisherError("existing bundle differs from validated manifest") + return bundle_id, bundle, selected_documents + + +def _slug(value: Any, fallback: str = "unknown") -> str: + text = re.sub(r"[^a-z0-9_.-]+", "-", str(value or "").lower()).strip("-.") + return text[:128] if text and SAFE_ID.fullmatch(text[:128]) else fallback + + +def _derived_id(prefix: str, value: Any) -> str: + return f"{prefix}{_sha_bytes(_canonical(value))}" + + +def _git_run(document: dict[str, Any]) -> dict[str, Any]: + return _document_git_run(document) or {} + + +def _public_attempt(document: dict[str, Any], *, selected: bool = False) -> dict[str, Any]: + normalized = contracts.normalize_attempt(document) + run = _git_run(document) + evidence = ( + [{"evidence_id": row["evidence_id"], "point_id": row["point_id"]} + for row in document["measurement"]["rows"]] + if document["format"] == contracts.RAW_FORMAT else [] + ) + status, reason = _outcome(document) + failure_mode = document["outcome"].get("failure_mode") + if not isinstance(failure_mode, str) or REASON.fullmatch(failure_mode) is None: + failure_mode = None if status == "success" else reason + series_id = normalized["series_id"] if status == "success" and selected else None + return { + "attempt_id": normalized["attempt_id"], + "evidence": evidence, + "case_id": normalized["case_id"], + "allocation_id": normalized["allocation_id"], + "run_id": str(run["run_id"]), + "run_attempt": int(run["run_attempt"]), + "qualification_index": int(run["qualification_index"]), + "attempt_index": document["identity"]["attempt_ordinal"], + "selected": selected, + "outcome": status, + "failure_mode": failure_mode, + "reason": reason, + "series_id": series_id, + "completed_at": document["generated_at"], + } + + +def _ratio(values: Sequence[float]) -> float | None: + return max(values) / min(values) if len(values) >= REQUIRED_ALLOCATIONS and min(values) > 0 else None + + +def _private_trial_components(sample_document: dict[str, Any]) -> dict[int, dict[str, Any]]: + """Copy validated trial blocks into publisher-private memory without fixing component names.""" + points: dict[int, dict[str, Any]] = {} + for point in sample_document["points"]: + token = point["tokens_per_rank"] + components: dict[str, Any] = {} + for name, component in point["components"].items(): + availability = component["availability"] + if availability in {"unavailable", "not-applicable"}: + components[name] = None + continue + if availability != "measured": + raise PublisherError(f"private sample component {name} has invalid availability") + trials = component["trials"] + if ( + not isinstance(trials, list) + or len(trials) != 64 + or any(not isinstance(trial, list) or len(trial) != 8 for trial in trials) + ): + raise PublisherError(f"private sample component {name} is not 64x8") + copied = tuple( + tuple(float(sample) for sample in trial) + for trial in trials + ) + if any( + not math.isfinite(sample) or sample < 0 + for trial in copied for sample in trial + ): + raise PublisherError(f"private sample component {name} is not finite") + components[name] = copied + points[token] = components + return points + + +def _trial_diagnostics( + trial_blocks: dict[str, dict[int, dict[str, Any]]], token: int, +) -> dict[str, Any]: + components: dict[str, Any] = {} + reasons: set[str] = set() + for name in ("dispatch", "stage", "combine", "roundtrip"): + values = [trial_blocks[run_id][token][name] for run_id in sorted(trial_blocks)] + if all(value is None for value in values): + components[name] = None + continue + if any(value is None for value in values): + raise PublisherError(f"{name} trial availability differs across qualification runs") + array = np.asarray(values, dtype=np.float64) + if array.shape != (REQUIRED_ALLOCATIONS, 64, 8) or not np.isfinite(array).all(): + raise PublisherError(f"{name} trial diagnostics require three finite 64x8 runs") + medians = np.median(array, axis=2) + first = np.median(medians[:, :8], axis=1) + last = np.median(medians[:, -8:], axis=1) + if np.any(first <= 0) or np.any(last <= 0): + raise PublisherError(f"{name} trial diagnostics require positive latency") + drift_ratio = float(np.max(np.maximum(first / last, last / first))) + center = float(np.median(medians)) + mad = float(np.median(np.abs(medians - center))) + if mad == 0: + outliers = np.abs(medians - center) > 0 + else: + outliers = np.abs(medians - center) > ( + TRIAL_OUTLIER_MAD_MULTIPLIER * 1.4826 * mad + ) + outlier_fraction = float(np.count_nonzero(outliers) / medians.size) + drift_flagged = drift_ratio > TRIAL_DRIFT_RATIO_LIMIT + outlier_flagged = outlier_fraction > TRIAL_OUTLIER_FRACTION_LIMIT + if drift_flagged: + reasons.add("trial-drift") + if outlier_flagged: + reasons.add("trial-outliers") + components[name] = { + "drift_flagged": drift_flagged, + "first_last_median_ratio": drift_ratio, + "outlier_flagged": outlier_flagged, + "robust_outlier_fraction": outlier_fraction, + "trial_count": int(medians.size), + } + return { + "flagged": bool(reasons), + "reasons": sorted(reasons), + "components": components, + } + + +def _nearest_rank_p99(blocks: Sequence[Sequence[float]]) -> float: + samples = sorted(float(sample) for block in blocks for sample in block) + if len(samples) != 512 or samples[0] < 0 or not all(map(math.isfinite, samples)): + raise PublisherError("p99 bootstrap input must contain 512 finite samples") + return samples[math.ceil(0.99 * len(samples)) - 1] + + +def _roundtrip_trial_array( + internal: dict[str, Any], token: int +) -> tuple[tuple[str, ...], np.ndarray]: + trial_blocks = internal.get("trial_blocks") + if not isinstance(trial_blocks, dict): + raise PublisherError("series is missing private trial blocks") + run_ids = tuple(sorted(trial_blocks, key=lambda value: (int(value), value))) + if len(run_ids) != REQUIRED_ALLOCATIONS: + raise PublisherError("p99 bootstrap requires exactly three run blocks") + values = [] + for run_id in run_ids: + point = trial_blocks[run_id].get(token) + blocks = point.get("roundtrip") if isinstance(point, dict) else None + if blocks is None: + raise PublisherError("p99 bootstrap requires measured roundtrip blocks") + if len(blocks) != 64 or any(len(block) != 8 for block in blocks): + raise PublisherError("p99 bootstrap roundtrip blocks must be 64x8") + values.append(blocks) + array = np.asarray(values, dtype=np.float64) + if array.shape != (REQUIRED_ALLOCATIONS, 64, 8): + raise PublisherError("p99 bootstrap trial array shape differs") + if not np.isfinite(array).all() or np.any(array <= 0): + raise PublisherError("p99 bootstrap latencies must be finite and positive") + return run_ids, array + + +def _bootstrap_seed( + dataset_binding: str, baseline_series_id: str, candidate_series_id: str, token: int +) -> tuple[str, int]: + payload = _canonical({ + "policy": BOOTSTRAP_POLICY, + "resamples": BOOTSTRAP_RESAMPLES, + "confidence": BOOTSTRAP_CONFIDENCE, + "equivalence_band": BOOTSTRAP_EQUIVALENCE_BAND, + "dataset_binding": dataset_binding, + "baseline_series_id": baseline_series_id, + "candidate_series_id": candidate_series_id, + "tokens_per_rank": token, + }) + digest = hashlib.sha256(payload).digest() + return digest.hex(), int.from_bytes(digest[:16], "big") + + +def _hierarchical_p99_ratio( + baseline_series_id: str, + candidate_series_id: str, + token: int, + internals: dict[str, dict[str, Any]], + dataset_binding: str, +) -> dict[str, Any]: + """Bootstrap candidate/baseline p99 across runs, then 64 trial blocks.""" + baseline_runs, baseline = _roundtrip_trial_array( + internals[baseline_series_id], token + ) + candidate_runs, candidate = _roundtrip_trial_array( + internals[candidate_series_id], token + ) + if baseline_runs != candidate_runs: + raise PublisherError("p99 bootstrap run blocks are not aligned") + seed_sha256, seed = _bootstrap_seed( + dataset_binding, baseline_series_id, candidate_series_id, token + ) + cache_key = ( + seed_sha256, + _sha_bytes(baseline.tobytes()), + _sha_bytes(candidate.tobytes()), + ) + cached = _BOOTSTRAP_CACHE.get(cache_key) + if cached is not None: + return dict(cached) + + baseline_run_p99 = np.asarray( + [_nearest_rank_p99(run) for run in baseline], dtype=np.float64 + ) + candidate_run_p99 = np.asarray( + [_nearest_rank_p99(run) for run in candidate], dtype=np.float64 + ) + run_ratios = candidate_run_p99 / baseline_run_p99 + point_ratio = float(np.median(candidate_run_p99) / np.median(baseline_run_p99)) + + rng = np.random.Generator(np.random.PCG64(seed)) + ratios = np.empty(BOOTSTRAP_RESAMPLES, dtype=np.float64) + p99_index = math.ceil(0.99 * 512) - 1 + for start in range(0, BOOTSTRAP_RESAMPLES, BOOTSTRAP_CHUNK_SIZE): + size = min(BOOTSTRAP_CHUNK_SIZE, BOOTSTRAP_RESAMPLES - start) + sampled_runs = rng.integers(0, REQUIRED_ALLOCATIONS, size=(size, 3)) + sampled_blocks = rng.integers(0, 64, size=(size, 3, 64)) + run_index = sampled_runs[:, :, None] + baseline_sample = baseline[run_index, sampled_blocks].reshape(size, 3, 512) + candidate_sample = candidate[run_index, sampled_blocks].reshape(size, 3, 512) + baseline_p99 = np.partition(baseline_sample, p99_index, axis=2)[:, :, p99_index] + candidate_p99 = np.partition(candidate_sample, p99_index, axis=2)[:, :, p99_index] + ratios[start:start + size] = ( + np.median(candidate_p99, axis=1) / np.median(baseline_p99, axis=1) + ) + ratios.sort() + tail = (1.0 - BOOTSTRAP_CONFIDENCE) / 2.0 + lower_index = max(0, math.ceil(tail * BOOTSTRAP_RESAMPLES) - 1) + upper_index = min( + BOOTSTRAP_RESAMPLES - 1, + math.ceil((1.0 - tail) * BOOTSTRAP_RESAMPLES) - 1, + ) + ci = [float(ratios[lower_index]), float(ratios[upper_index])] + threshold = 1.0 + BOOTSTRAP_EQUIVALENCE_BAND + baseline_wins = ci[0] > threshold and bool(np.all(run_ratios > threshold)) + result = { + "policy": BOOTSTRAP_POLICY, + "resamples": BOOTSTRAP_RESAMPLES, + "confidence": BOOTSTRAP_CONFIDENCE, + "equivalence_band": BOOTSTRAP_EQUIVALENCE_BAND, + "seed_sha256": seed_sha256, + "point_ratio": point_ratio, + "ci95": ci, + "run_ratios": [float(value) for value in run_ratios], + "all_runs_agree": bool(np.all(run_ratios > threshold)), + "baseline_wins": baseline_wins, + "tie": not baseline_wins, + } + _BOOTSTRAP_CACHE[cache_key] = result + return dict(result) + + +def _bootstrap_inputs_ready( + members: Sequence[dict[str, Any]], + internals: dict[str, dict[str, Any]], + tokens: Sequence[int], +) -> bool: + try: + expected_runs: tuple[str, ...] | None = None + for member in members: + for token in tokens: + run_ids, _ = _roundtrip_trial_array(internals[member["series_id"]], token) + if expected_runs is None: + expected_runs = run_ids + elif run_ids != expected_runs: + return False + return expected_runs is not None + except (KeyError, PublisherError, TypeError, ValueError): + return False + + +def _eligibility_record( + allocations: Sequence[str], + *, + complete: bool, + correct: bool, + measured: bool, + stable_ordering: bool, + p50_ratio: float | None, + p99_ratio: float | None, + extra_reasons: Sequence[str] = (), +) -> dict[str, Any]: + ids = sorted(set(allocations)) + stable_p50 = p50_ratio is not None and p50_ratio <= P50_STABILITY_LIMIT + stable_p99 = p99_ratio is not None and p99_ratio <= P99_STABILITY_LIMIT + reasons = list(extra_reasons) + for condition, reason in ( + (len(ids) >= REQUIRED_ALLOCATIONS, "insufficient-allocations"), + (complete, "incomplete-repeat-coverage"), + (correct, "correctness-failed"), + (measured, "missing-measured-roundtrip-p99"), + (stable_p50, "unstable-p50"), + (stable_p99, "unstable-p99"), + (stable_ordering, "unstable-ordering"), + ): + if not condition: + reasons.append(reason) + reasons = sorted(set(reasons)) + decision = not reasons + return { + "decision_grade": decision, + "allocation_ids": ids, + "complete": complete, + "correct": correct, + "measured_roundtrip_p99": measured, + "stable_p50": stable_p50, + "stable_p99": stable_p99, + "stable_ordering": stable_ordering, + "p50_max_min_ratio": p50_ratio, + "p99_max_min_ratio": p99_ratio, + "reasons": reasons, + } + + +def _aggregate_percentiles(values: Sequence[dict[str, Any]]) -> dict[str, float]: + return { + name: float(statistics.median(float(value[name]) for value in values)) + for name in ("p50", "p90", "p95", "p99") + } + + +def _aggregate_component( + rows: Sequence[dict[str, Any]], name: str +) -> dict[str, Any] | None: + components = [row["components"][name] for row in rows] + if all(component["availability"] == "unavailable" for component in components): + return None + if any(component["availability"] == "unavailable" for component in components): + raise PublisherError("component availability differs across repeat allocations") + latency = _aggregate_percentiles([component["percentiles_us"] for component in components]) + if name == "isolated_sum": + byte_provenance = { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": 0, + "scale_bytes": 0, + "total_logical_bytes": 0, + } + return { + "origin": "derived", + "latency_us": latency, + "byte_provenance": byte_provenance, + "activation_data_rate_gbps_at_latency_percentile": None, + "total_logical_data_rate_gbps_at_latency_percentile": None, + "sample_count": None, + } + byte_provenance = _exact_repeat_value( + [row["byte_provenance"][name] for row in rows], + f"{name} byte accounting", + ) + activation_rates = { + statistic: byte_provenance["activation_data_bytes"] / (latency[statistic] * 1000.0) + for statistic in latency + } + total_rates = { + statistic: byte_provenance["total_logical_bytes"] / (latency[statistic] * 1000.0) + for statistic in latency + } + return { + "origin": "measured", + "latency_us": latency, + "byte_provenance": byte_provenance, + "activation_data_rate_gbps_at_latency_percentile": activation_rates, + "total_logical_data_rate_gbps_at_latency_percentile": total_rates, + "sample_count": 512, + } + + +def _exact_repeat_value(values: Sequence[Any], label: str) -> Any: + if not values or len({_canonical(value) for value in values}) != 1: + raise PublisherError(f"{label} differs across repeat allocations") + return values[0] + + +def _eplb_descriptor(document: dict[str, Any]) -> dict[str, Any]: + value = document["case"]["eplb"] + return { + "enabled": value["enabled"], + "calibration_workload_id": value["calibration_workload_id"], + "calibration_trace_sha256": value["calibration_trace_sha256"], + "calibration_window": value["calibration_window"], + "calibration_token_offset": value["calibration_token_offset"], + "planner": value["planner"], + "mapping_sha256": value["mapping_hash"], + "logical_experts": value["num_logical_experts"], + "physical_experts": value["num_physical_experts"], + "redundant_experts": value["num_redundant"], + "reference_tokens_per_rank": value["reference_tokens_per_rank"], + "replicated_experts": value["replicated_experts"], + "max_replicas": value["max_replicas"], + "imbalance_before": value["imbalance_before"], + "imbalance_after": value["imbalance_after"], + } + + +def _routing_facts(row: dict[str, Any]) -> dict[str, Any]: + routing = row["routing"] + return { + "fanout_mean": routing["fanout_mean"], + "recv_tokens_max": row["receive"]["max"], + "expert_load_cv": routing["expert_load_cv"], + "payload_rank_cv": routing["payload_rank_cv"], + "hotspot_ratio": routing["hotspot_ratio"], + "empty_expert_count": routing["empty_expert_count"], + "empty_rank_count": routing["empty_rank_count"], + "routed_copies": routing["routed_copies"], + } + + +def _aggregate_precision_evidence(rows: Sequence[dict[str, Any]]) -> dict[str, Any]: + values = [row["correctness"]["precision"] for row in rows] + profile_ids = {value["profile_id"] for value in values} + if len(profile_ids) != 1: + raise PublisherError("precision evidence profile differs across qualification runs") + result: dict[str, Any] = {"profile_id": profile_ids.pop()} + for direction in ("dispatch", "combine"): + axes = [value[direction] for value in values] + finite = [axis["scales_finite"] for axis in axes] + positive = [axis["scales_positive"] for axis in axes] + result[direction] = { + "encoded_payload_valid": all(axis["encoded_payload_valid"] for axis in axes), + "scales_finite": None if all(value is None for value in finite) else all( + value is True for value in finite + ), + "scales_positive": None if all(value is None for value in positive) else all( + value is True for value in positive + ), + "dequantized_semantics": all(axis["dequantized_semantics"] for axis in axes), + "saturation_count": max(axis["saturation_count"] for axis in axes), + "saturation_rate": max(axis["saturation_rate"] for axis in axes), + "max_abs_error": max(axis["max_abs_error"] for axis in axes), + "max_rel_error": max(axis["max_rel_error"] for axis in axes), + "passed": all(axis["passed"] for axis in axes), + } + result["passed"] = result["dispatch"]["passed"] and result["combine"]["passed"] + return result + + +def _series_extra_reasons(documents: Sequence[dict[str, Any]]) -> list[str]: + reasons: set[str] = set() + for document in documents: + validity = document["outcome"]["validity"] + rows = document["measurement"]["rows"] + if validity.get("provenance_complete") is not True: + reasons.add("incomplete-provenance") + if validity.get("workload_source") != "canonical-serialized": + reasons.add("noncanonical-workload") + if validity.get("anomaly_free") is not True or any(row["anomalies"] for row in rows): + reasons.add("unresolved-anomaly") + if validity.get("semantic_correctness") != "pass": + reasons.add("semantic-correctness-failed") + if validity.get("measurement_conformance") != "conformant" or validity.get("sampling_conformance") != "conformant": + reasons.add("measurement-nonconformant") + profile = identity.case_profile(document["case"]["mode"]) + scopes = {row["correctness"].get("scope") for row in rows} + if scopes != {profile["correctness_scope"]}: + reasons.add("expert-oracle-incomplete") + return sorted(reasons) + + +BACKEND_LABELS = { + "deepep": "DeepEP V1", + "deepep-v2": "DeepEP V2", + "deepep-hybrid": "DeepEP Hybrid", + "uccl": "UCCL", + "mori": "MoRI", + "nccl-ep": "NCCL/RCCL reference", +} + + +def _build_series( + series_id: str, + documents: Sequence[dict[str, Any]], + sample_documents: Sequence[dict[str, Any]], + expected_repeats: int, +) -> tuple[dict[str, Any], dict[str, Any]]: + if not documents: + raise PublisherError("cannot aggregate an empty series") + first = documents[0] + if any(document["identity"]["series_id"] != series_id for document in documents): + raise PublisherError("series aggregation mixed identities") + if len(sample_documents) != len(documents): + raise PublisherError("series aggregation lost private sample documents") + allocations = [document["identity"]["allocation_id"] for document in documents] + if len(allocations) != len(set(allocations)): + raise PublisherError("series repeats reuse an allocation identity") + row_maps = [ + {row["tokens_per_rank"]: row for row in document["measurement"]["rows"]} + for document in documents + ] + token_sets = {tuple(sorted(rows)) for rows in row_maps} + if len(token_sets) != 1: + raise PublisherError("series token coverage differs across allocations") + tokens = list(next(iter(token_sets))) + qualification_indices = sorted( + document["measurement"]["qualification_index"] for document in documents + ) + p50_ratios = [ + _ratio([rows[token]["components"]["roundtrip"]["percentiles_us"]["p50"] for rows in row_maps]) + for token in tokens + ] + p99_ratios = [ + _ratio([rows[token]["components"]["roundtrip"]["percentiles_us"]["p99"] for rows in row_maps]) + for token in tokens + ] + p50_ratio = max((value for value in p50_ratios if value is not None), default=None) + p99_ratio = max((value for value in p99_ratios if value is not None), default=None) + correct = all( + row["correctness"]["passed"] + for document in documents for row in document["measurement"]["rows"] + ) + measured = all( + row["components"]["roundtrip"]["availability"] == "measured" + and row["components"]["roundtrip"]["percentiles_us"].get("p99") is not None + for document in documents for row in document["measurement"]["rows"] + ) + extra_reasons = _series_extra_reasons(documents) + case = first["case"] + shape = case["shape"] + topology = first["topology"] + runtime = first["runtime_fingerprint"] + workload_id = first["workload"]["workload_id"] + if not identity.is_typed_id(workload_id, "workload"): + raise PublisherError("raw workload is not canonical") + backend_id = case["backend"] + resource_raw = first["implementation"]["resource_profile"] + public_config = contracts.public_series_config( + kernel_generation=first["implementation"]["kernel_generation"], + provenance=first["implementation"]["provenance"], + resource_profile=resource_raw, + resource_mode=case["resource_mode"], + device_product=topology["device_product"], + ) + resource_profile = public_config["resource"]["profile"] + configured_units = public_config["resource"]["configured_units"] + units_kind = public_config["resource"]["comm_units_kind"] + resource_label = ( + f"{configured_units} {str(units_kind).upper()}" + if configured_units is not None and units_kind + else resource_profile + ) + eplb = _exact_repeat_value( + [_eplb_descriptor(document) for document in documents], "EPLB descriptor" + ) + points: list[dict[str, Any]] = [] + run_metrics: dict[str, dict[int, dict[str, float]]] = {} + trial_blocks: dict[str, dict[int, dict[str, Any]]] = {} + for document, sample_document, rows in zip( + documents, sample_documents, row_maps, strict=True + ): + if any( + sample_document[field] != document["identity"][field] + for field in ("allocation_id", "attempt_id", "case_id", "series_id") + ): + raise PublisherError("private samples differ from their selected raw attempt") + if sample_document["qualification_index"] != document["measurement"]["qualification_index"]: + raise PublisherError("private sample qualification index differs from raw attempt") + run_id = str(_git_run(document)["run_id"]) + if run_id in run_metrics: + raise PublisherError("series has two allocations from one workflow run") + trial_blocks[run_id] = _private_trial_components(sample_document) + run_metrics[run_id] = {} + for token in tokens: + latency = rows[token]["components"]["roundtrip"]["percentiles_us"] + byte_provenance = rows[token]["byte_provenance"]["roundtrip"] + run_metrics[run_id][token] = { + "latency_us": {statistic: latency[statistic] for statistic in ("p50", "p99")}, + "activation_data_rate_gbps_at_latency_percentile": { + statistic: byte_provenance["activation_data_bytes"] + / (latency[statistic] * 1000.0) + for statistic in ("p50", "p99") + }, + "total_logical_data_rate_gbps_at_latency_percentile": { + statistic: byte_provenance["total_logical_bytes"] + / (latency[statistic] * 1000.0) + for statistic in ("p50", "p99") + }, + } + for token in tokens: + rows = [row_map[token] for row_map in row_maps] + diagnostics = _trial_diagnostics(trial_blocks, token) + if diagnostics["flagged"]: + extra_reasons.append("unresolved-trial-diagnostic") + routing = _exact_repeat_value( + [_routing_facts(row) for row in rows], "routing/load facts" + ) + components = { + name: _aggregate_component(rows, name) + for name in ("dispatch", "stage", "combine", "roundtrip") + } + if components["dispatch"] is None: + components["isolated_sum"] = None + else: + latency = { + statistic: components["dispatch"]["latency_us"][statistic] + + ( + components["stage"]["latency_us"][statistic] + if components["stage"] is not None else 0.0 + ) + + components["combine"]["latency_us"][statistic] + for statistic in ("p50", "p90", "p95", "p99") + } + components["isolated_sum"] = { + "origin": "derived", + "latency_us": latency, + "byte_provenance": components["roundtrip"]["byte_provenance"], + "activation_data_rate_gbps_at_latency_percentile": None, + "total_logical_data_rate_gbps_at_latency_percentile": None, + "sample_count": None, + } + points.append({ + "point_id": rows[0]["point_id"], + "tokens_per_rank": token, + "global_tokens": token * case["ep_size"], + "correctness": { + "semantic_pass": all(row["correctness"]["passed"] for row in rows), + "precision": _aggregate_precision_evidence(rows), + }, + "anomalies": sorted({ + anomaly["type"].replace("_", "-") + for row in rows for anomaly in row["anomalies"] + } | set(diagnostics["reasons"])), + "stability": { + "complete": qualification_indices == [1, 2, 3], + "qualification_indices": qualification_indices, + "p50_max_min_ratio": p50_ratios[tokens.index(token)] + if qualification_indices == [1, 2, 3] else None, + "p99_max_min_ratio": p99_ratios[tokens.index(token)] + if qualification_indices == [1, 2, 3] else None, + "stable_p50": bool( + qualification_indices == [1, 2, 3] + and p50_ratios[tokens.index(token)] is not None + and p50_ratios[tokens.index(token)] <= P50_STABILITY_LIMIT + ), + "stable_p99": bool( + qualification_indices == [1, 2, 3] + and p99_ratios[tokens.index(token)] is not None + and p99_ratios[tokens.index(token)] <= P99_STABILITY_LIMIT + ), + }, + "trial_diagnostics": diagnostics, + "routing": routing, + "components": components, + "roundtrip_token_rate_at_latency_percentile": { + statistic: (token * case["ep_size"]) + / (components["roundtrip"]["latency_us"][statistic] * 1e-6) + for statistic in ("p50", "p90", "p95", "p99") + }, + "evidence_ids": [row["evidence_id"] for row in rows], + }) + eligibility = _eligibility_record( + allocations, + complete=len(documents) == expected_repeats, + correct=correct, + measured=measured, + # Ordering is defined only across alternatives in a controlled cohort. + stable_ordering=True, + p50_ratio=p50_ratio, + p99_ratio=p99_ratio, + extra_reasons=sorted(set(extra_reasons)), + ) + series = { + "series_id": series_id, + "label": ( + f"{case['runner'].upper()} / {BACKEND_LABELS.get(backend_id, backend_id)} / " + f"EP{case['ep_size']} / {topology['nodes']} node" + f"{'s' if topology['nodes'] != 1 else ''} / {topology['scope']} / " + f"{case['mode']} / {case['phase']} / {shape['routing']}" + f"{' + EPLB' if case['eplb']['enabled'] else ''} / {resource_label}" + ), + "status": "decision-grade" if eligibility["decision_grade"] else "diagnostic", + "case_ids": sorted({document["identity"]["case_id"] for document in documents}), + "allocation_ids": sorted(allocations), + "model": _slug(case["workload_name"]), + "suite": _slug(case["suite"]), + "mode": case["mode"], + "phase": case["phase"], + "publication_tier": case["required_publication"], + "backend": { + "id": _slug(backend_id), + "label": BACKEND_LABELS.get(backend_id, backend_id), + "role": "reference" if backend_id == "nccl-ep" else "library", + **public_config["backend"], + }, + "build": { + "implementation_contract_sha256": first["identity"]["series_factors"][ + "implementation_contract_sha256" + ], + "public_config_sha256": first["identity"]["series_factors"][ + "public_config_sha256" + ], + "routing_control_sha256": first["identity"]["series_factors"][ + "routing_control_sha256" + ], + "runtime_fingerprint_sha256": first["identity"]["series_factors"][ + "runtime_fingerprint_sha256" + ], + "image_digest": first["identity"]["series_factors"]["image_digest"], + "source_sha": first["identity"]["series_factors"]["source_sha"], + "squash_sha256": first["identity"]["series_factors"]["squash_sha256"], + }, + "system": { + "sku": _slug(case["runner"]), + "label": public_config["system"]["label"], + "vendor": runtime["vendor"], + "topology_class": _slug(topology["topology_class"]), + "transport": _slug(topology["transport"]), + "scale_up_transport": _slug(topology["scale_up_transport"]), + "scale_out_transport": ( + _slug(topology["scale_out_transport"]) + if topology["scale_out_transport"] is not None + else None + ), + "scope": topology["scope"], + "nodes": topology["nodes"], + "gpus_per_node": topology["gpus_per_node"], + "scale_up_domain": topology["scale_up_domain"], + "world_size": topology["world_size"], + "ep_size": case["ep_size"], + "placement": topology["placement"], + }, + "workload": { + "workload_id": workload_id, + "hidden": shape["hidden"], + "top_k": shape["topk"], + "experts": case["eplb"]["num_logical_experts"], + "routing": shape["routing"], + "eplb": case["eplb"]["enabled"], + "precision_profile": shape["precision_profile"], + "dispatch_precision": shape["dispatch_precision"], + "combine_precision": shape["combine_precision"], + "activation_profile": shape["activation_profile"], + }, + "eplb": eplb, + "resource": public_config["resource"], + "measurement": { + "contract": first["measurement"]["contract"], + "component_order_contract": first["measurement"]["component_order_contract"], + "combine_semantics": identity.case_profile(case["mode"])["combine_semantics"], + "payload_unit": identity.case_profile(case["mode"])["payload_unit"], + "sampling_contract": first["measurement"]["sampling"]["contract"], + "iters": first["measurement"]["sampling"]["iterations_per_trial"], + "trials": first["measurement"]["sampling"]["trials"], + "warmups": first["measurement"]["sampling"]["warmup_iterations"], + "samples_per_component": first["measurement"]["sampling"]["samples_per_component"], + "qualification_indices": qualification_indices, + "headline_component": "roundtrip", + "headline_percentile": "p99", + }, + "points": points, + "eligibility": eligibility, + } + internal = { + "documents": list(documents), + "run_metrics": run_metrics, + "trial_blocks": trial_blocks, + "series_factors": first["identity"]["series_factors"], + } + return series, internal + + +def _resolve_bundle_file(root: Path, record: dict[str, Any]) -> Path: + path = root.joinpath(*PurePosixPath(record["path"]).parts) + try: + path.relative_to(root) + except ValueError as exc: + raise PublisherError("bundle record escapes its directory") from exc + if path.resolve() != path or path.is_symlink() or not path.is_file(): + raise PublisherError("bundle record points to a missing or linked file") + if path.stat().st_size != record["bytes"] or _sha_file(path) != record["sha256"]: + raise PublisherError("bundle file checksum differs from its manifest") + return path + + +def load_bundle(store: Store, bundle_id: str) -> dict[str, Any]: + if HEX64.fullmatch(bundle_id) is None: + raise PublisherError("bundle ID must be a SHA-256 digest") + root = store.bundles / bundle_id + if root.is_symlink() or not (root / "COMPLETE").is_file(): + raise PublisherError(f"bundle {bundle_id} is missing or incomplete") + _verify_frozen_tree(root, private=True) + if (root / "COMPLETE").read_text().strip() != bundle_id: + raise PublisherError("bundle COMPLETE marker differs") + manifest_path = root / "bundle.json" + if _sha_file(manifest_path) != bundle_id: + raise PublisherError("bundle directory digest differs from bundle.json") + manifest = validate_bundle_manifest(strict_load(manifest_path)) + checksum_path = _resolve_bundle_file(root, manifest["checksums"]) + checksum_document = strict_load(checksum_path) + checksum_document = _exact(checksum_document, {"format", "files"}, "checksums") + if checksum_document["format"] != "collectivex.checksums.v1": + raise PublisherError("bundle checksum format is invalid") + records = [_file_record(value, f"checksums.files[{index}]") + for index, value in enumerate(_array(checksum_document["files"], "checksums.files"))] + _unique([record["path"] for record in records], "checksums.files[].path") + for record in records: + _resolve_bundle_file(root, record) + expected_paths = { + path.relative_to(root).as_posix() for path in _tree_files(root) + if path.name not in {"bundle.json", "checksums.json"} + } + if {record["path"] for record in records} != expected_paths: + raise PublisherError("bundle checksum catalog does not cover its payload exactly") + artifact_by_root: dict[str, str] = {} + for index, source in enumerate(manifest["sources"]): + _resolve_bundle_file(root, source) + archive_key = f"artifact-{index:04d}" + if source["path"] != f"source/{archive_key}.zip": + raise PublisherError("bundle source catalog order/path differs") + artifact_by_root[archive_key] = source["artifact_name"] + if len(set(artifact_by_root.values())) != len(artifact_by_root): + raise PublisherError("bundle source catalog repeats an artifact name") + matrix_path = _resolve_bundle_file(root, manifest["matrix"]) + matrix_document = strict_load(matrix_path) + cases = validate_matrix(matrix_document) + expected_by_id = {case["case_id"]: case for case in cases} + expected_deliveries = _expected_deliveries( + matrix_document, cases, manifest["run"] + ) + if {item["case_id"] for item in manifest["coverage"]["selections"]} != set(expected_by_id): + raise PublisherError("bundle selected coverage differs from requested matrix") + documents: dict[str, dict[str, Any]] = {} + sample_documents: dict[str, dict[str, Any]] = {} + runtime_fingerprints: set[str] = set() + for attempt in manifest["attempts"]: + document_path = _resolve_bundle_file(root, attempt["document"]) + document = contracts.strict_load(document_path) + artifact_safety.assert_publication_safe([document]) + if document.get("format") == contracts.RAW_FORMAT: + _schema("raw-case-v1.schema.json", document) + sample_path = document_path.with_name(document["sample_artifact"]["path"]) + if attempt["samples"] is None: + raise PublisherError("raw attempt is missing its sample manifest record") + manifest_sample_path = _resolve_bundle_file(root, attempt["samples"]) + if manifest_sample_path != sample_path: + raise PublisherError("sample manifest record points to the wrong raw evidence") + sample_document = contracts.strict_load(sample_path) + artifact_safety.assert_publication_safe([sample_document]) + _schema("samples-v1.schema.json", sample_document) + sample_document = contracts.validate_samples_document(sample_document) + document = contracts.load_raw_attempt(document_path) + sample_documents[attempt["attempt_id"]] = sample_document + else: + if attempt["samples"] is not None: + raise PublisherError("terminal attempt unexpectedly names a sample artifact") + _schema("terminal-outcome-v1.schema.json", document) + document = contracts.validate_terminal_document(document) + _validate_delivery_binding( + document, document_path, root / "raw", artifact_by_root, + expected_by_id, expected_deliveries, manifest["run"], + ) + expected_record = _attempt_record( + document, document_path, root, selected=attempt["selected"] + ) + if expected_record != attempt: + raise PublisherError("bundle attempt record differs from native document") + if attempt["runtime_fingerprint_sha256"]: + runtime_fingerprints.add(attempt["runtime_fingerprint_sha256"]) + documents[attempt["attempt_id"]] = document + if sorted(runtime_fingerprints) != manifest["runtime_fingerprints"]: + raise PublisherError("bundle runtime fingerprint catalog differs from attempts") + selected = { + selection["case_id"]: documents[selection["selected_attempt_id"]] + for selection in manifest["coverage"]["selections"] + } + return { + "id": bundle_id, + "root": root, + "manifest": manifest, + "cases": cases, + "documents": documents, + "sample_documents": sample_documents, + "selected": selected, + } + + +def _cohort_control( + kind: str, series: dict[str, Any], internal: dict[str, Any] +) -> tuple[dict[str, Any], list[str], list[str], Any]: + binary_build = series["build"] + source = binary_build["source_sha"] + workload = series["workload"] + shape = { + key: workload[key] + for key in ( + "hidden", "top_k", "experts", "precision_profile", "dispatch_precision", + "combine_precision", "activation_profile", + ) + } + common = { + "model": series["model"], "mode": series["mode"], + "phase": series["phase"], "shape": shape, + "measurement": series["measurement"], "ep_size": series["system"]["ep_size"], + } + if kind == "library": + control = {**common, "system": series["system"], "workload": workload, + "resource_mode": series["resource"]["mode"], "source": source} + return control, ["system", "workload", "mode", "phase", "measurement", "resource.mode", "source"], ["backend", "resource"], series["backend"]["id"] + if kind == "chip": + control = {**common, "backend": series["backend"], "source": source, + "workload": workload, "resource_mode": series["resource"]["mode"]} + return control, ["backend", "source", "workload", "mode", "phase", "measurement", "resource.mode"], ["system", "resource"], series["system"] + if kind == "system": + control = {**common, "workload": workload, "source": source} + varying = [series["system"]["sku"], series["backend"]["id"], series["resource"]["profile"]] + return control, ["workload", "mode", "phase", "measurement", "source"], ["system", "backend", "resource"], varying + if kind == "routing": + control = { + **common, + "backend": series["backend"], + "system": series["system"], + "resource": series["resource"], + "build": _routing_build_control(binary_build), + } + varying = [ + workload["routing"], workload["eplb"], + binary_build["implementation_contract_sha256"], + ] + return ( + control, + ["backend", "implementation-static-build", "system", "model-shape", "mode", "phase", "measurement", "resource"], + ["workload.routing", "workload.eplb", "implementation-config"], + varying, + ) + if kind in PRECISION_COHORT_KINDS: + control, variant = _public_cohort_factors(kind, series) + if kind == "dispatch-precision": + controlled = [ + "backend", "implementation-static-build", "system", "model-shape", + "mode", "phase", "workload.routing", "workload.eplb", "measurement", + "resource", "combine-precision", + ] + varying = ["dispatch-precision"] + elif kind == "combine-precision": + controlled = [ + "backend", "implementation-static-build", "system", "model-shape", + "mode", "phase", "workload.routing", "workload.eplb", "measurement", + "resource", "dispatch-precision", + ] + varying = ["combine-precision"] + else: + controlled = [ + "backend", "implementation-static-build", "system", "model-shape", + "mode", "phase", "workload.routing", "workload.eplb", "measurement", + ] + varying = [ + "dispatch-precision", "combine-precision", "precision-profile", "resource", + ] + return control, controlled, varying, variant + raise PublisherError(f"unknown cohort kind {kind}") + + +def _cohort_ordering( + members: Sequence[dict[str, Any]], internals: dict[str, dict[str, Any]], tokens: Sequence[int] +) -> tuple[bool, int]: + run_ids = set.intersection(*( + set(internals[member["series_id"]]["run_metrics"]) for member in members + )) + if len(run_ids) < REQUIRED_ALLOCATIONS: + return False, len(run_ids) + orders: list[tuple[str, str, int, str, tuple[str, ...]]] = [] + for run_id in sorted(run_ids): + for token in tokens: + for measure in ( + "latency_us", "activation_data_rate_gbps_at_latency_percentile", + "total_logical_data_rate_gbps_at_latency_percentile", + ): + for statistic in ("p50", "p99"): + ordered = tuple( + member["series_id"] + for member in sorted( + members, + key=lambda item: ( + internals[item["series_id"]]["run_metrics"][run_id][token][measure][statistic], + item["series_id"], + ), + reverse=measure != "latency_us", + ) + ) + orders.append((measure, statistic, token, run_id, ordered)) + for token in tokens: + for measure in ( + "latency_us", "activation_data_rate_gbps_at_latency_percentile", + "total_logical_data_rate_gbps_at_latency_percentile", + ): + for statistic in ("p50", "p99"): + observed = { + entry[4] + for entry in orders + if entry[0] == measure and entry[1] == statistic and entry[2] == token + } + if len(observed) != 1: + return False, len(run_ids) + return True, len(run_ids) + + +def _p99_top_tie_ids( + members: Sequence[dict[str, Any]], + internals: dict[str, dict[str, Any]], + token: int, + dataset_binding: str, + cohort_id: str, +) -> set[str]: + metric = { + "operation": "roundtrip", + "statistic": "p99", + "measure": "latency_us", + "objective": "min", + "tokens_per_rank": token, + "phase": members[0]["phase"], + } + ordered = sorted( + members, + key=lambda member: ( + _metric_value(member, metric)[1], member["series_id"] + ), + ) + baseline_id = ordered[0]["series_id"] + comparisons: dict[str, dict[str, Any]] = {} + tie_end = 0 + for index, candidate in enumerate(ordered[1:], 1): + candidate_id = candidate["series_id"] + result = _hierarchical_p99_ratio( + baseline_id, candidate_id, token, internals, dataset_binding + ) + comparisons[candidate_id] = result + if not result["baseline_wins"]: + tie_end = index + tie_ids = {member["series_id"] for member in ordered[:tie_end + 1]} + internals[baseline_id].setdefault("decision_statistics", {})[ + f"{cohort_id}:p99:{token}" + ] = { + "baseline_series_id": baseline_id, + "comparisons": comparisons, + "tie_series_ids": sorted(tie_ids), + } + return tie_ids + + +def build_decisions( + series: Sequence[dict[str, Any]], + internals: dict[str, dict[str, Any]], + *, + dataset_binding: str | None = None, +) -> tuple[list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]], list[dict[str, Any]]]: + if dataset_binding is None: + dataset_binding = _sha_bytes(_canonical({ + "series_ids": sorted(item["series_id"] for item in series), + })) + cohorts: list[dict[str, Any]] = [] + for kind in (*REQUIRED_COHORT_KINDS, *PRECISION_COHORT_KINDS): + groups: dict[bytes, list[tuple[dict[str, Any], Any, list[str], list[str]]]] = {} + for item in series: + if kind == "library" and item["backend"]["role"] != "library": + continue + if kind == "system" and item["backend"]["role"] != "reference": + continue + control, controlled, varying, variant = _cohort_control(kind, item, internals[item["series_id"]]) + groups.setdefault(_canonical(control), []).append((item, variant, controlled, varying)) + for entries in groups.values(): + variants = {_canonical(entry[1]) for entry in entries} + if len(entries) < 2 or len(variants) < 2: + continue + members = sorted((entry[0] for entry in entries), key=lambda item: item["series_id"]) + token_sets = [set(point["tokens_per_rank"] for point in member["points"]) for member in members] + tokens = sorted(set.intersection(*token_sets)) + same_points = len({tuple(sorted(values)) for values in token_sets}) == 1 + ordering, aligned_runs = _cohort_ordering(members, internals, tokens) if tokens else (False, 0) + allocations = sorted({value for member in members for value in member["allocation_ids"]}) + p50_ratio = max( + (member["eligibility"]["p50_max_min_ratio"] for member in members + if member["eligibility"]["p50_max_min_ratio"] is not None), default=None + ) + p99_ratio = max( + (member["eligibility"]["p99_max_min_ratio"] for member in members + if member["eligibility"]["p99_max_min_ratio"] is not None), default=None + ) + extra = { + reason for member in members for reason in member["eligibility"]["reasons"] + if reason not in {"unstable-ordering"} + } + if aligned_runs < REQUIRED_ALLOCATIONS: + extra.add("incomplete-aligned-repeats") + if tokens and not _bootstrap_inputs_ready(members, internals, tokens): + extra.add("missing-trial-blocks") + if kind == "routing" and sum( + member["workload"]["routing"] == "uniform" + and not member["workload"]["eplb"] + for member in members + ) != 1: + extra.add("missing-uniform-baseline") + if kind == "routing" and { + (member["workload"]["routing"], member["workload"]["eplb"]) + for member in members + } != {("uniform", False), ("zipf", False), ("zipf", True)}: + extra.add("incomplete-routing-anchors") + if kind == "routing" and _routing_implementation_mismatch(members): + extra.add("implementation-config-mismatch") + endpoint_contrast = kind == "routing" or kind in PRECISION_COHORT_KINDS + if not tokens or (not endpoint_contrast and not same_points): + extra.add("unmatched-token-coverage") + if kind in {"dispatch-precision", "combine-precision"}: + axis = "dispatch" if kind == "dispatch-precision" else "combine" + field = f"{axis}_precision" + bf16 = identity.precision_profile( + identity.V1_CONTROL_PRECISION_PROFILE + )[axis] + if sum( + _canonical(member["workload"][field]) == _canonical(bf16) + for member in members + ) != 1: + extra.add("missing-bf16-precision-baseline") + eligibility = _eligibility_record( + allocations, + complete=all(member["eligibility"]["complete"] for member in members) + and bool(tokens) and (endpoint_contrast or same_points), + correct=all(member["eligibility"]["correct"] for member in members), + measured=all(member["eligibility"]["measured_roundtrip_p99"] for member in members), + stable_ordering=ordering, + p50_ratio=p50_ratio, + p99_ratio=p99_ratio, + extra_reasons=sorted(extra), + ) + member_ids = [member["series_id"] for member in members] + publication_tier = ( + "comparable-experimental" + if any(member["publication_tier"] == "comparable-experimental" for member in members) + else "official" + ) + controlled, varying = entries[0][2], entries[0][3] + cohort_id = _derived_id("cxcohort-v1-", { + "kind": kind, "series_ids": member_ids, + "controlled_factors": controlled, "varying_factors": varying, + }) + kind_label = { + "chip": "Platform", + "dispatch-precision": "Dispatch precision", + "combine-precision": "Combine precision", + "precision-pair": "Precision profile", + }.get(kind, kind.title()) + first = members[0] + routing_label = first["workload"]["routing"] + ( + "+EPLB" if first["workload"]["eplb"] else "" + ) + context = { + "library": ( + f"{first['system']['sku'].upper()} EP{first['system']['ep_size']} / " + f"{first['mode']} / {first['phase']} / {routing_label}" + ), + "chip": ( + f"{first['backend']['label']} EP{first['system']['ep_size']} / " + f"{first['mode']} / {first['phase']} / {routing_label}" + ), + "system": ( + f"Reference EP{first['system']['ep_size']} / {first['mode']} / " + f"{first['phase']} / {routing_label}" + ), + "routing": ( + f"{first['system']['sku'].upper()} / {first['backend']['label']} / " + f"EP{first['system']['ep_size']} / {first['mode']} / {first['phase']}" + ), + "dispatch-precision": ( + f"{first['system']['sku'].upper()} / {first['backend']['label']} / " + f"EP{first['system']['ep_size']} / {first['mode']} / {first['phase']}" + ), + "combine-precision": ( + f"{first['system']['sku'].upper()} / {first['backend']['label']} / " + f"EP{first['system']['ep_size']} / {first['mode']} / {first['phase']}" + ), + "precision-pair": ( + f"{first['system']['sku'].upper()} / {first['backend']['label']} / " + f"EP{first['system']['ep_size']} / {first['mode']} / {first['phase']}" + ), + }[kind] + cohorts.append({ + "cohort_id": cohort_id, + "kind": kind, + "label": f"{context} / {kind_label} contrast ({len(members)} series)", + "description": ( + "Publisher-controlled NCCL/RCCL system comparison" + if kind == "system" + else ( + "Descriptive configured-stack precision comparison; no isolated axis claim" + if kind == "precision-pair" + else f"Publisher-controlled {kind_label.lower()} comparison" + ) + ), + "series_ids": member_ids, + "controlled_factors": controlled, + "varying_factors": varying, + "publication_tier": publication_tier, + "eligibility": eligibility, + }) + cohorts.sort(key=lambda item: item["cohort_id"]) + series_by_id = {item["series_id"]: item for item in series} + rankings: list[dict[str, Any]] = [] + recommendations: list[dict[str, Any]] = [] + sensitivities: list[dict[str, Any]] = [] + for cohort in cohorts: + if not cohort["eligibility"]["decision_grade"]: + continue + members = [series_by_id[series_id] for series_id in cohort["series_ids"]] + tokens = sorted(set.intersection(*( + {point["tokens_per_rank"] for point in member["points"]} for member in members + ))) + for token in tokens: + p99_tie_ids = _p99_top_tie_ids( + members, internals, token, dataset_binding, cohort["cohort_id"] + ) + for measure, objective, unit in ( + ("latency_us", "min", "us"), + ("activation_data_rate_gbps_at_latency_percentile", "max", "GB/s"), + ("total_logical_data_rate_gbps_at_latency_percentile", "max", "GB/s"), + ): + for statistic in ("p50", "p99"): + metric = { + "operation": "roundtrip", "statistic": statistic, + "measure": measure, "objective": objective, + "tokens_per_rank": token, "phase": members[0]["phase"], + } + entries = [] + for member in members: + point_id, value, observed_unit = _metric_value(member, metric) + if observed_unit != unit: + raise PublisherError("publisher metric unit differs") + entries.append({ + "rank": 0, "series_id": member["series_id"], "point_id": point_id, + "value": value, "unit": unit, + }) + entries.sort(key=lambda item: (item["value"], item["series_id"]), reverse=objective == "max") + for rank, entry in enumerate(entries, 1): + entry["rank"] = ( + 1 + if measure == "latency_us" + and statistic == "p99" + and entry["series_id"] in p99_tie_ids + else rank + ) + ranking_id = _derived_id("cxranking-v1-", { + "cohort_id": cohort["cohort_id"], "metric": metric, + }) + metric_label = _metric_label(measure, statistic) + rankings.append({ + "ranking_id": ranking_id, "cohort_id": cohort["cohort_id"], + "label": f"{cohort['kind'].title()} {metric_label} T={token}", + "metric": metric, "entries": entries, + "publication_tier": cohort["publication_tier"], + "eligibility": cohort["eligibility"], + }) + if ( + cohort["publication_tier"] != "official" + or measure != "latency_us" + or statistic != "p99" + or sum(entry["rank"] == 1 for entry in entries) != 1 + ): + continue + objective_name = "min-p99-latency" + top = entries[0] + recommendation_id = _derived_id("cxrecommendation-v1-", { + "objective": objective_name, "ranking_id": ranking_id, + }) + recommendations.append({ + "recommendation_id": recommendation_id, + "cohort_id": cohort["cohort_id"], + "label": f"Best {metric_label} at T={token}", + "objective": objective_name, + "series_id": top["series_id"], "point_id": top["point_id"], + "value": top["value"], "unit": top["unit"], + "rationale": ( + "Unique p99 winner after deterministic hierarchical bootstrap " + "and all-run agreement" + ), + "publication_tier": cohort["publication_tier"], + "eligibility": cohort["eligibility"], + }) + if cohort["kind"] == "routing": + baseline = next( + (member for member in members + if member["workload"]["routing"] == "uniform" and not member["workload"]["eplb"]), + None, + ) + if baseline: + for candidate in members: + if candidate is baseline: + continue + for token in tokens: + for measure, objective in ( + ("latency_us", "min"), + ("activation_data_rate_gbps_at_latency_percentile", "max"), + ("total_logical_data_rate_gbps_at_latency_percentile", "max"), + ): + for statistic in ("p50", "p99"): + metric = { + "operation": "roundtrip", "statistic": statistic, + "measure": measure, "objective": objective, + "tokens_per_rank": token, "phase": baseline["phase"], + } + _, base_value, _ = _metric_value(baseline, metric) + _, candidate_value, _ = _metric_value(candidate, metric) + sensitivity_id = _derived_id("cxsensitivity-v1-", { + "baseline": baseline["series_id"], "candidate": candidate["series_id"], + "cohort": cohort["cohort_id"], "metric": metric, + }) + sensitivities.append({ + "sensitivity_id": sensitivity_id, + "cohort_id": cohort["cohort_id"], + "label": ( + f"Routing sensitivity: " + f"{_metric_label(measure, statistic)} T={token}" + ), + "baseline_series_id": baseline["series_id"], + "candidate_series_id": candidate["series_id"], + "metric": metric, + "signed_change_ratio": (candidate_value - base_value) / base_value, + "publication_tier": cohort["publication_tier"], + "eligibility": cohort["eligibility"], + }) + if cohort["kind"] in {"dispatch-precision", "combine-precision"}: + axis = ( + "dispatch" + if cohort["kind"] == "dispatch-precision" + else "combine" + ) + field = f"{axis}_precision" + bf16 = identity.precision_profile( + identity.V1_CONTROL_PRECISION_PROFILE + )[axis] + baseline = next( + member for member in members + if _canonical(member["workload"][field]) == _canonical(bf16) + ) + for candidate in members: + if candidate is baseline: + continue + for token in tokens: + for measure, objective in ( + ("latency_us", "min"), + ("activation_data_rate_gbps_at_latency_percentile", "max"), + ("total_logical_data_rate_gbps_at_latency_percentile", "max"), + ): + for statistic in ("p50", "p99"): + metric = { + "operation": "roundtrip", + "statistic": statistic, + "measure": measure, + "objective": objective, + "tokens_per_rank": token, + "phase": baseline["phase"], + } + _, base_value, _ = _metric_value(baseline, metric) + _, candidate_value, _ = _metric_value(candidate, metric) + sensitivity_id = _derived_id("cxsensitivity-v1-", { + "baseline": baseline["series_id"], + "candidate": candidate["series_id"], + "cohort": cohort["cohort_id"], + "metric": metric, + }) + sensitivities.append({ + "sensitivity_id": sensitivity_id, + "cohort_id": cohort["cohort_id"], + "label": ( + f"{axis.title()} precision sensitivity: " + f"{_metric_label(measure, statistic)} T={token}" + ), + "baseline_series_id": baseline["series_id"], + "candidate_series_id": candidate["series_id"], + "metric": metric, + "signed_change_ratio": ( + candidate_value - base_value + ) / base_value, + "publication_tier": cohort["publication_tier"], + "eligibility": cohort["eligibility"], + }) + rankings.sort(key=lambda item: item["ranking_id"]) + recommendations.sort(key=lambda item: item["recommendation_id"]) + sensitivities.sort(key=lambda item: item["sensitivity_id"]) + return cohorts, rankings, recommendations, sensitivities + + +def _require_runnable_promotion_success( + bundles: Sequence[dict[str, Any]], cases: dict[str, dict[str, Any]] +) -> None: + for bundle in bundles: + for case_id, case in cases.items(): + if case["_disposition"] != "runnable": + continue + status, _ = _outcome(bundle["selected"][case_id]) + if status != "success": + raise PublisherError( + "promotion requires every runnable matrix case to succeed " + "in every selected bundle" + ) + prior_statuses = { + _outcome(document)[0] + for document in bundle["documents"].values() + if document["identity"]["case_id"] == case_id + } + if prior_statuses != {"success"}: + raise PublisherError( + "promotion rejects runnable cases with failed, invalid, or diagnostic retries" + ) + + +def _expected_chip_cohort_count(series: Sequence[dict[str, Any]]) -> int: + groups: dict[bytes, set[bytes]] = {} + for item in series: + control, variant = _public_cohort_factors("chip", item) + groups.setdefault(_canonical(control), set()).add(_canonical(variant)) + return sum(len(variants) >= 2 for variants in groups.values()) + + +def _require_promotion_cohorts( + cohorts: Sequence[dict[str, Any]], series: Sequence[dict[str, Any]] +) -> None: + eligible_kinds = { + cohort["kind"] + for cohort in cohorts + if cohort["eligibility"]["decision_grade"] + } + required_kinds = list(REQUIRED_COHORT_KINDS) + if any( + item["workload"].get( + "precision_profile", identity.V1_CONTROL_PRECISION_PROFILE + ) + != identity.V1_CONTROL_PRECISION_PROFILE + for item in series + ): + required_kinds.extend(PRECISION_COHORT_KINDS) + missing = [kind for kind in required_kinds if kind not in eligible_kinds] + if missing: + raise PublisherError( + "promotion lacks decision-grade cohort kinds: " + ", ".join(missing) + ) + for kind, expected in REQUIRED_PROMOTION_COHORT_COUNTS.items(): + members = [cohort for cohort in cohorts if cohort["kind"] == kind] + if len(members) != expected or any( + not cohort["eligibility"]["decision_grade"] for cohort in members + ): + raise PublisherError( + f"promotion requires exactly {expected} decision-grade {kind} cohorts" + ) + + chip_cohorts = [cohort for cohort in cohorts if cohort["kind"] == "chip"] + expected_chips = _expected_chip_cohort_count(series) + if len(chip_cohorts) != expected_chips or any( + not cohort["eligibility"]["decision_grade"] for cohort in chip_cohorts + ): + raise PublisherError( + f"promotion requires all {expected_chips} derived chip cohorts to be decision-grade" + ) + + by_id = {item["series_id"]: item for item in series} + anchors = {("uniform", False), ("zipf", False), ("zipf", True)} + for cohort in ( + item for item in cohorts + if item["kind"] == "routing" and item["eligibility"]["decision_grade"] + ): + observed = { + (by_id[series_id]["workload"]["routing"], by_id[series_id]["workload"]["eplb"]): + by_id[series_id] + for series_id in cohort["series_ids"] + } + if len(cohort["series_ids"]) != len(anchors) or set(observed) != anchors: + raise PublisherError( + "promotion routing cohorts require exact uniform, zipf, and zipf+EPLB anchors" + ) + if ( + observed[("uniform", False)]["build"]["implementation_contract_sha256"] + != observed[("zipf", False)]["build"]["implementation_contract_sha256"] + ): + raise PublisherError( + "promotion routing cohorts require identical off-EPLB generated implementation" + ) + + +def _require_promotion_series(series: Sequence[dict[str, Any]]) -> None: + if not series or any(item["status"] != "decision-grade" for item in series): + raise PublisherError("promotion has unstable or incomplete required series") + + +def build_dataset( + store: Store, + bundle_ids: Sequence[str], + *, + promote: bool, +) -> dict[str, Any]: + if not bundle_ids or len(bundle_ids) != len(set(bundle_ids)): + raise PublisherError("dataset requires unique explicit bundle IDs") + loaded = [load_bundle(store, bundle_id) for bundle_id in bundle_ids] + loaded.sort(key=lambda bundle: ( + int(bundle["manifest"]["run"]["run_id"]), + bundle["manifest"]["run"]["run_attempt"], + bundle["id"], + )) + matrix_ids = {bundle["manifest"]["matrix"]["sha256"] for bundle in loaded} + case_sets = [{case["case_id"] for case in bundle["cases"]} for bundle in loaded] + if len(matrix_ids) != 1 or len({tuple(sorted(values)) for values in case_sets}) != 1: + raise PublisherError("dataset bundles do not share one exact requested matrix") + run_ids = [bundle["manifest"]["run"]["run_id"] for bundle in loaded] + qualification_indices = sorted( + bundle["manifest"]["run"]["qualification_index"] for bundle in loaded + ) + if promote and ( + len(loaded) != REQUIRED_ALLOCATIONS + or len(run_ids) != len(set(run_ids)) + or qualification_indices != [1, 2, 3] + or any(bundle["manifest"]["run"]["run_attempt"] != 1 for bundle in loaded) + ): + raise PublisherError( + "promotion requires qualification indices 1, 2, and 3 from first-attempt runs" + ) + if promote and matrix_ids != {CANONICAL_FULL_V1_MATRIX_SHA256}: + raise PublisherError("promotion requires the canonical full-v1 matrix") + cases = {case["case_id"]: case for case in loaded[0]["cases"]} + if promote: + _require_runnable_promotion_success(loaded, cases) + all_documents = [ + document for bundle in loaded for document in bundle["documents"].values() + ] + selected_ids = { + selection["selected_attempt_id"] + for bundle in loaded for selection in bundle["manifest"]["coverage"]["selections"] + } + public_attempts = [ + _public_attempt( + document, selected=document["identity"]["attempt_id"] in selected_ids + ) + for document in all_documents + ] + _unique([attempt["attempt_id"] for attempt in public_attempts], "dataset attempts") + selected_by_case: dict[str, list[dict[str, Any]]] = { + case_id: [bundle["selected"][case_id] for bundle in loaded] + for case_id in sorted(cases) + } + samples_by_attempt = { + attempt_id: sample_document + for bundle in loaded + for attempt_id, sample_document in bundle["sample_documents"].items() + } + coverage: list[dict[str, Any]] = [] + for case_id, case in sorted(cases.items()): + attempts = sorted( + (attempt for attempt in public_attempts if attempt["case_id"] == case_id), + key=lambda attempt: ( + int(attempt["run_id"]), attempt["run_attempt"], + attempt["attempt_index"], attempt["attempt_id"], + ), + ) + selected_document = selected_by_case[case_id][-1] + selected = _public_attempt(selected_document, selected=True) + precision_profile = case.get( + "precision_profile", identity.V1_CONTROL_PRECISION_PROFILE + ) + precision = identity.precision_profile(precision_profile) + selected_raw = ( + selected_document + if selected_document["format"] == contracts.RAW_FORMAT + and selected_document["outcome"]["status"] == "success" + else None + ) + if selected_raw is not None: + backend_generation = selected_raw["implementation"]["kernel_generation"] + projected = contracts.public_series_config( + kernel_generation=backend_generation, + provenance=selected_raw["implementation"]["provenance"], + resource_profile=selected_raw["implementation"]["resource_profile"], + resource_mode=selected_raw["case"]["resource_mode"], + device_product=selected_raw["topology"]["device_product"], + ) + resource = projected["resource"] + rows_by_token = { + row["tokens_per_rank"]: row for row in selected_raw["measurement"]["rows"] + } + series_id = selected_raw["identity"]["series_id"] + else: + backend_generation = None + resource = { + "mode": "fixed-profile", + "profile": None, + "comm_units_kind": None, + "configured_units": None, + } + rows_by_token = {} + series_id = None + point_status = ( + "measured" if selected["outcome"] == "success" else selected["outcome"] + ) + point_reason = ( + None + if point_status == "measured" + else case["_reason"] + if point_status == "unsupported" + else selected["reason"] + ) + token_ladder = [int(value) for value in case["ladder"].split()] + coverage_points = [] + for token in token_ladder: + row = rows_by_token.get(token) + coverage_points.append({ + "point_id": row["point_id"] if row is not None else None, + "series_id": series_id if row is not None else None, + "tokens_per_rank": token, + "global_tokens": token * case["ep"], + "terminal_status": point_status, + "reason": point_reason, + }) + coverage.append({ + "case_id": case_id, + "label": ( + f"{case['sku'].upper()} / {case['backend']} / EP{case['ep']} / " + f"{case['mode']} / {case['phase']} / {case['routing']}" + ), + "required": True, + "sku": _slug(case["sku"]), + "suite": _slug(case["suite"]), + "workload": _slug(case["workload"]), + "publication_tier": case["required_publication"], + "backend": _slug(case["backend"]), + "backend_generation": backend_generation, + "mode": case["mode"], + "phase": case["phase"], + "routing": case["routing"], + "eplb": case["eplb"], + "precision_profile": precision_profile, + "dispatch_precision": precision["dispatch"], + "combine_precision": precision["combine"], + "resource": resource, + "topology": _coverage_topology(case), + "points": coverage_points, + "disposition": case["_disposition"], + "selected_attempt_id": selected["attempt_id"], + "outcome": selected["outcome"], + "failure_mode": selected["failure_mode"], + "reason": case["_reason"] if case["_disposition"] == "unsupported" else selected["reason"], + "attempt_ids": [attempt["attempt_id"] for attempt in attempts], + }) + by_series: dict[str, list[dict[str, Any]]] = {} + for case_documents in selected_by_case.values(): + for document in case_documents: + if ( + document["format"] == contracts.RAW_FORMAT + and document["outcome"]["status"] == "success" + ): + by_series.setdefault(document["identity"]["series_id"], []).append(document) + series: list[dict[str, Any]] = [] + internals: dict[str, dict[str, Any]] = {} + for series_id, documents in sorted(by_series.items()): + try: + sample_documents = [ + samples_by_attempt[document["identity"]["attempt_id"]] + for document in documents + ] + except KeyError as exc: + raise PublisherError( + "selected raw evidence is missing its private sample document" + ) from exc + item, internal = _build_series( + series_id, documents, sample_documents, len(loaded) + ) + series.append(item) + internals[series_id] = internal + dataset_binding = _sha_bytes(_canonical({ + "matrix_id": next(iter(matrix_ids)), + "source_bundle_ids": sorted(bundle_ids), + })) + cohorts, rankings, recommendations, sensitivities = build_decisions( + series, internals, dataset_binding=dataset_binding + ) + allocation_ids = sorted({attempt["allocation_id"] for attempt in public_attempts}) + qualification_indices = sorted({int(value) for value in qualification_indices}) + measured_cases = sum( + all(point["terminal_status"] == "measured" for point in item["points"]) + for item in coverage + ) + unsupported_cases = sum( + all(point["terminal_status"] == "unsupported" for point in item["points"]) + for item in coverage + ) + requested_points = sum(len(item["points"]) for item in coverage) + measured_points = sum( + point["terminal_status"] == "measured" + for item in coverage for point in item["points"] + ) + unsupported_points = sum( + point["terminal_status"] == "unsupported" + for item in coverage for point in item["points"] + ) + status = "promoted" if promote else "diagnostic" + dataset = { + "format": FORMAT_PUBLIC, + "schema_version": 1, + "generated_at": _latest_timestamp( + [bundle["manifest"]["created_at"] for bundle in loaded] + ), + "source_bundle_ids": sorted(bundle_ids), + "promotion": { + "status": status, + "reason": None, + "matrix_id": next(iter(matrix_ids)), + "allocation_ids": allocation_ids, + "required_allocations": REQUIRED_ALLOCATIONS, + "qualification_indices": qualification_indices, + "requested_cases": len(coverage), + "terminal_cases": len(coverage), + "measured_cases": measured_cases, + "unsupported_cases": unsupported_cases, + "requested_points": requested_points, + "terminal_points": requested_points, + "measured_points": measured_points, + "unsupported_points": unsupported_points, + "policy": POLICY, + }, + "coverage": coverage, + "attempts": sorted(public_attempts, key=lambda attempt: attempt["attempt_id"]), + "series": series, + "cohorts": cohorts, + "rankings": rankings, + "recommendations": recommendations, + "sensitivities": sensitivities, + } + if promote: + _require_promotion_series(series) + _require_promotion_cohorts(cohorts, series) + validate_public_dataset(dataset) + return dataset + + +def quarantine_incoming( + store: Store, ingest_id: str, reason: str, generated_at: str +) -> str: + if REASON.fullmatch(reason) is None: + raise PublisherError("quarantine reason must be a machine code") + public_reason = f"{reason}-{ingest_id}" + if REASON.fullmatch(public_reason) is None: + raise PublisherError("quarantine reason and incoming ID exceed the public reason contract") + manifest = { + "format": "collectivex.quarantine.v1", + "schema_version": 1, + "created_at": generated_at, + "incoming_id": ingest_id, + "reason": reason, + } + digest = _sha_bytes(_canonical(manifest)) + with store.staging(store.quarantine, private=True) as stage: + _write_json(stage / "quarantine.json", manifest, mode=0o600) + store.complete(stage, digest, private=True) + store.install(stage, store.quarantine / digest, private=True) + if _sha_bytes(_canonical(strict_load(store.quarantine / digest / "quarantine.json"))) != digest: + raise PublisherError("existing quarantine object differs") + return digest + + +def _store_from_args(args: argparse.Namespace) -> Store: + root = args.store_root or os.environ.get("COLLECTIVEX_STORE_ROOT") + if not root: + raise PublisherError("COLLECTIVEX_STORE_ROOT or --store-root is required") + if not Path(root).is_absolute(): + raise PublisherError("COLLECTIVEX_STORE_ROOT must be an absolute path") + return Store(root) + + +def _run_metadata(args: argparse.Namespace) -> dict[str, Any]: + """Validate offline operator assertions about a completed successful GHA run. + + The publisher deliberately performs no network access. The caller must preflight workflow + identity and conclusion against GitHub before supplying these values; artifact-internal + provenance is then required to match them exactly. + """ + run = { + "repository": args.repository, + "run_id": args.run_id, + "run_attempt": args.run_attempt, + "qualification_index": args.qualification_index, + "source_sha": args.source_sha, + } + # Reuse the authoritative private schema constraints before any filesystem mutation. + if not re.fullmatch(r"[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+", run["repository"] or ""): + raise PublisherError("--repository must be owner/name") + if not re.fullmatch(r"[1-9][0-9]*", run["run_id"] or ""): + raise PublisherError("--run-id must be a positive decimal string") + if type(run["run_attempt"]) is not int or run["run_attempt"] < 1: + raise PublisherError("--run-attempt must be positive") + if type(run["qualification_index"]) is not int or run["qualification_index"] not in range(1, 4): + raise PublisherError("--qualification-index must be 1, 2, or 3") + if not re.fullmatch(r"[0-9a-f]{40}", run["source_sha"] or ""): + raise PublisherError("--source-sha must be a 40-character lowercase Git SHA") + return run + + +def _ingest_inputs( + args: argparse.Namespace, +) -> tuple[dict[str, Any], Path, list[Path]]: + run = _run_metadata(args) + matrix = Path(args.matrix).absolute() + if matrix.is_symlink() or not matrix.is_file(): + raise PublisherError("--matrix must be a regular non-symlink file") + artifacts = [Path(value).absolute() for value in args.artifact] + if not artifacts: + raise PublisherError("at least one --artifact is required") + names = [_artifact_name(path) for path in artifacts] + if len(names) != len(set(names)): + raise PublisherError("--artifact contains duplicate GHA names") + for path in artifacts: + if path.is_symlink() or not (path.is_dir() or path.is_file()): + raise PublisherError("--artifact must be a regular ZIP or real directory") + return run, matrix, artifacts + + +def _bundle_ids(values: Sequence[str], *, promote: bool) -> list[str]: + bundle_ids = list(values) + if ( + not bundle_ids + or len(bundle_ids) != len(set(bundle_ids)) + or any(HEX64.fullmatch(value) is None for value in bundle_ids) + ): + raise PublisherError("bundle IDs must be unique SHA-256 digests") + if promote and len(bundle_ids) != REQUIRED_ALLOCATIONS: + raise PublisherError("promotion requires exactly three explicit bundle IDs") + return bundle_ids + + +def ingest_command(args: argparse.Namespace) -> dict[str, Any]: + run, matrix, artifacts = _ingest_inputs(args) + store = _store_from_args(args) + with store.locked(): + ingest_id, incoming, _ = archive_incoming( + store, matrix, artifacts, run + ) + try: + bundle_id, _, _ = build_bundle(store, ingest_id, incoming, run) + return { + "status": "accepted", "incoming_id": ingest_id, + "bundle_id": bundle_id, + } + except ( + PublisherError, contracts.ContractError, artifact_safety.ArtifactSafetyError, + jsonschema.ValidationError, + ) as exc: + # Invalid delivery bytes provide no trusted timestamp. A fixed sentinel keeps + # repeated quarantine of the same immutable incoming object content-idempotent. + generated_at = "1970-01-01T00:00:00Z" + quarantine_id = quarantine_incoming( + store, ingest_id, "artifact-validation-failed", generated_at + ) + raise PublisherError( + f"incoming {ingest_id} quarantined as {quarantine_id}: {exc}" + ) from exc + + +def promote_command(args: argparse.Namespace) -> dict[str, Any]: + bundle_ids = _bundle_ids(args.bundle, promote=True) + store = _store_from_args(args) + with store.locked(): + dataset = build_dataset(store, bundle_ids, promote=True) + digest, size = store.install_dataset(dataset) + store.update_channel("dev-latest", digest, size, dataset["generated_at"]) + store.verify_channel("dev-latest") + return { + "status": "promoted", "bundle_ids": bundle_ids, + "dataset_sha256": digest, "channel": "dev-latest", + } + + +def verify_command(args: argparse.Namespace) -> dict[str, Any]: + bundle_ids = _bundle_ids(args.bundle, promote=False) if args.bundle else [] + channels = args.channel or ["dev-latest"] + if any(channel != "dev-latest" for channel in channels): + raise PublisherError("unknown channel") + store = _store_from_args(args) + with store.locked(): + pointers = {channel: store.verify_channel(channel) for channel in channels} + bundles = [load_bundle(store, bundle_id)["id"] for bundle_id in bundle_ids] + return {"status": "verified", "channels": pointers, "bundle_ids": bundles} + + +def _parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="CollectiveX isolated filesystem publisher") + parser.add_argument("--store-root", help="defaults to COLLECTIVEX_STORE_ROOT") + subparsers = parser.add_subparsers(dest="command", required=True) + ingest = subparsers.add_parser("ingest", help="archive and validate one complete GHA run") + ingest.add_argument("--matrix", required=True) + ingest.add_argument("--artifact", action="append", required=True) + ingest.add_argument("--repository", required=True) + ingest.add_argument("--run-id", required=True) + ingest.add_argument("--run-attempt", required=True, type=int) + ingest.add_argument("--qualification-index", required=True, type=int) + ingest.add_argument("--source-sha", required=True) + promote = subparsers.add_parser("promote", help="publish explicit independent bundles") + promote.add_argument("--bundle", action="append", required=True) + verify = subparsers.add_parser("verify", help="verify immutable targets and pointers") + verify.add_argument("--channel", action="append", choices=["dev-latest"]) + verify.add_argument("--bundle", action="append", default=[]) + return parser + + +def main() -> int: + args = _parser().parse_args() + try: + if args.command == "ingest": + result = ingest_command(args) + elif args.command == "promote": + result = promote_command(args) + elif args.command == "verify": + result = verify_command(args) + else: + raise PublisherError(f"unknown command {args.command!r}") + except ( + PublisherError, contracts.ContractError, artifact_safety.ArtifactSafetyError, + jsonschema.ValidationError, OSError, + ) as exc: + print(json.dumps({"status": "error", "error": str(exc)}), file=sys.stderr) + return 2 + print(json.dumps(result, sort_keys=True)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/requirements.txt b/experimental/CollectiveX/requirements.txt new file mode 100644 index 000000000..f68f97d83 --- /dev/null +++ b/experimental/CollectiveX/requirements.txt @@ -0,0 +1,8 @@ +# Host-side matrix generation. GPU libraries are supplied by benchmark images. +PyYAML==6.0.2 + +# Canonical workload serialization. +numpy>=1.26,<3 + +# Host-only strict artifact publisher schemas (never imported by GPU execution). +jsonschema==4.25.1 diff --git a/experimental/CollectiveX/runtime/common.sh b/experimental/CollectiveX/runtime/common.sh new file mode 100644 index 000000000..13d8bbf04 --- /dev/null +++ b/experimental/CollectiveX/runtime/common.sh @@ -0,0 +1,2435 @@ +# shellcheck shell=bash +# CollectiveX — shared launcher helpers (sourced, not executed). +# +# Cluster-generic scaffolding only (Slurm/container/build/staging); no +# model-serving. Logging goes to stderr so functions can `echo` a single +# result on stdout. + +_CX_COMMON_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +CX_SQUASH_FORMAT_VERSION="repro-v1" +CX_SQUASH_SOURCE_DATE_EPOCH=1 +CX_DEEPEP_V2_COMMIT="fa8a9b16898204afd347c663b89e65ef87dc6ce6" # pragma: allowlist secret +CX_DEEPEP_V2_TREE="29809e75c5874e6609dac4804e7b651d5226959f" # pragma: allowlist secret +CX_DEEPEP_V2_FMT_COMMIT="a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" # pragma: allowlist secret +CX_DEEPEP_HYBRID_COMMIT="e0a5b1d9848ab3e7b4a67842bf06f067bfac67f8" # pragma: allowlist secret +CX_DEEPEP_HYBRID_TREE="d77aeab7f1bb52b615666fe178d26ced41fae08e" # pragma: allowlist secret +CX_DEEPEP_HYBRID_NCCL_COMMIT="1e0c869c39bb33f1034cb9920bd2a8a8406f04a3" # pragma: allowlist secret +unset COLLECTIVEX_OPERATOR_CONFIG_LOADED COLLECTIVEX_EPHEMERAL_CONFIG_PATH + +cx_log() { printf '[collectivex] %s\n' "$*" >&2; } +cx_die() { printf '[collectivex] FATAL: %s\n' "$*" >&2; exit 1; } + +# Public failure telemetry is a closed vocabulary. Raw scheduler, container, +# host, and filesystem diagnostics stay in the mode-0600 private logs. +cx_set_failure_stage() { + local stage="$1" + case "$stage" in + setup|repository-stage|registry-verification|scheduler-allocation|container-import) ;; + container-hash|container-launch|backend-setup|execution|artifact-collection) ;; + *) cx_die "invalid launcher failure stage" ;; + esac + export CX_FAILSAFE_MODE="$stage" +} + +cx_fail_stage() { + local stage="$1" log_path="${2:-}" diagnostic="unknown" + cx_set_failure_stage "$stage" + if [ -n "$log_path" ] && [ -f "$log_path" ]; then + if grep -aEqi 'no space left|disk quota|quota exceeded' "$log_path"; then + diagnostic="storage-capacity" + elif grep -aEqi 'permission denied|operation not permitted|read-only file system|source mount (creation|ownership validation|permission inspection|permission normalization|permission validation) failed' "$log_path"; then + diagnostic="storage-permission" + elif grep -aEqi 'outside one realized LSA domain|lsa(Size| team| domain).*(mismatch|invalid|expected)|ranks.*not in (one|the same) nvlink.domain' "$log_path" \ + || { [ "${CX_BENCH:-}" = deepep-v2 ] \ + && grep -aEqi 'nccl[.]cu:(111|112)([^0-9]|$)' "$log_path"; }; then + diagnostic="accelerator-topology" + elif grep -aEqi 'cuda driver version is insufficient|call requires newer driver|cudaErrorCallRequiresNewerDriver|CUDA_ERROR_SYSTEM_DRIVER_MISMATCH|unsupported toolchain' "$log_path"; then + diagnostic="accelerator-driver" + elif grep -aEqi 'ncclDevCommCreate|ncclCommWindowRegister|ncclGetLsa(Device)?Pointer|Communicator does not support symmetric memory|Symmetric memory is not supported' "$log_path" \ + || { [ "${CX_BENCH:-}" = deepep-v2 ] \ + && grep -aEqi 'nccl[.]cu:(106|127|128|129|135)([^0-9]|$)' "$log_path"; }; then + diagnostic="nccl-device-api" + elif grep -aEqi 'NVCC (PTX )?compilation failed|cuobjdump failed|invalid device (kernel )?image|no kernel image is available' "$log_path"; then + diagnostic="jit-toolchain" + elif grep -aEqi 'cuda out of memory|CUDA_ERROR_OUT_OF_MEMORY|out of memory.*cuda' "$log_path"; then + diagnostic="accelerator-memory" + elif grep -aEqi 'does not match its pinned image contract|requires the exact pinned|version mismatch' "$log_path"; then + diagnostic="backend-version" + elif grep -aEqi 'nvshmem is unavailable|build-tool installation failed' "$log_path"; then + diagnostic="backend-dependency" + elif grep -aEqi 'revision fetch failed|submodule fetch failed|package installation failed|staged source is invalid|source (pin resolution|seed validation|seed copy|checkout creation|publication validation|existing source validation) failed' "$log_path"; then + diagnostic="backend-source" + elif grep -aEqi 'failed to mount|squashfs|enroot|pyxis|mount.*invalid argument|invalid argument.*mount' "$log_path"; then + diagnostic="container-runtime" + elif grep -aEqi 'backend preparation failed|build (failed|is incomplete)|cache (mount identity )?validation failed|import failed' "$log_path"; then + diagnostic="backend-build" + elif grep -aEqi 'command not found|not found on this runner|git lookup failed' "$log_path"; then + diagnostic="missing-runtime" + elif grep -aEqi 'too many requests|rate.?limit' "$log_path"; then + diagnostic="registry-rate-limit" + elif grep -aEqi 'timed out|operation timeout|wait timeout after|watchdog.*timeout|timeout: sending signal|connection reset|could not resolve|TLS|certificate' "$log_path"; then + diagnostic="network-or-timeout" + elif grep -aEqi 'salloc:|srun:.*(unable to create step|step creation|invalid partition|invalid account)|unable to create step|job allocation' "$log_path"; then + diagnostic="scheduler" + elif grep -aEqi 'SHARD done: [0-9]+/[0-9]+ case\(s\) failed|WARN: .* run failed rc=|completed with invalid semantic evidence' "$log_path"; then + diagnostic="benchmark-case-failure" + elif [ -s "$log_path" ]; then + diagnostic="unclassified" + else + diagnostic="empty-log" + fi + fi + cx_log "ERROR: failure-class=$stage diagnostic=$diagnostic" + return 1 +} + +# Runner-local deployment settings are strict JSON kept outside the checkout. +# Only the selected runner's allowlisted values are exported; the document is +# never sourced or evaluated as shell. +cx_load_operator_config() { + [ -n "${COLLECTIVEX_OPERATOR_CONFIG_LOADED:-}" ] \ + && [ "$COLLECTIVEX_OPERATOR_CONFIG_LOADED" = "$$" ] && return 0 + local config_path generated=0 parsed_path config_log key value + unset CX_PARTITION CX_ACCOUNT CX_SQUASH_DIR CX_STAGE_DIR CX_ENROOT_CACHE_PATH + unset ENROOT_CACHE_PATH + unset CX_EXCLUDE_NODES CX_NODELIST CX_LOCK_DIR CX_MASTER_PORT + unset CX_SOCKET_IFNAME CX_RDMA_DEVICES CX_IB_GID_INDEX CX_RDMA_SERVICE_LEVEL + unset CX_AUDIT_SALT + unset MASTER_ADDR MASTER_PORT RANK WORLD_SIZE LOCAL_RANK LOCAL_WORLD_SIZE + config_path="${COLLECTIVEX_OPERATOR_CONFIG:-${XDG_CONFIG_HOME:-${HOME}/.config}/inferencex/collectivex.json}" + if [ -n "${COLLECTIVEX_OPERATOR_CONFIG_CONTENT:-}" ]; then + umask 077 + if [[ "${CX_JOB_ROOT:-}" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \ + && [ -d "$CX_JOB_ROOT" ] && [ ! -L "$CX_JOB_ROOT" ] \ + && [ "$(stat -c '%u:%a' "$CX_JOB_ROOT" 2>/dev/null)" = "$(id -u):700" ]; then + config_path="$CX_JOB_ROOT/operator-config.json" + (set -C; : > "$config_path") 2>/dev/null \ + || cx_die "cannot create ephemeral runner configuration" + else + config_path="$(mktemp /tmp/inferencex-collectivex-config.XXXXXX)" \ + || cx_die "cannot create ephemeral runner configuration" + fi + COLLECTIVEX_EPHEMERAL_CONFIG_PATH="$config_path" + generated=1 + if ! printf '%s' "$COLLECTIVEX_OPERATOR_CONFIG_CONTENT" > "$config_path"; then + unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT + rm -f -- "$config_path" + unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH + cx_die "cannot materialize runner configuration" + fi + elif [ "${COLLECTIVEX_OPERATOR_CONFIG_REQUIRED:-0}" = 1 ]; then + unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT + cx_die "runner configuration is unavailable" + fi + unset COLLECTIVEX_OPERATOR_CONFIG_CONTENT COLLECTIVEX_OPERATOR_CONFIG_REQUIRED + if [ ! -e "$config_path" ]; then + [ "${COLLECTIVEX_CANONICAL_GHA:-0}" != 1 ] \ + || cx_die "runner configuration is unavailable" + COLLECTIVEX_OPERATOR_CONFIG_LOADED="$$" + return 0 + fi + umask 077 + parsed_path="$(mktemp /tmp/inferencex-collectivex-parsed.XXXXXX)" || { + [ "$generated" = 0 ] || rm -f -- "$config_path" + cx_die "cannot parse runner configuration" + } + config_log="$(cx_private_log_path operator-config)" + if ! python3 - "$config_path" "${CX_RUNNER:-${CX_SHARD_SKU:-${CX_PUBLIC_RUNNER:-}}}" \ + "${COLLECTIVEX_CANONICAL_GHA:-0}" \ + > "$parsed_path" 2> "$config_log" <<'PY' +import json +import os +import posixpath +import re +import stat +import sys + +RUNNERS = { + "h100-dgxc", "h200-dgxc", "b200-dgxc", "b300", + "gb200", "gb300", "mi325x", "mi355x", +} +FIELDS = { + "partition": "CX_PARTITION", + "account": "CX_ACCOUNT", + "squash_dir": "CX_SQUASH_DIR", + "stage_dir": "CX_STAGE_DIR", + "enroot_cache_path": "CX_ENROOT_CACHE_PATH", + "exclude_nodes": "CX_EXCLUDE_NODES", + "nodelist": "CX_NODELIST", + "lock_dir": "CX_LOCK_DIR", + "socket_ifname": "CX_SOCKET_IFNAME", + "rdma_devices": "CX_RDMA_DEVICES", + "ib_gid_index": "CX_IB_GID_INDEX", + "rdma_service_level": "CX_RDMA_SERVICE_LEVEL", +} +NETWORK_FIELDS = { + "socket_ifname", "rdma_devices", "ib_gid_index", "rdma_service_level", +} +REQUIRED = { + "h100-dgxc": {"partition", "account", "squash_dir", "stage_dir"}, + "h200-dgxc": {"partition", "squash_dir", "stage_dir"}, + "b200-dgxc": {"partition", "account", "squash_dir", "stage_dir"}, + "b300": { + "partition", "account", "squash_dir", "stage_dir", + }, + "gb200": {"partition", "account", "storage_roots"}, + "gb300": {"partition", "account", "squash_dir", "stage_dir", "enroot_cache_path"}, + "mi325x": {"partition", "squash_dir", "stage_dir"}, + "mi355x": {"partition", "squash_dir", "stage_dir"}, +} +ALLOWED = { + "h100-dgxc": REQUIRED["h100-dgxc"] | {"exclude_nodes", "stage_dir"} | NETWORK_FIELDS, + "h200-dgxc": REQUIRED["h200-dgxc"] | {"account", "exclude_nodes", "stage_dir"} | NETWORK_FIELDS, + "b200-dgxc": REQUIRED["b200-dgxc"] | {"exclude_nodes", "stage_dir"} | NETWORK_FIELDS, + "b300": REQUIRED["b300"] | {"exclude_nodes"} | NETWORK_FIELDS, + "gb200": REQUIRED["gb200"] | NETWORK_FIELDS, + "gb300": REQUIRED["gb300"] | NETWORK_FIELDS, + "mi325x": REQUIRED["mi325x"] | {"exclude_nodes", "nodelist", "stage_dir", "lock_dir"} | NETWORK_FIELDS, + "mi355x": REQUIRED["mi355x"] | {"exclude_nodes", "nodelist", "stage_dir", "lock_dir"} | NETWORK_FIELDS, +} +TOKEN = re.compile(r"^[A-Za-z0-9_.\[\],-]+$") +PATH = re.compile(r"^/[A-Za-z0-9._/+\-]+$") +IPV4 = re.compile(r"(? 65536 + ): + raise ValueError + flags = os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0) + descriptor = os.open(path, flags) + try: + opened = os.fstat(descriptor) + if (opened.st_dev, opened.st_ino) != (metadata.st_dev, metadata.st_ino): + raise ValueError + payload = b"" + while len(payload) <= 65536: + chunk = os.read(descriptor, 65537 - len(payload)) + if not chunk: + break + payload += chunk + document = json.loads( + payload.decode("utf-8"), + object_pairs_hook=pairs, + parse_constant=lambda _: (_ for _ in ()).throw(ValueError()), + ) + finally: + os.close(descriptor) + if ( + set(document) not in ( + {"schema_version", "runners"}, + {"schema_version", "audit_salt", "runners"}, + ) + or type(document["schema_version"]) is not int + or document["schema_version"] != 1 + ): + raise ValueError + audit_salt = document.get("audit_salt") + if ( + (audit_salt is not None and ( + not isinstance(audit_salt, str) or not AUDIT_SALT.fullmatch(audit_salt) + )) + or (audit_required == "1" and audit_salt is None) + ): + raise ValueError + runners = document["runners"] + if ( + not isinstance(runners, dict) or not runners or set(runners) - RUNNERS + or runner not in runners + ): + raise ValueError + selected = None + for name, config in runners.items(): + if ( + not isinstance(config, dict) + or (name == runner and not REQUIRED[name].issubset(config)) + ): + raise ValueError + if set(config) - ALLOWED[name]: + raise ValueError + for field, value in config.items(): + if field == "storage_roots": + if ( + not isinstance(value, list) or not 1 <= len(value) <= 16 + or len(value) != len(set(value)) or not all(valid_path(item) for item in value) + ): + raise ValueError + elif field == "socket_ifname": + if not isinstance(value, str) or not INTERFACES.fullmatch(value): + raise ValueError + elif field == "rdma_devices": + if not isinstance(value, str) or not RDMA_DEVICES.fullmatch(value): + raise ValueError + elif field == "ib_gid_index": + if type(value) is not int or not 0 <= value <= 255: + raise ValueError + elif field == "rdma_service_level": + if type(value) is not int or not 0 <= value <= 15: + raise ValueError + elif field.endswith(("_dir", "_path")): + if not valid_path(value): + raise ValueError + elif ( + not isinstance(value, str) or not value or len(value) > 512 + or not TOKEN.fullmatch(value) or IPV4.search(value) + ): + raise ValueError + if name == runner: + selected = dict(config) + if selected is None: + raise ValueError + roots = selected.pop("storage_roots", None) + if roots is not None: + for root in roots: + squash = posixpath.join(root, "collectivex", "containers") + stage = posixpath.join(root, "collectivex", "stage") + probes = [] + try: + for directory in (squash, stage): + os.makedirs(directory, mode=0o700, exist_ok=True) + probe = posixpath.join(directory, f".write-probe-{os.getpid()}") + fd = os.open(probe, os.O_WRONLY | os.O_CREAT | os.O_EXCL, 0o600) + os.close(fd) + probes.append(probe) + selected.update(squash_dir=squash, stage_dir=stage) + break + except OSError: + pass + finally: + for probe in probes: + try: + os.unlink(probe) + except OSError: + pass + else: + raise ValueError + if audit_salt is not None: + sys.stdout.buffer.write(b"CX_AUDIT_SALT\0" + audit_salt.encode() + b"\0") + for field, value in selected.items(): + key = FIELDS[field] + sys.stdout.buffer.write( + key.encode() + b"\0" + str(value).encode() + b"\0" + ) +except (KeyError, OSError, TypeError, UnicodeError, ValueError): + raise SystemExit(1) +PY + then + rm -f -- "$parsed_path" + [ "$generated" = 0 ] || rm -f -- "$config_path" + unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH + unset COLLECTIVEX_OPERATOR_CONFIG COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL + cx_die "runner-local configuration failed" + fi + while IFS= read -r -d '' key && IFS= read -r -d '' value; do + printf -v "$key" '%s' "$value" + export "${key?}" + done < "$parsed_path" + rm -f -- "$parsed_path" + if [ "$generated" = 1 ] || [ "${COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL:-0}" = 1 ]; then + rm -f -- "$config_path" || cx_die "cannot remove ephemeral runner configuration" + fi + unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH + unset COLLECTIVEX_OPERATOR_CONFIG COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL + COLLECTIVEX_OPERATOR_CONFIG_LOADED="$$" +} + +cx_private_log_path() { + local label="$1" tag="${COLLECTIVEX_EXECUTION_ID:-manual_$$}" path + path="$(python3 - "$tag" "$label" <<'PY' 2>/dev/null +import os +import re +import shutil +import stat +import sys +import time + +tag, label = sys.argv[1:] +if not all(re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", value) for value in (tag, label)): + raise SystemExit(1) +root = f"/tmp/inferencex-collectivex-{os.getuid()}" +old_umask = os.umask(0o077) +flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0) +try: + try: + os.mkdir(root, 0o700) + except FileExistsError: + pass + root_fd = os.open(root, flags) + try: + metadata = os.fstat(root_fd) + if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700: + raise OSError("unsafe root") + cutoff = time.time() - 86400 + for entry in os.scandir(root): + try: + if ( + entry.name != tag and entry.is_dir(follow_symlinks=False) + and entry.stat(follow_symlinks=False).st_mtime < cutoff + ): + shutil.rmtree(entry.path) + except OSError: + pass + try: + os.mkdir(tag, 0o700, dir_fd=root_fd) + except FileExistsError: + pass + directory_fd = os.open(tag, flags, dir_fd=root_fd) + try: + metadata = os.fstat(directory_fd) + if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700: + raise OSError("unsafe directory") + log_flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL | getattr(os, "O_NOFOLLOW", 0) + log_fd = os.open(f"{label}.log", log_flags, 0o600, dir_fd=directory_fd) + os.close(log_fd) + finally: + os.close(directory_fd) + finally: + os.close(root_fd) +finally: + os.umask(old_umask) +print(f"{root}/{tag}/{label}.log", end="") +PY +)" || cx_die "cannot create private runtime log" + printf '%s' "$path" +} + +# Manual successes delete diagnostics immediately. Canonical workflow logs survive +# until artifact upload succeeds; failed logs remain private for debugging, and a +# later run prunes abandoned directories older than 24 hours. +cx_cleanup_private_logs() { + local rc="$1" tag="${COLLECTIVEX_EXECUTION_ID:-manual_$$}" + [ "$rc" = 0 ] || return 0 + python3 - "$tag" <<'PY' >/dev/null 2>&1 || true +import os +import re +import shutil +import stat +import sys + +tag = sys.argv[1] +if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", tag): + raise SystemExit(1) +root = f"/tmp/inferencex-collectivex-{os.getuid()}" +flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0) +root_fd = os.open(root, flags) +try: + metadata = os.fstat(root_fd) + if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) != 0o700: + raise SystemExit(1) +finally: + os.close(root_fd) +path = os.path.join(root, tag) +if os.path.isdir(path) and not os.path.islink(path): + shutil.rmtree(path) +PY +} + +# Explicit Slurm export boundary. Operator config, runner credentials, HOME, +# workspace paths, and unrelated service secrets never enter the container. +cx_container_exports() { + printf '%s' 'COLLECTIVEX_SOURCE_SHA,COLLECTIVEX_ARTIFACT_NAME,COLLECTIVEX_EXECUTION_ID,COLLECTIVEX_CONTROL_SHA256,COLLECTIVEX_IMAGE,COLLECTIVEX_IMAGE_DIGEST,COLLECTIVEX_IMAGE_DIGEST_VERIFIED,COLLECTIVEX_SQUASH_SHA256,GITHUB_REF_NAME,GITHUB_REF,GITHUB_REPOSITORY,GITHUB_JOB,GITHUB_RUN_ID,GITHUB_RUN_ATTEMPT,GITHUB_SHA,CX_RUNNER,CX_BENCH,CX_NODES,CX_GPUS_PER_NODE,CX_SCALE_UP_DOMAIN,CX_SHARD_FILE,CX_SHARD_SKU,CX_PRECISION_PROBE,CX_NGPUS,CX_TS,CX_TOPO,CX_SCOPE,CX_TRANSPORT,CX_SCALE_UP_TRANSPORT,CX_SCALE_OUT_TRANSPORT,CX_MODE,CX_PHASE,CX_ROUTING,CX_EPLB,CX_CASE_ID,CX_SUITE,CX_WORKLOAD_NAME,CX_REQUIRED_PUBLICATION,CX_PRECISION_PROFILE,CX_QUALIFICATION_INDEX,CX_HIDDEN,CX_TOPK,CX_EXPERTS,CX_TOKENS_LADDER,CX_CANONICAL,CX_ITERS,CX_TRIALS,CX_WARMUP,CX_SAMPLES_PER_POINT,CX_WARMUP_SEMANTICS,CX_SEED,CX_RUN_TIMEOUT,CX_NCCL_HOME,CX_ALLOW_MNNVL,CX_ATTEMPT_ID,CX_RUNTIME_MARKER,CX_MORI_KERNEL_TYPE,CX_WORKLOAD_DIR,CX_BACKEND_CACHE_ROOT,CX_BACKEND_CACHE_SENTINEL_SHA256,CX_BACKEND_SOURCE_ROOT,CX_AUDIT_SALT,CX_SOCKET_IFNAME,CX_RDMA_DEVICES,CX_IB_GID_INDEX,CX_RDMA_SERVICE_LEVEL,MASTER_ADDR,MASTER_PORT,RANK,WORLD_SIZE,LOCAL_RANK,LOCAL_WORLD_SIZE,NCCL_NET,NCCL_SOCKET_IFNAME,GLOO_SOCKET_IFNAME,NCCL_IB_HCA,NCCL_IB_GID_INDEX,NCCL_IB_SL,NVSHMEM_HCA_LIST,NVSHMEM_IB_GID_INDEX,NVSHMEM_IB_SL,NVSHMEM_IB_ENABLE_IBGDA,NVSHMEM_IBGDA_NIC_HANDLER,EP_NIC_NAME,EP_OVERRIDE_RDMA_SL,UCCL_SOCKET_IFNAME,UCCL_IB_GID_INDEX,UCCL_IB_SL,MORI_RDMA_DEVICES,HYBRID_EP_MULTINODE,USE_NIXL,RDMA_CORE_HOME,DEEPEP_HYBRID_BUILD_MODE,NCCL_CUMEM_ENABLE,NCCL_MNNVL_ENABLE,MC_FORCE_MNNVL,MORI_DISABLE_AUTO_XGMI,MORI_ENABLE_SDMA,MORI_APP_LOG_LEVEL,MORI_SHMEM_LOG_LEVEL,MORI_IO_LOG_LEVEL' + printf '%s' ',MORI_COMMIT' +} + +# Host-side utility steps need only the basic login paths. They never receive +# the complete Actions or runner environment. +cx_host_exports() { + printf '%s' 'HOME,PATH,USER,XDG_CACHE_HOME,ENROOT_CACHE_PATH' +} + +cx_prepare_runtime_marker() { + local mount_src="$1" tag="${COLLECTIVEX_EXECUTION_ID:-${CX_TS:-}}" marker + [[ "$tag" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \ + || cx_die "cannot create runtime stage marker" + marker=".shards/runtime-stage-${tag}.txt" + mkdir -p "$mount_src/experimental/CollectiveX/.shards" >/dev/null 2>&1 \ + || cx_die "cannot create runtime stage marker" + rm -f -- "$mount_src/experimental/CollectiveX/$marker" >/dev/null 2>&1 \ + || cx_die "cannot reset runtime stage marker" + export CX_RUNTIME_MARKER="$marker" +} + +cx_write_runtime_stage() { + local stage="$1" marker="${CX_RUNTIME_MARKER:-}" + [ -n "$marker" ] || return 0 + [[ "$marker" =~ ^\.shards/runtime-stage-[A-Za-z0-9][A-Za-z0-9._-]*\.txt$ ]] \ + || return 1 + case "$stage" in backend-setup|execution) ;; *) return 1 ;; esac + printf '%s\n' "$stage" > "$marker" +} + +cx_adopt_runtime_stage() { + local mount_src="$1" marker="${CX_RUNTIME_MARKER:-}" stage="" + [ -n "$marker" ] || return 0 + if [[ "$marker" =~ ^\.shards/runtime-stage-[A-Za-z0-9][A-Za-z0-9._-]*\.txt$ ]] \ + && [ -f "$mount_src/experimental/CollectiveX/$marker" ]; then + IFS= read -r stage < "$mount_src/experimental/CollectiveX/$marker" || true + rm -f -- "$mount_src/experimental/CollectiveX/$marker" >/dev/null 2>&1 || true + case "$stage" in + backend-setup|execution) cx_set_failure_stage "$stage" ;; + esac + fi +} + +cx_require_vars() { + local name + local -a missing=() + for name in "$@"; do + [ -n "${!name:-}" ] || missing+=("$name") + done + [ "${#missing[@]}" -eq 0 ] || cx_die \ + "missing runner-local configuration: ${missing[*]} (set them in COLLECTIVEX_OPERATOR_CONFIG)" +} + +cx_bool_enabled() { + local normalized + normalized="$(printf '%s' "$1" | tr '[:upper:]' '[:lower:]')" + case "$normalized" in + 1|true|yes) return 0 ;; + *) return 1 ;; + esac +} + +cx_require_record_safe() { + local value + for value in "$@"; do + case "$value" in + *'|'*|*$'\n'*|*$'\r'*) cx_die "manual case field contains a record delimiter" ;; + esac + done +} + +cx_require_single_node() { + [ "${CX_NODES:-1}" = "1" ] || cx_die "$1 supports one-node EP only" +} + +# Convert private, runner-local network selectors into the public library +# variables needed inside the container. Values are interface/HCA identifiers, +# never addresses; the rendezvous hostname is derived from the allocation. +cx_apply_network_profile() { + local nodes="$1" transport="$2" selector rdma_name rdma_names="" ep_nic="" + local -a selectors + [[ "$nodes" =~ ^[1-9][0-9]*$ ]] || cx_die "invalid network placement" + unset NCCL_NET NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME NCCL_IB_HCA + unset NCCL_IB_GID_INDEX NCCL_IB_SL + unset NVSHMEM_HCA_LIST NVSHMEM_IB_GID_INDEX NVSHMEM_IB_SL + unset NVSHMEM_IB_ENABLE_IBGDA NVSHMEM_IBGDA_NIC_HANDLER + unset EP_NIC_NAME EP_OVERRIDE_RDMA_SL + unset UCCL_SOCKET_IFNAME UCCL_IB_GID_INDEX UCCL_IB_SL MORI_RDMA_DEVICES + [ "$nodes" -gt 1 ] && [ "$transport" != mnnvl ] || return 0 + [ -n "${CX_SOCKET_IFNAME:-}" ] && [ -n "${CX_RDMA_DEVICES:-}" ] \ + || cx_die "multi-node execution requires private socket and RDMA selectors" + if [ -n "${CX_SOCKET_IFNAME:-}" ]; then + [[ "$CX_SOCKET_IFNAME" =~ ^[A-Za-z][A-Za-z0-9_.-]{0,31}(,[A-Za-z][A-Za-z0-9_.-]{0,31})*$ ]] \ + || cx_die "invalid private socket interface selector" + export NCCL_SOCKET_IFNAME="$CX_SOCKET_IFNAME" GLOO_SOCKET_IFNAME="$CX_SOCKET_IFNAME" + export UCCL_SOCKET_IFNAME="$CX_SOCKET_IFNAME" + fi + if [ -n "${CX_RDMA_DEVICES:-}" ]; then + [[ "$CX_RDMA_DEVICES" =~ ^[A-Za-z][A-Za-z0-9_.-]{0,31}(:[1-9][0-9]*)?(,[A-Za-z][A-Za-z0-9_.-]{0,31}(:[1-9][0-9]*)?)*$ ]] \ + || cx_die "invalid private RDMA device selector" + IFS=, read -r -a selectors <<< "$CX_RDMA_DEVICES" + for selector in "${selectors[@]}"; do + rdma_name="${selector%%:*}" + rdma_names="${rdma_names}${rdma_names:+,}${rdma_name}" + [ -n "$ep_nic" ] || ep_nic="$rdma_name" + done + export NCCL_NET=IB NCCL_IB_HCA="=$CX_RDMA_DEVICES" + export NVSHMEM_HCA_LIST="$CX_RDMA_DEVICES" + export MORI_RDMA_DEVICES="$rdma_names" EP_NIC_NAME="$ep_nic" + fi + if [ -n "${CX_IB_GID_INDEX:-}" ]; then + [[ "$CX_IB_GID_INDEX" =~ ^[0-9]+$ ]] && [ "$CX_IB_GID_INDEX" -le 255 ] \ + || cx_die "invalid private IB GID index" + export NCCL_IB_GID_INDEX="$CX_IB_GID_INDEX" NVSHMEM_IB_GID_INDEX="$CX_IB_GID_INDEX" + export UCCL_IB_GID_INDEX="$CX_IB_GID_INDEX" + fi + if [ -n "${CX_RDMA_SERVICE_LEVEL:-}" ]; then + [[ "$CX_RDMA_SERVICE_LEVEL" =~ ^[0-9]+$ ]] && [ "$CX_RDMA_SERVICE_LEVEL" -le 15 ] \ + || cx_die "invalid private RDMA service level" + export NCCL_IB_SL="$CX_RDMA_SERVICE_LEVEL" NVSHMEM_IB_SL="$CX_RDMA_SERVICE_LEVEL" + export UCCL_IB_SL="$CX_RDMA_SERVICE_LEVEL" + export EP_OVERRIDE_RDMA_SL="$CX_RDMA_SERVICE_LEVEL" + fi + export NVSHMEM_IB_ENABLE_IBGDA=1 NVSHMEM_IBGDA_NIC_HANDLER=gpu +} + +# Prove that the operator-pinned scale-out fabric exists on every allocated +# node before image import or backend initialization. Selector values and node +# diagnostics stay in the runner-private log. +cx_validate_network_profile_on_job() { + local job_id="$1" nodes="$2" transport="$3" log rc=0 + [ "$nodes" -gt 1 ] && [ "$transport" != mnnvl ] || return 0 + [[ "$job_id" =~ ^[1-9][0-9]*$ && "$nodes" =~ ^[1-9][0-9]*$ ]] \ + || return 1 + [ -n "${CX_SOCKET_IFNAME:-}" ] && [ -n "${CX_RDMA_DEVICES:-}" ] \ + || return 1 + log="$(cx_private_log_path network-profile)" || return 1 + srun --jobid="$job_id" --nodes="$nodes" --ntasks="$nodes" --ntasks-per-node=1 \ + --chdir=/tmp --input=all \ + --export="$(cx_host_exports),CX_SOCKET_IFNAME,CX_RDMA_DEVICES,CX_IB_GID_INDEX" \ + bash -s > "$log" 2>&1 <<'BASH' || rc=$? +set -euo pipefail +[[ "$CX_SOCKET_IFNAME" =~ ^[A-Za-z][A-Za-z0-9_.-]{0,31}(,[A-Za-z][A-Za-z0-9_.-]{0,31})*$ ]] +[[ "$CX_RDMA_DEVICES" =~ ^[A-Za-z][A-Za-z0-9_.-]{0,31}(:[1-9][0-9]*)?(,[A-Za-z][A-Za-z0-9_.-]{0,31}(:[1-9][0-9]*)?)*$ ]] +if [ -n "${CX_IB_GID_INDEX:-}" ]; then + [[ "$CX_IB_GID_INDEX" =~ ^[0-9]+$ ]] && [ "$CX_IB_GID_INDEX" -le 255 ] +fi +IFS=, read -r -a interfaces <<< "$CX_SOCKET_IFNAME" +for interface in "${interfaces[@]}"; do + [ -d "/sys/class/net/$interface" ] + state="$(cat "/sys/class/net/$interface/operstate")" + [ "$state" = up ] || [ "$state" = unknown ] +done +check_port() { + local port_path="$1" state gid + [ -d "$port_path" ] || return 1 + read -r state _ < "$port_path/state" + [ "$state" = 4: ] || return 1 + if [ -n "${CX_IB_GID_INDEX:-}" ]; then + [ -r "$port_path/gids/$CX_IB_GID_INDEX" ] || return 1 + gid="$(tr -d ':0[:space:]' < "$port_path/gids/$CX_IB_GID_INDEX")" + [ -n "$gid" ] || return 1 + fi +} +IFS=, read -r -a devices <<< "$CX_RDMA_DEVICES" +for selector in "${devices[@]}"; do + device="${selector%%:*}" + configured_port="" + [ "$selector" = "$device" ] || configured_port="${selector#*:}" + ports="/sys/class/infiniband/$device/ports" + [ -d "$ports" ] + if [ -n "$configured_port" ]; then + check_port "$ports/$configured_port" + else + active=0 + for port_path in "$ports"/*; do + if check_port "$port_path"; then + active=1 + break + fi + done + [ "$active" = 1 ] + fi +done +BASH + if [ "$rc" != 0 ]; then + cx_fail_stage setup "$log" || true + return "$rc" + fi +} + +cx_resolve_slurm_rendezvous() { + local job_id="$1" nodes master_addr master_port + [[ "$job_id" =~ ^[1-9][0-9]*$ ]] || cx_die "invalid rendezvous allocation" + nodes="$(squeue -j "$job_id" -h -o %N 2>/dev/null)" + master_addr="$(scontrol show hostnames "$nodes" 2>/dev/null | head -n1)" + master_port="${CX_MASTER_PORT:-29551}" + [[ "$master_addr" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \ + || cx_die "could not resolve the allocated primary node" + [[ "$master_port" =~ ^[1-9][0-9]*$ ]] && [ "$master_port" -le 65535 ] \ + || cx_die "invalid distributed rendezvous port" + export MASTER_ADDR="$master_addr" MASTER_PORT="$master_port" +} + +# Printed into `bash -c` for one Slurm task per GPU. Every rank derives its +# identity from Slurm rather than accepting caller-supplied rank values. +cx_slurm_rank_wrapper() { + cat <<'BASH' +case "${SLURM_PROCID:-}:${SLURM_NTASKS:-}:${SLURM_LOCALID:-}:${SLURM_NODEID:-}" in + *[!0-9:]*|:*|*::*|*:) exit 67 ;; +esac +[ "$SLURM_NTASKS" = "$CX_NGPUS" ] || exit 67 +[ "$SLURM_LOCALID" -lt "$CX_GPUS_PER_NODE" ] || exit 67 +export RANK="$SLURM_PROCID" WORLD_SIZE="$SLURM_NTASKS" +export LOCAL_RANK="$SLURM_LOCALID" LOCAL_WORLD_SIZE="$CX_GPUS_PER_NODE" +case "${CX_PRECISION_PROBE:-0}" in + 1) exec python3 tests/probe_precision.py "$@" ;; + 0|'') exec python3 tests/run_ep.py "$@" ;; + *) exit 67 ;; +esac +BASH +} + +# A set shard path is an execution contract, never a hint. Validate it before +# staging/allocation and again in-container so a missing or stale control file +# cannot silently fall back to a manual single-case run. +cx_validate_shard_control() { + local cx_root="$1" shard="${CX_SHARD_FILE:-}" path expected_sku control_sha256 + [ -n "$shard" ] || return 0 + expected_sku="${CX_SHARD_SKU:-}" + [ -n "$expected_sku" ] || cx_die "CX_SHARD_SKU is required with CX_SHARD_FILE" + [ -n "${CX_BENCH:-}" ] || cx_die "CX_BENCH is required with CX_SHARD_FILE" + [[ "${CX_NODES:-}" =~ ^[1-9][0-9]*$ ]] \ + || cx_die "positive CX_NODES is required with CX_SHARD_FILE" + path="$shard" + [ -f "$path" ] || path="${cx_root%/}/$shard" + [ -f "$path" ] || cx_die "shard control does not exist" + [ -s "$path" ] || cx_die "shard control is empty" + if [ "${CX_PRECISION_PROBE:-0}" = 1 ]; then + python3 "${cx_root%/}/tests/probe_precision.py" \ + --validate-control "$path" --expect-sku "$expected_sku" \ + --expect-backend "$CX_BENCH" --expect-nodes "$CX_NODES" >/dev/null 2>&1 \ + || cx_die "invalid precision probe control" + else + python3 "${cx_root%/}/sweep_matrix.py" \ + --validate-control "$path" --expect-sku "$expected_sku" \ + --expect-backend "$CX_BENCH" --expect-nodes "$CX_NODES" >/dev/null 2>&1 \ + || cx_die "invalid shard control" + fi + control_sha256="$(sha256sum "$path" | awk '{print $1}')" + [[ "$control_sha256" =~ ^[0-9a-f]{64}$ ]] \ + || cx_die "cannot hash shard control" + export COLLECTIVEX_CONTROL_SHA256="$control_sha256" +} + +cx_precision_probe_control_fields() { + local cx_root="$1" shard="${CX_SHARD_FILE:-}" path + [ "${CX_PRECISION_PROBE:-0}" = 1 ] || return 1 + path="$shard" + [ -f "$path" ] || path="${cx_root%/}/$shard" + python3 - "$path" <<'PY' +import json +import pathlib +import sys + +path = pathlib.Path(sys.argv[1]) +document = json.loads(path.read_text()) +target = document["target"] +values = ( + document["id"], target["backend"], target["sku"], target["ep"], + target["mode"], target["precision_profile"], +) +if any("|" in str(value) or "\n" in str(value) for value in values): + raise SystemExit("unsafe precision probe control field") +print("|".join(map(str, values))) +PY +} + +cx_apply_timing_profile() { + [ -n "${CX_TIMING:-}" ] || return 0 + local iters trials warmup extra + IFS=: read -r iters trials warmup extra <<< "$CX_TIMING" + [[ "$iters" =~ ^[1-9][0-9]*$ && "$trials" =~ ^[1-9][0-9]*$ \ + && "$warmup" =~ ^[1-9][0-9]*$ && -z "$extra" ]] \ + || cx_die "CX_TIMING must be positive iters:trials:warmup" + export CX_ITERS="$iters" CX_TRIALS="$trials" CX_WARMUP="$warmup" +} + +# Use an opaque, execution-bound name so a missing grant message can be +# reconciled without exposing runner or shard details in public logs. +cx_scheduler_job_name() { + local execution_id="${COLLECTIVEX_EXECUTION_ID:-manual-$$}" digest + digest="$(printf '%s' "$execution_id" | sha256sum | awk '{print $1}')" \ + || return 1 + [[ "$digest" =~ ^[0-9a-f]{64}$ ]] || return 1 + printf 'cx-%s' "${digest:0:24}" +} + +# Return 0 after recovering one allocation ID, 2 after three successful empty +# observations, and 1 for every ambiguous or failed lookup. Callers inspect the +# state variables rather than the status because all missing-ID paths still fail. +cx_reconcile_salloc_jobid() { + local job_name="$1" scheduler_user queue_output line delay attempt + local -a ids=() + scheduler_user="$(id -un 2>/dev/null)" || return 1 + [[ "$scheduler_user" =~ ^[A-Za-z0-9_.-]+$ \ + && "$job_name" =~ ^cx-[0-9a-f]{24}$ ]] || return 1 + for attempt in 1 2 3; do + ids=() + if ! queue_output="$( + squeue -h --user="$scheduler_user" --name="$job_name" -o %A 2>/dev/null + )"; then + return 1 + fi + while IFS= read -r line; do + [[ "$line" =~ ^[[:space:]]*$ ]] && continue + if [[ "$line" =~ ^[[:space:]]*([1-9][0-9]*)[[:space:]]*$ ]]; then + ids+=("${BASH_REMATCH[1]}") + else + return 1 + fi + done <<< "$queue_output" + if [ "${#ids[@]}" -eq 1 ]; then + JOB_ID="${ids[0]}" + CX_ALLOCATION_UNCERTAIN=0 + return 0 + fi + [ "${#ids[@]}" -eq 0 ] || return 1 + if [ "$attempt" -eq 3 ]; then + CX_ALLOCATION_UNCERTAIN=0 + return 2 + fi + delay=$((1 << (attempt - 1))) + sleep "$delay" || return 1 + done + return 1 +} + +# Allocate via salloc's stable grant message and assign JOB_ID in this shell. +# Raw scheduler output remains in the bounded private execution log. +cx_salloc_jobid() { + local log job_id job_name argument salloc_rc=0 + log="$(cx_private_log_path scheduler-allocation)" + for argument in "$@"; do + case "$argument" in + --job-name|--job-name=*|-J|-J*) + cx_log "ERROR: scheduler job names are managed by CollectiveX" + return 1 + ;; + esac + done + job_name="$(cx_scheduler_job_name)" || return 1 + CX_ALLOCATION_UNCERTAIN=1 + # salloc has no portable --parsable option. Parse the stable grant message + # used by the production launchers, while also accepting a bare ID from + # site wrappers. + salloc "$@" --job-name="$job_name" --no-shell > "$log" 2>&1 || salloc_rc=$? + job_id="$(sed -nE \ + -e 's/^([0-9]+)(;[^[:space:]]+)?$/\1/p' \ + -e 's/.*Granted job allocation ([0-9]+).*/\1/p' \ + "$log" | head -n1)" + if [ -n "$job_id" ]; then + [[ "$job_id" =~ ^[0-9]+$ ]] || return 1 + JOB_ID="$job_id" + CX_ALLOCATION_UNCERTAIN=0 + fi + if [ "$salloc_rc" != 0 ]; then + if [ "$salloc_rc" -ge 128 ] && [ -z "$JOB_ID" ]; then + cx_fail_stage scheduler-allocation "$log" + return 1 + fi + [ -n "$JOB_ID" ] || cx_reconcile_salloc_jobid "$job_name" || true + cx_fail_stage scheduler-allocation "$log" + return 1 + fi + if [ -z "$JOB_ID" ]; then + cx_reconcile_salloc_jobid "$job_name" || true + cx_fail_stage scheduler-allocation "$log" + return 1 + fi +} + +cx_cancel_job() { + local job_id="$1" active delay + [[ "$job_id" =~ ^[0-9]+$ ]] || return 1 + scancel "$job_id" >/dev/null 2>&1 || true + for delay in 1 2 4 8 16 32; do + if ! active="$(squeue -h -j "$job_id" -o %A 2>/dev/null)"; then + sleep "$delay" + continue + fi + [ -n "$active" ] || return 0 + sleep "$delay" + done + cx_log "ERROR: scheduled allocation did not terminate during cleanup" + return 1 +} + +cx_write_cleanup_guard() { + local state="$1" root="${CX_JOB_ROOT:-}" safe unsafe + [[ "$root" =~ ^/tmp/inferencex-collectivex-[0-9]+-[0-9]+-[A-Za-z0-9._-]+$ ]] \ + && [ -d "$root" ] && [ ! -L "$root" ] \ + && [ "$(stat -c '%u:%a' "$root" 2>/dev/null)" = "$(id -u):700" ] || return 0 + safe="$root/cleanup-safe" + unsafe="$root/cleanup-unsafe" + umask 077 + case "$state" in + safe) : > "$safe" && rm -f -- "$unsafe" ;; + unsafe) rm -f -- "$safe" && : > "$unsafe" ;; + *) return 1 ;; + esac +} + +# Single multi-arch container for ALL NVIDIA SKUs: tag `v0.5.11-cu130` is an OCI +# image index covering linux/amd64 (B200) + linux/arm64 (GB200); enroot import +# pulls the matching arch. (cu130 = CUDA 13, system nccl.h in /usr/include, torch 2.9.x.) +# Import remains tag-based because Enroot cannot reliably import a digest-qualified +# Docker Hub reference non-interactively. The registry digest is resolved and checked +# immediately before import, then recorded as verified provenance. +CX_IMAGE_MULTIARCH_DIGEST="sha256:061fb71f838e82000a1768c159654d526c2f17ebe751c21e7fc48ca53c8ef975" +# (v0.5.12-cu130 was rejected: its 62 layers overflow enroot's overlay-based +# squash creation on these nodes — "failed to mount overlay ... Invalid argument". +# v0.5.11-cu130 imports cleanly.) +# Runtime setup verifies the image-bundled DeepEP build for the detected GPU target. +CX_IMAGE_MULTIARCH="lmsysorg/sglang:v0.5.11-cu130" + +# AMD (ROCm/CDNA): separate single-arch images bundle MoRI. +CX_IMAGE_AMD_MORI="rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2" +CX_IMAGE_AMD_MORI_DIGEST="sha256:24c3b30d64475937abbb6498e3b29528649adcb836dde7a468979f767809b0e8" +CX_MORI_COMMIT_MI355="99bc0a3a6e7a70aacc6372cd9a4275ccfb4de567" # pragma: allowlist secret +CX_IMAGE_AMD_MORI_MI325="rocm/sgl-dev:sglang-0.5.14-rocm720-mi35x-mori-0701" +CX_IMAGE_AMD_MORI_MI325_DIGEST="sha256:ea42375343c2ef8f73b3bdb9e1b7b435556e3ca92aba5e3f74ada29ba217fabc" +CX_MORI_COMMIT_MI325="bf99bdf18fc69887a346913ca01c315c2aa9bd4c" # pragma: allowlist secret +cx_default_image() { + case "$1" in + mi325x*) echo "$CX_IMAGE_AMD_MORI_MI325" ;; + mi355x*) echo "$CX_IMAGE_AMD_MORI" ;; + b200*|gb200*|b300*|gb300*|h100*|h200*) echo "$CX_IMAGE_MULTIARCH" ;; + *) cx_die "no default image for runner prefix: $1" ;; + esac +} + +cx_resolve_registry_digest() { + local image="$1" repository reference token digest registry + if [[ "$image" == *@* ]]; then + cx_die "digest-qualified image overrides are unsupported; configure a tag and pinned digest" + fi + registry="${image%%/*}" + if [[ "$image" == */* && ( "$registry" == *.* || "$registry" == *:* || "$registry" = localhost ) ]]; then + case "$registry" in + docker.io|registry-1.docker.io) image="${image#*/}" ;; + *) cx_die "only Docker Hub images are supported by the registry verifier" ;; + esac + fi + repository="${image%:*}" + reference="${image##*:}" + [ "$repository" != "$image" ] || { repository="$image"; reference=latest; } + [ -n "$repository" ] && [ -n "$reference" ] \ + || cx_die "configured image reference is malformed" + [[ "$repository" == */* ]] || repository="library/$repository" + token="$(curl -fsSLG --connect-timeout 10 --max-time 30 --retry 2 \ + --retry-delay 1 --retry-all-errors 'https://auth.docker.io/token' \ + --data-urlencode 'service=registry.docker.io' \ + --data-urlencode "scope=repository:${repository}:pull" \ + | python3 -c 'import json,sys; print(json.load(sys.stdin)["token"])')" \ + || cx_die "cannot authenticate to the image registry" + digest="$(curl -fsSI --connect-timeout 10 --max-time 30 --retry 2 \ + --retry-delay 1 --retry-all-errors \ + -H "Authorization: Bearer $token" \ + -H 'Accept: application/vnd.oci.image.index.v1+json, application/vnd.oci.image.manifest.v1+json, application/vnd.docker.distribution.manifest.list.v2+json, application/vnd.docker.distribution.manifest.v2+json' \ + "https://registry-1.docker.io/v2/${repository}/manifests/${reference}" \ + | tr -d '\r' | awk 'tolower($1)=="docker-content-digest:" {print $2; exit}')" \ + || cx_die "cannot resolve the configured image digest" + [[ "$digest" =~ ^sha256:[0-9a-f]{64}$ ]] \ + || cx_die "registry returned an invalid image digest" + printf '%s' "$digest" +} + +cx_verify_registry_image() { + local image="$1" expected actual + expected="${CX_IMAGE_DIGEST:-$(cx_default_image_digest "$image")}" + [[ "$expected" =~ ^sha256:[0-9a-f]{64}$ ]] \ + || cx_die "a pinned digest is required for the configured image" + actual="$(cx_resolve_registry_digest "$image")" + [ "$actual" = "$expected" ] \ + || cx_die "configured image tag no longer matches its pinned digest" + export COLLECTIVEX_IMAGE="$image" COLLECTIVEX_IMAGE_DIGEST="$actual" + export COLLECTIVEX_IMAGE_DIGEST_VERIFIED=1 +} + +cx_default_image_digest() { + case "$1" in + "$CX_IMAGE_MULTIARCH") printf '%s' "$CX_IMAGE_MULTIARCH_DIGEST" ;; + "$CX_IMAGE_AMD_MORI") printf '%s' "$CX_IMAGE_AMD_MORI_DIGEST" ;; + "$CX_IMAGE_AMD_MORI_MI325") printf '%s' "$CX_IMAGE_AMD_MORI_MI325_DIGEST" ;; + esac +} + +# Canonical workflow runs must not inherit benchmark controls from a persistent +# self-hosted runner service. Manual/SSH diagnostics retain their explicit +# overrides by leaving COLLECTIVEX_CANONICAL_GHA unset. +cx_gha_workspace_stage_root() { + local workspace="${GITHUB_WORKSPACE:-}" + python3 - "$workspace" <<'PY' +import os +import stat +import sys + +workspace = sys.argv[1] +try: + if ( + not os.path.isabs(workspace) + or os.path.realpath(workspace) != workspace + or not os.path.isdir(workspace) + ): + raise OSError + metadata = os.stat(workspace, follow_symlinks=False) + # GitHub runner workspaces are runner-owned but commonly writable by the + # trusted runner-service group. Keep the child mode 0700 and reject world write. + if metadata.st_uid != os.getuid() or stat.S_IMODE(metadata.st_mode) & stat.S_IWOTH: + raise OSError +except OSError: + raise SystemExit(1) +print(workspace, end="") +PY +} + +# Create a per-UID cache under validated cluster-local storage. Only the fixed +# /cx-cache mount enters the container; the operator host path does not. +cx_prepare_backend_cache() { + local stage_parent="$1" cache info sentinel_sha256 + unset CX_PREPARED_BACKEND_CACHE CX_BACKEND_CACHE_SENTINEL_SHA256 + info="$(python3 - "$stage_parent" <<'PY' +import hashlib +import os +import secrets +import stat +import sys + +configured_parent = sys.argv[1] +try: + if ( + not os.path.isabs(configured_parent) + or "\n" in configured_parent + or "\r" in configured_parent + ): + raise OSError + parent = os.path.realpath(configured_parent) + if not os.path.isdir(parent): + raise OSError + flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0) + parent_fd = os.open(parent, flags) + try: + probe_name = f".collectivex-owner-probe-{os.getpid()}-{secrets.token_hex(8)}" + os.mkdir(probe_name, 0o700, dir_fd=parent_fd) + try: + probe_fd = os.open(probe_name, flags, dir_fd=parent_fd) + try: + probe = os.fstat(probe_fd) + if stat.S_IMODE(probe.st_mode) & 0o777 != 0o700: + raise OSError + realized_owner = probe.st_uid + finally: + os.close(probe_fd) + finally: + os.rmdir(probe_name, dir_fd=parent_fd) + for generation in (3, 4): + name = f".collectivex-backend-cache-v{generation}-{os.getuid()}" + try: + os.mkdir(name, 0o700, dir_fd=parent_fd) + except FileExistsError: + pass + try: + cache_fd = os.open(name, flags, dir_fd=parent_fd) + try: + metadata = os.fstat(cache_fd) + if ( + metadata.st_uid != realized_owner + or stat.S_IMODE(metadata.st_mode) & 0o777 != 0o700 + ): + raise OSError + sentinel_name = ".collectivex-mount-sentinel-v1" + temporary_name = ( + f"{sentinel_name}.tmp.{os.getpid()}.{secrets.token_hex(8)}" + ) + create_flags = ( + os.O_WRONLY | os.O_CREAT | os.O_EXCL + | getattr(os, "O_NOFOLLOW", 0) + ) + payload = secrets.token_bytes(32) + temporary_fd = os.open( + temporary_name, create_flags, 0o600, dir_fd=cache_fd + ) + try: + try: + view = memoryview(payload) + try: + while view: + written = os.write(temporary_fd, view) + if written <= 0: + raise OSError + view = view[written:] + os.fsync(temporary_fd) + finally: + view.release() + finally: + os.close(temporary_fd) + try: + os.link( + temporary_name, + sentinel_name, + src_dir_fd=cache_fd, + dst_dir_fd=cache_fd, + follow_symlinks=False, + ) + except FileExistsError: + pass + finally: + try: + os.unlink(temporary_name, dir_fd=cache_fd) + except FileNotFoundError: + pass + sentinel_fd = os.open( + sentinel_name, + os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0), + dir_fd=cache_fd, + ) + try: + sentinel = os.fstat(sentinel_fd) + payload = os.read(sentinel_fd, 33) + if ( + not stat.S_ISREG(sentinel.st_mode) + or sentinel.st_uid != realized_owner + or stat.S_IMODE(sentinel.st_mode) & 0o777 != 0o600 + or sentinel.st_size != 32 + or len(payload) != 32 + ): + raise OSError + sentinel_sha256 = hashlib.sha256(payload).hexdigest() + finally: + os.close(sentinel_fd) + finally: + os.close(cache_fd) + except OSError: + if generation == 3: + continue + raise + break + finally: + os.close(parent_fd) +except OSError: + raise SystemExit(1) +print(sentinel_sha256, os.path.join(parent, name), end="") +PY +)" || return 1 + sentinel_sha256="${info%% *}" + cache="${info#* }" + [ "$cache" != "$info" ] && [[ "$sentinel_sha256" =~ ^[0-9a-f]{64}$ ]] \ + && [[ "$cache" = /* ]] || return 1 + export CX_PREPARED_BACKEND_CACHE="$cache" + export CX_BACKEND_CACHE_SENTINEL_SHA256="$sentinel_sha256" +} + +cx_verify_backend_cache_mount() { + python3 - "${CX_BACKEND_CACHE_ROOT:-}" \ + "${CX_BACKEND_CACHE_SENTINEL_SHA256:-}" <<'PY' +import hashlib +import os +import re +import stat +import sys + +root, expected = sys.argv[1:] +try: + if ( + not os.path.isabs(root) + or os.path.realpath(root) != root + or re.fullmatch(r"[0-9a-f]{64}", expected) is None + ): + raise OSError + flags = os.O_RDONLY | os.O_DIRECTORY | getattr(os, "O_NOFOLLOW", 0) + root_fd = os.open(root, flags) + try: + root_item = os.fstat(root_fd) + if ( + not stat.S_ISDIR(root_item.st_mode) + or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700 + ): + raise OSError + sentinel_fd = os.open( + ".collectivex-mount-sentinel-v1", + os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0), + dir_fd=root_fd, + ) + try: + sentinel = os.fstat(sentinel_fd) + payload = os.read(sentinel_fd, 33) + if ( + not stat.S_ISREG(sentinel.st_mode) + or sentinel.st_uid != root_item.st_uid + or stat.S_IMODE(sentinel.st_mode) & 0o777 != 0o600 + or sentinel.st_size != 32 + or len(payload) != 32 + or hashlib.sha256(payload).hexdigest() != expected + ): + raise OSError + finally: + os.close(sentinel_fd) + finally: + os.close(root_fd) +except OSError: + raise SystemExit(1) +PY +} + +cx_git() { + GIT_CONFIG_NOSYSTEM=1 GIT_CONFIG_GLOBAL=/dev/null GIT_TERMINAL_PROMPT=0 \ + git -c credential.helper= "$@" +} + +cx_git_in_tree() { + local directory="$1" canonical + shift + [[ "$directory" = /* ]] && [ -d "$directory" ] && [ ! -L "$directory" ] \ + || return 1 + [[ "$directory" != *'*'* && "$directory" != *$'\n'* && "$directory" != *$'\r'* ]] \ + || return 1 + canonical="$(cd -P -- "$directory" && pwd -P)" || return 1 + cx_git -c "safe.directory=$canonical" -C "$canonical" "$@" +} + +cx_fetch_revision() { + local repository="$1" revision="$2" destination="$3" attempt + for attempt in 1 2 3; do + rm -rf -- "$destination" + if cx_git init -q "$destination" \ + && cx_git_in_tree "$destination" remote add origin "$repository" \ + && cx_git_in_tree "$destination" fetch -q --no-tags --depth 1 origin "$revision" \ + && cx_git_in_tree "$destination" -c advice.detachedHead=false \ + checkout -q --detach FETCH_HEAD \ + && [ "$(cx_git_in_tree "$destination" rev-parse HEAD)" = "$revision" ]; then + return 0 + fi + [ "$attempt" = 3 ] || sleep $((attempt * 5)) + done + return 1 +} + +cx_backend_source_pin() { + case "$1" in + deepep-v2) + printf '%s|%s|%s' \ + "$CX_DEEPEP_V2_COMMIT" "$CX_DEEPEP_V2_TREE" "$CX_DEEPEP_V2_FMT_COMMIT" + ;; + deepep-hybrid) + printf '%s|%s||%s' "$CX_DEEPEP_HYBRID_COMMIT" "$CX_DEEPEP_HYBRID_TREE" \ + "$CX_DEEPEP_HYBRID_NCCL_COMMIT" + ;; + *) return 1 ;; + esac +} + +cx_backend_source_path() { + local root="$1" backend="$2" revision tree fmt nccl pin + pin="$(cx_backend_source_pin "$backend")" || return 1 + IFS='|' read -r revision tree fmt nccl <<< "$pin" + printf '%s/%s-%s' "$root" "$backend" "$revision" +} + +cx_backend_source_is_valid() { + local backend="$1" source="$2" revision tree fmt nccl pin status ignored + pin="$(cx_backend_source_pin "$backend")" || return 1 + IFS='|' read -r revision tree fmt nccl <<< "$pin" + [ -d "$source" ] && [ ! -L "$source" ] \ + && [ "$(cx_git_in_tree "$source" rev-parse HEAD 2>/dev/null)" = "$revision" ] \ + && [ "$(cx_git_in_tree "$source" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \ + || return 1 + status="$(cx_git_in_tree "$source" status --porcelain --untracked-files=all \ + --ignore-submodules=none 2>/dev/null)" || return 1 + [ -z "$status" ] || return 1 + ignored="$(cx_git_in_tree "$source" ls-files --others --ignored --exclude-standard \ + 2>/dev/null)" || return 1 + [ -z "$ignored" ] || return 1 + [ -z "$fmt" ] \ + || [ "$(cx_git_in_tree "$source/third-party/fmt" rev-parse HEAD 2>/dev/null)" = "$fmt" ] \ + || return 1 + [ -z "$nccl" ] \ + || [ "$(cx_git_in_tree "$source/third-party/nccl" rev-parse HEAD 2>/dev/null)" = "$nccl" ] +} + +cx_extension_pair_sha256() { + python3 - "$1" "$2" "$3" <<'PY' +import hashlib +import os +from pathlib import Path +import stat +import sys + +root = Path(sys.argv[1]) +digest = hashlib.sha256() +try: + if root.is_symlink() or not root.is_dir(): + raise OSError + for pattern in sys.argv[2:]: + matches = list(root.glob(pattern)) + if len(matches) != 1 or matches[0].is_symlink(): + raise OSError + path = matches[0] + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + metadata = os.fstat(descriptor) + if not stat.S_ISREG(metadata.st_mode): + raise OSError + file_digest = hashlib.sha256() + with os.fdopen(descriptor, "rb", closefd=False) as stream: + for chunk in iter(lambda: stream.read(1024 * 1024), b""): + file_digest.update(chunk) + digest.update(path.name.encode("utf-8") + b"\0") + digest.update(str(metadata.st_size).encode("ascii") + b"\0") + digest.update(file_digest.digest()) + finally: + os.close(descriptor) +except (OSError, UnicodeError): + raise SystemExit(1) +print(digest.hexdigest(), end="") +PY +} + +# Acquire source before compute allocation, preferring the verified same-run GHA seed. +_cx_prepare_backend_source() { + local mount_src="$1" backend="$2" root source temporary revision tree fmt nccl pin + local root_mode stage_mode root_owner stage_owner + local seed_root="${CX_BACKEND_SOURCE_SEED_ROOT:-}" seed seed_mode + root="$mount_src/experimental/CollectiveX/.cx_sources" + CX_BACKEND_SOURCE_STEP="source mount creation" + if [ ! -e "$root" ] && [ ! -L "$root" ]; then + mkdir -m 700 -- "$root" || return 1 + fi + CX_BACKEND_SOURCE_STEP="source mount ownership validation" + [ -d "$mount_src" ] && [ ! -L "$mount_src" ] \ + && [ -d "$root" ] && [ ! -L "$root" ] || return 1 + stage_owner="$(stat -c '%u' "$mount_src" 2>/dev/null)" || return 1 + root_owner="$(stat -c '%u' "$root" 2>/dev/null)" || return 1 + [ "$root_owner" = "$stage_owner" ] || return 1 + stage_mode="$(stat -c '%a' "$mount_src" 2>/dev/null)" || return 1 + case "$stage_mode" in 700|[1-7]700) ;; *) return 1 ;; esac + # Shared stage parents may retain harmless special bits despite mkdir -m. + CX_BACKEND_SOURCE_STEP="source mount permission inspection" + root_mode="$(stat -c '%a' "$root" 2>/dev/null)" || return 1 + case "$root_mode" in + 700|[1-7]700) ;; + *) + CX_BACKEND_SOURCE_STEP="source mount permission normalization" + chmod 700 "$root" || return 1 + CX_BACKEND_SOURCE_STEP="source mount permission validation" + root_mode="$(stat -c '%a' "$root" 2>/dev/null)" || return 1 + case "$root_mode" in 700|[1-7]700) ;; *) return 1 ;; esac + ;; + esac + CX_BACKEND_SOURCE_STEP="git lookup" + command -v git >/dev/null || return 1 + CX_BACKEND_SOURCE_STEP="source pin resolution" + source="$(cx_backend_source_path "$root" "$backend")" || return 1 + if [ -e "$source" ] || [ -L "$source" ]; then + CX_BACKEND_SOURCE_STEP="existing source validation" + cx_backend_source_is_valid "$backend" "$source" + return + fi + if [ -n "$seed_root" ]; then + CX_BACKEND_SOURCE_STEP="source seed validation" + [[ "$seed_root" = /* ]] && [ -d "$seed_root" ] && [ ! -L "$seed_root" ] \ + || return 1 + seed_mode="$(stat -c '%a' "$seed_root" 2>/dev/null)" || return 1 + case "$seed_mode" in 700|[1-7]700) ;; *) return 1 ;; esac + seed="$(cx_backend_source_path "$seed_root" "$backend")" || return 1 + cx_backend_source_is_valid "$backend" "$seed" || return 1 + CX_BACKEND_SOURCE_STEP="source seed copy" + temporary="$(mktemp -d "$root/.${backend}.XXXXXX")" || return 1 + if ! cp -R -- "$seed/." "$temporary/" \ + || ! cx_backend_source_is_valid "$backend" "$temporary" \ + || ! mv -- "$temporary" "$source"; then + rm -rf -- "$temporary" + return 1 + fi + return + fi + if [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ]; then + CX_BACKEND_SOURCE_STEP="source seed validation" + return 1 + fi + CX_BACKEND_SOURCE_STEP="source checkout creation" + temporary="$(mktemp -d "$root/.${backend}.XXXXXX")" || return 1 + CX_BACKEND_SOURCE_STEP="source pin resolution" + pin="$(cx_backend_source_pin "$backend")" || { + rm -rf -- "$temporary" + return 1 + } + IFS='|' read -r revision tree fmt nccl <<< "$pin" + CX_BACKEND_SOURCE_STEP="revision fetch" + if ! cx_fetch_revision \ + https://github.com/deepseek-ai/DeepEP "$revision" "$temporary"; then + rm -rf -- "$temporary" + return 1 + fi + CX_BACKEND_SOURCE_STEP="submodule fetch" + if [ -n "$fmt" ] && ! cx_git_in_tree "$temporary" \ + -c "safe.directory=$temporary/third-party/fmt" \ + submodule update -q --init --depth 1 third-party/fmt; then + rm -rf -- "$temporary" + return 1 + fi + if [ -n "$nccl" ] && ! cx_git_in_tree "$temporary" \ + -c "safe.directory=$temporary/third-party/nccl" \ + submodule update -q --init --depth 1 third-party/nccl; then + rm -rf -- "$temporary" + return 1 + fi + CX_BACKEND_SOURCE_STEP="source publication validation" + if ! cx_backend_source_is_valid "$backend" "$temporary" \ + || ! mv -- "$temporary" "$source"; then + rm -rf -- "$temporary" + return 1 + fi +} + +cx_prepare_backend_source() { + local log backend="$2" CX_BACKEND_SOURCE_STEP="initialization" + log="$(cx_private_log_path "backend-source-$backend")" || return 1 + if _cx_prepare_backend_source "$@" > "$log" 2>&1; then + return 0 + fi + printf '%s failed\n' "$CX_BACKEND_SOURCE_STEP" >> "$log" + cx_log "ERROR: backend-source-step=${CX_BACKEND_SOURCE_STEP// /-}" + cx_fail_stage backend-setup "$log" +} + +cx_materialize_backend_source() { + local backend="$1" destination="$2" source parent temporary + [ -n "${CX_BACKEND_SOURCE_ROOT:-}" ] || return 1 + source="$(cx_backend_source_path "$CX_BACKEND_SOURCE_ROOT" "$backend")" || return 1 + cx_backend_source_is_valid "$backend" "$source" || return 1 + parent="${destination%/*}" + [ "$parent" != "$destination" ] && [ -d "$parent" ] && [ ! -L "$parent" ] \ + || return 1 + temporary="$(mktemp -d "$parent/.collectivex-source.XXXXXX")" || return 1 + if ! cp -R -- "$source/." "$temporary/" \ + || ! cx_backend_source_is_valid "$backend" "$temporary"; then + rm -rf -- "$temporary" + return 1 + fi + if ! rm -rf -- "$destination" || ! mv -- "$temporary" "$destination"; then + rm -rf -- "$temporary" + return 1 + fi + if ! cx_backend_source_is_valid "$backend" "$destination"; then + rm -rf -- "$destination" + return 1 + fi + return 0 +} + +cx_lock_canonical_gha_env() { + local runner="$1" expected_nodes expected_gpn expected_world trusted_lock_dir="" + local trusted_stage_dir="" + local trusted_socket_ifname="" trusted_rdma_devices="" + local trusted_ib_gid_index="" trusted_rdma_service_level="" + local trusted_audit_salt="" + [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] || return 0 + [ "${GITHUB_ACTIONS:-}" = true ] \ + || cx_die "canonical CollectiveX execution requires GitHub Actions" + [ -n "${CX_SHARD_FILE:-}" ] && [ "${CX_SHARD_SKU:-}" = "$runner" ] \ + || cx_die "canonical CollectiveX execution requires a matched shard" + [[ "${GITHUB_RUN_ID:-}" =~ ^[1-9][0-9]*$ \ + && "${GITHUB_RUN_ATTEMPT:-}" =~ ^[1-9][0-9]*$ \ + && "${COLLECTIVEX_SOURCE_SHA:-}" =~ ^[0-9a-f]{40,64}$ ]] \ + || cx_die "canonical CollectiveX workflow identity is incomplete" + + # cx_load_operator_config clears inherited values before setting this process marker. + # Preserve only values parsed from that private strict document. + if [ "${COLLECTIVEX_OPERATOR_CONFIG_LOADED:-}" = "$$" ]; then + trusted_lock_dir="${CX_LOCK_DIR:-}" + trusted_stage_dir="${CX_STAGE_DIR:-}" + trusted_socket_ifname="${CX_SOCKET_IFNAME:-}" + trusted_rdma_devices="${CX_RDMA_DEVICES:-}" + trusted_ib_gid_index="${CX_IB_GID_INDEX:-}" + trusted_rdma_service_level="${CX_RDMA_SERVICE_LEVEL:-}" + trusted_audit_salt="${CX_AUDIT_SALT:-}" + fi + unset CX_NCCL_HOME CX_MASTER_PORT CX_MORI_KERNEL_TYPE CX_LOCK_DIR CX_STAGE_DIR + unset MASTER_ADDR MASTER_PORT RANK WORLD_SIZE LOCAL_RANK LOCAL_WORLD_SIZE + unset CX_SOCKET_IFNAME CX_RDMA_DEVICES CX_IB_GID_INDEX CX_RDMA_SERVICE_LEVEL + unset CX_AUDIT_SALT + unset NCCL_NET NCCL_SOCKET_IFNAME GLOO_SOCKET_IFNAME NCCL_IB_HCA + unset NCCL_IB_GID_INDEX NCCL_IB_SL + unset NVSHMEM_HCA_LIST NVSHMEM_IB_GID_INDEX NVSHMEM_IB_SL + unset NVSHMEM_IB_ENABLE_IBGDA NVSHMEM_IBGDA_NIC_HANDLER + unset EP_NIC_NAME EP_OVERRIDE_RDMA_SL + unset UCCL_SOCKET_IFNAME UCCL_IB_GID_INDEX UCCL_IB_SL MORI_RDMA_DEVICES + unset HYBRID_EP_MULTINODE USE_NIXL RDMA_CORE_HOME DEEPEP_HYBRID_BUILD_MODE + unset MORI_COMMIT MORI_DISABLE_AUTO_XGMI MORI_ENABLE_SDMA + unset MORI_APP_LOG_LEVEL MORI_SHMEM_LOG_LEVEL MORI_IO_LOG_LEVEL + unset NCCL_CUMEM_ENABLE NCCL_MNNVL_ENABLE MC_FORCE_MNNVL + unset CX_BACKEND_CACHE_ROOT CX_BACKEND_CACHE_SENTINEL_SHA256 + unset CX_PREPARED_BACKEND_CACHE CX_BACKEND_SOURCE_ROOT + + [ -n "${CX_SQUASH_DIR:-}" ] \ + || cx_die "canonical CollectiveX execution requires shared container storage" + [ -n "$trusted_stage_dir" ] \ + || cx_die "canonical CollectiveX execution requires a configured shared stage directory" + [[ "$trusted_audit_salt" =~ ^[0-9a-f]{64}$ ]] \ + || cx_die "canonical CollectiveX execution requires a private audit salt" + + case "$runner" in + h100-dgxc|h200-dgxc|b200-dgxc|b300) + expected_nodes="${CX_NODES:-}"; expected_gpn=8 + [ "$expected_nodes" = 1 ] || [ "$expected_nodes" = 2 ] \ + || cx_die "canonical NVIDIA execution requires one or two nodes" + CX_IMAGE="$CX_IMAGE_MULTIARCH" + CX_IMAGE_DIGEST="$CX_IMAGE_MULTIARCH_DIGEST" + CX_NCCL_HOME=/usr + ;; + gb200|gb300) + expected_nodes="${CX_NODES:-}"; expected_gpn=4 + [ "$expected_nodes" = 2 ] || [ "$expected_nodes" = 4 ] \ + || cx_die "canonical GB execution requires two or four trays" + CX_IMAGE="$CX_IMAGE_MULTIARCH" + CX_IMAGE_DIGEST="$CX_IMAGE_MULTIARCH_DIGEST" + CX_NCCL_HOME=/usr + CX_MASTER_PORT=29551 + ;; + mi325x) + expected_nodes="${CX_NODES:-}"; expected_gpn=8 + [ "$expected_nodes" = 1 ] || [ "$expected_nodes" = 2 ] \ + || cx_die "canonical AMD execution requires one or two nodes" + CX_IMAGE="$CX_IMAGE_AMD_MORI_MI325" + CX_IMAGE_DIGEST="$CX_IMAGE_AMD_MORI_MI325_DIGEST" + if [ "$expected_nodes" = 2 ]; then + CX_MORI_KERNEL_TYPE=internode-v1 + else + CX_MORI_KERNEL_TYPE=asyncll + fi + MORI_COMMIT="$CX_MORI_COMMIT_MI325" + MORI_DISABLE_AUTO_XGMI=0 + MORI_ENABLE_SDMA=1 + MORI_APP_LOG_LEVEL=info + MORI_SHMEM_LOG_LEVEL=info + MORI_IO_LOG_LEVEL=info + ;; + mi355x) + expected_nodes="${CX_NODES:-}"; expected_gpn=8 + [ "$expected_nodes" = 1 ] || [ "$expected_nodes" = 2 ] \ + || cx_die "canonical AMD execution requires one or two nodes" + CX_IMAGE="$CX_IMAGE_AMD_MORI" + CX_IMAGE_DIGEST="$CX_IMAGE_AMD_MORI_DIGEST" + if [ "$expected_nodes" = 2 ]; then + CX_MORI_KERNEL_TYPE=internode-v1 + else + CX_MORI_KERNEL_TYPE=intranode + fi + MORI_COMMIT="$CX_MORI_COMMIT_MI355" + ;; + *) cx_die "canonical CollectiveX runner is not registered" ;; + esac + case "$runner:$trusted_lock_dir" in + mi325x:?*|mi355x:?*) export CX_LOCK_DIR="$trusted_lock_dir" ;; + esac + CX_STAGE_DIR="$trusted_stage_dir" + [ -z "$trusted_socket_ifname" ] \ + || export CX_SOCKET_IFNAME="$trusted_socket_ifname" + [ -z "$trusted_rdma_devices" ] \ + || export CX_RDMA_DEVICES="$trusted_rdma_devices" + [ -z "$trusted_ib_gid_index" ] \ + || export CX_IB_GID_INDEX="$trusted_ib_gid_index" + [ -z "$trusted_rdma_service_level" ] \ + || export CX_RDMA_SERVICE_LEVEL="$trusted_rdma_service_level" + CX_AUDIT_SALT="$trusted_audit_salt" + export CX_STAGE_DIR CX_AUDIT_SALT + [ "${CX_NODES:-}" = "$expected_nodes" ] \ + && [ "${CX_GPUS_PER_NODE:-}" = "$expected_gpn" ] \ + || cx_die "canonical CollectiveX placement differs from the shard" + expected_world=$((expected_nodes * expected_gpn)) + CX_NGPUS="$expected_world" + CX_SEED=67 + case "$runner" in mi325x|mi355x) CX_RUN_TIMEOUT=1800 ;; *) CX_RUN_TIMEOUT=900 ;; esac + unset CX_PUBLIC_RUNNER CX_GB_PRODUCT CX_DRYRUN CX_TIMING CX_ALLOW_MNNVL + unset CX_ENROOT_LOCAL_IMPORT COLLECTIVEX_IMAGE COLLECTIVEX_IMAGE_DIGEST + unset COLLECTIVEX_IMAGE_DIGEST_VERIFIED COLLECTIVEX_SQUASH_SHA256 + export CX_IMAGE CX_IMAGE_DIGEST CX_NGPUS CX_SEED CX_RUN_TIMEOUT + case "$runner" in + h100-dgxc|h200-dgxc|b200-dgxc|b300) export CX_NCCL_HOME ;; + gb200|gb300) export CX_NCCL_HOME CX_MASTER_PORT ;; + mi325x) + export CX_MORI_KERNEL_TYPE MORI_COMMIT MORI_DISABLE_AUTO_XGMI MORI_ENABLE_SDMA + export MORI_APP_LOG_LEVEL MORI_SHMEM_LOG_LEVEL MORI_IO_LOG_LEVEL + ;; + mi355x) export CX_MORI_KERNEL_TYPE MORI_COMMIT ;; + esac +} + +cx_reverify_registry_image() { + local image="$1" actual + [[ "${COLLECTIVEX_IMAGE_DIGEST:-}" =~ ^sha256:[0-9a-f]{64}$ ]] \ + && [ "${COLLECTIVEX_IMAGE_DIGEST_VERIFIED:-0}" = 1 ] || return 1 + actual="$(cx_resolve_registry_digest "$image")" || return 1 + [ "$actual" = "$COLLECTIVEX_IMAGE_DIGEST" ] || { + cx_log "ERROR: configured image tag changed during container import" + return 1 + } +} + +cx_export_squash_identity() { + local image="$1" digest log + log="$(cx_private_log_path container-hash)" + digest="$(sha256sum "$image" 2>> "$log" | awk '{print $1}')" + [[ "$digest" =~ ^[0-9a-f]{64}$ ]] \ + || { cx_fail_stage container-hash "$log"; return 1; } + export COLLECTIVEX_SQUASH_SHA256="$digest" +} + +cx_squash_path() { + local squash_dir="$1" image="$2" key platform + [[ "${COLLECTIVEX_IMAGE_DIGEST:-}" =~ ^sha256:[0-9a-f]{64}$ ]] \ + || return 1 + case "${CX_IMAGE_PLATFORM:-}" in + linux/amd64) platform="" ;; + linux/arm64) platform="_linux_arm64" ;; + *) return 1 ;; + esac + key="${CX_SQUASH_FORMAT_VERSION}${platform}_${COLLECTIVEX_IMAGE_DIGEST#sha256:}_$( + printf '%s' "$image" | sed 's#[/:@#]#_#g' + )" + printf '%s' "$squash_dir/${key}.sqsh" +} + +# cx_ensure_squash -> echoes the squash file path. +# Imports via Enroot only if a valid squash is not already present, under a lock. +cx_ensure_squash() { + local squash_dir="$1" image="$2" key sq locks lock_fd log + local enroot_local="" import_rc=0 machine + log="$(cx_private_log_path container-import)" + machine="$(uname -m)" + case "${CX_IMAGE_PLATFORM:-}:$machine" in + linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;; + *) cx_fail_stage container-import "$log"; return 1 ;; + esac + mkdir -p "$squash_dir" 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + sq="$(cx_squash_path "$squash_dir" "$image")" \ + || { cx_fail_stage container-import "$log"; return 1; } + key="${sq##*/}" + key="${key%.sqsh}" + locks="$squash_dir/.locks" + mkdir -p "$locks" 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + { exec {lock_fd}>"$locks/${key}.lock"; } 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + flock -w 900 "$lock_fd" 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + if unsquashfs -l "$sq" >/dev/null 2>&1; then + cx_log "container squash ready" + else + cx_log "importing configured container image" + rm -f "$sq" 2>> "$log" \ + || { cx_fail_stage container-import "$log"; return 1; } + # > "$log" 2>&1 || import_rc=$? + rm -rf -- "$enroot_local" >/dev/null 2>&1 || true + [ "$import_rc" = 0 ] \ + || { cx_fail_stage container-import "$log"; return 1; } + else + SOURCE_DATE_EPOCH="$CX_SQUASH_SOURCE_DATE_EPOCH" \ + enroot import -o "$sq" "docker://$image" > "$log" 2>&1 \ + || { cx_fail_stage container-import "$log"; return 1; } + fi + unsquashfs -l "$sq" >> "$log" 2>&1 \ + || { cx_fail_stage container-import "$log"; return 1; } + fi + if ! cx_reverify_registry_image "$image" >> "$log" 2>&1; then + flock -u "$lock_fd" >/dev/null 2>&1 || true + exec {lock_fd}>&- + cx_fail_stage container-import "$log" + return 1 + fi + flock -u "$lock_fd" + exec {lock_fd}>&- + echo "$sq" +} + +# Import on an allocated compute node so multiarch tags resolve for the target +# architecture. The squash directory must be shared with the submit host. +cx_ensure_squash_on_job() { + local job_id="$1" squash_dir="$2" image="$3" lock_dir="${4:-}" sq key lock log + [[ "$job_id" =~ ^[0-9]+$ ]] || return 1 + sq="$(cx_squash_path "$squash_dir" "$image")" || return 1 + key="${sq##*/}" + key="${key%.sqsh}" + [ -n "$lock_dir" ] || lock_dir="$squash_dir/.locks" + lock="$lock_dir/${key}.lock" + log="$(cx_private_log_path container-import)" + if ! srun --jobid="$job_id" --nodes=1 --ntasks=1 --chdir=/tmp \ + --export="$(cx_host_exports)" \ + bash -s -- "$sq" "$lock" "$image" "$CX_SQUASH_SOURCE_DATE_EPOCH" \ + "$CX_IMAGE_PLATFORM" \ + > "$log" 2>&1 <<'BASH' +set -euo pipefail +sq="$1"; lock="$2"; image="$3"; source_date_epoch="$4"; platform="$5" +machine="$(uname -m)" +case "$platform:$machine" in + linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;; + *) exit 13 ;; +esac +compute_home="$(mktemp -d /tmp/inferencex-collectivex-home.XXXXXX)" +trap 'rm -rf -- "$compute_home"' EXIT +export HOME="$compute_home" XDG_CACHE_HOME="$compute_home/.cache" +export ENROOT_TEMP_PATH="$compute_home/enroot-tmp" +export ENROOT_CACHE_PATH="$compute_home/enroot-cache" +export ENROOT_DATA_PATH="$compute_home/enroot-data" +export ENROOT_RUNTIME_PATH="$compute_home/enroot-run" +mkdir -p "$(dirname "$sq")" "$(dirname "$lock")" \ + "$ENROOT_TEMP_PATH" "$ENROOT_CACHE_PATH" "$ENROOT_DATA_PATH" "$ENROOT_RUNTIME_PATH" +exec 9>"$lock" +flock -w 900 9 +if unsquashfs -l "$sq" >/dev/null 2>&1; then + echo 'container squash ready' +else + rm -f -- "$sq" + SOURCE_DATE_EPOCH="$source_date_epoch" \ + enroot import -o "$sq" "docker://$image" /dev/null 2>&1 +fi +BASH + then + cx_fail_stage container-import "$log" + return 1 + fi + if ! cx_reverify_registry_image "$image" >> "$log" 2>&1; then + cx_fail_stage container-import "$log" + return 1 + fi + printf '%s' "$sq" +} + +cx_preflight_allocation() { + local job_id="$1" nodes="$2" mount_src="$3" squash="$4" shard="${5:-}" + local log rc=0 runtime shard_path="" probe_root probe_token index + runtime="$mount_src/experimental/CollectiveX/runtime/run_in_container.sh" + [ -z "$shard" ] || shard_path="$mount_src/experimental/CollectiveX/$shard" + log="$(cx_private_log_path allocation-preflight)" + probe_root="$mount_src/.collectivex-preflight" + probe_token="$probe_root/source" + if [ -e "$probe_root" ] || [ -L "$probe_root" ] \ + || ! mkdir -m 700 "$probe_root"; then + cx_fail_stage repository-stage "$log" + return 1 + fi + if ! printf '%s\n' "${COLLECTIVEX_EXECUTION_ID:-manual-$$}" > "$probe_token" \ + || ! chmod 600 "$probe_token"; then + chmod 700 "$probe_root" >/dev/null 2>&1 || true + rm -rf -- "$probe_root" >/dev/null 2>&1 || true + cx_fail_stage repository-stage "$log" + return 1 + fi + srun --jobid="$job_id" --nodes="$nodes" --ntasks="$nodes" --ntasks-per-node=1 \ + --chdir=/tmp --input=all \ + --export="$(cx_host_exports)" bash -s -- "$runtime" "$shard_path" "$squash" \ + "$CX_IMAGE_PLATFORM" "$probe_root" \ + > "$log" 2>&1 <<'BASH' || rc=$? +set -euo pipefail +machine="$(uname -m)" +case "$4:$machine" in + linux/amd64:x86_64|linux/amd64:amd64|linux/arm64:aarch64|linux/arm64:arm64) ;; + *) exit 13 ;; +esac +test -r "$1" || exit 10 +[ -z "$2" ] || test -r "$2" || exit 11 +test -r "$3" || exit 12 +unsquashfs -s "$3" >/dev/null 2>&1 || exit 12 +case "${SLURM_NODEID:-}" in ""|*[!0-9]*) exit 10 ;; esac +[ -d "$5" ] && [ ! -L "$5" ] && [ -r "$5/source" ] || exit 10 +(set -C; cat "$5/source" > "$5/node-$SLURM_NODEID") || exit 10 +cmp -s -- "$5/source" "$5/node-$SLURM_NODEID" || exit 10 +BASH + if [ "$rc" = 0 ]; then + for ((index = 0; index < nodes; index++)); do + if ! cmp -s -- "$probe_token" "$probe_root/node-$index"; then + rc=10 + break + fi + done + fi + if [ -d "$probe_root" ] && [ ! -L "$probe_root" ]; then + chmod 700 "$probe_root" >/dev/null 2>&1 || rc=10 + fi + rm -rf -- "$probe_root" >/dev/null 2>&1 || rc=10 + [ "$rc" = 0 ] && return 0 + case "$rc" in + 10|11) cx_fail_stage repository-stage "$log" ;; + 12) cx_fail_stage container-hash "$log" ;; + *) cx_fail_stage container-launch "$log" ;; + esac + return 1 +} + +# Resolve the exact per-execution child before any copy starts, so the parent +# EXIT trap can remove an interrupted partial stage. The configured base must +# already exist on compute-visible storage and must not traverse symlinks. +cx_stage_path() { + local repo_root="$1" stage_base="${2:-}" tag safe_tag stage_path + tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}" + [[ "$tag" =~ ^[A-Za-z0-9][A-Za-z0-9._-]*$ ]] \ + || cx_die "invalid staging execution identity" + safe_tag="$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')" + if [ -z "$stage_base" ] || [ "$stage_base" = "$repo_root" ]; then + [ "${COLLECTIVEX_CANONICAL_GHA:-0}" != 1 ] \ + || cx_die "canonical CollectiveX execution requires compute-visible staging" + [ -n "${CX_SQUASH_DIR:-}" ] \ + || cx_die "manual CollectiveX staging requires CX_SQUASH_DIR" + stage_base="$CX_SQUASH_DIR" + stage_path="${stage_base%/}/.collectivex-stage-$safe_tag" + else + stage_path="${stage_base%/}/job_$safe_tag" + fi + python3 - "$repo_root" "$stage_base" "$stage_path" \ + "${CX_JOB_ROOT:-}" "${GITHUB_WORKSPACE:-}" <<'PY' +import os +import stat +import sys + +repo, base, child, job_root, workspace = sys.argv[1:] +try: + if ( + not os.path.isabs(repo) + or os.path.realpath(repo) != repo + or not os.path.isabs(base) + or os.path.realpath(base) != base + or not os.path.isabs(child) + or os.path.dirname(child) != base.rstrip("/") + or os.path.lexists(child) + ): + raise OSError + metadata = os.stat(base, follow_symlinks=False) + excluded = [repo] + excluded.extend(path for path in (job_root, workspace) if path) + for path in excluded: + resolved = os.path.realpath(path) + if os.path.commonpath((base, resolved)) == resolved: + raise OSError + if ( + not stat.S_ISDIR(metadata.st_mode) + or metadata.st_uid != os.getuid() + or stat.S_IMODE(metadata.st_mode) & (stat.S_IWGRP | stat.S_IWOTH) + or not os.access(base, os.W_OK | os.X_OK) + ): + raise OSError +except OSError: + raise SystemExit(1) +print(child, end="") +PY +} + +# Stage only the public benchmark tree into a pre-resolved, private execution +# child. A runner-owned marker makes recursive cleanup an explicit capability. +cx_stage_repo() { + local repo_root="$1" stage_dir="$2" expected log tag marker + cx_validate_shard_control "$repo_root/experimental/CollectiveX" + expected="$(cx_stage_path "$repo_root" "${CX_STAGE_DIR:-}")" \ + || cx_die "configured stage base is unavailable or unsafe" + [ "$stage_dir" = "$expected" ] \ + || cx_die "execution stage differs from the configured stage base" + tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}" + if [ -e "$stage_dir" ] || [ -L "$stage_dir" ]; then + cx_die "refusing to reuse a pre-existing execution stage" + fi + mkdir -m 700 "$stage_dir" 2>/dev/null \ + || cx_die "cannot create the configured stage directory" + chmod 700 "$stage_dir" 2>/dev/null \ + || cx_die "cannot protect the configured stage directory" + marker="$stage_dir/.collectivex-stage-v1" + umask 077 + (set -C; printf 'collectivex-stage-v1\n%s\n' "$tag" > "$marker") 2>/dev/null \ + || cx_die "cannot claim the configured stage directory" + chmod 600 "$marker" 2>/dev/null \ + || cx_die "cannot protect the configured stage directory" + mkdir -m 700 "$stage_dir/experimental" 2>/dev/null \ + || cx_die "cannot create the configured stage directory" + cx_log "staging CollectiveX on compute-visible storage" + log="$(cx_private_log_path repository-stage)" + if ! rsync -a --delete --delete-excluded \ + --exclude='__pycache__/' --exclude='results/' --exclude='.cx_workloads/' \ + --exclude='.cx_backend/' --exclude='.cx_sources/' \ + --exclude='configs/platforms.yaml' --exclude='private-infra.md' \ + --exclude='goal.md' --exclude='notes.md' \ + "$repo_root/experimental/CollectiveX" "$stage_dir/experimental/" > "$log" 2>&1; then + rm -rf -- "$stage_dir" >/dev/null 2>&1 \ + || cx_log "ERROR: cannot remove the incomplete execution stage" + cx_fail_stage repository-stage "$log" || true + return 1 + fi +} + +# cx_collect_results +# When the run used a staged (compute-visible) mount, copy result JSONs back to +# the original checkout's results/ so the workflow's upload-artifact (which reads +# the checkout, not the stage dir) finds them. No-op when no staging was used. +cx_collect_results() { + local mount_src="$1" repo_root="$2" dst log + local -a files + [ "$mount_src" = "$repo_root" ] && return 0 + log="$(cx_private_log_path "artifact-collection-$$-${RANDOM}")" + dst="$repo_root/experimental/CollectiveX/results" + mkdir -p "$dst" 2>> "$log" \ + || { cx_log "ERROR: cannot create checkout result directory"; return 1; } + shopt -s nullglob + files=("$mount_src/experimental/CollectiveX/results/"*.json) + shopt -u nullglob + [ "${#files[@]}" -gt 0 ] || { cx_log "ERROR: staged run produced no result JSON"; return 1; } + cp -- "${files[@]}" "$dst/" >> "$log" 2>&1 \ + || { cx_log "ERROR: staged result collection failed"; return 1; } + cx_log "collected staged results for artifact validation" +} + +cx_cleanup_stage() { + local mount_src="$1" repo_root="$2" base="${CX_STAGE_DIR:-}" tag safe_tag expected + tag="${COLLECTIVEX_EXECUTION_ID:-${GITHUB_RUN_ID:-manual-$$}}" + safe_tag="$(printf '%s' "$tag" | tr -c 'A-Za-z0-9._-' '_')" + [ "$mount_src" != "$repo_root" ] || return 0 + if [ -n "$base" ] && [ "$base" != "$repo_root" ]; then + expected="${base%/}/job_$safe_tag" + else + [ -n "${CX_SQUASH_DIR:-}" ] \ + || { cx_log "ERROR: cannot identify the generated stage directory"; return 1; } + expected="${CX_SQUASH_DIR%/}/.collectivex-stage-$safe_tag" + fi + if [ "$mount_src" != "$expected" ] || [ "$mount_src" = / ] \ + || { [ -n "$base" ] && [ "$mount_src" = "$base" ]; }; then + cx_log "ERROR: refusing to remove an unrecognized stage directory" + return 1 + fi + if ! python3 - "$mount_src" "$tag" <<'PY' +import os +from pathlib import Path +import stat +import sys + +root = Path(sys.argv[1]) +expected = f"collectivex-stage-v1\n{sys.argv[2]}\n" +try: + metadata = os.stat(root, follow_symlinks=False) + marker = root / ".collectivex-stage-v1" + if ( + not stat.S_ISDIR(metadata.st_mode) + or metadata.st_uid != os.getuid() + or (stat.S_IMODE(metadata.st_mode) & 0o777) != 0o700 + ): + raise OSError + entries = list(root.iterdir()) + if marker.exists(): + marker_metadata = os.stat(marker, follow_symlinks=False) + if ( + not stat.S_ISREG(marker_metadata.st_mode) + or marker_metadata.st_uid != os.getuid() + or stat.S_IMODE(marker_metadata.st_mode) != 0o600 + ): + raise OSError + marker_content = marker.read_text() + if marker_content != expected and entries != [marker]: + raise OSError + elif entries: + raise OSError +except (OSError, UnicodeError): + raise SystemExit(1) +PY + then + cx_log "ERROR: refusing to remove an unowned stage directory" + return 1 + fi + rm -rf -- "$mount_src" >/dev/null 2>&1 || { + cx_log "ERROR: cannot remove generated stage directory" + return 1 + } + cx_log "removed generated per-execution stage directory" +} + +# Return success only when a benchmark output is a complete JSON result object. +# Callers use this before synthesizing a terminal outcome so an emitted invalid result +# is not shadowed by a second record for the same attempt. +cx_has_result_doc() { + local path="$1" + python3 "$_CX_COMMON_ROOT/contracts.py" probe "$path" >/dev/null 2>&1 +} + +cx_result_doc_is() { + local path="$1" expected="$2" + python3 "$_CX_COMMON_ROOT/contracts.py" probe "$path" --status "$expected" \ + >/dev/null 2>&1 +} + +# A rank-zero result can be written before another rank or backend teardown fails. Preserve its +# measurements, but make the distributed command's nonzero terminal status authoritative. +cx_demote_result_doc() { + local path="$1" rc="$2" + python3 "$_CX_COMMON_ROOT/contracts.py" demote "$path" --return-code "$rc" +} + +cx_quarantine_result_doc() { + python3 "$_CX_COMMON_ROOT/contracts.py" quarantine-invalid "$1" +} + +# cx_emit_ep_failed_case +# Preserve failures from rack launchers that invoke run_ep.py directly and therefore cannot use +# run_in_container.sh's emitter. Case identity is read from the exported CX_* variables. +cx_emit_ep_failed_case() { + local out="$1" backend="$2" phase="$3" rc="$4" + local -a args=(emit-terminal --out "$out" --backend "$backend" --phase "$phase" + --return-code "$rc") + [ -z "${CX_FAILURE_MODE:-}" ] || args+=(--failure-mode "$CX_FAILURE_MODE") + if ! python3 "$_CX_COMMON_ROOT/contracts.py" "${args[@]}" + then + cx_log "ERROR: could not preserve terminal outcome" + return 1 + fi +} + +cx_case_attempt_exists() { + local out_dir="$1" case_id="$2" + python3 - "$_CX_COMMON_ROOT" "$out_dir" "$case_id" <<'PY' +import pathlib, sys + +sys.path.insert(0, sys.argv[1]) +import contracts + +sample_paths = set() +referenced_samples = set() +found = False + +def quarantine(path, document): + sample = document.get("sample_artifact") if isinstance(document, dict) else None + if ( + isinstance(sample, dict) + and isinstance(sample.get("path"), str) + and pathlib.Path(sample["path"]).name == sample["path"] + ): + sample_path = path.with_name(sample["path"]) + if sample_path.is_file(): + sample_path.replace(sample_path.with_name(sample_path.name + ".quarantine")) + if path.is_file(): + path.replace(path.with_name(path.name + ".quarantine")) + +for path in pathlib.Path(sys.argv[2]).glob("*.json"): + document = None + try: + document = contracts.strict_load(path) + if not isinstance(document, dict): + continue + if document.get("format") == contracts.RAW_FORMAT: + document = contracts.load_raw_attempt(path) + referenced_samples.add(path.with_name(document["sample_artifact"]["path"])) + elif document.get("format") == contracts.TERMINAL_FORMAT: + document = contracts.validate_terminal_document(document) + elif document.get("format") == contracts.SAMPLES_FORMAT: + contracts.validate_samples_document(document) + sample_paths.add(path) + continue + else: + continue + except (contracts.ContractError, OSError, ValueError): + quarantine(path, document) + continue + if document["identity"]["case_id"] == sys.argv[3]: + found = True +for orphan in sample_paths - referenced_samples: + quarantine(orphan, {}) +raise SystemExit(0 if found else 1) +PY +} + +# Emit one setup-failure record per requested case. Rack launchers call this when +# backend preparation fails before rank processes can start. +cx_emit_setup_failures() { + local root="$1" out_dir="$2" backend="$3" rc="$4" shard="${CX_SHARD_FILE:-}" path + local phase case_id suite workload required routing eplb ep hidden topk experts nodes + local gpn domain ladder canonical timing mode scope scale_up_transport scale_out_transport + local warmup_semantics precision_profile + local transport topology_class + local cases_file expected emitted=0 covered=0 + mkdir -p "$out_dir" || return 1 + export CX_FAILURE_MODE="${CX_FAILSAFE_MODE:-setup}" CX_ATTEMPT_ID=1 + if [ -z "$shard" ]; then + local phases="${CX_PHASE:-decode}" + [ "$phases" = both ] && phases="decode prefill" + for phase in $phases; do + if [ -n "${CX_CASE_ID:-}" ] && cx_case_attempt_exists "$out_dir" "$CX_CASE_ID"; then + continue + fi + cx_emit_ep_failed_case "$out_dir/failed_${backend}_${phase}_${CX_TS:-setup}-a01.json" \ + "$backend" "$phase" "$rc" || return 1 + done + unset CX_FAILURE_MODE + return 0 + fi + path="$shard" + [ -f "$path" ] || path="${root%/}/$shard" + [ -f "$path" ] || { + unset CX_FAILURE_MODE + cx_log "ERROR: cannot emit setup failures without shard control" + return 1 + } + export COLLECTIVEX_CONTROL_SHA256 + COLLECTIVEX_CONTROL_SHA256="$(sha256sum "$path" | awk '{print $1}')" + [[ "$COLLECTIVEX_CONTROL_SHA256" =~ ^[0-9a-f]{64}$ ]] || { + unset CX_FAILURE_MODE COLLECTIVEX_CONTROL_SHA256 + cx_log "ERROR: cannot hash shard for setup-failure records" + return 1 + } + cases_file="$(mktemp)" || return 1 + if ! python3 - "$path" > "$cases_file" <<'PY' +import json, sys + +with open(sys.argv[1]) as handle: + cases = json.load(handle)["cases"] +for case in cases: + fields = ( + case["phase"], case["mode"], case["case_id"], case["suite"], case["workload"], + case["required_publication"], case["routing"], "1" if case["eplb"] else "", + case["ep"], case["hidden"], case["topk"], case["experts"], case["nodes"], + case["gpus_per_node"], case["scale_up_domain"], case["scope"], + case["scale_up_transport"], case.get("scale_out_transport") or "", + case["transport"], case["topology_class"], case["ladder"], + case["warmup_semantics"], + "1" if case["canonical"] else "", case["timing"], + case.get("precision_profile") or "", + ) + print("|".join(map(str, fields))) +PY + then + rm -f "$cases_file" + unset CX_FAILURE_MODE + return 1 + fi + expected="$(wc -l < "$cases_file" | tr -d ' ')" + [ "$expected" -gt 0 ] || { rm -f "$cases_file"; unset CX_FAILURE_MODE; return 1; } + while IFS='|' read -r phase mode case_id suite workload required routing eplb ep hidden topk \ + experts nodes gpn domain scope scale_up_transport scale_out_transport transport \ + topology_class ladder warmup_semantics canonical timing precision_profile; do + export CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload" + export CX_REQUIRED_PUBLICATION="$required" CX_ROUTING="$routing" CX_EPLB="$eplb" + export CX_EP="$ep" CX_NGPUS="$ep" CX_HIDDEN="$hidden" CX_TOPK="$topk" CX_EXPERTS="$experts" + export CX_MODE="$mode" CX_NODES="$nodes" CX_GPUS_PER_NODE="$gpn" + export CX_SCALE_UP_DOMAIN="$domain" CX_SCOPE="$scope" + export CX_SCALE_UP_TRANSPORT="$scale_up_transport" + export CX_SCALE_OUT_TRANSPORT="$scale_out_transport" + export CX_TRANSPORT="$transport" CX_TOPO="$topology_class" + export CX_TOKENS_LADDER="$ladder" CX_CANONICAL="$canonical" + export CX_PRECISION_PROFILE="$precision_profile" + export CX_WARMUP_SEMANTICS="$warmup_semantics" + IFS=: read -r CX_ITERS CX_TRIALS CX_WARMUP <<< "$timing" + export CX_ITERS CX_TRIALS CX_WARMUP CX_SAMPLES_PER_POINT="$((CX_ITERS * CX_TRIALS))" + if cx_case_attempt_exists "$out_dir" "$case_id"; then + covered=$((covered + 1)) + continue + fi + cx_emit_ep_failed_case "$out_dir/failed_${case_id}-a01.json" "$backend" "$phase" "$rc" || return 1 + emitted=$((emitted + 1)) + done < "$cases_file" + rm -f "$cases_file" + unset CX_FAILURE_MODE + [ "$((emitted + covered))" -eq "$expected" ] || { + cx_log "ERROR: covered $((emitted + covered))/$expected terminal cases" + return 1 + } +} + +# Run one validated shard with one Slurm task per GPU. Launchers provide only +# allocation/container policy through globals and CX_DISTRIBUTED_CONTAINER_ARGS. +# shellcheck disable=SC2153 +cx_run_distributed_shard() { + local build_log build_rc cases_file expected_cases ci=0 failed_cases=0 + local ph mode routing eplb hidden topk experts ladder suite workload required_pub + local canonical case_id ep timing case_iters case_trials case_warmup case_stem + local scope scale_up_transport scale_out_transport transport topology_class nodes gpn domain + local precision_profile + local workload_dir workload_ladder workload_log stage_rc attempt_tag out failure_out + local runtime_log run_rc expected_out case_ok summary_log + local -a container_args workload_args ep_args + [ "${NODES:-0}" -gt 1 ] && [ "${NGPUS:-0}" = "$((NODES * GPN))" ] \ + || cx_die "invalid distributed launcher placement" + [ -n "${JOB_ID:-}" ] && [ -n "${SQUASH_FILE:-}" ] \ + && [ -n "${CONTAINER_MOUNTS:-}" ] || cx_die "distributed launcher is incomplete" + [ -n "${SOURCE_BACKEND_ENV:-}" ] && [ -n "${BACKEND_PROBE:-}" ] \ + && [ -n "${WRAP:-}" ] || cx_die "distributed rank wrapper is incomplete" + + cx_resolve_slurm_rendezvous "$JOB_ID" + mkdir -p "$MOUNT_SRC/experimental/CollectiveX/results" + container_args=(--container-mounts="$CONTAINER_MOUNTS" --no-container-mount-home + --container-workdir=/ix/experimental/CollectiveX --no-container-entrypoint) + if declare -p CX_DISTRIBUTED_CONTAINER_ARGS >/dev/null 2>&1; then + container_args+=("${CX_DISTRIBUTED_CONTAINER_ARGS[@]}") + fi + local container_name="cxep_${JOB_ID}" + + cx_log "distributed backend preparation: bench=$CX_BENCH nodes=$NODES" + cx_set_failure_stage backend-setup + build_log="$(cx_private_log_path backend-prepare)" + set +e + srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --chdir=/tmp \ + --container-name="$container_name" --container-image="$SQUASH_FILE" \ + "${container_args[@]}" --export="$(cx_container_exports),CX_BUILD_ONLY=1" \ + bash /ix/experimental/CollectiveX/runtime/run_in_container.sh \ + "$build_log" 2>&1 + build_rc=$? + if [ "$build_rc" = 0 ]; then + srun --jobid="$JOB_ID" --nodes="$NODES" --ntasks-per-node=1 --chdir=/tmp \ + --container-name="$container_name" "${container_args[@]}" \ + --export="$(cx_container_exports)" bash -c "$BACKEND_PROBE" \ + >"$build_log" 2>&1 + build_rc=$? + fi + set -e + if [ "$build_rc" != 0 ]; then + cx_fail_stage backend-setup "$build_log" || true + [ "${CX_PRECISION_PROBE:-0}" != 1 ] || return "$build_rc" + cx_emit_setup_failures "$CX_DIR" "$MOUNT_SRC/experimental/CollectiveX/results" \ + "$CX_BENCH" "$build_rc" + return "$build_rc" + fi + cx_set_failure_stage execution + + if [ "${CX_PRECISION_PROBE:-0}" = 1 ]; then + local fields probe_id backend sku ep mode profile + fields="$(cx_precision_probe_control_fields "$CX_DIR")" || return 1 + IFS='|' read -r probe_id backend sku ep mode profile <<< "$fields" + [ "$backend" = "$CX_BENCH" ] && [ "$sku" = "$RUNNER" ] && [ "$ep" = "$NGPUS" ] \ + || cx_die "precision probe control differs from runtime placement" + out="results/${probe_id}.json" + expected_out="$MOUNT_SRC/experimental/CollectiveX/$out" + runtime_log="$(cx_private_log_path precision-probe)" + set +e + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" \ + --ntasks="$NGPUS" --ntasks-per-node="$GPN" --chdir=/tmp \ + --container-name="$container_name" "${container_args[@]}" \ + --export="$(cx_container_exports)" \ + bash -c "$WRAP" _ --backend "$backend" --sku "$sku" --ep "$ep" \ + --mode "$mode" --precision-profile "$profile" --out "$out" \ + "$runtime_log" 2>&1 + run_rc=$? + set -e + if [ "$run_rc" != 0 ] || ! python3 "$CX_DIR/tests/probe_precision.py" \ + --validate-manifest "$expected_out" >/dev/null 2>&1; then + [ "$run_rc" != 0 ] || run_rc=1 + cx_fail_stage execution "$runtime_log" || true + return "$run_rc" + fi + return 0 + fi + + cases_file="$(mktemp)" || return 1 + local shard="${CX_SHARD_FILE:-}" + [ -z "$shard" ] || [ -f "$shard" ] || shard="$CX_DIR/$shard" + if [ -n "$shard" ]; then + if [ ! -f "$shard" ] || ! python3 - "$shard" > "$cases_file" <<'PY' +import json +import sys + +with open(sys.argv[1]) as handle: + cases = json.load(handle)["cases"] +for case in cases: + get = lambda key, default="": str(case.get(key) or default) + fields = ( + get("phase", "decode"), get("mode", "normal"), get("routing", "uniform"), + "1" if case.get("eplb") else "", get("hidden", "7168"), + get("topk", "8"), get("experts", "256"), get("ladder"), + get("suite"), get("workload"), get("required_publication"), + "1" if case.get("canonical") else "", get("case_id"), get("ep"), + get("timing", "8:64:32"), get("nodes"), get("gpus_per_node"), + get("scale_up_domain"), get("scope"), get("scale_up_transport"), + get("scale_out_transport"), get("transport"), get("topology_class"), + get("precision_profile"), + ) + print("|".join(fields)) +PY + then + rm -f "$cases_file" + cx_die "could not enumerate validated shard cases" + fi + else + local phases="${CX_PHASE:-decode}" phase + [ "$phases" = both ] && phases="decode prefill" + cx_require_record_safe "$phases" "${CX_MODE:-normal}" "${CX_ROUTING:-uniform}" \ + "${CX_EPLB:-}" "${CX_HIDDEN:-7168}" "${CX_TOPK:-8}" "${CX_EXPERTS:-256}" \ + "${CX_TOKENS_LADDER:-}" "${CX_SUITE:-}" "${CX_WORKLOAD_NAME:-}" \ + "${CX_REQUIRED_PUBLICATION:-}" "${CX_CANONICAL:-}" "${CX_CASE_ID:-}" \ + "${CX_PRECISION_PROFILE:-}" \ + "${CX_ITERS:-8}" "${CX_TRIALS:-64}" "${CX_WARMUP:-32}" \ + "${CX_SCOPE:-scale-up}" \ + "${CX_SCALE_UP_TRANSPORT:-unknown}" "${CX_SCALE_OUT_TRANSPORT:-}" \ + "${CX_TRANSPORT:-unknown}" "${CX_TOPO:-manual}" + for phase in $phases; do + (IFS='|'; printf '%s\n' "$phase|${CX_MODE:-normal}|${CX_ROUTING:-uniform}|${CX_EPLB:-}|${CX_HIDDEN:-7168}|${CX_TOPK:-8}|${CX_EXPERTS:-256}|${CX_TOKENS_LADDER:-}|${CX_SUITE:-}|${CX_WORKLOAD_NAME:-}|${CX_REQUIRED_PUBLICATION:-}|${CX_CANONICAL:-}|${CX_CASE_ID:-}|$NGPUS|${CX_ITERS:-8}:${CX_TRIALS:-64}:${CX_WARMUP:-32}|$NODES|$GPN|$SCALE_UP_DOMAIN|${CX_SCOPE:-scale-up}|${CX_SCALE_UP_TRANSPORT:-unknown}|${CX_SCALE_OUT_TRANSPORT:-}|${CX_TRANSPORT:-unknown}|${CX_TOPO:-manual}|${CX_PRECISION_PROFILE:-}") + done > "$cases_file" + fi + expected_cases="$(wc -l < "$cases_file" | tr -d ' ')" + [ "$expected_cases" -gt 0 ] \ + || { rm -f "$cases_file"; cx_die "distributed case list is empty"; } + + while IFS='|' read -r ph mode routing eplb hidden topk experts ladder suite workload \ + required_pub canonical case_id ep timing nodes gpn domain scope scale_up_transport \ + scale_out_transport transport topology_class precision_profile; do + [ -n "$ph" ] || continue + ci=$((ci + 1)) + case_stem="${RUNNER}_${CX_BENCH}_${ph}_${TS}-c$(printf '%03d' "$ci")" + IFS=: read -r case_iters case_trials case_warmup <<< "${timing:-8:64:32}" + case_iters="${case_iters:-8}" + case_trials="${case_trials:-64}" + case_warmup="${case_warmup:-32}" + ep="${ep:-$NGPUS}" + export CX_MODE="$mode" CX_CASE_ID="$case_id" CX_SUITE="$suite" CX_WORKLOAD_NAME="$workload" + export CX_REQUIRED_PUBLICATION="$required_pub" CX_CANONICAL="$canonical" CX_EP="$ep" + export CX_PRECISION_PROFILE="$precision_profile" + export CX_ROUTING="$routing" CX_EPLB="$eplb" CX_TOKENS_LADDER="$ladder" + export CX_HIDDEN="$hidden" CX_TOPK="$topk" CX_EXPERTS="$experts" + export CX_NODES="$nodes" CX_GPUS_PER_NODE="$gpn" CX_SCALE_UP_DOMAIN="$domain" + export CX_SCOPE="$scope" CX_SCALE_UP_TRANSPORT="$scale_up_transport" + export CX_SCALE_OUT_TRANSPORT="$scale_out_transport" + export CX_TRANSPORT="$transport" CX_TOPO="$topology_class" + export CX_ITERS="$case_iters" CX_TRIALS="$case_trials" CX_WARMUP="$case_warmup" + export CX_SAMPLES_PER_POINT="$((case_iters * case_trials))" + export CX_WARMUP_SEMANTICS="full-roundtrip-before-each-component-trial-point-v1" + cx_log "EP${NGPUS}[$ci] id=${case_id:-manual} $mode/$ph $CX_BENCH" + if [ "$ep" != "$NGPUS" ] || [ "$nodes" != "$NODES" ] || [ "$gpn" != "$GPN" ] \ + || [ "$domain" != "$SCALE_UP_DOMAIN" ]; then + export CX_ATTEMPT_ID=1 + failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json" + cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" 5 + failed_cases=$((failed_cases + 1)) + continue + fi + + workload_dir="" + if cx_bool_enabled "$canonical"; then + workload_dir=".cx_workloads/c$(printf '%03d' "$ci")" + workload_ladder="$ladder" + [ -n "$workload_ladder" ] \ + || workload_ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096" + workload_args=(python3 tests/make_workloads.py --out-dir "$workload_dir" + --routing "$routing" --ep "$ep" --hidden "$hidden" --topk "$topk" + --experts "$experts" --seed "${CX_SEED:-67}" --tokens-ladder "$workload_ladder") + workload_log="$(cx_private_log_path "workload-c$(printf '%03d' "$ci")")" + set +e + srun --jobid="$JOB_ID" --nodes=1 --ntasks=1 --chdir=/tmp \ + --container-name="$container_name" "${container_args[@]}" \ + --export="$(cx_container_exports)" "${workload_args[@]}" \ + "$workload_log" 2>&1 + stage_rc=$? + set -e + if [ "$stage_rc" != 0 ]; then + export CX_ATTEMPT_ID=1 + failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-a01.json" + cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$stage_rc" + failed_cases=$((failed_cases + 1)) + continue + fi + fi + + ep_args=(--backend "$CX_BENCH" --mode "$mode" --phase "$ph" --routing "$routing" + --precision-profile "$precision_profile" + --gpus-per-node "$gpn" --scale-up-domain "$domain" --scope "$scope" + --scale-up-transport "$scale_up_transport" --scale-out-transport "$scale_out_transport" + --tokens-ladder "$ladder" --hidden "$hidden" --topk "$topk" --experts "$experts" + --warmup "$case_warmup" --iters "$case_iters" --trials "$case_trials" + --seed "${CX_SEED:-67}" --runner "$RUNNER" --topology-class "$topology_class" + --transport "$transport" --case-id "$case_id" --suite "$suite" + --workload-name "$workload" --required-publication "$required_pub" + --qualification-index "${CX_QUALIFICATION_INDEX:-1}") + cx_bool_enabled "$eplb" && ep_args+=(--eplb) + [ -z "$workload_dir" ] || ep_args+=(--workload-dir "$workload_dir") + export CX_ATTEMPT_ID=1 + attempt_tag=a01 + out="results/${case_stem}_${attempt_tag}.json" + failure_out="$MOUNT_SRC/experimental/CollectiveX/results/failed_${case_stem}-${attempt_tag}.json" + runtime_log="$(cx_private_log_path "runtime-c$(printf '%03d' "$ci")-$attempt_tag")" + set +e + timeout -k 30 "${CX_RUN_TIMEOUT:-900}" srun --jobid="$JOB_ID" --nodes="$NODES" \ + --ntasks="$NGPUS" --ntasks-per-node="$GPN" --chdir=/tmp \ + --container-name="$container_name" "${container_args[@]}" \ + --export="$(cx_container_exports)" \ + bash -c "$WRAP" _ "${ep_args[@]}" --out "$out" \ + "$runtime_log" 2>&1 + run_rc=$? + set -e + expected_out="$MOUNT_SRC/experimental/CollectiveX/$out" + case_ok=0 + if [ "$run_rc" = 0 ] && cx_result_doc_is "$expected_out" success; then + case_ok=1 + elif [ "$run_rc" = 0 ] && cx_result_doc_is "$expected_out" invalid; then + cx_log "ERROR: EP${NGPUS}[$ci] completed with invalid semantic evidence" + else + [ "$run_rc" != 0 ] || run_rc=1 + if cx_has_result_doc "$expected_out"; then + cx_demote_result_doc "$expected_out" "$run_rc" \ + || { cx_quarantine_result_doc "$expected_out"; cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc"; } + else + cx_quarantine_result_doc "$expected_out" + cx_emit_ep_failed_case "$failure_out" "$CX_BENCH" "$ph" "$run_rc" + fi + fi + if [ "$case_ok" = 0 ]; then + [ "$run_rc" = 0 ] || cx_fail_stage execution "$runtime_log" || true + failed_cases=$((failed_cases + 1)) + fi + done < "$cases_file" + rm -f "$cases_file" + [ "$ci" -eq "$expected_cases" ] \ + || cx_die "enumerated $expected_cases cases but executed $ci" + if [ "$failed_cases" -ne 0 ]; then + summary_log="$(cx_private_log_path shard-summary)" + printf 'SHARD done: %s/%s case(s) failed\n' "$failed_cases" "$expected_cases" \ + > "$summary_log" + cx_fail_stage execution "$summary_log" || true + return 1 + fi + return 0 +} + +cx_launcher_cleanup() { + local rc="$1" stage_root="${MOUNT_SRC:-}" source_root out_dir allocation_stopped=1 + source_root="${stage_root:-${REPO_ROOT:-}}" + trap - EXIT + if [ -n "${COLLECTIVEX_EPHEMERAL_CONFIG_PATH:-}" ]; then + rm -f -- "$COLLECTIVEX_EPHEMERAL_CONFIG_PATH" >/dev/null 2>&1 || true + unset COLLECTIVEX_EPHEMERAL_CONFIG_PATH + fi + if [ -n "${JOB_ID:-}" ]; then + if ! cx_cancel_job "$JOB_ID"; then + allocation_stopped=0 + [ "$rc" != 0 ] || rc=1 + fi + elif [ "${CX_ALLOCATION_UNCERTAIN:-0}" = 1 ]; then + allocation_stopped=0 + [ "$rc" != 0 ] || rc=1 + fi + if [ "$allocation_stopped" = 1 ]; then + cx_write_cleanup_guard safe || true + else + cx_write_cleanup_guard unsafe || true + fi + [ "$allocation_stopped" = 1 ] || source_root="${REPO_ROOT:-$source_root}" + if [ "$rc" != 0 ] && [ "${CX_PRECISION_PROBE:-0}" != 1 ] \ + && [ -n "${REPO_ROOT:-}" ] && [ -n "${CX_BENCH:-}" ]; then + cx_log "ERROR: terminal-failure-class=${CX_FAILSAFE_MODE:-setup}" + [ -d "$source_root/experimental/CollectiveX" ] || source_root="$REPO_ROOT" + out_dir="$source_root/experimental/CollectiveX/results" + cx_emit_setup_failures \ + "$source_root/experimental/CollectiveX" "$out_dir" "$CX_BENCH" "$rc" || true + [ "$source_root" = "$REPO_ROOT" ] \ + || cx_collect_results "$source_root" "$REPO_ROOT" || true + fi + if [ "$allocation_stopped" = 1 ] && [ -n "${REPO_ROOT:-}" ] \ + && [ -n "$stage_root" ] && [ "$stage_root" != "$REPO_ROOT" ]; then + if ! cx_cleanup_stage "$stage_root" "$REPO_ROOT"; then + [ "$rc" != 0 ] || rc=1 + fi + fi + [ "${COLLECTIVEX_CANONICAL_GHA:-0}" = 1 ] || cx_cleanup_private_logs "$rc" + exit "$rc" +} + +cx_install_launcher_fail_safe() { + CX_ALLOCATION_UNCERTAIN=0 + trap 'cx_launcher_cleanup "$?"' EXIT +} diff --git a/experimental/CollectiveX/runtime/run_in_container.sh b/experimental/CollectiveX/runtime/run_in_container.sh new file mode 100644 index 000000000..eeb8f632f --- /dev/null +++ b/experimental/CollectiveX/runtime/run_in_container.sh @@ -0,0 +1,1116 @@ +#!/usr/bin/env bash +# CollectiveX — generic in-container benchmark dispatcher (single-node). +# +# Runs INSIDE the container under `srun` for single-node shards. The GB EP8 launcher invokes +# run_ep.py directly across nodes. The SKU adapter handles allocation/container/transport-env; +# this script selects one EP backend from CX_BENCH and writes result JSON under results/. +# +# Required env (exported by the adapter): CX_RUNNER CX_NGPUS CX_TS CX_TOPO +# Selector: CX_BENCH = deepep | deepep-v2 | mori | uccl | nccl-ep | deepep-hybrid +# EP knobs passed to tests/run_ep.py: +# CX_PHASE = decode | prefill | both (default decode) <- picks the token sweep +# CX_TOKENS_LADDER (space/comma sep; blank = phase default) +# CX_HIDDEN CX_TOPK CX_EXPERTS CX_ROUTING CX_SEED CX_ITERS +set -euo pipefail + +cd /ix/experimental/CollectiveX +# shellcheck source=../runtime/common.sh +source runtime/common.sh +mkdir -p results +cx_write_runtime_stage backend-setup || cx_die "cannot record runtime stage" + +: "${CX_RUNNER:?CX_RUNNER not set}" +: "${CX_NGPUS:?CX_NGPUS not set}" +: "${CX_TS:?CX_TS not set}" +: "${CX_TOPO:?CX_TOPO not set}" +CX_BENCH="${CX_BENCH:-deepep}" +CX_TRANSPORT="${CX_TRANSPORT:-}" + +cx_apply_timing_profile + +cx_log "in-container: runner=$CX_RUNNER ngpus=$CX_NGPUS bench=$CX_BENCH topo=$CX_TOPO" + +# Blank ladders use the phase default in tests/run_ep.py. +cx_ep_ladder() { + printf '%s' "${CX_TOKENS_LADDER:-}" +} + +# Canonical workload staging. Every SKU/backend generates identical canonical array bytes and +# content IDs in-container; the NPZ container bytes themselves are not an identity boundary. When CX_CANONICAL=1 +# (and CX_WORKLOAD_DIR not already provided) we generate routing traces for the run's ladder +# into a NON-results dir (.cx_workloads/ — so the *.manifest.json never pollute the results glob) and +# point run_ep at it. Raw attempts remain diagnostic until the publisher validates full coverage. +cx_stage_canonical() { + cx_bool_enabled "${CX_CANONICAL:-0}" || return 0 + [ -n "${CX_WORKLOAD_DIR:-}" ] && return 0 + local dir="$PWD/.cx_workloads" + local ladder; ladder="$(cx_ep_ladder)" + # cover both phase ladders when none is given, so either phase finds its files. + [ -z "$ladder" ] && ladder="1 2 4 8 16 32 64 128 256 512 1024 2048 4096" + cx_log "staging canonical workloads (routing=${CX_ROUTING:-uniform} ep=$CX_NGPUS ladder='$ladder')" + python3 tests/make_workloads.py --out-dir "$dir" --routing "${CX_ROUTING:-uniform}" \ + --ep "$CX_NGPUS" --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" \ + --experts "${CX_EXPERTS:-256}" --seed "${CX_SEED:-67}" --tokens-ladder "$ladder" \ + || { cx_log "ERROR: canonical workload staging failed"; return 1; } + export CX_WORKLOAD_DIR="$dir" + cx_log "canonical workloads staged at $dir" +} + +# run_ep_suite +# One tests/run_ep.py invocation per phase (decode/prefill/both); dispatch and +# combine are timed separately inside it. One JSON per (backend, phase). +# Preserve a failed case with its full scheduled identity instead of letting it vanish. +emit_failed_case() { # backend phase rc + cx_emit_ep_failed_case \ + "results/failed_${CX_RUNNER}_${1}_${2}_${CX_TS}.json" "$1" "$2" "$3" || true +} + +run_ep_suite() { + local backend="$1" phase phases ladder failure_kind rc=0 rc_run + ladder="$(cx_ep_ladder)" + phases="${CX_PHASE:-decode}" + [ "$phases" = "both" ] && phases="decode prefill" + if ! cx_stage_canonical; then + for phase in $phases; do + emit_failed_case "$backend" "$phase" 2 + done + return 1 + fi + for phase in $phases; do + cx_log "ep backend=$backend phase=$phase ngpus=$CX_NGPUS ladder='${ladder:-}'" + local out="results/${CX_RUNNER}_${backend}_${phase}_${CX_TS}.json" + local -a EPARGS=(--backend "$backend" --mode "${CX_MODE:-normal}" --phase "$phase" + --precision-profile "${CX_PRECISION_PROFILE:-}" + --tokens-ladder "$ladder" + --hidden "${CX_HIDDEN:-7168}" --topk "${CX_TOPK:-8}" --experts "${CX_EXPERTS:-256}" + --routing "${CX_ROUTING:-uniform}" --seed "${CX_SEED:-67}" --iters "${CX_ITERS:-8}" + --trials "${CX_TRIALS:-64}" --warmup "${CX_WARMUP:-32}" + --gpus-per-node "${CX_GPUS_PER_NODE:-0}" --scale-up-domain "${CX_SCALE_UP_DOMAIN:-0}" + --scope "${CX_SCOPE:-scale-up}" --scale-up-transport "${CX_SCALE_UP_TRANSPORT:-unknown}" + --scale-out-transport "${CX_SCALE_OUT_TRANSPORT:-}" + --case-id "${CX_CASE_ID:-}" --suite "${CX_SUITE:-}" --workload-name "${CX_WORKLOAD_NAME:-}" + --required-publication "${CX_REQUIRED_PUBLICATION:-}" + --qualification-index "${CX_QUALIFICATION_INDEX:-1}" + --runner "$CX_RUNNER" --topology-class "$CX_TOPO" --transport "$CX_TRANSPORT" + --out "$out") + cx_bool_enabled "${CX_EPLB:-0}" && EPARGS+=(--eplb) + [ -n "${CX_WORKLOAD_DIR:-}" ] && EPARGS+=(--workload-dir "$CX_WORKLOAD_DIR") + cx_write_runtime_stage execution || cx_die "cannot record runtime stage" + if timeout -k 30 "${CX_RUN_TIMEOUT:-900}" \ + torchrun --nproc_per_node="$CX_NGPUS" tests/run_ep.py "${EPARGS[@]}"; then + rc_run=0 + else + rc_run=$? + fi + if [ "$rc_run" = 0 ] && cx_result_doc_is "$out" invalid; then + cx_log "WARN: $backend $phase completed with invalid semantic evidence" + rc=1 + continue + fi + if [ "$rc_run" = 0 ] && ! cx_result_doc_is "$out" success; then + rc_run=1 + fi + if [ "$rc_run" != 0 ]; then + failure_kind=failed + [ "$rc_run" != 124 ] && [ "$rc_run" != 137 ] || failure_kind="timed out" + if [ "$failure_kind" = "timed out" ]; then + cx_log "WARN: $backend $phase run timed out rc=$rc_run (limit=${CX_RUN_TIMEOUT:-900}s)" + else + cx_log "WARN: $backend $phase run failed rc=$rc_run" + fi + if cx_has_result_doc "$out"; then + cx_demote_result_doc "$out" "$rc_run" \ + || { cx_quarantine_result_doc "$out"; emit_failed_case "$backend" "$phase" "$rc_run"; } + cx_log "preserved benchmark output as a failed attempt" + else + cx_quarantine_result_doc "$out" + emit_failed_case "$backend" "$phase" "$rc_run" + fi + rc=1 + fi + done + return "$rc" +} + +# Resolve and verify the actual CUDA target before compiling source kernels. +cx_cuda_arch() { + local expected detected + case "$CX_RUNNER" in + h100*|h200*) expected="9.0" ;; + b200*|gb200*) expected="10.0" ;; + b300*|gb300*) expected="10.3" ;; + *) cx_log "ERROR: no CUDA target registered for $CX_RUNNER"; return 1 ;; + esac + detected="$(python3 - <<'PY' +import torch + +major, minor = torch.cuda.get_device_capability() +print(f"{major}.{minor}") +PY +)" || return 1 + [ "$detected" = "$expected" ] || { + cx_log "ERROR: $CX_RUNNER expected CUDA target $expected, detected $detected" + return 1 + } + printf '%s' "$detected" +} + +cx_nvidia_package_root() { + local package="$1" component="$2" + python3 - "$package" "$component" <<'PY' +from importlib import metadata +from pathlib import Path, PurePosixPath +import sys + +package, component = sys.argv[1:] +try: + distribution = metadata.distribution(package) + prefix = f"nvidia/{component}/" + entries = [str(entry).replace("\\", "/") for entry in distribution.files or ()] + if not any(entry.startswith(prefix) for entry in entries): + raise ValueError + root = Path(distribution.locate_file(PurePosixPath("nvidia") / component)).resolve() + if not root.is_dir(): + raise ValueError +except (metadata.PackageNotFoundError, OSError, TypeError, ValueError): + raise SystemExit(1) +print(root, end="") +PY +} + +cx_prepare_cuda_cccl() { + local cccl="" candidate cuda_home nvcc + nvcc="$(command -v nvcc)" \ + || { cx_log "ERROR: CUDA nvcc is unavailable"; return 1; } + nvcc="$(readlink -f -- "$nvcc")" \ + || { cx_log "ERROR: CUDA nvcc cannot be resolved"; return 1; } + case "$nvcc" in + */bin/nvcc) cuda_home="${nvcc%/bin/nvcc}" ;; + *) cx_log "ERROR: CUDA nvcc has an unexpected path"; return 1 ;; + esac + [ -x "$cuda_home/bin/nvcc" ] && [ -d "$cuda_home/include" ] \ + && [ -d "$cuda_home/lib64" ] \ + || { cx_log "ERROR: CUDA toolkit root is incomplete"; return 1; } + for candidate in "$cuda_home"/targets/*/include/cccl; do + if [ -d "$candidate" ]; then + cccl="$candidate" + break + fi + done + [ -n "$cccl" ] || { cx_log "ERROR: CUDA CCCL headers are unavailable"; return 1; } + export CUDA_HOME="$cuda_home" CX_CUDA_CCCL="$cccl" + export CPATH="$cccl:${CPATH:-}" + export NVCC_PREPEND_FLAGS="-I$cccl ${NVCC_PREPEND_FLAGS:-}" +} + +cx_prepare_deepep_toolchain() { + local packaged overlay path root temporary + packaged="$(cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem)" \ + || { cx_log "ERROR: nvidia.nvshmem is unavailable"; return 1; } + root="$(cx_deepep_v2_root)" || return 1 + overlay="$root/nvshmem-overlay" + if ! ( + umask 077 + exec 8>"$root/nvshmem-overlay.lock" || exit 1 + flock 8 || exit 1 + if [ ! -d "$overlay" ]; then + temporary="$root/.nvshmem-overlay.$$" + rm -rf "$temporary" || exit 1 + mkdir -p "$temporary/lib" || exit 1 + ln -s "$packaged/include" "$temporary/include" || exit 1 + for path in "$packaged"/lib/*; do + ln -s "$path" "$temporary/lib/${path##*/}" || exit 1 + done + [ ! -e "$packaged/lib/libnvshmem_host.so.3" ] \ + || ln -sf "$packaged/lib/libnvshmem_host.so.3" \ + "$temporary/lib/libnvshmem_host.so" || exit 1 + mv "$temporary" "$overlay" || exit 1 + fi + [ ! -L "$overlay" ] \ + && [ "$(readlink -f "$overlay/include")" = "$(readlink -f "$packaged/include")" ] \ + && [ -e "$overlay/lib/libnvshmem_host.so" ] \ + && [ -e "$overlay/lib/libnvshmem_device.a" ] + ); then + cx_log "ERROR: DeepEP V2 NVSHMEM overlay is invalid" + return 1 + fi + NVSHMEM_DIR="$overlay" + export NVSHMEM_DIR + cx_prepare_cuda_cccl || return 1 + export LD_LIBRARY_PATH="$NVSHMEM_DIR/lib:${LD_LIBRARY_PATH:-}" +} + +cx_probe_deepep() { + local expected_record_sha256 expected_version expected_wheel_sha256 + if [ "${COLLECTIVEX_IMAGE:-}" != "$CX_IMAGE_MULTIARCH" ] \ + || [ "${COLLECTIVEX_IMAGE_DIGEST:-}" != "$CX_IMAGE_MULTIARCH_DIGEST" ] \ + || [ "${COLLECTIVEX_IMAGE_DIGEST_VERIFIED:-0}" != 1 ]; then + cx_log "ERROR: DeepEP V1 requires the exact pinned multi-architecture image" + return 1 + fi + cx_cuda_arch >/dev/null || return 1 + case "$CX_RUNNER" in + gb200|gb300) + expected_version="1.1.0+814e508" + expected_wheel_sha256="784dabec0877b6cf72619b7e93eda7e2f365648487bd37fc3ff6960e53669313" + expected_record_sha256="2671cff7baf8c2c214ff4bac721af875d513130670bec57601998bd1aae82882" + DEEPEP_COMMIT="814e508537c6ffc775d59f6f1b9ba43f3a65968c" + ;; + *) + expected_version="1.2.1" + expected_wheel_sha256="7c02c29306ea0fe2dd474618e72e0f310f260187a9c0700a656d2f6964e8c307" + expected_record_sha256="6548e9c504a12b2471af4b7f4d9546321210a57a456b5dc55bd4a8dad0f932ac" + DEEPEP_COMMIT="9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee" + ;; + esac + export DEEPEP_COMMIT + python3 - "$expected_version" "$expected_wheel_sha256" "$expected_record_sha256" <<'PY' || { +import base64 +import csv +import hashlib +import importlib.metadata as metadata +import io +import json +from pathlib import Path +import sys + +import deep_ep +from deep_ep import Buffer + +distribution = metadata.distribution("deep_ep") +assert distribution.version == sys.argv[1] +assert Buffer.__name__ == "Buffer" +recorded_files = { + Path(distribution.locate_file(entry)).resolve() for entry in distribution.files or () +} +buffer_module = sys.modules.get(Buffer.__module__) +assert Path(deep_ep.__file__).resolve() in recorded_files +assert buffer_module is not None and Path(buffer_module.__file__).resolve() in recorded_files +direct_url = json.loads(distribution.read_text("direct_url.json")) +assert direct_url["archive_info"]["hashes"]["sha256"] == sys.argv[2] +record_entry = next( + entry for entry in distribution.files or () + if str(entry).endswith(".dist-info/RECORD") +) +record = distribution.locate_file(record_entry).read_bytes() +assert hashlib.sha256(record).hexdigest() == sys.argv[3] +for path, encoded_digest, size in csv.reader(io.StringIO(record.decode())): + if not encoded_digest: + continue + algorithm, expected = encoded_digest.split("=", 1) + assert algorithm == "sha256" + payload = distribution.locate_file(path).read_bytes() + observed = base64.urlsafe_b64encode(hashlib.sha256(payload).digest()).decode().rstrip("=") + assert observed == expected + assert not size or len(payload) == int(size) +PY + cx_log "ERROR: container DeepEP build does not match its pinned image contract" + return 1 + } + cx_log "DeepEP image build ready ($DEEPEP_COMMIT)" +} + +# DeepEP V2 is PR #605's ElasticBuffer implementation with upstream PR #630's pure scale-up +# initialization fix. Canonical launchers stage the pinned source and mount a private cluster-local +# build cache at /cx-cache. +cx_deepep_v2_root() { + local arch cpu base identity key image_digest + arch="$(cx_cuda_arch)" || return 1 + cpu="$(uname -m)" + [[ "$cpu" =~ ^[A-Za-z0-9._-]+$ ]] || return 1 + base="${CX_BACKEND_CACHE_ROOT:-}" + [[ "$base" = /* ]] || return 1 + image_digest="${COLLECTIVEX_IMAGE_DIGEST:-manual-unverified}" + [[ "$image_digest" = manual-unverified || "$image_digest" =~ ^sha256:[0-9a-f]{64}$ ]] \ + || return 1 + # Bump the recipe generation whenever the build procedure changes. Benchmark-only + # source revisions must reuse the same immutable environment instead of leaking GBs. + identity="deepep-v2-cache-v2|$cpu|sm${arch/./}|image=$image_digest|recipe=aot-persistent-nvshmem-active-cuda-maxjobs16-v2|$CX_DEEPEP_V2_COMMIT|$CX_DEEPEP_V2_TREE|$CX_DEEPEP_V2_FMT_COMMIT|pip=26.1.2|setuptools=82.0.1|wheel=0.47.0|ninja=1.13.0|numpy=2.2.6|torch=2.10.0+cu130|nccl=2.30.4|nvshmem=3.3.9|max-jobs=16" + key="$(printf '%s' "$identity" | sha256sum | awk '{print $1}')" + [[ "$key" =~ ^[0-9a-f]{64}$ ]] || return 1 + printf '%s/deepep-v2-%s' "$base" "$key" +} + +cx_activate_deepep_v2() { + local root venv stage_root + root="$(cx_deepep_v2_root)" || return 1 + venv="$root/venv" + [ -x "$venv/bin/python" ] \ + || { cx_log "ERROR: DeepEP V2 venv interpreter is unavailable"; return 1; } + export VIRTUAL_ENV="$venv" + export PATH="$venv/bin:${PATH#"$venv/bin:"}" + EP_NCCL_ROOT_DIR="$(cx_nvidia_package_root nvidia-nccl-cu13 nccl)" \ + || { cx_log "ERROR: DeepEP V2 NCCL package root is unavailable"; return 1; } + EP_NVSHMEM_ROOT_DIR="$(cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem)" \ + || { cx_log "ERROR: DeepEP V2 NVSHMEM package root is unavailable"; return 1; } + export EP_NCCL_ROOT_DIR EP_NVSHMEM_ROOT_DIR + export LD_LIBRARY_PATH="$EP_NCCL_ROOT_DIR/lib:$EP_NVSHMEM_ROOT_DIR/lib:${LD_LIBRARY_PATH:-}" + case "${CX_BACKEND_SOURCE_ROOT:-}" in + /*/.cx_sources) stage_root="${CX_BACKEND_SOURCE_ROOT%/.cx_sources}" ;; + *) cx_log "ERROR: DeepEP V2 job-local source root is unavailable"; return 1 ;; + esac + [ -d "$stage_root" ] && [ ! -L "$stage_root" ] \ + || { cx_log "ERROR: DeepEP V2 job-local stage is invalid"; return 1; } + # JIT CUBINs are evidence from this shard, not part of the persistent AOT environment. + # Keeping them on the isolated staged tree prevents a prior driver/topology attempt + # from seeding a later run; all ranks and cases in this shard still share one cold build. + export EP_JIT_CACHE_DIR="$stage_root/.cx_backend/deepep-v2-jit" + export EP_REUSE_NCCL_COMM=1 + export DEEPEP_V2_PR=605 DEEPEP_V2_FIX_PR=630 + DEEPEP_V2_COMMIT="$CX_DEEPEP_V2_COMMIT" + DEEPEP_V2_TREE="$CX_DEEPEP_V2_TREE" + DEEPEP_V2_FMT_COMMIT="$CX_DEEPEP_V2_FMT_COMMIT" + export DEEPEP_V2_COMMIT DEEPEP_V2_TREE DEEPEP_V2_FMT_COMMIT + [ ! -L "$stage_root/.cx_backend" ] && [ ! -L "$EP_JIT_CACHE_DIR" ] \ + || { cx_log "ERROR: DeepEP V2 JIT cache path is unsafe"; return 1; } + if ! mkdir -p "$EP_JIT_CACHE_DIR" \ + || ! chmod 700 "$stage_root/.cx_backend" "$EP_JIT_CACHE_DIR"; then + cx_log "ERROR: DeepEP V2 JIT cache is unavailable" + return 1 + fi + unset EP_SUPPRESS_NCCL_CHECK +} + +cx_enable_deepep_v2_jit_reproducibility() { + local seed="collectivex-deepep-v2-fa8a9b1" cccl + [ -n "${CUDA_HOME:-}" ] \ + || { cx_log "ERROR: active CUDA toolkit is unavailable"; return 1; } + cccl="${CX_CUDA_CCCL:-}" + case "$cccl" in + "$CUDA_HOME"/targets/*/include/cccl) ;; + *) cx_log "ERROR: CUDA CCCL headers differ from the active toolkit"; return 1 ;; + esac + [ -d "$cccl" ] || { cx_log "ERROR: CUDA CCCL headers are unavailable"; return 1; } + CPATH="$cccl" + NVCC_PREPEND_FLAGS="--frandom-seed=$seed -I$cccl" + DEEPEP_V2_JIT_RANDOM_SEED="$seed" + EP_JIT_DUMP_SASS=1 + unset EP_JIT_DEBUG EP_JIT_DUMP_ASM EP_JIT_DUMP_PTX EP_JIT_WITH_LINEINFO + unset EP_JIT_PTXAS_VERBOSE EP_JIT_PRINT_COMPILER_COMMAND EP_JIT_NVCC_COMPILER + unset EP_JIT_CPP_STANDARD EP_JIT_PTXAS_CHECK EP_GIN_GDAKI_DEBUG EP_NUM_TOPK_IDX_BITS + export CPATH DEEPEP_V2_JIT_RANDOM_SEED EP_JIT_DUMP_SASS NVCC_PREPEND_FLAGS +} + +cx_probe_deepep_v2() { + python3 - <<'PY' +import ctypes +import importlib.metadata as metadata +import inspect +import os + +import torch + +assert torch.__version__ == "2.10.0+cu130", torch.__version__ +assert metadata.version("nvidia-nccl-cu13") == "2.30.4" +assert metadata.version("nvidia-nvshmem-cu12") == "3.3.9" +assert metadata.version("numpy") == "2.2.6" + +import deep_ep +assert deep_ep.__version__ == "2.0.0", deep_ep.__version__ +assert metadata.version("deep_ep") == "2.0.0+fa8a9b1" +assert inspect.isclass(deep_ep.ElasticBuffer) +assert deep_ep.ElasticBuffer.__name__ == "ElasticBuffer" +assert os.environ.get("EP_SUPPRESS_NCCL_CHECK") is None +with open("/proc/self/maps", encoding="utf-8") as handle: + loaded_nccl = { + os.path.realpath(line.rstrip().split()[-1]) + for line in handle + if "libnccl.so" in line and os.path.isfile(line.rstrip().split()[-1]) + } +assert len(loaded_nccl) == 1 +runtime_version = ctypes.c_int() +assert ctypes.CDLL(loaded_nccl.pop()).ncclGetVersion(ctypes.byref(runtime_version)) == 0 +assert runtime_version.value == 23004, runtime_version.value +PY +} + +cx_deepep_v2_content_sha256() { + python3 - <<'PY' +import hashlib +from importlib import metadata +import os +from pathlib import Path, PurePosixPath +import stat + +distribution = metadata.distribution("deep_ep") +entries = sorted(distribution.files or (), key=lambda entry: entry.as_posix()) +if not entries: + raise SystemExit(1) +venv_path = Path(os.environ["VIRTUAL_ENV"]).absolute() +if venv_path.is_symlink() or not venv_path.is_dir(): + raise SystemExit(1) +venv = venv_path.resolve(strict=True) +digest = hashlib.sha256() +extension = False +for entry in entries: + relative = PurePosixPath(entry.as_posix()) + if ( + relative.is_absolute() + or ".." in relative.parts + or not relative.parts + or not ( + relative.parts[0] == "deep_ep" + or relative.parts[0].startswith("deep_ep-") + and relative.parts[0].endswith(".dist-info") + ) + ): + raise SystemExit(1) + path = Path(distribution.locate_file(entry)).absolute() + resolved = path.resolve(strict=True) + try: + path.relative_to(venv_path) + resolved.relative_to(venv) + except ValueError: + raise SystemExit(1) + parent = path.parent + while parent != venv_path: + if parent.is_symlink(): + raise SystemExit(1) + parent = parent.parent + item = os.lstat(path) + if not stat.S_ISREG(item.st_mode): + raise SystemExit(1) + descriptor = os.open(path, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + opened = os.fstat(descriptor) + if (opened.st_dev, opened.st_ino) != (item.st_dev, item.st_ino): + raise SystemExit(1) + file_digest = hashlib.sha256() + while chunk := os.read(descriptor, 1024 * 1024): + file_digest.update(chunk) + finally: + os.close(descriptor) + name = relative.as_posix() + extension |= name.startswith("deep_ep/") and name.endswith(".so") + digest.update(name.encode()) + digest.update(b"\0") + digest.update(str(item.st_size).encode()) + digest.update(b"\0") + digest.update(file_digest.digest()) +if not extension: + raise SystemExit(1) +print(digest.hexdigest(), end="") +PY +} + +cx_deepep_v2_marker_content_sha256() { + local root="$1" marker="$2" revision="$3" tree="$4" fmt_revision="$5" cache_key="$6" + python3 - "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" <<'PY' +import os +import re +import stat +import sys + +root, marker, revision, tree, fmt_revision, cache_key = sys.argv[1:] +try: + root_item = os.lstat(root) + marker_item = os.lstat(marker) + children = [os.lstat(os.path.join(root, name)) for name in ("source", "venv")] + if ( + not stat.S_ISDIR(root_item.st_mode) + or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700 + or not stat.S_ISREG(marker_item.st_mode) + or marker_item.st_uid != root_item.st_uid + or stat.S_IMODE(marker_item.st_mode) & 0o777 != 0o600 + or marker_item.st_size > 1024 + or any( + not stat.S_ISDIR(child.st_mode) + or child.st_uid != root_item.st_uid + or stat.S_IMODE(child.st_mode) & 0o022 + for child in children + ) + ): + raise OSError + descriptor = os.open(marker, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + opened = os.fstat(descriptor) + if (opened.st_dev, opened.st_ino) != (marker_item.st_dev, marker_item.st_ino): + raise OSError + payload = os.read(descriptor, 1025) + finally: + os.close(descriptor) + lines = payload.decode("ascii").splitlines() + if lines[:4] != [revision, tree, fmt_revision, cache_key] or len(lines) != 5: + raise ValueError + if not re.fullmatch(r"[0-9a-f]{64}", lines[4]): + raise ValueError +except (OSError, UnicodeError, ValueError): + raise SystemExit(1) +print(lines[4], end="") +PY +} + +cx_deepep_v2_cache_is_valid() { + local root="$1" marker="$2" revision="$3" tree="$4" fmt_revision="$5" cache_key="$6" + local expected_content actual_content + expected_content="$( + cx_deepep_v2_marker_content_sha256 \ + "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" + )" || return 1 + [ -d "$root/source" ] && [ ! -L "$root/source" ] \ + && [ "$(cx_git_in_tree "$root/source" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \ + && [ "$(cx_git_in_tree "$root/source/third-party/fmt" rev-parse HEAD 2>/dev/null)" = "$fmt_revision" ] \ + || return 1 + cx_activate_deepep_v2 || return 1 + actual_content="$(cx_deepep_v2_content_sha256)" || return 1 + [ "$actual_content" = "$expected_content" ] +} + +cx_build_deepep_v2() { + local root venv source marker marker_tmp lock_path arch cache_key cache_ready content_sha256 + local revision="fa8a9b16898204afd347c663b89e65ef87dc6ce6" + local tree="29809e75c5874e6609dac4804e7b651d5226959f" + local fmt_revision="a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" + cx_verify_backend_cache_mount \ + || { cx_log "ERROR: DeepEP V2 cache mount identity validation failed"; return 1; } + arch="$(cx_cuda_arch)" || return 1 + root="$(cx_deepep_v2_root)" || return 1 + cache_key="${root##*/deepep-v2-}" + [[ "$cache_key" =~ ^[0-9a-f]{64}$ ]] || return 1 + venv="$root/venv"; source="$root/source"; marker="$root/.collectivex-complete" + lock_path="${root}.lock" + command -v flock >/dev/null || { cx_log "ERROR: flock is required for DeepEP V2"; return 1; } + mkdir -p "${root%/*}" || return 1 + cx_log "DeepEP V2: preparing PR #605 implementation with upstream PR #630 fix ($revision)" + if ! ( + [ ! -L "$lock_path" ] \ + || { cx_log "ERROR: DeepEP V2 cache lock is unsafe"; exit 1; } + (umask 077; : >> "$lock_path") && chmod 600 "$lock_path" \ + || { cx_log "ERROR: DeepEP V2 cache-lock-create failed"; exit 1; } + exec 9<>"$lock_path" \ + || { cx_log "ERROR: DeepEP V2 cache-lock-open failed"; exit 1; } + flock 9 \ + || { cx_log "ERROR: DeepEP V2 cache-lock-acquire failed"; exit 1; } + cache_ready=0 + if [ -e "$marker" ] || [ -L "$marker" ]; then + if ( + cx_deepep_v2_cache_is_valid \ + "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" + ); then + cache_ready=1 + else + cx_log "ERROR: published DeepEP V2 cache failed integrity validation; refusing reset" + exit 1 + fi + fi + if [ "$cache_ready" != 1 ]; then + if [ -e "$root" ] || [ -L "$root" ]; then + rm -rf "$root" \ + || { cx_log "ERROR: incomplete DeepEP V2 cache-reset failed"; exit 1; } + fi + mkdir -m 700 "$root" \ + || { cx_log "ERROR: DeepEP V2 cache-create failed"; exit 1; } + python3 -m venv "$venv" \ + || { cx_log "ERROR: DeepEP V2 venv creation failed"; exit 1; } + "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \ + "pip==26.1.2" "setuptools==82.0.1" "wheel==0.47.0" "ninja==1.13.0" \ + "numpy==2.2.6" "nvidia-nvshmem-cu12==3.3.9" >&2 2>&1 \ + || { cx_log "ERROR: DeepEP V2 build-tool installation failed"; exit 1; } + "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \ + --index-url https://download.pytorch.org/whl/cu130 \ + --extra-index-url https://pypi.org/simple "torch==2.10.0" >&2 2>&1 \ + || { cx_log "ERROR: torch 2.10.0+cu130 installation failed"; exit 1; } + # Torch pins NCCL 2.28.9; the PR #605 ElasticBuffer implementation requires 2.30.4. + "$venv/bin/python" -m pip install -q --disable-pip-version-check --no-input \ + --force-reinstall --no-deps "nvidia-nccl-cu13==2.30.4" >&2 2>&1 \ + || { cx_log "ERROR: NCCL 2.30.4 installation failed"; exit 1; } + cx_activate_deepep_v2 \ + || { cx_log "ERROR: DeepEP V2 environment activation failed"; exit 1; } + cx_prepare_deepep_toolchain \ + || { cx_log "ERROR: DeepEP V2 toolchain preparation failed"; exit 1; } + EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR" + export EP_NVSHMEM_ROOT_DIR + cx_materialize_backend_source deepep-v2 "$source" \ + || { cx_log "ERROR: DeepEP V2 staged source is invalid"; exit 1; } + (cd "$source" && SOURCE_DATE_EPOCH="$(cx_git_in_tree "$source" show -s --format=%ct HEAD)" \ + TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \ + python3 -m pip install -q --no-build-isolation --no-deps --force-reinstall .) >&2 2>&1 \ + || { cx_log "ERROR: DeepEP V2 build failed"; exit 1; } + cx_probe_deepep_v2 \ + || { cx_log "ERROR: DeepEP V2 ElasticBuffer/runtime probe failed"; exit 1; } + content_sha256="$(cx_deepep_v2_content_sha256)" \ + || { cx_log "ERROR: DeepEP V2 installed-content hashing failed"; exit 1; } + marker_tmp="$(mktemp "$root/.collectivex-complete.tmp.XXXXXX")" \ + || { cx_log "ERROR: DeepEP V2 cache-marker-create failed"; exit 1; } + chmod 600 "$marker_tmp" \ + || { cx_log "ERROR: DeepEP V2 cache-marker-permission failed"; exit 1; } + printf '%s\n%s\n%s\n%s\n%s\n' \ + "$revision" "$tree" "$fmt_revision" "$cache_key" "$content_sha256" > "$marker_tmp" \ + || { cx_log "ERROR: DeepEP V2 cache-marker-write failed"; exit 1; } + mv -f -- "$marker_tmp" "$marker" \ + || { cx_log "ERROR: DeepEP V2 cache-marker-publish failed"; exit 1; } + fi + cx_deepep_v2_cache_is_valid \ + "$root" "$marker" "$revision" "$tree" "$fmt_revision" "$cache_key" \ + || { cx_log "ERROR: DeepEP V2 cache validation failed"; exit 1; } + ); then + cx_log "ERROR: shared DeepEP V2 environment is incomplete" + return 1 + fi + cx_activate_deepep_v2 || return 1 + cx_prepare_deepep_toolchain || return 1 + cx_enable_deepep_v2_jit_reproducibility || return 1 + EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR" + export EP_NVSHMEM_ROOT_DIR + cx_probe_deepep_v2 || { cx_log "ERROR: DeepEP V2 shared runtime probe failed"; return 1; } + cx_log "DeepEP V2 ready ($DEEPEP_V2_COMMIT, ElasticBuffer, NCCL Device API; LSA/Gin selected by adapter)" +} + +# Build the pinned DeepEP `hybrid-ep` implementation. MNNVL remains one scale-up +# domain; true x86 scale-out uses the upstream DOCA/RDMA build explicitly. +cx_configure_deepep_hybrid_build() { + local interface device rdma_name + local -a interfaces devices + unset HYBRID_EP_MULTINODE USE_NIXL RDMA_CORE_HOME DEEPEP_HYBRID_BUILD_MODE + if [ "${CX_NODES:-1}" -le 1 ] || [ "${CX_TRANSPORT:-}" = mnnvl ]; then + export DEEPEP_HYBRID_BUILD_MODE=intradomain + return 0 + fi + [ "$(uname -m)" = x86_64 ] \ + || { cx_log "ERROR: hybrid-ep scale-out is registered only on x86_64"; return 1; } + [ -n "${GLOO_SOCKET_IFNAME:-}" ] && [ -n "${NCCL_IB_HCA:-}" ] \ + || { cx_log "ERROR: hybrid-ep scale-out network selectors are unavailable"; return 1; } + IFS=, read -r -a interfaces <<< "$GLOO_SOCKET_IFNAME" + for interface in "${interfaces[@]}"; do + [ -d "/sys/class/net/$interface" ] \ + || { cx_log "ERROR: configured hybrid-ep socket interface is absent"; return 1; } + done + IFS=, read -r -a devices <<< "$NCCL_IB_HCA" + for device in "${devices[@]}"; do + rdma_name="${device%%:*}" + [ -d "/sys/class/infiniband/$rdma_name" ] \ + || { cx_log "ERROR: configured hybrid-ep RDMA device is absent"; return 1; } + done + command -v make >/dev/null \ + || { cx_log "ERROR: make is required for hybrid-ep scale-out"; return 1; } + [ -r /usr/include/infiniband/verbs.h ] && [ -r /usr/include/infiniband/mlx5dv.h ] \ + || { cx_log "ERROR: pinned hybrid-ep RDMA headers are unavailable"; return 1; } + python3 - <<'PY' >/dev/null 2>&1 || { +import ctypes.util +import sys +sys.exit(0 if all(ctypes.util.find_library(name) for name in ("ibverbs", "mlx5")) else 1) +PY + cx_log "ERROR: pinned hybrid-ep RDMA libraries are unavailable" + return 1 + } + export HYBRID_EP_MULTINODE=1 USE_NIXL=0 RDMA_CORE_HOME=/usr + export DEEPEP_HYBRID_BUILD_MODE=multinode-doca +} + +cx_deepep_hybrid_marker_content_sha256() { + python3 - "$1" "$2" "$3" "$4" "${5:-}" <<'PY' +import os +import re +import stat +import sys + +root, marker, revision, tree, build_mode = sys.argv[1:] +try: + root_item = os.lstat(root) + marker_item = os.lstat(marker) + if ( + not stat.S_ISDIR(root_item.st_mode) + or stat.S_IMODE(root_item.st_mode) & 0o777 != 0o700 + or not stat.S_ISREG(marker_item.st_mode) + or marker_item.st_uid != root_item.st_uid + or stat.S_IMODE(marker_item.st_mode) & 0o777 != 0o600 + or marker_item.st_size > 512 + ): + raise OSError + descriptor = os.open(marker, os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)) + try: + opened = os.fstat(descriptor) + if (opened.st_dev, opened.st_ino) != (marker_item.st_dev, marker_item.st_ino): + raise OSError + payload = os.read(descriptor, 513) + finally: + os.close(descriptor) + lines = payload.decode("ascii").splitlines() + expected = [revision, tree, build_mode] if build_mode else [revision, tree] + if len(lines) != len(expected) + 1 or lines[:-1] != expected: + raise ValueError + if not re.fullmatch(r"[0-9a-f]{64}", lines[-1]): + raise ValueError +except (OSError, UnicodeError, ValueError): + raise SystemExit(1) +print(lines[-1], end="") +PY +} + +cx_deepep_hybrid_cache_is_valid() { + local root="$1" marker="$2" revision="$3" tree="$4" build_mode="${5:-}" + local expected actual status extra + expected="$(cx_deepep_hybrid_marker_content_sha256 \ + "$root" "$marker" "$revision" "$tree" "$build_mode")" || return 1 + [ "$(cx_git_in_tree "$root" rev-parse HEAD 2>/dev/null)" = "$revision" ] \ + && [ "$(cx_git_in_tree "$root" rev-parse 'HEAD^{tree}' 2>/dev/null)" = "$tree" ] \ + || return 1 + status="$(cx_git_in_tree "$root" status --porcelain --untracked-files=no \ + --ignore-submodules=none 2>/dev/null)" || return 1 + [ -z "$status" ] || return 1 + extra="$(cx_git_in_tree "$root" ls-files --others --exclude-standard -- \ + 'deep_ep/*.py' 'deep_ep/*.so' 2>/dev/null)" || return 1 + [ -z "$extra" ] || return 1 + extra="$(cx_git_in_tree "$root" ls-files --others --ignored --exclude-standard -- \ + 'deep_ep/*.py' 'deep_ep/*.so' 2>/dev/null)" || return 1 + [ -z "$extra" ] || return 1 + actual="$(cx_extension_pair_sha256 "$root" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" \ + || return 1 + [ "$actual" = "$expected" ] +} + +cx_build_deepep_hybrid() { + local arch revision="$CX_DEEPEP_HYBRID_COMMIT" tree="$CX_DEEPEP_HYBRID_TREE" + local build_root marker marker_tmp lock_path content_sha256 cache_ready build_mode + export DEEPEP_COMMIT="$revision" DEEPEP_TREE="$tree" + arch="$(cx_cuda_arch)" || return 1 + cx_configure_deepep_hybrid_build || return 1 + build_mode="$DEEPEP_HYBRID_BUILD_MODE" + build_root="$PWD/.cx_backend/deepep-hybrid-${arch/./}-${build_mode}" + marker="$build_root/.collectivex-complete" + lock_path="${build_root}.lock" + cx_log "DeepEP hybrid-ep: building $revision for CUDA target $arch" + unset NVSHMEM_DIR + cx_prepare_cuda_cccl || return 1 + command -v flock >/dev/null || { cx_log "ERROR: flock is required for hybrid-ep"; return 1; } + mkdir -p "$PWD/.cx_backend" || return 1 + if ! ( + [ ! -L "$lock_path" ] || exit 1 + (umask 077; : >> "$lock_path") && chmod 600 "$lock_path" || exit 1 + exec 9<>"$lock_path" || exit 1 + flock 9 || exit 1 + cache_ready=0 + if [ -e "$marker" ] || [ -L "$marker" ]; then + cx_deepep_hybrid_cache_is_valid \ + "$build_root" "$marker" "$revision" "$tree" "$build_mode" \ + || exit 1 + cache_ready=1 + fi + if [ "$cache_ready" != 1 ]; then + cx_materialize_backend_source deepep-hybrid "$build_root" \ + || { cx_log "ERROR: hybrid-ep staged source is invalid"; exit 1; } + if [ "$build_mode" = multinode-doca ]; then + [ "$(cx_git_in_tree "$build_root/third-party/nccl" rev-parse HEAD 2>/dev/null)" \ + = "$CX_DEEPEP_HYBRID_NCCL_COMMIT" ] \ + || { cx_log "ERROR: pinned hybrid-ep NCCL transport source is absent"; exit 1; } + fi + (cd "$build_root" && \ + SOURCE_DATE_EPOCH="$(cx_git_in_tree "$build_root" show -s --format=%ct HEAD)" \ + TORCH_CUDA_ARCH_LIST="$arch" MAX_JOBS=16 \ + python3 setup.py build_ext --inplace) >&2 2>&1 \ + || { cx_log "ERROR: hybrid-ep build failed"; exit 1; } + content_sha256="$(cx_extension_pair_sha256 \ + "$build_root" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" || exit 1 + marker_tmp="$(mktemp "$build_root/.collectivex-complete.tmp.XXXXXX")" || exit 1 + chmod 600 "$marker_tmp" || exit 1 + printf '%s\n%s\n%s\n%s\n' \ + "$revision" "$tree" "$build_mode" "$content_sha256" > "$marker_tmp" \ + || exit 1 + mv -f -- "$marker_tmp" "$marker" || exit 1 + fi + cx_deepep_hybrid_cache_is_valid \ + "$build_root" "$marker" "$revision" "$tree" "$build_mode" + ); then + cx_log "ERROR: shared hybrid-ep build is incomplete" + return 1 + fi + export PYTHONPATH="$build_root:${PYTHONPATH:-}" + python3 -c "import deep_ep; assert hasattr(deep_ep,'HybridEPBuffer'); print('built hybrid-ep deep_ep', getattr(deep_ep,'__version__','?'))" >&2 \ + || { cx_log "ERROR: hybrid-ep import / HybridEPBuffer missing after build"; return 1; } + cx_log "DeepEP hybrid-ep ready ($DEEPEP_COMMIT, mode=$build_mode)" +} + +# UCCL EP (uccl.ep.Buffer is a DeepEP-API clone). The prebuilt wheel is cu12; on a cu13 +# image its kernels need a cu12 CUDA runtime on LD_LIBRARY_PATH (probe-confirmed). PEP-668 +# images need PIP_BREAK_SYSTEM_PACKAGES. Best-effort; failure to import fails loudly. +cx_build_uccl() { + if [ -f /tmp/.cx_built_uccl ]; then + cx_log "UCCL EP already prepared this allocation — skip rebuild" + python3 -c "import torch; from uccl_deepep import Buffer" 2>/dev/null || return 1 + return 0 + fi + local version="0.1.1" tag="v0.1.1" + local wheel_sha256="390c1320918972206546e44d79b132988f2818ec07e23afcd0595f7183916cec" + cx_log "UCCL EP: installing uccl==$version + cu12 runtime shim" + export PIP_BREAK_SYSTEM_PACKAGES=1 + pip install -q --no-deps "sortedcontainers==2.4.0" "intervaltree==3.1.0" >&2 2>&1 \ + || { cx_log "ERROR: UCCL support dependency installation failed"; return 1; } + printf 'uccl==%s --hash=sha256:%s\n' "$version" "$wheel_sha256" \ + | pip install -q --no-deps --only-binary=:all: --require-hashes -r /dev/stdin >&2 2>&1 \ + || { cx_log "ERROR: pip install uccl==$version failed"; return 1; } + pip install -q --no-deps "nvidia-cuda-runtime-cu12==12.9.79" >&2 2>&1 \ + || { cx_log "ERROR: CUDA 12 runtime shim install failed"; return 1; } + local cu12lib + cu12lib="$(python3 -c "import nvidia.cuda_runtime as m, os; print(os.path.join(os.path.dirname(m.__file__),'lib'))" 2>/dev/null)" + [ -n "$cu12lib" ] && export LD_LIBRARY_PATH="$cu12lib:${LD_LIBRARY_PATH:-}" + local installed + installed="$(python3 -c 'import importlib.metadata as m; print(m.version("uccl"))')" \ + || { cx_log "ERROR: cannot read installed UCCL version"; return 1; } + [ "$installed" = "$version" ] \ + || { cx_log "ERROR: expected UCCL $version, installed $installed"; return 1; } + UCCL_COMMIT="pkg-$installed" + export UCCL_COMMIT + # import torch FIRST: uccl.ep's C extension links libc10.so (torch), which is only on the loader + # path once torch is imported (rpath). The adapter (ep_uccl.py) imports torch before uccl.ep too. + python3 -c "import torch; from uccl.ep import Buffer; print('uccl.ep ready')" >&2 \ + || { cx_log "ERROR: uccl.ep import failed (cu12 runtime on LD_LIBRARY_PATH?)"; return 1; } + # Vendor UCCL's DeepEP-API wrapper (ep/deep_ep_wrapper/deep_ep) under a NON-conflicting name + # (uccl_deepep) so it doesn't shadow the container's real deep_ep. Its Buffer(group, num_nvl_bytes, + # ...) takes a torch ProcessGroup (matching DeepEP + ep_uccl.py's calls) and runs the full + # proxy/IPC-handle/runtime.sync bootstrap that the low-level uccl.ep.Buffer(rank,num_ranks) lacks. + rm -rf /tmp/uccl_src /tmp/uccl_deepep_pkg + # Pin the wrapper to the SAME tag as the installed wheel (pkg-0.1.1 -> v0.1.1): the wrapper's + # dispatch calls into uccl.ep (get_rdma_buffer etc.), so a main-branch wrapper vs a 0.1.1 wheel + # mismatches signatures. Match them. + if git clone --depth 1 --branch "$tag" https://github.com/uccl-project/uccl /tmp/uccl_src >&2 2>&1 \ + && [ "$(git -C /tmp/uccl_src rev-parse HEAD)" = "73ee4f12ba71717d6de34ba06806e1baaabe3f42" ] \ + && [ -d /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep ]; then + mkdir -p /tmp/uccl_deepep_pkg/uccl_deepep + cp /tmp/uccl_src/ep/deep_ep_wrapper/deep_ep/*.py /tmp/uccl_deepep_pkg/uccl_deepep/ 2>/dev/null + export PYTHONPATH="/tmp/uccl_deepep_pkg:${PYTHONPATH:-}" + python3 -c "import torch; from uccl_deepep import Buffer; print('uccl_deepep wrapper ready')" >&2 \ + || { cx_log "ERROR: uccl_deepep wrapper import failed"; return 1; } + export CX_UCCL_WRAPPER=1 + export UCCL_WRAPPER_COMMIT="73ee4f12ba71717d6de34ba06806e1baaabe3f42" + else + cx_log "ERROR: uccl deep_ep_wrapper not available" + return 1 + fi + : > /tmp/.cx_built_uccl + cx_log "UCCL EP ready ($UCCL_COMMIT, wrapper=${CX_UCCL_WRAPPER:-0})" +} + +# Rack build and rank steps may enter different container instances. Persist each node's +# loader/import path and build identity on the shared staged mount, then require it from every rank. +cx_persist_backend_env() { + local root="$PWD/.cx_backend/env" node_id="${SLURM_NODEID:-0}" path temporary name + local -a names=(PATH VIRTUAL_ENV LD_LIBRARY_PATH PYTHONPATH CUDA_HOME CPATH NVCC_PREPEND_FLAGS + NVSHMEM_DIR DEEPEP_COMMIT DEEPEP_TREE + EP_NCCL_ROOT_DIR EP_NVSHMEM_ROOT_DIR EP_JIT_CACHE_DIR EP_REUSE_NCCL_COMM + EP_JIT_DUMP_SASS + DEEPEP_V2_PR DEEPEP_V2_FIX_PR DEEPEP_V2_COMMIT DEEPEP_V2_TREE DEEPEP_V2_FMT_COMMIT + DEEPEP_V2_JIT_RANDOM_SEED + HYBRID_EP_MULTINODE USE_NIXL RDMA_CORE_HOME DEEPEP_HYBRID_BUILD_MODE + UCCL_COMMIT UCCL_WRAPPER_COMMIT CX_UCCL_WRAPPER) + [[ "$node_id" =~ ^[0-9]+$ ]] || return 1 + mkdir -p "$root" || return 1 + chmod 700 "$root" || return 1 + temporary="$(mktemp "$root/.node-${node_id}.XXXXXX")" || return 1 + chmod 600 "$temporary" || { rm -f "$temporary"; return 1; } + for name in "${names[@]}"; do + if declare -p "$name" >/dev/null 2>&1; then + printf 'export %s=%q\n' "$name" "${!name}" >> "$temporary" \ + || { rm -f "$temporary"; return 1; } + fi + done + path="$root/node-${node_id}.sh" + mv -f -- "$temporary" "$path" || { rm -f "$temporary"; return 1; } +} + +# Validate private scale-out selectors on every allocated compute node before a +# backend can initialize or build transport code. +cx_probe_scaleout_network() { + local interface device rdma_name + local -a interfaces devices + if [ "${CX_NODES:-1}" -le 1 ] || [ "${CX_TRANSPORT:-}" = mnnvl ]; then + return 0 + fi + [ -n "${GLOO_SOCKET_IFNAME:-}" ] && [ -n "${NCCL_IB_HCA:-}" ] \ + || { cx_log "ERROR: scale-out network selectors are unavailable"; return 1; } + IFS=, read -r -a interfaces <<< "$GLOO_SOCKET_IFNAME" + for interface in "${interfaces[@]}"; do + [ -d "/sys/class/net/$interface" ] \ + || { cx_log "ERROR: configured scale-out socket interface is absent"; return 1; } + done + IFS=, read -r -a devices <<< "$NCCL_IB_HCA" + for device in "${devices[@]}"; do + rdma_name="${device%%:*}" + [ -d "/sys/class/infiniband/$rdma_name" ] \ + || { cx_log "ERROR: configured scale-out RDMA device is absent"; return 1; } + done +} + +# Prepare and probe one backend without running a benchmark. The same hook is used +# by normal in-container runs and by rack launchers' persistent build-only step. +cx_prepare_backend() { + local backend="${1:-}" + case "$backend" in + deepep) + cx_probe_deepep || return 1 + ;; + deepep-v2) + cx_build_deepep_v2 || return 1 + ;; + deepep-hybrid) + cx_build_deepep_hybrid || return 1 + ;; + uccl) + cx_build_uccl || return 1 + ;; + mori) + python3 -c "import mori" 2>/dev/null || return 1 + ;; + nccl-ep) + ;; + *) + cx_log "ERROR: unknown backend preparation request" + return 1 + ;; + esac +} + +prepare_backend_or_record() { + local backend="$1" phases="${CX_PHASE:-decode}" phase + cx_write_runtime_stage backend-setup || return 1 + if cx_prepare_backend "$backend"; then + return 0 + fi + cx_log "WARN: $backend preparation failed" + [ "$phases" = "both" ] && phases="decode prefill" + for phase in $phases; do + CX_FAILURE_MODE=backend-setup emit_failed_case "$backend" "$phase" 6 + done + return 1 +} + +# dispatch_bench runs the CURRENT CX_BENCH (+ CX_* config env) once. The sweep workflow runs many +# of these per allocation (SHARD mode below), reusing this single container + its built backend. +dispatch_bench() { + case "$CX_BENCH" in + nccl-ep) + run_ep_suite "$CX_BENCH" + ;; + deepep|deepep-v2|deepep-hybrid|mori|uccl) + prepare_backend_or_record "$CX_BENCH" && run_ep_suite "$CX_BENCH" + ;; + *) + cx_die "unknown CX_BENCH=$CX_BENCH (want deepep|deepep-v2|mori|uccl|nccl-ep|deepep-hybrid)" + ;; + esac +} + +run_precision_probe() { + local fields probe_id backend sku ep mode profile out rc_run + fields="$(cx_precision_probe_control_fields "$PWD")" || return 1 + IFS='|' read -r probe_id backend sku ep mode profile <<< "$fields" + [ "$backend" = "$CX_BENCH" ] && [ "$sku" = "$CX_RUNNER" ] && [ "$ep" = "$CX_NGPUS" ] \ + || { cx_log "ERROR: precision probe control differs from runtime placement"; return 1; } + out="results/${probe_id}.json" + cx_write_runtime_stage execution || return 1 + if timeout -k 30 "${CX_RUN_TIMEOUT:-900}" torchrun --nproc_per_node="$CX_NGPUS" \ + tests/probe_precision.py --backend "$backend" --sku "$sku" --ep "$ep" \ + --mode "$mode" --precision-profile "$profile" --out "$out"; then + rc_run=0 + else + rc_run=$? + fi + [ "$rc_run" = 0 ] || return "$rc_run" + python3 tests/probe_precision.py --validate-manifest "$out" +} + +rc=0 +cx_validate_shard_control "$PWD" +# Build-only mode: rack launchers run the shared backend preparation hook once per +# node inside a persistent named container, then direct rank processes reuse it. +if [ -n "${CX_BUILD_ONLY:-}" ]; then + if cx_probe_scaleout_network && cx_prepare_backend "${CX_BENCH:-}"; then + cx_persist_backend_env || rc=1 + else + rc=1 + fi + cx_log "backend preparation: bench=${CX_BENCH:-unknown} rc=$rc" + exit "$rc" +fi +if [ "${CX_PRECISION_PROBE:-0}" = 1 ]; then + if cx_probe_scaleout_network && cx_prepare_backend "${CX_BENCH:-}"; then + run_precision_probe || rc=$? + else + rc=1 + fi +elif [ -n "${CX_SHARD_FILE:-}" ]; then + # SHARD/SWEEP mode (collectivex-sweep.yml): run EVERY case of this shard in THIS one allocation. + # All cases share (sku, backend, nodes), so backend preparation is paid once and cached. + ncases="$(python3 -c "import json;print(len(json.load(open('$CX_SHARD_FILE'))['cases']))")" + cx_log "SHARD mode: $ncases case(s) in one allocation (shard=$CX_SHARD_FILE)" + _cx_ts_base="$CX_TS" # per-case CX_TS suffix below keeps each case's result file UNIQUE (else + # cases sharing backend+phase overwrite each other at the same timestamp). + ci=0 + failed_cases=0 + while [ "$ci" -lt "$ncases" ]; do + CX_TS="${_cx_ts_base}-c$(printf '%03d' "$ci")" + export CX_TS + # Map varying case fields plus the frozen v1 defaults into CX_* env. + _exports="$(python3 - "$CX_SHARD_FILE" "$ci" <<'PY' +import json, sys, shlex +c = json.load(open(sys.argv[1]))["cases"][int(sys.argv[2])] +def g(k, d=""): + v = c.get(k, d); return "" if v is None else str(v) +env = { + "CX_BENCH": g("backend"), + "CX_MODE": g("mode", "normal"), + "CX_ROUTING": g("routing", "uniform"), "CX_PHASE": g("phase", "decode"), + "CX_EP": g("ep", "1"), + "CX_EPLB": "1" if c.get("eplb") else "", + "CX_CASE_ID": g("case_id"), "CX_SUITE": g("suite"), "CX_WORKLOAD_NAME": g("workload"), + "CX_REQUIRED_PUBLICATION": g("required_publication"), + "CX_HIDDEN": g("hidden"), "CX_TOPK": g("topk"), "CX_EXPERTS": g("experts"), + "CX_TOKENS_LADDER": g("ladder"), "CX_CANONICAL": ("1" if c.get("canonical") else ""), + "CX_NODES": g("nodes"), "CX_GPUS_PER_NODE": g("gpus_per_node"), + "CX_SCALE_UP_DOMAIN": g("scale_up_domain"), "CX_SCOPE": g("scope"), + "CX_SCALE_UP_TRANSPORT": g("scale_up_transport"), + "CX_SCALE_OUT_TRANSPORT": g("scale_out_transport"), + "CX_TRANSPORT": g("transport"), "CX_TOPO": g("topology_class"), + "CX_SAMPLES_PER_POINT": g("samples_per_point"), + "CX_WARMUP_SEMANTICS": g("warmup_semantics"), +} +lines = [f"export {k}={shlex.quote(v)}" for k, v in env.items()] +# Per-case timing "iters:trials:warmup" (fixed-512-v1 requires 8:64:32 everywhere); +# cases without one must fall back to the harness defaults, so UNSET rather than export-empty +# (an empty CX_ITERS would defeat the 8-iter default and break the run_ep argparse; NOTE no +# apostrophes in this heredoc — bash command-substitution scanning chokes on unbalanced quotes). +timing = g("timing") +if timing: + parts = (timing.split(":") + ["", "", ""])[:3] + for k, v in zip(("CX_ITERS", "CX_TRIALS", "CX_WARMUP"), parts): + if v: + lines.append(f"export {k}={shlex.quote(v)}") +else: + lines.append("unset CX_ITERS CX_TRIALS CX_WARMUP 2>/dev/null || true") +print("\n".join(lines)) +PY +)" + eval "$_exports" + # Each case has its OWN routing/dims -> its own canonical workload manifest. cx_stage_canonical + # short-circuits when CX_WORKLOAD_DIR is already set, so without this unset the first case's + # staged dir is reused for the rest and run_ep.py can't find the later cases' manifests + # (FileNotFoundError .cx_workloads/.manifest.json). Unset so every case re-stages its own. + unset CX_WORKLOAD_DIR 2>/dev/null || true + cx_log " [$((ci+1))/$ncases] $CX_BENCH $CX_MODE/$CX_PHASE routing=$CX_ROUTING eplb=${CX_EPLB:-0}" + _cx_case_ts="$CX_TS" + CX_TS="${_cx_case_ts}-a01" + export CX_ATTEMPT_ID=1 CX_TS + dispatch_bench || { + failed_cases=$((failed_cases+1)) + cx_log " [$((ci+1))/$ncases] $CX_BENCH case FAILED; failed-case record preserved" + } + export CX_TS="$_cx_case_ts" + ci=$((ci + 1)) + done + if [ "${failed_cases:-0}" -gt 0 ]; then + cx_log "SHARD done: $failed_cases/$ncases case(s) failed" + rc=1 + fi + # The base timestamp matches every per-case file, so the final summary covers the whole shard. + export CX_TS="$_cx_ts_base" +else + _cx_single_ts="$CX_TS" + CX_TS="${_cx_single_ts}-a01" + export CX_ATTEMPT_ID=1 CX_TS + dispatch_bench || rc=1 +fi + +# Summary table for the log; also fails the job if no valid results were produced. +if [ "${CX_PRECISION_PROBE:-0}" != 1 ]; then + python3 summarize.py --results-dir results --runner "$CX_RUNNER" --ts "$CX_TS" || rc=1 +fi +exit "$rc" diff --git a/experimental/CollectiveX/schemas/channel-v1.schema.json b/experimental/CollectiveX/schemas/channel-v1.schema.json new file mode 100644 index 000000000..87ffa86b0 --- /dev/null +++ b/experimental/CollectiveX/schemas/channel-v1.schema.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/channel-v1.schema.json", + "title": "CollectiveX public channel v1", + "type": "object", + "additionalProperties": false, + "required": ["format","channel","dataset","generated_at"], + "properties": { + "format": {"const": "collectivex.channel.v1"}, + "channel": {"const": "dev-latest"}, + "dataset": { + "type": "object", + "additionalProperties": false, + "required": ["path","sha256","bytes"], + "properties": { + "path": {"type": "string","pattern": "^datasets/[0-9a-f]{64}/dataset\\.json$"}, + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "bytes": {"type": "integer","minimum": 1,"maximum": 33554432} + } + }, + "generated_at": {"type": "string","format": "date-time"} + } +} diff --git a/experimental/CollectiveX/schemas/private-bundle-v1.schema.json b/experimental/CollectiveX/schemas/private-bundle-v1.schema.json new file mode 100644 index 000000000..789119692 --- /dev/null +++ b/experimental/CollectiveX/schemas/private-bundle-v1.schema.json @@ -0,0 +1,163 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/private-bundle-v1.schema.json", + "title": "CollectiveX private attempt bundle v1", + "type": "object", + "additionalProperties": false, + "required": [ + "format", + "schema_version", + "created_at", + "ingest_id", + "run", + "matrix", + "sources", + "attempts", + "coverage", + "runtime_fingerprints", + "checksums", + "validation" + ], + "properties": { + "format": {"const": "collectivex.private.bundle.v1"}, + "schema_version": {"const": 1}, + "created_at": {"type": "string","format": "date-time"}, + "ingest_id": {"$ref": "#/$defs/sha256"}, + "run": { + "type": "object", + "additionalProperties": false, + "required": ["repository","run_id","run_attempt","qualification_index","source_sha"], + "properties": { + "repository": {"type": "string","pattern": "^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$"}, + "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"}, + "run_attempt": {"type": "integer","minimum": 1}, + "qualification_index": {"type":"integer","minimum":1,"maximum":3}, + "source_sha": {"type": "string","pattern": "^[0-9a-f]{40}$"} + } + }, + "matrix": {"$ref": "#/$defs/file"}, + "sources": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/source"}}, + "attempts": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": [ + "attempt_id", + "allocation_id", + "case_id", + "outcome", + "reason", + "selected", + "document", + "samples", + "runtime_fingerprint_sha256", + "series_ids", + "evidence_ids" + ], + "properties": { + "attempt_id": {"$ref": "#/$defs/attemptId"}, + "allocation_id": {"$ref": "#/$defs/allocationId"}, + "case_id": {"$ref": "#/$defs/caseId"}, + "outcome": {"$ref": "#/$defs/outcome"}, + "reason": {"$ref": "#/$defs/reason"}, + "selected": {"type": "boolean"}, + "document": {"$ref": "#/$defs/file"}, + "samples": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/file"}]}, + "runtime_fingerprint_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]}, + "series_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/seriesId"}}, + "evidence_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/evidenceId"}} + } + } + }, + "coverage": { + "type": "object", + "additionalProperties": false, + "required": ["expected_cases","terminal_cases","complete","outcome_counts","selections"], + "properties": { + "expected_cases": {"type": "integer","minimum": 1}, + "terminal_cases": {"type": "integer","minimum": 0}, + "complete": {"type": "boolean"}, + "outcome_counts": {"$ref": "#/$defs/outcomeCounts"}, + "selections": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["case_id","selected_attempt_id","outcome"], + "properties": { + "case_id": {"$ref": "#/$defs/caseId"}, + "selected_attempt_id": {"$ref": "#/$defs/attemptId"}, + "outcome": {"$ref": "#/$defs/outcome"} + } + } + } + } + }, + "runtime_fingerprints": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/sha256"}}, + "checksums": {"$ref": "#/$defs/file"}, + "validation": { + "type": "object", + "additionalProperties": false, + "required": ["policy","passed","checks"], + "properties": { + "policy": {"const": "collectivex-publisher-v1"}, + "passed": {"const": true}, + "checks": { + "type": "array", + "minItems": 1, + "uniqueItems": true, + "items": {"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$"} + } + } + } + }, + "$defs": { + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"}, + "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"}, + "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"}, + "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"}, + "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"}, + "reason": {"oneOf": [{"type": "null"},{"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}]}, + "outcome": {"enum": ["success","unsupported","failed","invalid","diagnostic"]}, + "outcomeCounts": { + "type": "object", + "additionalProperties": false, + "required": ["success","unsupported","failed","invalid","diagnostic"], + "properties": { + "success": {"type": "integer","minimum": 0}, + "unsupported": {"type": "integer","minimum": 0}, + "failed": {"type": "integer","minimum": 0}, + "invalid": {"type": "integer","minimum": 0}, + "diagnostic": {"type": "integer","minimum": 0} + } + }, + "file": { + "type": "object", + "additionalProperties": false, + "required": ["path","sha256","bytes"], + "properties": { + "path": {"type": "string","pattern": "^[A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*$"}, + "sha256": {"$ref": "#/$defs/sha256"}, + "bytes": {"type": "integer","minimum": 1} + } + }, + "source": { + "type": "object", + "additionalProperties": false, + "required": ["path","sha256","bytes","artifact_name"], + "properties": { + "path": {"type": "string","pattern": "^[A-Za-z0-9_.-]+(?:/[A-Za-z0-9_.-]+)*$"}, + "sha256": {"$ref": "#/$defs/sha256"}, + "bytes": {"type": "integer","minimum": 1}, + "artifact_name": { + "type": "string", + "pattern": "^cx(?:unsupported|shard-[a-z0-9][a-z0-9_.-]{0,127})-[1-9][0-9]*-[1-9][0-9]*$" + } + } + } + } +} diff --git a/experimental/CollectiveX/schemas/public-dataset-v1.schema.json b/experimental/CollectiveX/schemas/public-dataset-v1.schema.json new file mode 100644 index 000000000..cf5a5eed4 --- /dev/null +++ b/experimental/CollectiveX/schemas/public-dataset-v1.schema.json @@ -0,0 +1,880 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/public-dataset-v1.schema.json", + "title": "CollectiveX sanitized public dataset v1", + "type": "object", + "additionalProperties": false, + "required": [ + "format", + "schema_version", + "generated_at", + "source_bundle_ids", + "promotion", + "coverage", + "attempts", + "series", + "cohorts", + "rankings", + "recommendations", + "sensitivities" + ], + "properties": { + "format": {"const": "collectivex.public.v1"}, + "schema_version": {"const": 1}, + "generated_at": {"type": "string","format": "date-time"}, + "source_bundle_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/sha256"}}, + "promotion": { + "type": "object", + "additionalProperties": false, + "required": [ + "status", + "reason", + "matrix_id", + "allocation_ids", + "required_allocations", + "qualification_indices", + "requested_cases", + "terminal_cases", + "measured_cases", + "unsupported_cases", + "requested_points", + "terminal_points", + "measured_points", + "unsupported_points", + "policy" + ], + "properties": { + "status": {"enum": ["promoted","diagnostic","quarantined"]}, + "reason": {"$ref": "#/$defs/reason"}, + "matrix_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]}, + "allocation_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}}, + "required_allocations": {"const": 3}, + "qualification_indices": {"type":"array","minItems":0,"maxItems":3,"uniqueItems":true,"items":{"enum":[1,2,3]}}, + "requested_cases": {"type": "integer","minimum": 0}, + "terminal_cases": {"type": "integer","minimum": 0}, + "measured_cases": {"type": "integer","minimum": 0}, + "unsupported_cases": {"type": "integer","minimum": 0}, + "requested_points": {"type": "integer","minimum": 0}, + "terminal_points": {"type": "integer","minimum": 0}, + "measured_points": {"type": "integer","minimum": 0}, + "unsupported_points": {"type": "integer","minimum": 0}, + "policy": {"const": "collectivex-decision-grade-v1"} + } + }, + "coverage": {"type": "array","uniqueItems":true,"items": {"$ref": "#/$defs/coverage"}}, + "attempts": {"type": "array","items": {"$ref": "#/$defs/attempt"}}, + "series": {"type": "array","items": {"$ref": "#/$defs/series"}}, + "cohorts": {"type": "array","items": {"$ref": "#/$defs/cohort"}}, + "rankings": {"type": "array","items": {"$ref": "#/$defs/ranking"}}, + "recommendations": {"type": "array","items": {"$ref": "#/$defs/recommendation"}}, + "sensitivities": {"type": "array","items": {"$ref": "#/$defs/sensitivity"}} + }, + "$defs": { + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"}, + "workloadId": {"type": "string","pattern": "^cxwork-v1-[0-9a-f]{64}$"}, + "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"}, + "pointId": {"type": "string","pattern": "^cxpoint-v1-[0-9a-f]{64}$"}, + "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"}, + "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"}, + "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"}, + "safeId": {"type": "string","pattern": "^[a-z0-9][a-z0-9_.-]*$","maxLength": 128}, + "communicationAxis": { + "type":"object", + "additionalProperties":false, + "required": [ + "alignment_contract","api_input_dtype","api_output_dtype","communication_format", + "conversion_boundary","padding_contract","quantization_origin","scale_dtype", + "scale_group_size","scale_layout" + ], + "properties": { + "alignment_contract":{"enum":["native-bf16-vector-alignment","hidden-block-128","native-fp8-vector-alignment","value-block-64"]}, + "api_input_dtype":{"enum":["bf16","fp8-e4m3fn-with-f32-scale","fp8-e4m3fnuz-with-f32-scale"]}, + "api_output_dtype":{"enum":["bf16","fp8-e4m3fn-with-f32-scale","fp8-e4m3fnuz-with-f32-scale"]}, + "communication_format":{"enum":["bf16","fp8-e4m3fn","fp8-e4m3fnuz","logfmt10"]}, + "conversion_boundary":{"enum":["none","before-dispatch-timing","inside-dispatch-timing","inside-combine-timing"]}, + "padding_contract":{"enum":["none","right-zero-pad-hidden-to-128","right-zero-pad-values-to-64"]}, + "quantization_origin":{"enum":["none","caller-prequantized","backend-fused","backend-internal","backend-internal-direct-cast"]}, + "scale_dtype":{"oneOf":[{"type":"null"},{"enum":["f32","implicit-logfmt10"]}]}, + "scale_group_size":{"oneOf":[{"type":"null"},{"enum":[64,128]}]}, + "scale_layout":{"enum":["none","per-token-hidden-block","dynamic-per-64-values"]} + } + }, + "precisionProfile": { + "enum": [ + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-bf16", + "d-fp8-e4m3fn-b128-f32-fused.c-bf16", + "d-bf16.c-logfmt10-dynamic64", + "d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64", + "d-bf16.c-fp8-e4m3fn-direct-cast-noscale", + "d-fp8-e4m3fn-b128-f32-prequantized.c-fp8-e4m3fn-direct-cast-noscale", + "d-bf16.c-fp8-e4m3fnuz-direct-cast-noscale", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-fp8-e4m3fnuz-direct-cast-noscale" + ] + }, + "byteAccounting": { + "type":"object", + "additionalProperties":false, + "required":["accounting_contract","activation_data_bytes","scale_bytes","total_logical_bytes"], + "properties": { + "accounting_contract":{"const":"activation-data-plus-scales-v1"}, + "activation_data_bytes":{"type":"integer","minimum":0}, + "scale_bytes":{"type":"integer","minimum":0}, + "total_logical_bytes":{"type":"integer","minimum":0} + } + }, + "publicationTier": {"enum": ["official","comparable-experimental"]}, + "label": {"type": "string","minLength": 1,"maxLength": 160}, + "nullableLabel": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/label"}]}, + "reason": {"oneOf": [{"type": "null"},{"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96}]}, + "reasonId": {"type":"string","pattern":"^[a-z0-9][a-z0-9.-]*$","maxLength":96}, + "outcome": {"enum": ["success","unsupported","failed","invalid","diagnostic"]}, + "precisionAxisEvidence": { + "type":"object", + "additionalProperties":false, + "required":[ + "dequantized_semantics","encoded_payload_valid","max_abs_error","max_rel_error", + "passed","saturation_count","saturation_rate","scales_finite","scales_positive" + ], + "properties": { + "dequantized_semantics":{"type":"boolean"}, + "encoded_payload_valid":{"type":"boolean"}, + "max_abs_error":{"type":"number","minimum":0}, + "max_rel_error":{"type":"number","minimum":0}, + "passed":{"type":"boolean"}, + "saturation_count":{"type":"integer","minimum":0}, + "saturation_rate":{"type":"number","minimum":0,"maximum":1}, + "scales_finite":{"oneOf":[{"type":"null"},{"type":"boolean"}]}, + "scales_positive":{"oneOf":[{"type":"null"},{"type":"boolean"}]} + } + }, + "precisionEvidence": { + "type":"object", + "additionalProperties":false, + "required":["combine","dispatch","passed","profile_id"], + "properties": { + "combine":{"$ref":"#/$defs/precisionAxisEvidence"}, + "dispatch":{"$ref":"#/$defs/precisionAxisEvidence"}, + "passed":{"type":"boolean"}, + "profile_id":{"$ref":"#/$defs/precisionProfile"} + } + }, + "pointCorrectness": { + "type":"object", + "additionalProperties":false, + "required":["semantic_pass","precision"], + "properties": { + "semantic_pass":{"type":"boolean"}, + "precision":{"$ref":"#/$defs/precisionEvidence"} + } + }, + "pointStability": { + "type":"object", + "additionalProperties":false, + "allOf":[ + { + "if":{"properties":{"complete":{"const":true}},"required":["complete"]}, + "then":{"properties":{ + "qualification_indices":{"const":[1,2,3]}, + "p50_max_min_ratio":{"type":"number","minimum":1}, + "p99_max_min_ratio":{"type":"number","minimum":1} + }}, + "else":{"properties":{ + "p50_max_min_ratio":{"type":"null"}, + "p99_max_min_ratio":{"type":"null"}, + "stable_p50":{"const":false}, + "stable_p99":{"const":false} + }} + } + ], + "required":[ + "complete","qualification_indices","p50_max_min_ratio","p99_max_min_ratio", + "stable_p50","stable_p99" + ], + "properties": { + "complete":{"type":"boolean"}, + "qualification_indices":{"type":"array","minItems":1,"maxItems":3,"uniqueItems":true,"items":{"enum":[1,2,3]}}, + "p50_max_min_ratio":{"oneOf":[{"type":"null"},{"type":"number","minimum":1}]}, + "p99_max_min_ratio":{"oneOf":[{"type":"null"},{"type":"number","minimum":1}]}, + "stable_p50":{"type":"boolean"}, + "stable_p99":{"type":"boolean"} + } + }, + "coverageTopology": { + "type": "object", + "additionalProperties": false, + "required": [ + "ep_size", + "nodes", + "gpus_per_node", + "scale_up_domain", + "scope", + "scale_up_transport", + "scale_out_transport", + "transport", + "topology_class" + ], + "properties": { + "ep_size": {"type": "integer","minimum": 1}, + "nodes": {"type": "integer","minimum": 1}, + "gpus_per_node": {"type": "integer","minimum": 1}, + "scale_up_domain": {"type": "integer","minimum": 1}, + "scope": {"enum": ["scale-up","scale-out"]}, + "scale_up_transport": {"$ref": "#/$defs/safeId"}, + "scale_out_transport": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/safeId"}]}, + "transport": {"$ref": "#/$defs/safeId"}, + "topology_class": {"$ref": "#/$defs/safeId"} + } + }, + "coverageResource": { + "type":"object", + "additionalProperties":false, + "required":["mode","profile","comm_units_kind","configured_units"], + "properties": { + "mode":{"const":"fixed-profile"}, + "profile":{"oneOf":[{"type":"null"},{"$ref":"#/$defs/safeId"}]}, + "comm_units_kind":{"$ref":"#/$defs/nullableLabel"}, + "configured_units":{"oneOf":[{"type":"null"},{"type":"integer","minimum":1}]} + } + }, + "coveragePoint": { + "type":"object", + "additionalProperties":false, + "allOf":[ + { + "if":{"properties":{"terminal_status":{"const":"measured"}},"required":["terminal_status"]}, + "then":{"properties":{"point_id":{"$ref":"#/$defs/pointId"},"series_id":{"$ref":"#/$defs/seriesId"},"reason":{"type":"null"}}} + }, + { + "if":{"properties":{"terminal_status":{"const":"unsupported"}},"required":["terminal_status"]}, + "then":{"properties":{"point_id":{"type":"null"},"series_id":{"type":"null"},"reason":{"$ref":"#/$defs/reasonId"}}} + }, + { + "if":{"properties":{"terminal_status":{"enum":["failed","invalid"]}},"required":["terminal_status"]}, + "then":{"properties":{"reason":{"$ref":"#/$defs/reasonId"}}} + } + ], + "required":["point_id","series_id","tokens_per_rank","global_tokens","terminal_status","reason"], + "properties": { + "point_id":{"oneOf":[{"type":"null"},{"$ref":"#/$defs/pointId"}]}, + "series_id":{"oneOf":[{"type":"null"},{"$ref":"#/$defs/seriesId"}]}, + "tokens_per_rank":{"type":"integer","minimum":1}, + "global_tokens":{"type":"integer","minimum":1}, + "terminal_status":{"enum":["measured","unsupported","failed","invalid","diagnostic"]}, + "reason":{"$ref":"#/$defs/reason"} + } + }, + "coverage": { + "type": "object", + "additionalProperties": false, + "required": [ + "case_id", + "label", + "required", + "sku", + "suite", + "workload", + "publication_tier", + "backend", + "backend_generation", + "mode", + "phase", + "routing", + "eplb", + "precision_profile", + "dispatch_precision", + "combine_precision", + "resource", + "topology", + "points", + "disposition", + "selected_attempt_id", + "outcome", + "failure_mode", + "reason", + "attempt_ids" + ], + "properties": { + "case_id": {"$ref": "#/$defs/caseId"}, + "label": {"$ref": "#/$defs/label"}, + "required": {"type": "boolean"}, + "sku": {"$ref": "#/$defs/safeId"}, + "suite": {"$ref":"#/$defs/safeId"}, + "workload": {"$ref":"#/$defs/safeId"}, + "publication_tier": {"$ref":"#/$defs/publicationTier"}, + "backend": {"$ref": "#/$defs/safeId"}, + "backend_generation": {"$ref":"#/$defs/nullableLabel"}, + "mode": {"enum": ["normal","low-latency"]}, + "phase": {"enum": ["decode","prefill"]}, + "routing": {"enum":["uniform","zipf"]}, + "eplb": {"type":"boolean"}, + "precision_profile": {"$ref":"#/$defs/precisionProfile"}, + "dispatch_precision": {"$ref":"#/$defs/communicationAxis"}, + "combine_precision": {"$ref":"#/$defs/communicationAxis"}, + "resource": {"$ref":"#/$defs/coverageResource"}, + "topology": {"$ref": "#/$defs/coverageTopology"}, + "points": {"type":"array","minItems":1,"uniqueItems":true,"items":{"$ref":"#/$defs/coveragePoint"}}, + "disposition": {"enum": ["runnable","unsupported"]}, + "selected_attempt_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/attemptId"}]}, + "outcome": {"$ref": "#/$defs/outcome"}, + "failure_mode": {"$ref": "#/$defs/reason"}, + "reason": {"$ref": "#/$defs/reason"}, + "attempt_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/attemptId"}} + } + }, + "attempt": { + "type": "object", + "additionalProperties": false, + "required": [ + "attempt_id", + "evidence", + "case_id", + "allocation_id", + "run_id", + "run_attempt", + "qualification_index", + "attempt_index", + "selected", + "outcome", + "failure_mode", + "reason", + "series_id", + "completed_at" + ], + "properties": { + "attempt_id": {"$ref": "#/$defs/attemptId"}, + "evidence": { + "type": "array", + "uniqueItems": true, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["evidence_id","point_id"], + "properties": {"evidence_id": {"$ref": "#/$defs/evidenceId"},"point_id": {"$ref": "#/$defs/pointId"}} + } + }, + "case_id": {"$ref": "#/$defs/caseId"}, + "allocation_id": {"$ref": "#/$defs/allocationId"}, + "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"}, + "run_attempt": {"type": "integer","minimum": 1}, + "qualification_index": {"type":"integer","minimum":1,"maximum":3}, + "attempt_index": {"type": "integer","minimum": 1}, + "selected": {"type": "boolean"}, + "outcome": {"$ref": "#/$defs/outcome"}, + "failure_mode": {"$ref": "#/$defs/reason"}, + "reason": {"$ref": "#/$defs/reason"}, + "series_id": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/seriesId"}]}, + "completed_at": {"oneOf": [{"type": "null"},{"type": "string","format": "date-time"}]} + } + }, + "eligibility": { + "type": "object", + "additionalProperties": false, + "allOf": [{ + "if": {"properties": {"decision_grade": {"const": true}},"required": ["decision_grade"]}, + "then": {"properties": {"reasons": {"maxItems": 0}}}, + "else": {"properties": {"reasons": {"minItems": 1}}} + }], + "required": [ + "decision_grade", + "allocation_ids", + "complete", + "correct", + "measured_roundtrip_p99", + "stable_p50", + "stable_p99", + "stable_ordering", + "p50_max_min_ratio", + "p99_max_min_ratio", + "reasons" + ], + "properties": { + "decision_grade": {"type": "boolean"}, + "allocation_ids": {"type": "array","uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}}, + "complete": {"type": "boolean"}, + "correct": {"type": "boolean"}, + "measured_roundtrip_p99": {"type": "boolean"}, + "stable_p50": {"type": "boolean"}, + "stable_p99": {"type": "boolean"}, + "stable_ordering": {"type": "boolean"}, + "p50_max_min_ratio": {"oneOf": [{"type": "null"},{"type": "number","minimum": 1}]}, + "p99_max_min_ratio": {"oneOf": [{"type": "null"},{"type": "number","minimum": 1}]}, + "reasons": { + "type": "array", + "uniqueItems": true, + "items": {"type": "string","pattern": "^[a-z0-9][a-z0-9.-]*$","maxLength": 96} + } + } + }, + "percentiles": { + "type": "object", + "additionalProperties": false, + "required": ["p50","p90","p95","p99"], + "properties": { + "p50": {"type": "number","exclusiveMinimum": 0}, + "p90": {"type": "number","exclusiveMinimum": 0}, + "p95": {"type": "number","exclusiveMinimum": 0}, + "p99": {"type": "number","exclusiveMinimum": 0} + } + }, + "component": { + "type": "object", + "additionalProperties": false, + "allOf":[ + { + "if":{"properties":{"origin":{"const":"measured"}},"required":["origin"]}, + "then":{"properties":{"sample_count":{"const":512}}} + }, + { + "if":{"properties":{"origin":{"const":"derived"}},"required":["origin"]}, + "then":{"properties":{"sample_count":{"type":"null"}}} + } + ], + "required": ["origin","latency_us","byte_provenance","activation_data_rate_gbps_at_latency_percentile","total_logical_data_rate_gbps_at_latency_percentile","sample_count"], + "properties": { + "origin": {"enum": ["measured","derived"]}, + "latency_us": {"$ref": "#/$defs/percentiles"}, + "byte_provenance": {"$ref":"#/$defs/byteAccounting"}, + "activation_data_rate_gbps_at_latency_percentile": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/percentiles"}]}, + "total_logical_data_rate_gbps_at_latency_percentile": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/percentiles"}]}, + "sample_count": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]} + } + }, + "nullableComponent": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/component"}]}, + "trialDiagnosticComponent": { + "type":"object", + "additionalProperties":false, + "required":["drift_flagged","first_last_median_ratio","outlier_flagged","robust_outlier_fraction","trial_count"], + "properties": { + "drift_flagged":{"type":"boolean"}, + "first_last_median_ratio":{"type":"number","minimum":1}, + "outlier_flagged":{"type":"boolean"}, + "robust_outlier_fraction":{"type":"number","minimum":0,"maximum":1}, + "trial_count":{"const":192} + } + }, + "nullableTrialDiagnosticComponent": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/trialDiagnosticComponent"}]}, + "trialDiagnostics": { + "type":"object", + "additionalProperties":false, + "required":["components","flagged","reasons"], + "properties": { + "components": { + "type":"object", + "additionalProperties":false, + "required":["dispatch","stage","combine","roundtrip"], + "properties": { + "dispatch":{"$ref":"#/$defs/nullableTrialDiagnosticComponent"}, + "stage":{"$ref":"#/$defs/nullableTrialDiagnosticComponent"}, + "combine":{"$ref":"#/$defs/nullableTrialDiagnosticComponent"}, + "roundtrip":{"$ref":"#/$defs/nullableTrialDiagnosticComponent"} + } + }, + "flagged":{"type":"boolean"}, + "reasons":{"type":"array","maxItems":2,"uniqueItems":true,"items":{"enum":["trial-drift","trial-outliers"]}} + } + }, + "point": { + "type": "object", + "additionalProperties": false, + "required": [ + "point_id", + "tokens_per_rank", + "global_tokens", + "anomalies", + "correctness", + "stability", + "trial_diagnostics", + "routing", + "components", + "roundtrip_token_rate_at_latency_percentile", + "evidence_ids" + ], + "properties": { + "point_id": {"$ref": "#/$defs/pointId"}, + "tokens_per_rank": {"type": "integer","minimum": 1}, + "global_tokens": {"type": "integer","minimum": 1}, + "anomalies": {"type":"array","maxItems":16,"uniqueItems":true,"items":{"$ref":"#/$defs/reasonId"}}, + "correctness": {"$ref":"#/$defs/pointCorrectness"}, + "stability": {"$ref":"#/$defs/pointStability"}, + "trial_diagnostics":{"$ref":"#/$defs/trialDiagnostics"}, + "routing": { + "type": "object", + "additionalProperties": false, + "required": [ + "fanout_mean", + "recv_tokens_max", + "expert_load_cv", + "payload_rank_cv", + "hotspot_ratio", + "empty_expert_count", + "empty_rank_count", + "routed_copies" + ], + "properties": { + "fanout_mean": {"type": "number","minimum": 0}, + "recv_tokens_max": {"type": "integer","minimum": 0}, + "expert_load_cv": {"type": "number","minimum": 0}, + "payload_rank_cv": {"type": "number","minimum": 0}, + "hotspot_ratio": {"type": "number","minimum": 0}, + "empty_expert_count": {"type": "integer","minimum": 0}, + "empty_rank_count": {"type": "integer","minimum": 0}, + "routed_copies": {"type": "integer","minimum": 1} + } + }, + "components": { + "type": "object", + "additionalProperties": false, + "required": ["dispatch","stage","combine","roundtrip","isolated_sum"], + "properties": { + "dispatch": {"$ref": "#/$defs/nullableComponent"}, + "stage": {"$ref":"#/$defs/nullableComponent"}, + "combine": {"$ref": "#/$defs/nullableComponent"}, + "roundtrip": {"$ref": "#/$defs/nullableComponent"}, + "isolated_sum": {"$ref": "#/$defs/nullableComponent"} + } + }, + "roundtrip_token_rate_at_latency_percentile": {"$ref": "#/$defs/percentiles"}, + "evidence_ids": {"type": "array","minItems":1,"maxItems":3,"uniqueItems": true,"items": {"$ref": "#/$defs/evidenceId"}} + } + }, + "series": { + "type": "object", + "additionalProperties": false, + "required": [ + "series_id", + "label", + "status", + "case_ids", + "allocation_ids", + "model", + "suite", + "mode", + "phase", + "publication_tier", + "backend", + "build", + "system", + "workload", + "eplb", + "resource", + "measurement", + "points", + "eligibility" + ], + "properties": { + "series_id": {"$ref": "#/$defs/seriesId"}, + "label": {"$ref": "#/$defs/label"}, + "status": {"enum": ["decision-grade","diagnostic"]}, + "case_ids": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/caseId"}}, + "allocation_ids": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/allocationId"}}, + "model": {"$ref": "#/$defs/safeId"}, + "suite": {"$ref": "#/$defs/safeId"}, + "mode": {"enum": ["normal","low-latency"]}, + "phase": {"enum": ["decode","prefill"]}, + "publication_tier": {"$ref": "#/$defs/publicationTier"}, + "backend": { + "type": "object", + "additionalProperties": false, + "required": ["id","label","role","generation","version"], + "properties": { + "id": {"$ref": "#/$defs/safeId"}, + "label": {"$ref": "#/$defs/label"}, + "role": {"enum": ["library","reference"]}, + "generation": {"$ref": "#/$defs/nullableLabel"}, + "version": {"$ref": "#/$defs/nullableLabel"} + } + }, + "build": { + "type": "object", + "additionalProperties": false, + "required": ["implementation_contract_sha256","public_config_sha256","routing_control_sha256","runtime_fingerprint_sha256","image_digest","source_sha","squash_sha256"], + "properties": { + "implementation_contract_sha256": {"$ref": "#/$defs/sha256"}, + "public_config_sha256": {"$ref": "#/$defs/sha256"}, + "routing_control_sha256": {"$ref": "#/$defs/sha256"}, + "runtime_fingerprint_sha256": {"$ref": "#/$defs/sha256"}, + "image_digest": {"type": "string","pattern": "^sha256:[0-9a-f]{64}$"}, + "source_sha": {"type": "string","pattern": "^[0-9a-f]{40,64}$"}, + "squash_sha256": {"$ref": "#/$defs/sha256"} + } + }, + "system": { + "type": "object", + "additionalProperties": false, + "required": ["sku","label","vendor","topology_class","transport","scale_up_transport","scale_out_transport","scope","nodes","gpus_per_node","scale_up_domain","world_size","ep_size","placement"], + "properties": { + "sku": {"$ref": "#/$defs/safeId"}, + "label": {"$ref": "#/$defs/label"}, + "vendor": {"enum": ["nvidia","amd"]}, + "topology_class": {"$ref": "#/$defs/safeId"}, + "transport": {"$ref": "#/$defs/safeId"}, + "scale_up_transport": {"$ref": "#/$defs/safeId"}, + "scale_out_transport": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/safeId"}]}, + "scope": {"enum": ["scale-up","scale-out"]}, + "nodes": {"type": "integer","minimum": 1}, + "gpus_per_node": {"type": "integer","minimum": 1}, + "scale_up_domain": {"type": "integer","minimum": 1}, + "world_size": {"type": "integer","minimum": 1}, + "ep_size": {"type": "integer","minimum": 1}, + "placement": {"enum": ["packed"]} + } + }, + "workload": { + "type": "object", + "additionalProperties": false, + "required": [ + "workload_id", + "hidden", + "top_k", + "experts", + "routing", + "eplb", + "precision_profile", + "dispatch_precision", + "combine_precision", + "activation_profile" + ], + "properties": { + "workload_id": {"$ref": "#/$defs/workloadId"}, + "hidden": {"type": "integer","minimum": 1}, + "top_k": {"type": "integer","minimum": 1}, + "experts": {"type": "integer","minimum": 1}, + "routing": {"enum": ["uniform","zipf"]}, + "eplb": {"type": "boolean"}, + "precision_profile": {"$ref":"#/$defs/precisionProfile"}, + "dispatch_precision": {"$ref":"#/$defs/communicationAxis"}, + "combine_precision": {"$ref":"#/$defs/communicationAxis"}, + "activation_profile": {"const": "canonical-counter-source-v4"} + } + }, + "eplb": { + "type": "object", + "additionalProperties": false, + "allOf":[ + { + "if":{"properties":{"enabled":{"const":true}},"required":["enabled"]}, + "then":{"properties":{ + "calibration_workload_id":{"$ref":"#/$defs/workloadId"}, + "calibration_trace_sha256":{"$ref":"#/$defs/sha256"}, + "calibration_window":{"const":"collectivex-eplb-calibration-window-v1"}, + "calibration_token_offset":{"type":"integer","minimum":0} + }}, + "else":{"properties":{ + "calibration_workload_id":{"type":"null"}, + "calibration_trace_sha256":{"type":"null"}, + "calibration_window":{"type":"null"}, + "calibration_token_offset":{"type":"null"} + }} + } + ], + "required": [ + "enabled", + "calibration_workload_id", + "calibration_trace_sha256", + "calibration_window", + "calibration_token_offset", + "planner", + "mapping_sha256", + "logical_experts", + "physical_experts", + "redundant_experts", + "reference_tokens_per_rank", + "replicated_experts", + "max_replicas", + "imbalance_before", + "imbalance_after" + ], + "properties": { + "enabled": {"type": "boolean"}, + "calibration_workload_id": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/workloadId"}]}, + "calibration_trace_sha256": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/sha256"}]}, + "calibration_window": {"oneOf":[{"type":"null"},{"const":"collectivex-eplb-calibration-window-v1"}]}, + "calibration_token_offset": {"oneOf":[{"type":"null"},{"type":"integer","minimum":0}]}, + "planner": {"$ref": "#/$defs/nullableLabel"}, + "mapping_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]}, + "logical_experts": {"type": "integer","minimum": 1}, + "physical_experts": {"type": "integer","minimum": 1}, + "redundant_experts": {"type": "integer","minimum": 0}, + "reference_tokens_per_rank": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]}, + "replicated_experts": {"type": "integer","minimum": 0}, + "max_replicas": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 0}]}, + "imbalance_before": {"oneOf": [{"type": "null"},{"type": "number","minimum": 0}]}, + "imbalance_after": {"oneOf": [{"type": "null"},{"type": "number","minimum": 0}]} + } + }, + "resource": { + "type": "object", + "additionalProperties": false, + "required": ["mode","profile","comm_units_kind","configured_units"], + "properties": { + "mode": {"const": "fixed-profile"}, + "profile": {"$ref": "#/$defs/safeId"}, + "comm_units_kind": {"$ref": "#/$defs/nullableLabel"}, + "configured_units": {"oneOf": [{"type": "null"},{"type": "integer","minimum": 1}]} + } + }, + "measurement": { + "type": "object", + "additionalProperties": false, + "required": [ + "contract", + "component_order_contract", + "combine_semantics", + "payload_unit", + "sampling_contract", + "iters", + "trials", + "warmups", + "samples_per_component", + "qualification_indices", + "headline_component", + "headline_percentile" + ], + "properties": { + "contract": {"enum": ["layout-and-dispatch-v1","expert-packed-weighted-combine-v1"]}, + "component_order_contract": {"const":"qualification-hash-rotated-components-v1"}, + "combine_semantics": {"enum": ["activation-only","gate-weighted"]}, + "payload_unit": {"enum": ["token-rank","token-expert"]}, + "sampling_contract": {"const": "fixed-512-v1"}, + "iters": {"const": 8}, + "trials": {"const": 64}, + "warmups": {"const": 32}, + "samples_per_component": {"const": 512}, + "qualification_indices": {"type":"array","minItems":1,"maxItems":3,"uniqueItems":true,"items":{"enum":[1,2,3]}}, + "headline_component": {"const": "roundtrip"}, + "headline_percentile": {"const": "p99"} + } + }, + "points": {"type": "array","minItems": 1,"items": {"$ref": "#/$defs/point"}}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + }, + "cohort": { + "type": "object", + "additionalProperties": false, + "required": [ + "cohort_id", + "kind", + "label", + "description", + "series_ids", + "controlled_factors", + "varying_factors", + "publication_tier", + "eligibility" + ], + "properties": { + "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"}, + "kind": {"enum": ["library","chip","system","routing","dispatch-precision","combine-precision","precision-pair"]}, + "label": {"$ref": "#/$defs/label"}, + "description": {"$ref": "#/$defs/label"}, + "series_ids": {"type": "array","minItems": 2,"uniqueItems": true,"items": {"$ref": "#/$defs/seriesId"}}, + "controlled_factors": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/safeId"}}, + "varying_factors": {"type": "array","minItems": 1,"uniqueItems": true,"items": {"$ref": "#/$defs/safeId"}}, + "publication_tier": {"$ref": "#/$defs/publicationTier"}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + }, + "metric": { + "type": "object", + "additionalProperties": false, + "required": ["operation","statistic","measure","objective","tokens_per_rank","phase"], + "properties": { + "operation": {"const": "roundtrip"}, + "statistic": {"enum": ["p50","p99"]}, + "measure": {"enum": ["latency_us","activation_data_rate_gbps_at_latency_percentile","total_logical_data_rate_gbps_at_latency_percentile"]}, + "objective": {"enum": ["min","max"]}, + "tokens_per_rank": {"type": "integer","minimum": 1}, + "phase": {"enum": ["decode","prefill"]} + } + }, + "ranking": { + "type": "object", + "additionalProperties": false, + "required": ["ranking_id","cohort_id","label","metric","entries","publication_tier","eligibility"], + "properties": { + "ranking_id": {"type": "string","pattern": "^cxranking-v1-[0-9a-f]{64}$"}, + "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"}, + "label": {"$ref": "#/$defs/label"}, + "metric": {"$ref": "#/$defs/metric"}, + "entries": { + "type": "array", + "minItems": 2, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["rank","series_id","point_id","value","unit"], + "properties": { + "rank": {"type": "integer","minimum": 1}, + "series_id": {"$ref": "#/$defs/seriesId"}, + "point_id": {"$ref": "#/$defs/pointId"}, + "value": {"type": "number","exclusiveMinimum": 0}, + "unit": {"enum": ["us","GB/s"]} + } + } + }, + "publication_tier": {"$ref": "#/$defs/publicationTier"}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + }, + "recommendation": { + "type": "object", + "additionalProperties": false, + "required": [ + "recommendation_id", + "cohort_id", + "label", + "objective", + "series_id", + "point_id", + "value", + "unit", + "rationale", + "publication_tier", + "eligibility" + ], + "properties": { + "recommendation_id": {"type": "string","pattern": "^cxrecommendation-v1-[0-9a-f]{64}$"}, + "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"}, + "label": {"$ref": "#/$defs/label"}, + "objective": {"enum": ["min-p50-latency","min-p99-latency","max-activation-data-rate-at-p50-latency","max-activation-data-rate-at-p99-latency","max-total-logical-data-rate-at-p50-latency","max-total-logical-data-rate-at-p99-latency"]}, + "series_id": {"$ref": "#/$defs/seriesId"}, + "point_id": {"$ref": "#/$defs/pointId"}, + "value": {"type": "number","exclusiveMinimum": 0}, + "unit": {"enum": ["us","GB/s"]}, + "rationale": {"$ref": "#/$defs/label"}, + "publication_tier": {"const": "official"}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + }, + "sensitivity": { + "type": "object", + "additionalProperties": false, + "required": [ + "sensitivity_id", + "cohort_id", + "label", + "baseline_series_id", + "candidate_series_id", + "metric", + "signed_change_ratio", + "publication_tier", + "eligibility" + ], + "properties": { + "sensitivity_id": {"type": "string","pattern": "^cxsensitivity-v1-[0-9a-f]{64}$"}, + "cohort_id": {"type": "string","pattern": "^cxcohort-v1-[0-9a-f]{64}$"}, + "label": {"$ref": "#/$defs/label"}, + "baseline_series_id": {"$ref": "#/$defs/seriesId"}, + "candidate_series_id": {"$ref": "#/$defs/seriesId"}, + "metric": {"$ref": "#/$defs/metric"}, + "signed_change_ratio": {"type": "number"}, + "publication_tier": {"$ref": "#/$defs/publicationTier"}, + "eligibility": {"$ref": "#/$defs/eligibility"} + } + } + } +} diff --git a/experimental/CollectiveX/schemas/raw-case-v1.schema.json b/experimental/CollectiveX/schemas/raw-case-v1.schema.json new file mode 100644 index 000000000..d5c8a73a5 --- /dev/null +++ b/experimental/CollectiveX/schemas/raw-case-v1.schema.json @@ -0,0 +1,1381 @@ +{ + "$id": "https://inferencex.com/schemas/collectivex/raw-case-v1.schema.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$defs": { + "deepep_v2_jit_cubin": { + "additionalProperties": false, + "properties": { + "cache_key": { + "pattern":"^kernel\\.[A-Za-z0-9_+-]+\\.[0-9a-f]{32}$", + "type":"string" + }, + "cubin_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "sass_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "source_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"} + }, + "required": ["cache_key","cubin_sha256","sass_sha256","source_sha256"], + "type": "object" + }, + "hybrid_jit_rank_artifact": { + "additionalProperties": false, + "properties": { + "bytes": {"minimum":1,"type":"integer"}, + "rank": {"minimum":0,"type":"integer"}, + "sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"} + }, + "required": ["bytes","rank","sha256"], + "type": "object" + }, + "hybrid_realized_config": { + "additionalProperties": false, + "properties": { + "backward_combine_api": {"type":"boolean"}, + "device_side_sync_combine_api": {"type":"boolean"}, + "device_side_sync_dispatch_api": {"type":"boolean"}, + "forward_dispatch_api": {"type":"boolean"}, + "hidden_dim": {"minimum":1,"type":"integer"}, + "max_num_of_tokens_per_rank": {"minimum":1,"type":"integer"}, + "num_of_additional_in_flight_s2g_combine_api": {"minimum":0,"type":"integer"}, + "num_of_additional_in_flight_s2g_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_blocks_combine_api": {"minimum":0,"type":"integer"}, + "num_of_blocks_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_blocks_permute": {"minimum":0,"type":"integer"}, + "num_of_blocks_preprocessing_api": {"minimum":0,"type":"integer"}, + "num_of_blocks_unpermute": {"minimum":0,"type":"integer"}, + "num_of_experts_per_rank": {"minimum":1,"type":"integer"}, + "num_of_in_flight_s2g_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_in_flight_s2g_permute_block_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_nodes": {"minimum":1,"type":"integer"}, + "num_of_ranks_per_node": {"minimum":1,"type":"integer"}, + "num_of_stages_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_stages_g2s_combine_api": {"minimum":0,"type":"integer"}, + "num_of_stages_permute_block_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_stages_s2g_combine_api": {"minimum":0,"type":"integer"}, + "num_of_threads_per_block_preprocessing_api": {"minimum":0,"type":"integer"}, + "num_of_tokens_per_chunk_combine_api": {"minimum":0,"type":"integer"}, + "num_of_tokens_per_chunk_dispatch_api": {"minimum":0,"type":"integer"}, + "num_of_tokens_per_chunk_preprocessing_api": {"minimum":0,"type":"integer"}, + "num_of_tokens_per_group_combine_api": {"minimum":0,"type":"integer"}, + "pad_multiple": {"minimum":0,"type":"integer"}, + "token_data_type": {"enum":["UINT8","UINT16"]} + }, + "required": [ + "backward_combine_api","device_side_sync_combine_api","device_side_sync_dispatch_api", + "forward_dispatch_api","hidden_dim","max_num_of_tokens_per_rank", + "num_of_additional_in_flight_s2g_combine_api", + "num_of_additional_in_flight_s2g_dispatch_api","num_of_blocks_combine_api", + "num_of_blocks_dispatch_api","num_of_blocks_permute","num_of_blocks_preprocessing_api", + "num_of_blocks_unpermute","num_of_experts_per_rank", + "num_of_in_flight_s2g_dispatch_api","num_of_in_flight_s2g_permute_block_dispatch_api", + "num_of_nodes","num_of_ranks_per_node","num_of_stages_dispatch_api", + "num_of_stages_g2s_combine_api","num_of_stages_permute_block_dispatch_api", + "num_of_stages_s2g_combine_api","num_of_threads_per_block_preprocessing_api", + "num_of_tokens_per_chunk_combine_api","num_of_tokens_per_chunk_dispatch_api", + "num_of_tokens_per_chunk_preprocessing_api","num_of_tokens_per_group_combine_api", + "pad_multiple","token_data_type" + ], + "type": "object" + }, + "nullable_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]}, + "precision_profile": { + "enum": [ + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-bf16", + "d-fp8-e4m3fn-b128-f32-fused.c-bf16", + "d-bf16.c-logfmt10-dynamic64", + "d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64", + "d-bf16.c-fp8-e4m3fn-direct-cast-noscale", + "d-fp8-e4m3fn-b128-f32-prequantized.c-fp8-e4m3fn-direct-cast-noscale", + "d-bf16.c-fp8-e4m3fnuz-direct-cast-noscale", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-fp8-e4m3fnuz-direct-cast-noscale" + ] + }, + "communication_axis": { + "additionalProperties": false, + "properties": { + "alignment_contract": {"enum":["native-bf16-vector-alignment","hidden-block-128","native-fp8-vector-alignment","value-block-64"]}, + "api_input_dtype": {"enum":["bf16","fp8-e4m3fn-with-f32-scale","fp8-e4m3fnuz-with-f32-scale"]}, + "api_output_dtype": {"enum":["bf16","fp8-e4m3fn-with-f32-scale","fp8-e4m3fnuz-with-f32-scale"]}, + "communication_format": {"enum":["bf16","fp8-e4m3fn","fp8-e4m3fnuz","logfmt10"]}, + "conversion_boundary": {"enum":["none","before-dispatch-timing","inside-dispatch-timing","inside-combine-timing"]}, + "padding_contract": {"enum":["none","right-zero-pad-hidden-to-128","right-zero-pad-values-to-64"]}, + "quantization_origin": {"enum":["none","caller-prequantized","backend-fused","backend-internal","backend-internal-direct-cast"]}, + "scale_dtype": {"oneOf":[{"type":"null"},{"enum":["f32","implicit-logfmt10"]}]}, + "scale_group_size": {"oneOf":[{"type":"null"},{"enum":[64,128]}]}, + "scale_layout": {"enum":["none","per-token-hidden-block","dynamic-per-64-values"]} + }, + "required": [ + "alignment_contract","api_input_dtype","api_output_dtype","communication_format", + "conversion_boundary","padding_contract","quantization_origin","scale_dtype", + "scale_group_size","scale_layout" + ], + "type": "object" + }, + "communication_precision": { + "additionalProperties": false, + "properties": { + "combine": {"$ref":"#/$defs/communication_axis"}, + "dispatch": {"$ref":"#/$defs/communication_axis"}, + "modes": { + "items":{"enum":["normal","low-latency"]}, + "minItems":1, + "type":"array", + "uniqueItems":true + }, + "profile_id": {"$ref":"#/$defs/precision_profile"} + }, + "required": ["combine","dispatch","modes","profile_id"], + "type": "object" + }, + "byte_accounting": { + "additionalProperties": false, + "properties": { + "accounting_contract": {"const":"activation-data-plus-scales-v1"}, + "activation_data_bytes": {"minimum":0,"type":"integer"}, + "scale_bytes": {"minimum":0,"type":"integer"}, + "total_logical_bytes": {"minimum":0,"type":"integer"} + }, + "required": [ + "accounting_contract","activation_data_bytes","scale_bytes","total_logical_bytes" + ], + "type": "object" + }, + "precision_axis_evidence": { + "additionalProperties": false, + "properties": { + "dequantized_semantics": {"type":"boolean"}, + "encoded_payload_valid": {"type":"boolean"}, + "max_abs_error": {"minimum":0,"type":"number"}, + "max_rel_error": {"minimum":0,"type":"number"}, + "passed": {"type":"boolean"}, + "saturation_count": {"minimum":0,"type":"integer"}, + "saturation_rate": {"maximum":1,"minimum":0,"type":"number"}, + "scales_finite": {"oneOf":[{"type":"null"},{"type":"boolean"}]}, + "scales_positive": {"oneOf":[{"type":"null"},{"type":"boolean"}]} + }, + "required": [ + "dequantized_semantics","encoded_payload_valid","max_abs_error","max_rel_error", + "passed","saturation_count","saturation_rate","scales_finite","scales_positive" + ], + "type":"object" + }, + "precision_evidence": { + "additionalProperties": false, + "properties": { + "combine": {"$ref":"#/$defs/precision_axis_evidence"}, + "dispatch": {"$ref":"#/$defs/precision_axis_evidence"}, + "passed": {"type":"boolean"}, + "profile_id": {"$ref":"#/$defs/precision_profile"} + }, + "required": ["combine","dispatch","passed","profile_id"], + "type":"object" + }, + "case_profile": { + "additionalProperties": false, + "allOf": [ + { + "if": {"properties":{"mode":{"const":"normal"}},"required":["mode"]}, + "then": {"properties": { + "combine_semantics":{"const":"activation-only"}, + "component_order_contract":{"const":"qualification-hash-rotated-components-v1"}, + "contract":{"const":"layout-and-dispatch-v1"}, + "correctness_scope":{"const":"dispatch-metadata-and-transformed-combine"}, + "oracle_contract":{"const":"expert-specific-transform-v1"}, + "payload_unit":{"const":"token-rank"} + }} + }, + { + "if": {"properties":{"mode":{"const":"low-latency"}},"required":["mode"]}, + "then": {"properties": { + "combine_semantics":{"const":"gate-weighted"}, + "component_order_contract":{"const":"qualification-hash-rotated-components-v1"}, + "contract":{"const":"expert-packed-weighted-combine-v1"}, + "correctness_scope":{"const":"expert-assignment-and-weighted-combine"}, + "oracle_contract":{"const":"expert-assignment-transform-v1"}, + "payload_unit":{"const":"token-expert"} + }} + } + ], + "properties": { + "activation_generator": {"const":"collectivex-activation-counter-v4"}, + "activation_profile": {"const":"canonical-counter-source-v4"}, + "combine_dtype": {"const":"bf16"}, + "combine_quant_mode": {"const":"none"}, + "combine_semantics": {"enum":["activation-only","gate-weighted"]}, + "communication_precision": {"$ref":"#/$defs/communication_precision"}, + "component_order_contract": {"const":"qualification-hash-rotated-components-v1"}, + "conditioning_contract": {"const":"fixed-phase-ramp-8-roundtrips-v1"}, + "contract": {"enum":["layout-and-dispatch-v1","expert-packed-weighted-combine-v1"]}, + "correctness_scope": {"enum":["dispatch-metadata-and-transformed-combine","expert-assignment-and-weighted-combine"]}, + "dtype": {"const":"bf16"}, + "eplb_planner": {"const":"greedy-rank-major-v1"}, + "eplb_redundant_experts": {"const":32}, + "eplb_reference_tokens_per_rank": {"const":2048}, + "mode": {"enum":["normal","low-latency"]}, + "oracle_contract": {"enum":["expert-specific-transform-v1","expert-assignment-transform-v1"]}, + "oracle_tolerances": {"const":"rtol=0.05,atol=0.02"}, + "payload_unit": {"enum":["token-rank","token-expert"]}, + "placement": {"const":"packed"}, + "percentile_method": {"const":"nearest-rank"}, + "rank_reduction": {"const":"cross-rank-max-per-iteration"}, + "resource_mode": {"const":"fixed-profile"}, + "routing_generator": {"const":"collectivex-routing-counter-v3"}, + "sampling_contract": {"const":"fixed-512-v1"}, + "seed": {"const":67}, + "source_identity_contract": {"const":"bounded-sign-bit-source-v1"} + }, + "required": [ + "activation_generator","activation_profile","combine_dtype","combine_quant_mode", + "combine_semantics","component_order_contract","conditioning_contract","contract", + "correctness_scope","dtype","eplb_planner","eplb_redundant_experts", + "eplb_reference_tokens_per_rank","mode","oracle_contract","oracle_tolerances", + "payload_unit","placement","percentile_method","rank_reduction","resource_mode", + "routing_generator","sampling_contract","seed","source_identity_contract" + ], + "type": "object" + }, + "oracle": { + "additionalProperties": false, + "properties": { + "checks": { + "additionalProperties": false, + "properties": { + "combine_values": {"type":"boolean"}, + "counts": {"type":"boolean"}, + "metadata": {"type":"boolean"}, + "multiplicity": {"type":"boolean"}, + "payload": {"type":"boolean"}, + "source_set": {"type":"boolean"}, + "weights": {"type":"boolean"} + }, + "required": ["combine_values","counts","metadata","multiplicity","payload","source_set","weights"], + "type": "object" + }, + "atol": {"const":0.02}, + "combine_weight_semantics": {"enum":["unweighted-rank-sum","gate-weighted-sum"]}, + "contract": {"enum":["expert-specific-transform-v1","expert-assignment-transform-v1"]}, + "dispatch_sha256": {"$ref":"#/$defs/nullable_sha256"}, + "max_absolute_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "max_elementwise_relative_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "max_relative_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "max_weight_error": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "order_sha256": {"$ref":"#/$defs/nullable_sha256"}, + "ordering_contract": {"minLength":1,"type":"string"}, + "passed": {"type":"boolean"}, + "receive_count": {"minimum":0,"type":"integer"}, + "rtol": {"const":0.05} + }, + "required": [ + "atol", + "checks", + "combine_weight_semantics", + "contract", + "dispatch_sha256", + "max_absolute_error", + "max_elementwise_relative_error", + "max_relative_error", + "max_weight_error", + "order_sha256", + "ordering_contract", + "passed", + "receive_count", + "rtol" + ], + "type": "object" + }, + "percentiles": { + "additionalProperties": false, + "properties": { + "p50": {"minimum":0,"type":"number"}, + "p90": {"minimum":0,"type":"number"}, + "p95": {"minimum":0,"type":"number"}, + "p99": {"minimum":0,"type":"number"} + }, + "required": ["p50","p90","p95","p99"], + "type": "object" + }, + "component": { + "additionalProperties": false, + "allOf": [ + { + "if": {"properties":{"availability":{"const":"measured"}},"required":["availability"]}, + "then": { + "properties": { + "origin": {"const":"measured"}, + "percentiles_us": {"$ref":"#/$defs/percentiles"}, + "sample_count": {"const":512} + } + } + }, + { + "if": {"properties":{"availability":{"const":"unavailable"}},"required":["availability"]}, + "then": { + "properties": { + "percentiles_us": {"type":"null"}, + "sample_count": {"const":0} + } + } + } + ], + "properties": { + "availability": {"enum":["measured","derived","unavailable"]}, + "origin": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "percentiles_us": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/percentiles"}]}, + "sample_count": {"minimum":0,"type":"integer"} + }, + "required": ["availability","origin","percentiles_us","sample_count"], + "type": "object" + }, + "histogram": { + "additionalProperties": false, + "properties": { + "bins": {"minimum":1,"type":"integer"}, + "counts": {"items":{"minimum":0,"type":"integer"},"minItems":1,"type":"array"}, + "max": {"minimum":0,"type":"number"}, + "min": {"minimum":0,"type":"number"}, + "n": {"minimum":1,"type":"integer"} + }, + "required": ["n","min","max","bins","counts"], + "type": "object" + }, + "scheduled_case": { + "additionalProperties": false, + "properties": { + "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "canonical": {"const":true}, + "ep": {"minimum":1,"type":"integer"}, + "eplb": {"type":"boolean"}, + "experts": {"minimum":1,"type":"integer"}, + "gpus_per_node": {"minimum":1,"type":"integer"}, + "hidden": {"minimum":1,"type":"integer"}, + "ladder": {"pattern":"^[1-9][0-9]*( [1-9][0-9]*)*$","type":"string"}, + "mode": {"enum":["normal","low-latency"]}, + "nodes": {"minimum":1,"type":"integer"}, + "phase": {"enum":["decode","prefill"]}, + "precision_profile": {"$ref":"#/$defs/precision_profile"}, + "required_publication": {"enum":["official","comparable-experimental"]}, + "routing": {"enum":["uniform","zipf"]}, + "samples_per_point": {"const":512}, + "scale_out_transport": {"oneOf":[{"type":"null"},{"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}]}, + "scale_up_domain": {"minimum":1,"type":"integer"}, + "scale_up_transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "scope": {"enum":["scale-up","scale-out"]}, + "suite": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "timing": {"const":"8:64:32"}, + "topk": {"minimum":1,"type":"integer"}, + "topology_class": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "warmup_semantics": {"const":"full-roundtrip-before-each-component-trial-point-v1"}, + "workload": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"} + }, + "required": [ + "backend", + "canonical", + "eplb", + "ep", + "experts", + "gpus_per_node", + "hidden", + "ladder", + "mode", + "nodes", + "phase", + "required_publication", + "routing", + "samples_per_point", + "scale_out_transport", + "scale_up_domain", + "scale_up_transport", + "scope", + "suite", + "timing", + "topk", + "topology_class", + "transport", + "warmup_semantics", + "workload" + ], + "type": "object" + }, + "git_run": { + "additionalProperties": false, + "properties": { + "artifact": {"minLength":1,"type":"string"}, + "job": {"minLength":1,"type":"string"}, + "ref": {"minLength":1,"type":"string"}, + "repo": {"pattern":"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$","type":"string"}, + "run_attempt": {"pattern":"^[1-9][0-9]*$","type":"string"}, + "run_id": {"pattern":"^[1-9][0-9]*$","type":"string"}, + "qualification_index": {"maximum":3,"minimum":1,"type":"integer"}, + "source_sha": {"pattern":"^[0-9a-f]{40}$","type":"string"} + }, + "required": ["artifact","job","qualification_index","ref","repo","run_attempt","run_id","source_sha"], + "type": "object" + } + }, + "additionalProperties": false, + "allOf": [ + { + "if": { + "properties": { + "workload": { + "properties": {"source": {"const":"canonical-serialized"}}, + "required": ["source"] + } + }, + "required": ["workload"] + }, + "then": { + "properties": { + "provenance": { + "properties": { + "allocation_stratum_sha256": { + "pattern":"^[0-9a-f]{64}$", + "type":"string" + } + }, + "required": ["allocation_stratum_sha256"] + } + } + } + } + ], + "properties": { + "case": { + "additionalProperties": false, + "properties": { + "attempt_ordinal": {"minimum":1,"type":"integer"}, + "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "ep_size": {"minimum":1,"type":"integer"}, + "eplb": { + "additionalProperties": false, + "allOf": [ + { + "if": {"properties":{"enabled":{"const":true}},"required":["enabled"]}, + "then": { + "properties": { + "calibration_token_offset": {"minimum":0,"type":"integer"}, + "calibration_trace_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "calibration_window": {"const":"collectivex-eplb-calibration-window-v1"}, + "calibration_workload_id": {"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"} + } + }, + "else": { + "properties": { + "calibration_token_offset": {"type":"null"}, + "calibration_trace_sha256": {"type":"null"}, + "calibration_window": {"type":"null"}, + "calibration_workload_id": {"type":"null"} + } + } + } + ], + "properties": { + "calibration_token_offset": {"oneOf":[{"type":"null"},{"minimum":0,"type":"integer"}]}, + "calibration_trace_sha256": {"$ref":"#/$defs/nullable_sha256"}, + "calibration_window": {"oneOf":[{"type":"null"},{"const":"collectivex-eplb-calibration-window-v1"}]}, + "calibration_workload_id": {"oneOf":[{"type":"null"},{"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"}]}, + "enabled": {"type":"boolean"}, + "imbalance_after": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "imbalance_before": {"oneOf":[{"type":"null"},{"minimum":0,"type":"number"}]}, + "mapping_hash": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]}, + "max_replicas": {"oneOf":[{"type":"null"},{"minimum":0,"type":"integer"}]}, + "num_logical_experts": {"minimum":1,"type":"integer"}, + "num_physical_experts": {"minimum":1,"type":"integer"}, + "num_redundant": {"minimum":0,"type":"integer"}, + "planner": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "reference_tokens_per_rank": {"oneOf":[{"type":"null"},{"minimum":1,"type":"integer"}]}, + "replicated_experts": {"minimum":0,"type":"integer"} + }, + "required": [ + "calibration_token_offset", + "calibration_trace_sha256", + "calibration_window", + "calibration_workload_id", + "enabled", + "imbalance_after", + "imbalance_before", + "mapping_hash", + "max_replicas", + "num_logical_experts", + "num_physical_experts", + "num_redundant", + "planner", + "reference_tokens_per_rank", + "replicated_experts" + ], + "type": "object" + }, + "mode": {"enum":["normal","low-latency"]}, + "phase": {"enum":["decode","prefill"]}, + "required_publication": {"enum":["official","comparable-experimental"]}, + "resource_mode": {"const":"fixed-profile"}, + "runner": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "shape": { + "additionalProperties": false, + "properties": { + "activation_profile": {"const":"canonical-counter-source-v4"}, + "combine_precision": {"$ref":"#/$defs/communication_axis"}, + "dispatch_precision": {"$ref":"#/$defs/communication_axis"}, + "eplb": {"type":"boolean"}, + "experts": {"minimum":1,"type":"integer"}, + "experts_per_rank": {"minimum":1,"type":"integer"}, + "hidden": {"minimum":1,"type":"integer"}, + "kernel_gen": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "num_logical_experts": {"minimum":1,"type":"integer"}, + "precision_profile": {"$ref":"#/$defs/precision_profile"}, + "routing": {"enum":["uniform","zipf"]}, + "topk": {"minimum":1,"type":"integer"} + }, + "required": [ + "activation_profile", + "combine_precision", + "dispatch_precision", + "eplb", + "experts", + "experts_per_rank", + "hidden", + "kernel_gen", + "num_logical_experts", + "precision_profile", + "routing", + "topk" + ], + "type": "object" + }, + "suite": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "workload_name": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"} + }, + "required": [ + "attempt_ordinal", + "backend", + "eplb", + "ep_size", + "mode", + "phase", + "required_publication", + "resource_mode", + "runner", + "shape", + "suite", + "workload_name" + ], + "type": "object" + }, + "format": {"const":"collectivex.ep.v1"}, + "generated_at": {"format":"date-time","type":"string"}, + "identity": { + "additionalProperties": false, + "properties": { + "allocation_factors": { + "additionalProperties": false, + "properties": { + "artifact": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "execution_id": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "job": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "repo": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "run_attempt": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "run_id": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "qualification_index": {"maximum":3,"minimum":1,"type":"integer"}, + "runner": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "source_sha": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]} + }, + "required": ["artifact","execution_id","job","qualification_index","repo","run_attempt","run_id","runner","source_sha"], + "type": "object" + }, + "allocation_id": {"pattern":"^cxallocation-v1-[0-9a-f]{64}$","type":"string"}, + "attempt_id": {"pattern":"^cxattempt-v1-[0-9a-f]{64}$","type":"string"}, + "attempt_ordinal": {"minimum":1,"type":"integer"}, + "case_factors": { + "additionalProperties": false, + "properties": { + "case": {"$ref":"#/$defs/scheduled_case"}, + "profile": {"$ref":"#/$defs/case_profile"}, + "sku": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"} + }, + "required": ["case","profile","sku"], + "type": "object" + }, + "case_id": {"pattern":"^cxcase-v1-[0-9a-f]{64}$","type":"string"}, + "series_factors": { + "additionalProperties": false, + "properties": { + "backend": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "implementation_contract_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "public_config_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "routing_control_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "case_id": {"pattern":"^cxcase-v1-[0-9a-f]{64}$","type":"string"}, + "image_digest": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "runtime_fingerprint_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "source_sha": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{40}$","type":"string"}]}, + "squash_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]}, + "workload_id": {"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"} + }, + "required": [ + "backend", + "implementation_contract_sha256", + "public_config_sha256", + "routing_control_sha256", + "case_id", + "image_digest", + "runtime_fingerprint_sha256", + "source_sha", + "squash_sha256", + "workload_id" + ], + "type": "object" + }, + "series_id": {"pattern":"^cxseries-v1-[0-9a-f]{64}$","type":"string"} + }, + "required": [ + "allocation_factors", + "allocation_id", + "attempt_id", + "attempt_ordinal", + "case_factors", + "case_id", + "series_factors", + "series_id" + ], + "type": "object" + }, + "implementation": { + "additionalProperties": false, + "properties": { + "kernel_generation": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "name": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "provenance": { + "properties": { + "allow_hybrid_mode": {"type":"boolean"}, + "communication_backend": {"enum":["nccl-device-lsa","nccl-gin"]}, + "deepep_fix_pr": {"const":630}, + "deepep_pr": {"const":605}, + "deterministic": {"type": "boolean"}, + "gin_enabled": {"type":"boolean"}, + "jit_cubins": { + "items": {"$ref":"#/$defs/deepep_v2_jit_cubin"}, + "maxItems": 5, + "minItems": 5, + "type": "array", + "uniqueItems": true + }, + "jit_kernel_keys": { + "items": {"maxLength":512,"pattern":"^[A-Za-z0-9][A-Za-z0-9_.+-]*$","type":"string"}, + "maxItems": 3, + "minItems": 3, + "type": "array", + "uniqueItems": true + }, + "jit_random_seed": {"const":"collectivex-deepep-v2-fa8a9b1"}, + "jit_shared_objects": { + "items": { + "additionalProperties": false, + "properties": { + "kernel_key": {"maxLength":512,"pattern":"^[A-Za-z0-9][A-Za-z0-9_.+-]*$","type":"string"}, + "rank_artifacts": { + "items": {"$ref":"#/$defs/hybrid_jit_rank_artifact"}, + "minItems": 1, + "type": "array" + } + }, + "required": ["kernel_key","rank_artifacts"], + "type": "object" + }, + "maxItems": 3, + "minItems": 3, + "type": "array" + }, + "num_experts": {"minimum": 1, "type": "integer"}, + "num_nvl_bytes": {"minimum": 0, "type": "integer"}, + "num_qps_per_rank": {"minimum": 1, "type": "integer"}, + "num_rdma_bytes": {"minimum": 0, "type": "integer"}, + "rdma_block_num": {"minimum": 0, "type": "integer"}, + "realized_config": {"$ref":"#/$defs/hybrid_realized_config"}, + "tuning_num_experts": {"minimum": 1, "type": "integer"}, + "uccl_dependency_versions": { + "additionalProperties": false, + "properties": { + "intervaltree": {"const":"3.1.0"}, + "nvidia-cuda-runtime-cu12": {"const":"12.9.79"}, + "sortedcontainers": {"const":"2.4.0"} + }, + "required": ["intervaltree","nvidia-cuda-runtime-cu12","sortedcontainers"], + "type": "object" + }, + "use_external_inp_buf": {"type": "boolean"} + }, + "type": "object", + "propertyNames": { + "enum": [ + "allocated_qps", + "allow_hybrid_mode", + "allow_mnnvl", + "allow_multiple_reduction", + "api", + "api_signature_sha256", + "backend", + "backend_lineage", + "block_num", + "block_num_floored", + "block_num_target", + "branch", + "collective_library", + "combine_dtype", + "combine_warps", + "communication_backend", + "cuda_version", + "deepep_commit", + "deepep_distribution_version", + "deepep_fix_pr", + "deepep_pr", + "deepep_tree", + "deepep_version", + "deterministic", + "device_cus", + "device_sms", + "dispatch_dtype", + "dispatch_warps", + "enable_sdma", + "fmt_commit", + "gpus_per_node", + "gin_enabled", + "heap_size", + "impl", + "jit_cache_key", + "jit_cubins", + "jit_kernel_keys", + "jit_random_seed", + "jit_shared_objects", + "kernel_type", + "loaded_libraries", + "local_experts", + "logical_scaleout_ranks", + "logical_scaleup_ranks", + "mapping_variant", + "max_num_inp_token_per_rank", + "max_num_tokens", + "max_total_recv_tokens", + "mnnvl_comm", + "mode", + "mori_commit", + "nccl_communicator", + "nccl_package_version", + "nccl_version", + "num_experts", + "num_max_tokens_per_rank", + "num_nvl_bytes", + "num_qps", + "num_qps_per_rank", + "num_rdma_bytes", + "num_sms", + "nvshmem_package_version", + "path", + "physical_nvlink_ranks", + "physical_rdma_ranks", + "prefer_overlap_with_compute", + "rdma_block_num", + "reference_semantics", + "realized_config", + "requested_num_sms", + "resource_mode", + "routing_factor", + "routing_metadata", + "sm_fraction", + "top_k", + "torch_git_version", + "torch_version", + "transport", + "trtllm", + "tuned_source", + "tuning_num_experts", + "uccl_commit", + "uccl_dependency_versions", + "uccl_version", + "uccl_wrapper_commit", + "use_external_inp_buf", + "workspace" + ] + } + }, + "resource_profile": { + "additionalProperties": false, + "properties": { + "achieved_fraction": {}, + "comm_units_kind": {}, + "configured_units": {}, + "conformance_class": {}, + "device_units": {}, + "fixed_kernel": {}, + "nonconforming": {}, + "pareto_eligible": {}, + "persistent_bytes": {}, + "qps_per_rank": {}, + "requested_fraction": {}, + "tuned_source": {}, + "target_achieved_within_tol": {}, + "tolerance": {}, + "resource_class": {}, + "warps_combine": {}, + "warps_dispatch": {} + }, + "required": [ + "comm_units_kind", + "requested_fraction", + "configured_units", + "device_units", + "achieved_fraction", + "warps_dispatch", + "warps_combine", + "qps_per_rank", + "persistent_bytes", + "tuned_source", + "resource_class", + "conformance_class", + "tolerance", + "target_achieved_within_tol", + "nonconforming", + "fixed_kernel", + "pareto_eligible" + ], + "type": "object" + } + }, + "required": ["kernel_generation","name","provenance","resource_profile"], + "type": "object" + }, + "measurement": { + "additionalProperties": false, + "properties": { + "component_order_contract": {"const":"qualification-hash-rotated-components-v1"}, + "conditioning": { + "additionalProperties": false, + "properties": { + "contract": {"const":"fixed-phase-ramp-8-roundtrips-v1"}, + "ladder": {"items":{"minimum":1,"type":"integer"},"minItems":1,"type":"array"}, + "roundtrips_per_shape": {"const":8} + }, + "required": ["contract","ladder","roundtrips_per_shape"], + "type": "object" + }, + "contract": {"enum":["layout-and-dispatch-v1","expert-packed-weighted-combine-v1"]}, + "execution_order_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "qualification_index": {"maximum":3,"minimum":1,"type":"integer"}, + "rows": { + "items": { + "additionalProperties": false, + "properties": { + "anomalies": { + "items": { + "additionalProperties": false, + "properties": { + "T": {"minimum":1,"type":"integer"}, + "component_floor_p50": {"minimum":0,"type":"number"}, + "isolated_sum_p99": {"minimum":0,"type":"number"}, + "ratio": {"minimum":0,"type":"number"}, + "roundtrip_p50": {"minimum":0,"type":"number"}, + "roundtrip_p99": {"minimum":0,"type":"number"}, + "threshold": {"minimum":0,"type":"number"}, + "type": {"enum":["roundtrip_gt_isolated_sum","roundtrip_lt_component_floor"]} + }, + "required": ["type","T"], + "type": "object" + }, + "type": "array" + }, + "components": { + "additionalProperties": false, + "properties": { + "combine": {"$ref":"#/$defs/component"}, + "dispatch": {"$ref":"#/$defs/component"}, + "isolated_sum": {"$ref":"#/$defs/component"}, + "roundtrip": {"$ref":"#/$defs/component"}, + "stage": {"$ref":"#/$defs/component"} + }, + "required": ["combine","dispatch","isolated_sum","roundtrip","stage"], + "type": "object" + }, + "correctness": { + "additionalProperties": false, + "properties": { + "contract": {"enum":["expert-specific-transform-v1","expert-assignment-transform-v1"]}, + "max_relative_error": {"minimum":0,"type":"number"}, + "passed": {"type":"boolean"}, + "precision": {"$ref":"#/$defs/precision_evidence"}, + "rank_evidence": { + "items": { + "additionalProperties": false, + "properties": { + "input_unchanged": {"type":"boolean"}, + "order_stable": {"type":"boolean"}, + "post_timing": {"$ref":"#/$defs/oracle"}, + "pre_timing": {"$ref":"#/$defs/oracle"}, + "rank": {"minimum":0,"type":"integer"} + }, + "required": ["input_unchanged","order_stable","post_timing","pre_timing","rank"], + "type": "object" + }, + "minItems": 1, + "type": "array" + }, + "scope": {"enum":["dispatch-metadata-and-transformed-combine","expert-assignment-and-weighted-combine"]} + }, + "required": ["contract","max_relative_error","passed","precision","rank_evidence","scope"], + "type": "object" + }, + "evidence_id": {"pattern":"^cxevidence-v1-[0-9a-f]{64}$","type":"string"}, + "global_tokens": {"minimum":1,"type":"integer"}, + "byte_provenance": { + "additionalProperties": false, + "properties": { + "combine": {"$ref":"#/$defs/byte_accounting"}, + "dispatch": {"$ref":"#/$defs/byte_accounting"}, + "roundtrip": {"$ref":"#/$defs/byte_accounting"}, + "stage": {"$ref":"#/$defs/byte_accounting"} + }, + "required": ["combine","dispatch","roundtrip","stage"], + "type": "object" + }, + "point_id": {"pattern":"^cxpoint-v1-[0-9a-f]{64}$","type":"string"}, + "receive": { + "additionalProperties": false, + "properties": { + "max": {"minimum":0,"type":"integer"}, + "mean": {"minimum":0,"type":"number"}, + "min": {"minimum":0,"type":"integer"}, + "total": {"minimum":0,"type":"integer"} + }, + "required": ["max","mean","min","total"], + "type": "object" + }, + "routing": { + "additionalProperties": false, + "properties": { + "empty_expert_count": {"minimum":0,"type":"integer"}, + "empty_rank_count": {"minimum":0,"type":"integer"}, + "expert_assignment_rank_cv": {"minimum":0,"type":"number"}, + "expert_assignments_per_rank": {"items":{"minimum":0,"type":"integer"},"type":"array"}, + "expert_load_cv": {"minimum":0,"type":"number"}, + "expert_load_max": {"minimum":0,"type":"integer"}, + "expert_load_mean": {"minimum":0,"type":"number"}, + "expert_load_min": {"minimum":0,"type":"integer"}, + "fanout_histogram": {"items":{"minimum":0,"type":"integer"},"type":"array"}, + "fanout_max": {"minimum":1,"type":"integer"}, + "fanout_mean": {"minimum":0,"type":"number"}, + "fanout_min": {"minimum":1,"type":"integer"}, + "hash": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "hotspot_ratio": {"minimum":0,"type":"number"}, + "locality": { + "oneOf": [ + {"type":"null"}, + { + "additionalProperties": false, + "properties": { + "copies": {"minimum":0,"type":"integer"}, + "cross_domain_fraction": {"minimum":0,"type":"number"}, + "cross_node_fraction": {"minimum":0,"type":"number"}, + "gpus_per_node": {"minimum":1,"type":"integer"}, + "local_rank_fraction": {"minimum":0,"type":"number"}, + "placement": {"const":"packed"}, + "same_node_fraction": {"minimum":0,"type":"number"}, + "same_scaleup_domain_fraction": {"minimum":0,"type":"number"}, + "scale_up_domain": {"minimum":1,"type":"integer"} + }, + "required": [ + "placement", + "local_rank_fraction", + "same_node_fraction", + "same_scaleup_domain_fraction", + "cross_node_fraction", + "cross_domain_fraction", + "gpus_per_node", + "scale_up_domain", + "copies" + ], + "type": "object" + } + ] + }, + "payload_copies_per_rank": {"items":{"minimum":0,"type":"integer"},"type":"array"}, + "payload_rank_cv": {"minimum":0,"type":"number"}, + "routed_copies": {"minimum":1,"type":"integer"}, + "source_token_stats": { + "oneOf": [ + {"type":"null"}, + { + "additionalProperties": false, + "properties": { + "cv": {"minimum":0,"type":"number"}, + "empty_ranks": {"minimum":0,"type":"integer"}, + "max": {"minimum":0,"type":"integer"}, + "mean": {"minimum":0,"type":"number"}, + "min": {"minimum":0,"type":"integer"}, + "ranks": {"minimum":1,"type":"integer"}, + "total": {"minimum":0,"type":"integer"} + }, + "required": ["min","mean","max","cv","empty_ranks","total","ranks"], + "type": "object" + } + ] + } + }, + "required": [ + "empty_expert_count", + "empty_rank_count", + "expert_assignment_rank_cv", + "expert_assignments_per_rank", + "expert_load_cv", + "expert_load_max", + "expert_load_mean", + "expert_load_min", + "fanout_histogram", + "fanout_max", + "fanout_mean", + "fanout_min", + "hash", + "hotspot_ratio", + "locality", + "payload_copies_per_rank", + "payload_rank_cv", + "routed_copies", + "source_token_stats" + ], + "type": "object" + }, + "sample_histograms": { + "additionalProperties": false, + "properties": { + "combine": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/histogram"}]}, + "dispatch": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/histogram"}]}, + "roundtrip": {"$ref":"#/$defs/histogram"}, + "stage": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/histogram"}]} + }, + "required": ["dispatch","combine","roundtrip","stage"], + "type": "object" + }, + "sample_sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "token_rate_at_latency_percentile": {"$ref":"#/$defs/percentiles"}, + "tokens_per_rank": {"minimum":1,"type":"integer"} + }, + "required": [ + "anomalies", + "components", + "correctness", + "byte_provenance", + "evidence_id", + "global_tokens", + "point_id", + "receive", + "routing", + "sample_histograms", + "sample_sha256", + "token_rate_at_latency_percentile", + "tokens_per_rank" + ], + "type": "object" + }, + "minItems": 1, + "type": "array" + }, + "sampling": { + "additionalProperties": false, + "properties": { + "contract": {"const":"fixed-512-v1"}, + "iterations_per_trial": {"const":8}, + "percentile_method": {"const":"nearest-rank"}, + "reduction": {"const":"cross-rank-max-per-iteration"}, + "samples_per_component": {"const":512}, + "trials": {"const":64}, + "warmup_iterations": {"const":32}, + "warmup_semantics": {"const":"full-roundtrip-before-each-component-trial-point-v1"} + }, + "required": [ + "contract", + "iterations_per_trial", + "percentile_method", + "reduction", + "samples_per_component", + "trials", + "warmup_iterations", + "warmup_semantics" + ], + "type": "object" + }, + "source_allocation": {"const":"even"} + }, + "required": [ + "component_order_contract", + "conditioning", + "contract", + "execution_order_sha256", + "qualification_index", + "rows", + "sampling", + "source_allocation" + ], + "type": "object" + }, + "outcome": { + "additionalProperties": false, + "properties": { + "publication_status": {"enum":["diagnostic","invalid"]}, + "reasons": {"items":{"type":"string"},"type":"array"}, + "status": {"enum":["success","invalid"]}, + "validity": { + "additionalProperties": false, + "properties": { + "anomaly_free": {"type":"boolean"}, + "execution_status": {"enum":["complete","failed"]}, + "measurement_conformance": {"enum":["conformant","nonconformant"]}, + "provenance_complete": {"type":"boolean"}, + "resource_conformance": {"minLength":1,"type":"string"}, + "sampling_conformance": {"enum":["conformant","nonconformant"]}, + "semantic_correctness": {"enum":["pass","fail"]}, + "workload_identity": {"enum":["consistent-across-ranks","inconsistent"]}, + "workload_source": {"enum":["canonical-serialized","seeded-runtime"]} + }, + "required": [ + "execution_status", + "semantic_correctness", + "workload_identity", + "workload_source", + "measurement_conformance", + "sampling_conformance", + "resource_conformance", + "provenance_complete", + "anomaly_free" + ], + "type": "object" + } + }, + "required": ["publication_status","reasons","status","validity"], + "type": "object" + }, + "provenance": { + "additionalProperties": false, + "properties": { + "allocation_stratum_sha256": { + "oneOf": [ + {"type":"null"}, + {"pattern":"^[0-9a-f]{64}$","type":"string"} + ] + }, + "command": {"minLength":1,"type":"string"}, + "distributed_launcher": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "git_run": {"oneOf":[{"type":"null"},{"$ref":"#/$defs/git_run"}]}, + "image": { + "additionalProperties": false, + "properties": { + "arch": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "digest": { + "oneOf": [{"type":"null"},{"pattern":"^sha256:[0-9a-f]{64}$","type":"string"}] + }, + "digest_verified": {"type":"boolean"}, + "reference": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "squash_sha256": {"oneOf":[{"type":"null"},{"pattern":"^[0-9a-f]{64}$","type":"string"}]} + }, + "required": ["arch","digest","digest_verified","reference","squash_sha256"], + "type": "object" + }, + "redaction": {"const":"sanitized-v1"} + }, + "required": ["allocation_stratum_sha256","command","distributed_launcher","git_run","image","redaction"], + "type": "object" + }, + "record_type": {"const":"case-attempt"}, + "runtime_fingerprint": { + "additionalProperties": false, + "properties": { + "accelerator_runtime": { + "additionalProperties": false, + "properties": { + "kind": {"enum":["cuda","hip"]}, + "version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]} + }, + "required": ["kind","version"], + "type": "object" + }, + "collective_library": { + "additionalProperties": false, + "properties": { + "kind": {"enum":["nccl","rccl"]}, + "version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]} + }, + "required": ["kind","version"], + "type": "object" + }, + "device": { + "additionalProperties": false, + "properties": { + "arch": {"minLength":1,"type":"string"}, + "compute_units": {"minimum":1,"type":"integer"}, + "memory_bytes": {"minimum":1,"type":"integer"}, + "product": {"minLength":1,"type":"string"}, + "warp_size": {"minimum":1,"type":"integer"} + }, + "required": ["arch","compute_units","memory_bytes","product","warp_size"], + "type": "object" + }, + "driver_version": {"oneOf":[{"type":"null"},{"minLength":1,"type":"string"}]}, + "framework": { + "additionalProperties": false, + "properties": {"kind":{"const":"torch"},"version":{"minLength":1,"type":"string"}}, + "required": ["kind","version"], + "type": "object" + }, + "machine": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "python_version": {"minLength":1,"type":"string"}, + "vendor": {"enum":["nvidia","amd"]} + }, + "required": [ + "accelerator_runtime", + "collective_library", + "device", + "driver_version", + "framework", + "machine", + "python_version", + "vendor" + ], + "type": "object" + }, + "sample_artifact": { + "additionalProperties": false, + "properties": { + "bytes": {"minimum":1,"type":"integer"}, + "format": {"const":"collectivex.samples.v1"}, + "path": {"pattern":"^[A-Za-z0-9_.-]+$","type":"string"}, + "sha256": {"pattern":"^[0-9a-f]{64}$","type":"string"} + }, + "required": ["bytes","format","path","sha256"], + "type": "object" + }, + "schema_version": {"const":1}, + "topology": { + "additionalProperties": false, + "properties": { + "device_count": {"minimum":1,"type":"integer"}, + "device_product": {"minLength":1,"type":"string"}, + "gpus_per_node": {"minimum":1,"type":"integer"}, + "nodes": {"minimum":1,"type":"integer"}, + "placement": {"const":"packed"}, + "realized_placement": { + "additionalProperties": false, + "properties": { + "gpus_per_node": {"minimum":1,"type":"integer"}, + "nodes": {"minimum":1,"type":"integer"}, + "ranks_per_node": {"minimum":1,"type":"integer"}, + "unique_local_ranks": {"const":true}, + "valid": {"const":true} + }, + "required": ["gpus_per_node","nodes","ranks_per_node","unique_local_ranks","valid"], + "type": "object" + }, + "scale_out_transport": {"oneOf":[{"type":"null"},{"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}]}, + "scale_up_domain": {"minimum":1,"type":"integer"}, + "scale_up_transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "scope": {"enum":["scale-up","scale-out"]}, + "topology_class": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "transport": {"maxLength":128,"pattern":"^[a-z0-9][a-z0-9_.-]*$","type":"string"}, + "world_size": {"minimum":1,"type":"integer"} + }, + "required": [ + "device_count", + "device_product", + "gpus_per_node", + "nodes", + "placement", + "realized_placement", + "scale_out_transport", + "scale_up_domain", + "scale_up_transport", + "scope", + "topology_class", + "transport", + "world_size" + ], + "type": "object" + }, + "workload": { + "additionalProperties": false, + "properties": { + "activation_generator": {"const":"collectivex-activation-counter-v4"}, + "activation_identity": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "activation_profile": {"const":"canonical-counter-source-v4"}, + "cross_rank_consistent": {"const":true}, + "manifest_checksums": { + "oneOf": [ + {"type":"null"}, + { + "additionalProperties": { + "additionalProperties": false, + "properties": { + "topk_idx": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "topk_weights": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "trace": {"pattern":"^[0-9a-f]{64}$","type":"string"} + }, + "required": ["topk_idx", "topk_weights", "trace"], + "type": "object" + }, + "type": "object" + } + ] + }, + "members": { + "oneOf": [ + {"type":"null"}, + { + "items": {"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"}, + "minItems": 1, + "uniqueItems": true, + "type": "array" + } + ] + }, + "routing_generator": {"const":"collectivex-routing-counter-v3"}, + "source": {"enum":["canonical-serialized","seeded-runtime"]}, + "trace_hashes": { + "items": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "minItems": 1, + "type": "array" + }, + "trace_signature": {"pattern":"^[0-9a-f]{64}$","type":"string"}, + "workload_id": { + "oneOf": [{"type":"null"},{"pattern":"^cxwork-v1-[0-9a-f]{64}$","type":"string"}] + } + }, + "required": [ + "activation_generator", + "activation_identity", + "activation_profile", + "cross_rank_consistent", + "manifest_checksums", + "members", + "routing_generator", + "source", + "trace_hashes", + "trace_signature", + "workload_id" + ], + "type": "object" + } + }, + "required": [ + "case", + "format", + "generated_at", + "identity", + "implementation", + "measurement", + "outcome", + "provenance", + "record_type", + "runtime_fingerprint", + "sample_artifact", + "schema_version", + "topology", + "workload" + ], + "title": "CollectiveX raw case attempt v1", + "type": "object" +} diff --git a/experimental/CollectiveX/schemas/samples-v1.schema.json b/experimental/CollectiveX/schemas/samples-v1.schema.json new file mode 100644 index 000000000..f216860a8 --- /dev/null +++ b/experimental/CollectiveX/schemas/samples-v1.schema.json @@ -0,0 +1,93 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/samples-v1.schema.json", + "title": "CollectiveX exact private samples v1", + "type": "object", + "additionalProperties": false, + "required": ["allocation_id","attempt_id","case_id","format","points","qualification_index","sampling","schema_version","series_id"], + "properties": { + "allocation_id": {"$ref": "#/$defs/allocationId"}, + "attempt_id": {"$ref": "#/$defs/attemptId"}, + "case_id": {"$ref": "#/$defs/caseId"}, + "format": {"const": "collectivex.samples.v1"}, + "points": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "additionalProperties": false, + "required": ["components","evidence_id","point_id","sample_sha256","tokens_per_rank"], + "properties": { + "components": { + "type": "object", + "additionalProperties": false, + "required": ["combine","dispatch","roundtrip","stage"], + "properties": { + "combine": {"$ref": "#/$defs/component"}, + "dispatch": {"$ref": "#/$defs/component"}, + "roundtrip": {"$ref": "#/$defs/component"}, + "stage": {"$ref":"#/$defs/component"} + } + }, + "evidence_id": {"$ref": "#/$defs/evidenceId"}, + "point_id": {"$ref": "#/$defs/pointId"}, + "sample_sha256": {"$ref": "#/$defs/sha256"}, + "tokens_per_rank": {"type": "integer","minimum": 1} + } + } + }, + "qualification_index": {"type":"integer","minimum":1,"maximum":3}, + "sampling": { + "type": "object", + "additionalProperties": false, + "required": ["iterations_per_trial","reduction","trials"], + "properties": { + "iterations_per_trial": {"const": 8}, + "reduction": {"const": "cross-rank-max-per-iteration"}, + "trials": {"const": 64} + } + }, + "schema_version": {"const": 1}, + "series_id": {"$ref": "#/$defs/seriesId"} + }, + "$defs": { + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"}, + "seriesId": {"type": "string","pattern": "^cxseries-v1-[0-9a-f]{64}$"}, + "pointId": {"type": "string","pattern": "^cxpoint-v1-[0-9a-f]{64}$"}, + "evidenceId": {"type": "string","pattern": "^cxevidence-v1-[0-9a-f]{64}$"}, + "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"}, + "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"}, + "trials": { + "type":"array", + "minItems":64, + "maxItems":64, + "items":{"type":"array","minItems":8,"maxItems":8,"items":{"type":"number","minimum":0}} + }, + "measuredComponent": { + "type":"object", + "additionalProperties":false, + "required":["availability","sample_count","trials"], + "properties": { + "availability":{"const":"measured"}, + "sample_count":{"const":512}, + "trials":{"$ref":"#/$defs/trials"} + } + }, + "component": { + "oneOf": [ + {"$ref":"#/$defs/measuredComponent"}, + { + "type":"object", + "additionalProperties":false, + "required":["availability","sample_count","trials"], + "properties": { + "availability":{"const":"unavailable"}, + "sample_count":{"const":0}, + "trials":{"type":"null"} + } + } + ] + } + } +} diff --git a/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json b/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json new file mode 100644 index 000000000..f4a9d99ca --- /dev/null +++ b/experimental/CollectiveX/schemas/terminal-outcome-v1.schema.json @@ -0,0 +1,337 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://inferencex.com/schemas/collectivex/terminal-outcome-v1.schema.json", + "title": "CollectiveX terminal outcome v1", + "type": "object", + "additionalProperties": false, + "required": ["case","format","generated_at","identity","outcome","provenance","record_type","schema_version"], + "properties": { + "case": {"$ref": "#/$defs/case"}, + "format": {"const": "collectivex.terminal.v1"}, + "generated_at": {"type": "string","format": "date-time"}, + "identity": { + "type": "object", + "additionalProperties": false, + "required": ["allocation_factors","allocation_id","attempt_id","attempt_ordinal","case_factors","case_id"], + "properties": { + "allocation_factors": {"$ref": "#/$defs/allocationFactors"}, + "allocation_id": {"$ref": "#/$defs/allocationId"}, + "attempt_id": {"$ref": "#/$defs/attemptId"}, + "attempt_ordinal": {"type": "integer","minimum": 1}, + "case_factors": { + "type": "object", + "additionalProperties": false, + "required": ["case","profile","sku"], + "properties": { + "case": {"$ref": "#/$defs/case"}, + "profile": {"$ref":"#/$defs/caseProfile"}, + "sku": {"$ref": "#/$defs/safeId"} + } + }, + "case_id": {"$ref": "#/$defs/caseId"} + } + }, + "outcome": { + "type": "object", + "additionalProperties": false, + "required": ["failure_mode","reason","return_code","status"], + "properties": { + "failure_mode": {"$ref": "#/$defs/safeId"}, + "reason": {"type": "string","minLength": 1,"maxLength": 240}, + "return_code": {"type": "integer","minimum": 0}, + "status": {"enum": ["failed","invalid","unsupported"]} + } + }, + "provenance": { + "type": "object", + "additionalProperties": false, + "required": ["git_run","control_sha256","redaction","source"], + "properties": { + "git_run": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/gitRun"}]}, + "control_sha256": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/sha256"}]}, + "redaction": {"const": "sanitized-v1"}, + "source": { + "enum": [ + "runtime-emitter", + "post-emit-command", + "matrix-capability-resolver" + ] + } + } + }, + "record_type": {"const": "terminal-outcome"}, + "schema_version": {"const": 1} + }, + "allOf": [ + { + "oneOf": [ + { + "properties": { + "provenance": { + "properties": {"source": {"const": "runtime-emitter"}} + }, + "outcome": {"$ref": "#/$defs/runtimeOutcome"} + } + }, + { + "properties": { + "provenance": { + "properties": {"source": {"const": "post-emit-command"}} + }, + "outcome": {"$ref": "#/$defs/postEmitOutcome"} + } + }, + { + "properties": { + "provenance": { + "properties": {"source": {"const": "matrix-capability-resolver"}} + }, + "outcome": {"$ref": "#/$defs/capabilityOutcome"} + } + } + ] + } + ], + "$defs": { + "runtimeOutcome": { + "type": "object", + "properties": {"status": {"const": "failed"}}, + "allOf": [ + { + "oneOf": [ + {"properties": {"failure_mode": {"const": "setup"}, "reason": {"const": "launcher-setup-failed"}}}, + {"properties": {"failure_mode": {"const": "repository-stage"}, "reason": {"const": "repository-staging-failed"}}}, + {"properties": {"failure_mode": {"const": "registry-verification"}, "reason": {"const": "container-registry-verification-failed"}}}, + {"properties": {"failure_mode": {"const": "scheduler-allocation"}, "reason": {"const": "scheduler-allocation-failed"}}}, + {"properties": {"failure_mode": {"const": "container-import"}, "reason": {"const": "container-image-preparation-failed"}}}, + {"properties": {"failure_mode": {"const": "container-hash"}, "reason": {"const": "container-image-identity-failed"}}}, + {"properties": {"failure_mode": {"const": "container-launch"}, "reason": {"const": "container-runtime-launch-failed"}}}, + {"properties": {"failure_mode": {"const": "backend-setup"}, "reason": {"const": "backend-setup-failed"}}}, + {"properties": {"failure_mode": {"const": "artifact-collection"}, "reason": {"const": "artifact-collection-failed"}}}, + {"properties": {"failure_mode": {"const": "runtime-identity"}, "reason": {"const": "runtime-identity-mismatch"}}}, + {"properties": {"failure_mode": {"const": "timeout"}, "reason": {"const": "execution-timeout"}}}, + {"properties": {"failure_mode": {"const": "deadlock"}, "reason": {"const": "execution-deadlock"}}}, + {"properties": {"failure_mode": {"const": "execution"}, "reason": {"const": "distributed-command-failed"}}} + ] + } + ] + }, + "postEmitOutcome": { + "type": "object", + "properties": { + "status": {"const": "failed"}, + "failure_mode": {"enum": ["runtime-identity", "timeout", "deadlock", "execution"]}, + "reason": {"const": "post-emit-distributed-command-failed"} + } + }, + "capabilityOutcome": { + "type": "object", + "properties": { + "status": {"const": "unsupported"}, + "failure_mode": {"const": "capability"}, + "reason": { + "enum": [ + "backend-platform-unsupported", + "backend-token-capacity", + "precision-profile-unsupported" + ] + } + } + }, + "sha256": {"type": "string","pattern": "^[0-9a-f]{64}$"}, + "safeId": {"type": "string","pattern": "^[a-z0-9][a-z0-9_.-]*$","maxLength": 128}, + "precisionProfile": { + "enum":[ + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-bf16", + "d-fp8-e4m3fn-b128-f32-fused.c-bf16", + "d-bf16.c-logfmt10-dynamic64", + "d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64", + "d-bf16.c-fp8-e4m3fn-direct-cast-noscale", + "d-fp8-e4m3fn-b128-f32-prequantized.c-fp8-e4m3fn-direct-cast-noscale", + "d-bf16.c-fp8-e4m3fnuz-direct-cast-noscale", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-fp8-e4m3fnuz-direct-cast-noscale" + ] + }, + "caseId": {"type": "string","pattern": "^cxcase-v1-[0-9a-f]{64}$"}, + "allocationId": {"type": "string","pattern": "^cxallocation-v1-[0-9a-f]{64}$"}, + "attemptId": {"type": "string","pattern": "^cxattempt-v1-[0-9a-f]{64}$"}, + "nullableText": {"oneOf": [{"type": "null"},{"type": "string","minLength": 1}]}, + "communicationAxis": { + "type":"object", + "additionalProperties":false, + "required":["alignment_contract","api_input_dtype","api_output_dtype","communication_format","conversion_boundary","padding_contract","quantization_origin","scale_dtype","scale_group_size","scale_layout"], + "properties": { + "alignment_contract":{"enum":["native-bf16-vector-alignment","hidden-block-128","native-fp8-vector-alignment","value-block-64"]}, + "api_input_dtype":{"enum":["bf16","fp8-e4m3fn-with-f32-scale","fp8-e4m3fnuz-with-f32-scale"]}, + "api_output_dtype":{"enum":["bf16","fp8-e4m3fn-with-f32-scale","fp8-e4m3fnuz-with-f32-scale"]}, + "communication_format":{"enum":["bf16","fp8-e4m3fn","fp8-e4m3fnuz","logfmt10"]}, + "conversion_boundary":{"enum":["none","before-dispatch-timing","inside-dispatch-timing","inside-combine-timing"]}, + "padding_contract":{"enum":["none","right-zero-pad-hidden-to-128","right-zero-pad-values-to-64"]}, + "quantization_origin":{"enum":["none","caller-prequantized","backend-fused","backend-internal","backend-internal-direct-cast"]}, + "scale_dtype":{"oneOf":[{"type":"null"},{"enum":["f32","implicit-logfmt10"]}]}, + "scale_group_size":{"oneOf":[{"type":"null"},{"enum":[64,128]}]}, + "scale_layout":{"enum":["none","per-token-hidden-block","dynamic-per-64-values"]} + } + }, + "communicationPrecision": { + "type":"object", + "additionalProperties":false, + "required":["combine","dispatch","modes","profile_id"], + "properties": { + "combine":{"$ref":"#/$defs/communicationAxis"}, + "dispatch":{"$ref":"#/$defs/communicationAxis"}, + "modes":{"type":"array","minItems":1,"uniqueItems":true,"items":{"enum":["normal","low-latency"]}}, + "profile_id":{"$ref":"#/$defs/precisionProfile"} + } + }, + "caseProfile": { + "type":"object", + "additionalProperties":false, + "allOf":[ + { + "if":{"properties":{"mode":{"const":"normal"}},"required":["mode"]}, + "then":{"properties":{ + "combine_semantics":{"const":"activation-only"}, + "component_order_contract":{"const":"qualification-hash-rotated-components-v1"}, + "contract":{"const":"layout-and-dispatch-v1"}, + "correctness_scope":{"const":"dispatch-metadata-and-transformed-combine"}, + "oracle_contract":{"const":"expert-specific-transform-v1"}, + "payload_unit":{"const":"token-rank"} + }} + }, + { + "if":{"properties":{"mode":{"const":"low-latency"}},"required":["mode"]}, + "then":{"properties":{ + "combine_semantics":{"const":"gate-weighted"}, + "component_order_contract":{"const":"qualification-hash-rotated-components-v1"}, + "contract":{"const":"expert-packed-weighted-combine-v1"}, + "correctness_scope":{"const":"expert-assignment-and-weighted-combine"}, + "oracle_contract":{"const":"expert-assignment-transform-v1"}, + "payload_unit":{"const":"token-expert"} + }} + } + ], + "required":["activation_generator","activation_profile","combine_dtype","combine_quant_mode","combine_semantics","component_order_contract","conditioning_contract","contract","correctness_scope","dtype","eplb_planner","eplb_redundant_experts","eplb_reference_tokens_per_rank","mode","oracle_contract","oracle_tolerances","payload_unit","placement","percentile_method","rank_reduction","resource_mode","routing_generator","sampling_contract","seed","source_identity_contract"], + "properties": { + "activation_generator":{"const":"collectivex-activation-counter-v4"}, + "activation_profile":{"const":"canonical-counter-source-v4"}, + "combine_dtype":{"const":"bf16"}, + "combine_quant_mode":{"const":"none"}, + "combine_semantics":{"enum":["activation-only","gate-weighted"]}, + "communication_precision":{"$ref":"#/$defs/communicationPrecision"}, + "component_order_contract":{"const":"qualification-hash-rotated-components-v1"}, + "conditioning_contract":{"const":"fixed-phase-ramp-8-roundtrips-v1"}, + "contract":{"enum":["layout-and-dispatch-v1","expert-packed-weighted-combine-v1"]}, + "correctness_scope":{"enum":["dispatch-metadata-and-transformed-combine","expert-assignment-and-weighted-combine"]}, + "dtype":{"const":"bf16"}, + "eplb_planner":{"const":"greedy-rank-major-v1"}, + "eplb_redundant_experts":{"const":32}, + "eplb_reference_tokens_per_rank":{"const":2048}, + "mode":{"enum":["normal","low-latency"]}, + "oracle_contract":{"enum":["expert-specific-transform-v1","expert-assignment-transform-v1"]}, + "oracle_tolerances":{"const":"rtol=0.05,atol=0.02"}, + "payload_unit":{"enum":["token-rank","token-expert"]}, + "placement":{"const":"packed"}, + "percentile_method":{"const":"nearest-rank"}, + "rank_reduction":{"const":"cross-rank-max-per-iteration"}, + "resource_mode":{"const":"fixed-profile"}, + "routing_generator":{"const":"collectivex-routing-counter-v3"}, + "sampling_contract":{"const":"fixed-512-v1"}, + "seed":{"const":67}, + "source_identity_contract":{"const":"bounded-sign-bit-source-v1"} + } + }, + "allocationFactors": { + "type": "object", + "additionalProperties": false, + "required": ["artifact","execution_id","job","qualification_index","repo","run_attempt","run_id","runner","source_sha"], + "properties": { + "artifact": {"$ref": "#/$defs/nullableText"}, + "execution_id": {"$ref": "#/$defs/nullableText"}, + "job": {"$ref": "#/$defs/nullableText"}, + "qualification_index": {"type":"integer","minimum":1,"maximum":3}, + "repo": {"$ref": "#/$defs/nullableText"}, + "run_attempt": {"$ref": "#/$defs/nullableText"}, + "run_id": {"$ref": "#/$defs/nullableText"}, + "runner": {"$ref": "#/$defs/nullableText"}, + "source_sha": {"$ref": "#/$defs/nullableText"} + } + }, + "gitRun": { + "type": "object", + "additionalProperties": false, + "required": ["artifact","job","qualification_index","ref","repo","run_attempt","run_id","source_sha"], + "properties": { + "artifact": {"type": "string","minLength": 1}, + "job": {"type": "string","minLength": 1}, + "qualification_index": {"type":"integer","minimum":1,"maximum":3}, + "ref": {"type": "string","minLength": 1}, + "repo": {"type": "string","pattern": "^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$"}, + "run_attempt": {"type": "string","pattern": "^[1-9][0-9]*$"}, + "run_id": {"type": "string","pattern": "^[1-9][0-9]*$"}, + "source_sha": {"type": "string","pattern": "^[0-9a-f]{40}$"} + } + }, + "case": { + "type": "object", + "additionalProperties": false, + "required": [ + "backend", + "canonical", + "eplb", + "ep", + "experts", + "gpus_per_node", + "hidden", + "ladder", + "mode", + "nodes", + "phase", + "required_publication", + "routing", + "samples_per_point", + "scale_out_transport", + "scale_up_domain", + "scale_up_transport", + "scope", + "suite", + "timing", + "topk", + "topology_class", + "transport", + "warmup_semantics", + "workload" + ], + "properties": { + "backend": {"$ref": "#/$defs/safeId"}, + "canonical": {"type": "boolean"}, + "eplb": {"type": "boolean"}, + "ep": {"type": "integer","minimum": 1}, + "experts": {"type": "integer","minimum": 1}, + "gpus_per_node": {"type": "integer","minimum": 1}, + "hidden": {"type": "integer","minimum": 1}, + "ladder": {"type": "string","pattern": "^[1-9][0-9]*( [1-9][0-9]*)*$"}, + "mode": {"enum": ["normal","low-latency"]}, + "nodes": {"type": "integer","minimum": 1}, + "phase": {"enum": ["decode","prefill"]}, + "precision_profile": {"$ref":"#/$defs/precisionProfile"}, + "required_publication": {"enum": ["official","comparable-experimental","diagnostic"]}, + "routing": {"enum": ["uniform","zipf"]}, + "samples_per_point": {"const": 512}, + "scale_out_transport": {"oneOf": [{"type": "null"},{"$ref": "#/$defs/safeId"}]}, + "scale_up_domain": {"type": "integer","minimum": 1}, + "scale_up_transport": {"$ref": "#/$defs/safeId"}, + "scope": {"enum": ["scale-up","scale-out"]}, + "suite": {"$ref": "#/$defs/safeId"}, + "timing": {"const": "8:64:32"}, + "topk": {"type": "integer","minimum": 1}, + "topology_class": {"$ref": "#/$defs/safeId"}, + "transport": {"$ref": "#/$defs/safeId"}, + "warmup_semantics": {"const": "full-roundtrip-before-each-component-trial-point-v1"}, + "workload": {"$ref": "#/$defs/safeId"} + } + } + } +} diff --git a/experimental/CollectiveX/source_archive.py b/experimental/CollectiveX/source_archive.py new file mode 100644 index 000000000..c027490a6 --- /dev/null +++ b/experimental/CollectiveX/source_archive.py @@ -0,0 +1,349 @@ +#!/usr/bin/env python3 +"""Validate and extract one pinned backend from a shared source tar.""" +from __future__ import annotations + +import argparse +import os +from pathlib import Path, PurePosixPath +import stat +import tarfile +from typing import Optional, Sequence + + +PathParts = tuple[str, ...] +_DIRECTORY_FLAGS = os.O_RDONLY | os.O_DIRECTORY | os.O_NOFOLLOW | os.O_CLOEXEC +_FILE_FLAGS = os.O_WRONLY | os.O_CREAT | os.O_EXCL | os.O_NOFOLLOW | os.O_CLOEXEC +MAX_ARCHIVE_MEMBERS = 20_000 +MAX_MEMBER_BYTES = 512 * 1024 * 1024 +MAX_EXPANDED_BYTES = 2 * 1024 * 1024 * 1024 +MAX_ARCHIVE_BYTES = 4 * 1024 * 1024 * 1024 +MAX_ARCHIVE_HEADERS = 40_000 +MAX_EXTENSION_BYTES = 64 * 1024 * 1024 +MAX_EXTENSION_MEMBER_BYTES = 1024 * 1024 +MAX_EXTENSION_CHAIN = 8 +_TAR_BLOCK = 512 +_EXTENSION_TYPES = {b"L", b"K", b"x", b"g", b"X"} + + +class SourceArchiveError(ValueError): + """The backend source archive cannot be extracted safely.""" + + +def _tar_size(field: bytes) -> int: + if field[0] in (0o200, 0o377): + value = int.from_bytes(field[1:], "big") + if field[0] == 0o377: + value -= 256 ** (len(field) - 1) + return value + try: + text = field.split(b"\0", 1)[0].decode("ascii").strip() + return int(text or "0", 8) + except (UnicodeDecodeError, ValueError) as exc: + raise SourceArchiveError("archive contains an invalid size field") from exc + + +def _preflight_archive(descriptor: int, archive_size: int) -> None: + if archive_size <= 0 or archive_size > MAX_ARCHIVE_BYTES: + raise SourceArchiveError("backend source archive exceeds the raw size limit") + offset = headers = extension_bytes = extension_chain = 0 + while offset < archive_size: + header = os.pread(descriptor, _TAR_BLOCK, offset) + if len(header) != _TAR_BLOCK: + raise SourceArchiveError("archive header is truncated") + if not any(header): + return + headers += 1 + if headers > MAX_ARCHIVE_HEADERS: + raise SourceArchiveError("archive has too many physical headers") + size = _tar_size(header[124:136]) + if size < 0: + raise SourceArchiveError("archive contains a negative payload size") + type_flag = header[156:157] + if type_flag in _EXTENSION_TYPES: + extension_chain += 1 + extension_bytes += size + if ( + extension_chain > MAX_EXTENSION_CHAIN + or size > MAX_EXTENSION_MEMBER_BYTES + or extension_bytes > MAX_EXTENSION_BYTES + ): + raise SourceArchiveError("archive extension metadata exceeds its limit") + if type_flag in {b"x", b"g", b"X"}: + payload = os.pread(descriptor, size, offset + _TAR_BLOCK) + if len(payload) != size: + raise SourceArchiveError("archive extension metadata is truncated") + if b"GNU.sparse." in payload: + raise SourceArchiveError("archive contains sparse extension metadata") + else: + extension_chain = 0 + if type_flag == b"S": + raise SourceArchiveError("archive contains a sparse member") + blocks = (size + _TAR_BLOCK - 1) // _TAR_BLOCK + offset += _TAR_BLOCK + blocks * _TAR_BLOCK + if offset > archive_size: + raise SourceArchiveError("archive payload is truncated") + + +def _member_parts(name: str) -> PathParts: + if not name or "\\" in name or "\0" in name: + raise SourceArchiveError("archive contains a noncanonical member path") + path = PurePosixPath(name) + if ( + path.is_absolute() + or path.as_posix() != name + or not path.parts + or path.parts[0] != ".cx_sources" + or any(part in {"", ".", ".."} for part in path.parts) + ): + raise SourceArchiveError("archive contains a noncanonical member path") + return path.parts + + +def _root_parts(root_basename: str) -> PathParts: + path = PurePosixPath(root_basename) + if ( + not root_basename + or "\\" in root_basename + or "\0" in root_basename + or path.is_absolute() + or path.as_posix() != root_basename + or len(path.parts) != 1 + or path.parts[0] in {"", ".", ".."} + ): + raise SourceArchiveError("invalid backend source root") + return (".cx_sources", root_basename) + + +def _read_members(archive: tarfile.TarFile) -> list[tarfile.TarInfo]: + members: list[tarfile.TarInfo] = [] + for member in archive: + if len(members) >= MAX_ARCHIVE_MEMBERS: + raise SourceArchiveError("archive has an invalid member count") + members.append(member) + return members + + +def _validate_members( + members: list[tarfile.TarInfo], selected_root: PathParts +) -> dict[PathParts, tarfile.TarInfo]: + if not members or len(members) > MAX_ARCHIVE_MEMBERS: + raise SourceArchiveError("archive has an invalid member count") + entries: dict[PathParts, tarfile.TarInfo] = {} + expanded_bytes = 0 + for member in members: + parts = _member_parts(member.name) + if parts in entries: + raise SourceArchiveError("archive contains duplicate member paths") + if member.sparse is not None: + raise SourceArchiveError("archive contains a sparse member") + if member.isdir(): + if member.size != 0: + raise SourceArchiveError("archive contains an invalid directory") + elif member.isfile(): + if member.size < 0 or member.size > MAX_MEMBER_BYTES: + raise SourceArchiveError("archive member exceeds the size limit") + expanded_bytes += member.size + if expanded_bytes > MAX_EXPANDED_BYTES: + raise SourceArchiveError("archive exceeds the expanded size limit") + elif member.issym(): + if member.size != 0: + raise SourceArchiveError("archive contains an invalid symbolic link") + else: + raise SourceArchiveError("archive contains a non-file member") + entries[parts] = member + + source_parent = entries.get((".cx_sources",)) + selected = entries.get(selected_root) + if source_parent is None or not source_parent.isdir(): + raise SourceArchiveError("archive is missing its source directory") + if selected is None or not selected.isdir(): + raise SourceArchiveError("archive is missing the selected backend source") + + for parts in entries: + for depth in range(1, len(parts)): + parent = entries.get(parts[:depth]) + if parent is None or not parent.isdir(): + raise SourceArchiveError("archive member has an unsafe parent") + + for parts, member in entries.items(): + if not member.issym(): + continue + target_name = member.linkname + target_path = PurePosixPath(target_name) + if ( + not target_name + or "\\" in target_name + or "\0" in target_name + or target_path.is_absolute() + or target_path.as_posix() != target_name + ): + raise SourceArchiveError("archive contains an unsafe symbolic link") + target = list(parts[:-1]) + for component in target_path.parts: + if component == "..": + if len(target) <= 2: + raise SourceArchiveError("symbolic link escapes its backend source") + target.pop() + else: + target.append(component) + resolved = tuple(target) + if resolved[:2] != parts[:2]: + raise SourceArchiveError("symbolic link crosses backend sources") + target_member = entries.get(resolved) + if target_member is None or not target_member.isfile(): + raise SourceArchiveError("symbolic link target is not a regular archive file") + return entries + + +def _open_directory(root_fd: int, parts: PathParts) -> int: + descriptor = os.dup(root_fd) + try: + for part in parts: + child = os.open(part, _DIRECTORY_FLAGS, dir_fd=descriptor) + os.close(descriptor) + descriptor = child + return descriptor + except BaseException: + os.close(descriptor) + raise + + +def _create_directory(root_fd: int, parts: PathParts) -> None: + parent_fd = _open_directory(root_fd, parts[:-1]) + try: + os.mkdir(parts[-1], mode=0o700, dir_fd=parent_fd) + finally: + os.close(parent_fd) + + +def _extract_file( + archive: tarfile.TarFile, root_fd: int, parts: PathParts, member: tarfile.TarInfo +) -> None: + parent_fd = _open_directory(root_fd, parts[:-1]) + descriptor = -1 + source = None + try: + mode = 0o700 if member.mode & 0o111 else 0o600 + descriptor = os.open(parts[-1], _FILE_FLAGS, mode, dir_fd=parent_fd) + source = archive.extractfile(member) + if source is None: + raise SourceArchiveError("archive file has no readable payload") + remaining = member.size + while remaining: + chunk = source.read(min(1024 * 1024, remaining)) + if not chunk: + raise SourceArchiveError("archive file payload is truncated") + view = memoryview(chunk) + while view: + written = os.write(descriptor, view) + view = view[written:] + remaining -= len(chunk) + os.fchmod(descriptor, mode) + finally: + if source is not None: + source.close() + if descriptor >= 0: + os.close(descriptor) + os.close(parent_fd) + + +def _extract_symlink(root_fd: int, parts: PathParts, member: tarfile.TarInfo) -> None: + parent_fd = _open_directory(root_fd, parts[:-1]) + try: + os.symlink(member.linkname, parts[-1], dir_fd=parent_fd) + finally: + os.close(parent_fd) + + +def _extract_selected( + archive: tarfile.TarFile, + destination_fd: int, + entries: dict[PathParts, tarfile.TarInfo], + selected_root: PathParts, +) -> None: + try: + os.stat(".cx_sources", dir_fd=destination_fd, follow_symlinks=False) + except FileNotFoundError: + pass + else: + raise SourceArchiveError("backend source output already exists") + + selected = { + parts: member + for parts, member in entries.items() + if parts[: len(selected_root)] == selected_root + } + _create_directory(destination_fd, (".cx_sources",)) + directories = sorted( + (parts for parts, member in selected.items() if member.isdir()), + key=lambda parts: (len(parts), parts), + ) + for parts in directories: + _create_directory(destination_fd, parts) + for parts, member in sorted(selected.items()): + if member.isfile(): + _extract_file(archive, destination_fd, parts, member) + for parts, member in sorted(selected.items()): + if member.issym(): + _extract_symlink(destination_fd, parts, member) + + +def extract_source_archive( + archive_path: Path, destination: Path, root_basename: str +) -> None: + """Validate the complete tar, then safely extract one backend source root.""" + selected_root = _root_parts(root_basename) + archive_fd = os.open(archive_path, os.O_RDONLY | os.O_NOFOLLOW | os.O_CLOEXEC) + try: + metadata = os.fstat(archive_fd) + if ( + not stat.S_ISREG(metadata.st_mode) + or metadata.st_uid != os.getuid() + or stat.S_IMODE(metadata.st_mode) & 0o022 + ): + raise SourceArchiveError("backend source archive has unsafe metadata") + _preflight_archive(archive_fd, metadata.st_size) + with os.fdopen(os.dup(archive_fd), "rb") as stream: + try: + with tarfile.open(fileobj=stream, mode="r:") as archive: + entries = _validate_members(_read_members(archive), selected_root) + destination_fd = os.open(destination, _DIRECTORY_FLAGS) + try: + destination_metadata = os.fstat(destination_fd) + if ( + destination_metadata.st_uid != os.getuid() + or stat.S_IMODE(destination_metadata.st_mode) != 0o700 + ): + raise SourceArchiveError("backend source destination is unsafe") + previous_umask = os.umask(0o077) + try: + _extract_selected( + archive, destination_fd, entries, selected_root + ) + finally: + os.umask(previous_umask) + finally: + os.close(destination_fd) + except RecursionError as exc: + raise SourceArchiveError("archive extension metadata is recursive") from exc + finally: + os.close(archive_fd) + + +def main(argv: Optional[Sequence[str]] = None) -> int: + parser = argparse.ArgumentParser( + description="Safely install one pinned backend source archive" + ) + parser.add_argument("archive", type=Path) + parser.add_argument("destination", type=Path) + parser.add_argument("root_basename") + args = parser.parse_args(argv) + try: + extract_source_archive(args.archive, args.destination, args.root_basename) + except (OSError, SourceArchiveError, tarfile.TarError) as exc: + parser.error(f"backend source archive rejected: {exc}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/summarize.py b/experimental/CollectiveX/summarize.py new file mode 100644 index 000000000..3752db6b9 --- /dev/null +++ b/experimental/CollectiveX/summarize.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +"""Render a small native-v1 shard summary and gate on a successful case.""" +from __future__ import annotations + +import argparse +from pathlib import Path + +import contracts + + +def load_results(directory: str, runner: str | None, timestamp: str | None) -> list[dict]: + documents: list[dict] = [] + for path in sorted(Path(directory).glob("*.json")): + if runner and not path.name.startswith(f"{runner}_"): + continue + if timestamp and timestamp not in path.name: + continue + try: + document = contracts.strict_load(path) + if document.get("format") == contracts.RAW_FORMAT: + documents.append(contracts.load_raw_attempt(path)) + elif document.get("format") == contracts.TERMINAL_FORMAT: + documents.append(contracts.validate_terminal_document(document)) + except (contracts.ContractError, OSError): + continue + return documents + + +def _identity(document: dict) -> tuple[str, str, str, str, bool, str, int]: + case = document["case"] + if document["format"] == contracts.RAW_FORMAT: + routing = case["shape"]["routing"] + eplb = case["eplb"]["enabled"] + else: + routing = case["routing"] + eplb = case["eplb"] + sku = document["identity"]["case_factors"]["sku"] + return ( + sku, case["suite"], routing, case["phase"], eplb, + case["required_publication"], case.get("ep_size", case.get("ep", 0)), + ) + + +def _headline(document: dict) -> tuple[int | str, float | str, float | str]: + if document["format"] != contracts.RAW_FORMAT: + return "-", "-", "-" + rows = document["measurement"]["rows"] + row = next((item for item in rows if item["tokens_per_rank"] == 64), rows[len(rows) // 2]) + latency = row["components"]["roundtrip"]["percentiles_us"] + return row["tokens_per_rank"], latency["p50"], latency["p99"] + + +def render(documents: list[dict], markdown: bool) -> str: + documents = sorted(documents, key=_identity) + if markdown: + lines = [ + "## CollectiveX EP results", "", + "| sku | backend | suite | phase | routing | tier | ep | outcome | T* | p50 us | p99 us |", + "|---|---|---|---|---|---|--:|---|--:|--:|--:|", + ] + for document in documents: + sku, suite, routing, phase, eplb, tier, ep = _identity(document) + backend = document["case"]["backend"] + token, p50, p99 = _headline(document) + lines.append( + f"| {sku} | `{backend}` | {suite} | {phase} | " + f"{routing}{'+eplb' if eplb else ''} | {tier} | {ep} | " + f"{document['outcome']['status']} | {token} | {p50} | {p99} |" + ) + if not documents: + lines.append("\n> No valid native v1 outcome documents found.") + return "\n".join(lines) + lines = ["CollectiveX EP results", "======================"] + for document in documents: + sku, suite, routing, phase, eplb, tier, ep = _identity(document) + backend = document["case"]["backend"] + token, _, p99 = _headline(document) + lines.append( + f" {sku:<10} {backend:<16} {suite:<13} {phase:<7} " + f"{routing}{'+eplb' if eplb else ''} {tier} ep{ep} " + f"{document['outcome']['status']} T={token} roundtrip_p99_us={p99}" + ) + return "\n".join(lines) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Summarize CollectiveX native v1 outcomes") + parser.add_argument("--results-dir", default="results") + parser.add_argument("--runner") + parser.add_argument("--ts") + parser.add_argument("--markdown", action="store_true") + args = parser.parse_args() + documents = load_results(args.results_dir, args.runner, args.ts) + print(render(documents, args.markdown)) + if args.markdown: + return 0 + return 0 if any( + document["format"] == contracts.RAW_FORMAT + and document["outcome"]["status"] == "success" + for document in documents + ) else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/sweep_matrix.py b/experimental/CollectiveX/sweep_matrix.py new file mode 100644 index 000000000..309199fe2 --- /dev/null +++ b/experimental/CollectiveX/sweep_matrix.py @@ -0,0 +1,1453 @@ +#!/usr/bin/env python3 +"""Resolve CollectiveX v1 suites and extract validated execution shards. + +Mode changes measurement semantics and therefore participates in case identity. +Precision sensitivity uses allowlisted communication profiles; provisional native +paths remain outside the executable matrix until their probes are resolved. +""" +from __future__ import annotations + +import argparse +import datetime as dt +import hashlib +import itertools +import json +import os +from pathlib import Path +import re +import sys +from typing import Any + +HERE = Path(__file__).resolve().parent +sys.path.insert(0, str(HERE)) +sys.path.insert(0, str(HERE / "tests")) + +try: # Shard extraction on GPU runners is intentionally stdlib-only. + import yaml # type: ignore +except ModuleNotFoundError: # pragma: no cover - exercised by the workflow environment + yaml = None + +import capability as cap # noqa: E402 +import contracts # noqa: E402 +import ep_harness # noqa: E402 +import identity # noqa: E402 + + +EP_TIMING_PROFILE = ( + f"{ep_harness.TIMED_ITERS_PER_TRIAL}:{ep_harness.TRIALS_PER_POINT}:" + f"{ep_harness.WARMUP_ITERS_PER_TRIAL}" +) +V1_WORKLOAD = ("deepseek-v3-v1", 7168, 8, 256) +V1_SUITE_CONTRACTS = { + "ep-core-v1": { + "mode": "normal", + "publication": "official", + "coordinates": { + ("normal", "decode", "uniform", False), + ("normal", "prefill", "uniform", False), + }, + "ladders": { + "decode": tuple(ep_harness.DECODE_LADDER), + "prefill": (256, 512), + }, + }, + "ep-routing-v1": { + "mode": "normal", + "publication": "comparable-experimental", + "coordinates": { + ("normal", "decode", "zipf", False), + ("normal", "decode", "zipf", True), + ("normal", "prefill", "zipf", False), + ("normal", "prefill", "zipf", True), + }, + "ladders": {"decode": (128,), "prefill": (512,)}, + }, + "ep-low-latency-v1": { + "mode": "low-latency", + "publication": "official", + "backends": {"deepep", "uccl"}, + "coordinates": {("low-latency", "decode", "uniform", False)}, + "ladders": {"decode": tuple(ep_harness.DECODE_LADDER)}, + }, + "ep-precision-normal-v1": { + "mode": "normal", + "publication": "comparable-experimental", + "backends": {"deepep", "deepep-v2", "uccl", "deepep-hybrid", "mori"}, + "precision_profiles": identity.V1_NORMAL_PRECISION_PROFILE_IDS, + "coordinates": { + ("normal", "decode", "uniform", False), + ("normal", "prefill", "uniform", False), + }, + "ladders": {"decode": (128,), "prefill": (512,)}, + }, + "ep-precision-low-latency-v1": { + "mode": "low-latency", + "publication": "comparable-experimental", + "backends": {"deepep", "uccl"}, + "precision_profiles": identity.V1_LOW_LATENCY_PRECISION_PROFILE_IDS, + "coordinates": {("low-latency", "decode", "uniform", False)}, + "ladders": {"decode": (128,)}, + }, +} +IDENTIFIER = re.compile(r"[a-z0-9][a-z0-9.-]*") +SUITE_FIELDS = { + "backends", "ep_degrees", "eplb", "mode", "phases", "platforms", + "precision_profiles", "provisional", "required_publication", "routings", + "token_points", "token_points_decode", "token_points_prefill", "workloads", +} +SUITE_REQUIRED = { + "ep_degrees", "mode", "phases", "platforms", "required_publication", "routings", + "workloads", +} +TOPOLOGY_FIELDS = ( + "nodes", "gpus_per_node", "scale_up_domain", "scope", "scale_up_transport", + "scale_out_transport", "transport", "topology_class", +) +QUALIFICATION_INDICES = range(1, 4) + + +class MatrixError(ValueError): + """A matrix or shard-control document violates the execution contract.""" + + +if yaml is not None: + class _UniqueKeyLoader(yaml.SafeLoader): + pass + + def _unique_mapping(loader: Any, node: Any, deep: bool = False) -> dict[Any, Any]: + result: dict[Any, Any] = {} + for key_node, value_node in node.value: + key = loader.construct_object(key_node, deep=deep) + if key in result: + raise SystemExit(f"duplicate YAML key {key!r} at line {key_node.start_mark.line + 1}") + result[key] = loader.construct_object(value_node, deep=deep) + return result + + _UniqueKeyLoader.add_constructor( + yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG, _unique_mapping + ) + + +def _load(name: str) -> dict[str, Any]: + if yaml is None: + raise SystemExit("matrix generation requires PyYAML; shard extraction does not") + try: + with (HERE / "configs" / name).open() as fh: + document = yaml.load(fh, Loader=_UniqueKeyLoader) + except yaml.YAMLError as exc: + raise SystemExit(f"configs/{name} is not valid YAML: {exc}") from exc + if not isinstance(document, dict): + raise SystemExit(f"configs/{name} must contain a YAML object") + return document + + +def _workload_registry(workloads: dict[str, Any]) -> dict[str, dict[str, Any]]: + return { + name: cfg + for section in ("synthetic", "model_derived") + for name, cfg in (workloads.get(section) or {}).items() + } + + +def _fields(value: Any, path: str, allowed: set[str], required: set[str]) -> dict[str, Any]: + if not isinstance(value, dict): + raise SystemExit(f"{path} must be an object") + if any(not isinstance(key, str) for key in value): + raise SystemExit(f"{path} field names must be strings") + unknown, missing = set(value) - allowed, required - set(value) + if unknown or missing: + raise SystemExit(f"{path} fields: unknown={sorted(unknown)}, missing={sorted(missing)}") + return value + + +def _list(value: Any, path: str, item_type: type, allowed: set[Any] | None = None) -> list[Any]: + if (not isinstance(value, list) or not value + or any(type(item) is not item_type for item in value) + or len(value) != len(set(value)) + or (allowed is not None and any(item not in allowed for item in value))): + raise SystemExit(f"{path} must be a non-empty unique list of valid {item_type.__name__}s") + return value + + +def validate_config_documents( + suites_document: dict[str, Any], workloads: dict[str, Any] +) -> None: + """Reject configuration that is ambiguous, unused, or outside the v1 grid.""" + _fields( + suites_document, "configs/suites.yaml", + {"schema_version", "suites"}, {"schema_version", "suites"}, + ) + _fields( + workloads, "configs/workloads.yaml", + {"schema_version", "synthetic", "model_derived"}, {"schema_version"}, + ) + if type(suites_document["schema_version"]) is not int or suites_document["schema_version"] != 1: + raise SystemExit("configs/suites.yaml schema_version must be integer 1") + if type(workloads["schema_version"]) is not int or workloads["schema_version"] != 1: + raise SystemExit("configs/workloads.yaml schema_version must be integer 1") + registry: dict[str, dict[str, Any]] = {} + for section, expert_field in ( + ("synthetic", "experts"), + ("model_derived", "routed_experts"), + ): + entries = workloads.get(section, {}) + if not isinstance(entries, dict): + raise SystemExit(f"workloads.{section} must be an object") + for name, value in entries.items(): + if not isinstance(name, str) or not IDENTIFIER.fullmatch(name) or name in registry: + raise SystemExit(f"workloads.{section} has invalid or duplicate name {name!r}") + fields = {"hidden", "topk", expert_field, "verified_against"} + config = _fields(value, f"workload {name}", fields, fields - {"verified_against"}) + dimensions = [config[key] for key in ("hidden", "topk", expert_field)] + if any(type(item) is not int or item <= 0 for item in dimensions): + raise SystemExit(f"workload {name} dimensions must be positive integers") + if dimensions[1] > dimensions[2]: + raise SystemExit(f"workload {name}.topk exceeds its expert count") + source = config.get("verified_against") + if source is not None and (not isinstance(source, str) or not source.strip()): + raise SystemExit(f"workload {name}.verified_against must be a non-empty string") + registry[name] = config + if not registry: + raise SystemExit("configs/workloads.yaml must define at least one workload") + + suites = suites_document["suites"] + if not isinstance(suites, dict) or not suites: + raise SystemExit("configs/suites.yaml suites must be a non-empty object") + referenced: set[str] = set() + for name, value in suites.items(): + if not isinstance(name, str) or not IDENTIFIER.fullmatch(name): + raise SystemExit(f"invalid suite name {name!r}") + suite = _fields(value, f"suite {name}", SUITE_FIELDS, SUITE_REQUIRED) + contract = V1_SUITE_CONTRACTS.get(name) + if contract is None: + raise SystemExit(f"suite {name} is outside the frozen v1 catalog") + mode = suite["mode"] + if mode not in identity.V1_CASE_PROFILES or mode != contract["mode"]: + raise SystemExit(f"suite {name}.mode differs from the frozen v1 catalog") + suite_backends = _list( + suite.get("backends", list(cap.SWEEP_BACKENDS)), + f"suite {name}.backends", + str, + set(cap.SWEEP_BACKENDS), + ) + expected_backends = contract.get("backends") + if expected_backends is not None and set(suite_backends) != expected_backends: + raise SystemExit(f"suite {name}.backends differs from the frozen v1 catalog") + if expected_backends is None and "backends" in suite: + raise SystemExit(f"suite {name}.backends must be omitted") + expected_profiles = contract.get("precision_profiles") + if expected_profiles is None: + if "precision_profiles" in suite or "provisional" in suite: + raise SystemExit( + f"suite {name} cannot add precision fields to a baseline suite" + ) + precision_profiles: list[str] = [] + else: + precision_profiles = _list( + suite.get("precision_profiles"), + f"suite {name}.precision_profiles", + str, + set(identity.V1_PRECISION_PROFILES), + ) + if tuple(precision_profiles) != expected_profiles: + raise SystemExit( + f"suite {name}.precision_profiles differs from the frozen v1 catalog" + ) + if identity.V1_CONTROL_PRECISION_PROFILE in precision_profiles: + raise SystemExit( + f"suite {name} must reference existing BF16 evidence, not duplicate it" + ) + if any( + mode not in identity.V1_PRECISION_PROFILES[profile]["modes"] + for profile in precision_profiles + ): + raise SystemExit(f"suite {name} contains a precision profile for another mode") + if type(suite.get("provisional")) is not bool: + raise SystemExit(f"suite {name}.provisional must be a boolean") + unresolved = cap.provisional_precision_targets(precision_profiles) + if suite["provisional"] != bool(unresolved): + raise SystemExit( + f"suite {name}.provisional must track unresolved capability targets" + ) + candidates = cap.precision_targets(precision_profiles) + covered_candidates = [ + target for target in candidates + if target["backend"] in suite_backends + and target["sku"] in suite["platforms"] + and target["ep"] in suite["ep_degrees"] + and target["mode"] == mode + ] + if covered_candidates != candidates: + raise SystemExit( + f"suite {name} does not cover every declared precision target" + ) + suite_workloads = _list(suite["workloads"], f"suite {name}.workloads", str) + unknown = sorted(set(suite_workloads) - set(registry)) + if unknown: + raise SystemExit(f"suite {name}: unknown workloads {unknown}") + referenced.update(suite_workloads) + platforms = _list( + suite["platforms"], f"suite {name}.platforms", str, set(cap.PLATFORMS) + ) + phases = _list(suite["phases"], f"suite {name}.phases", str, {"decode", "prefill"}) + routings = _list(suite["routings"], f"suite {name}.routings", str, {"uniform", "zipf"}) + eplb = _list(suite.get("eplb", [False]), f"suite {name}.eplb", bool) + if True in eplb and routings != ["zipf"]: + raise SystemExit(f"suite {name}: EPLB is only valid for Zipf routing") + if suite["required_publication"] not in {"official", "comparable-experimental"}: + raise SystemExit(f"suite {name}.required_publication is invalid") + if suite["required_publication"] != contract["publication"]: + raise SystemExit( + f"suite {name}.required_publication differs from the frozen v1 catalog" + ) + if suite["required_publication"] == "official": + unverified = [item for item in suite_workloads if not registry[item].get("verified_against")] + if unverified: + raise SystemExit(f"suite {name}: official workloads need verified_against: {unverified}") + degrees = _list(suite["ep_degrees"], f"suite {name}.ep_degrees", int) + if degrees != [8, 16]: + raise SystemExit(f"suite {name}.ep_degrees must be exactly [8, 16]") + for platform in platforms: + if not set(degrees).issubset(cap.PLATFORMS[platform]["ep_degrees"]): + raise SystemExit(f"suite {name}: invalid EP degree for {platform}") + for phase in {"decode", "prefill"} - set(phases): + if f"token_points_{phase}" in suite: + raise SystemExit(f"suite {name}.token_points_{phase} is unreachable") + if "token_points" in suite and all( + f"token_points_{phase}" in suite for phase in phases + ): + raise SystemExit(f"suite {name}.token_points is unreachable") + for phase in phases: + _ladder(suite, phase) + coordinates = { + (mode, phase, routing, enabled) + for phase, routing, enabled in itertools.product(phases, routings, eplb) + } + if coordinates != contract["coordinates"] or any( + tuple(map(int, _ladder(suite, phase).split())) != contract["ladders"][phase] + for phase in phases + ): + raise SystemExit(f"suite {name} coordinates differ from the frozen v1 catalog") + unused = sorted(set(registry) - referenced) + if unused: + raise SystemExit(f"unreferenced workloads: {unused}") + + +def _dims(workloads: dict[str, Any], name: str) -> tuple[int, int, int]: + config = _workload_registry(workloads)[name] + values = ( + config.get("hidden"), + config.get("topk"), + config.get("experts", config.get("routed_experts")), + ) + return values # type: ignore[return-value] + + +def _ladder(suite: dict[str, Any], phase: str) -> str: + points = suite.get(f"token_points_{phase}", suite.get("token_points")) + if points is None: + points = ep_harness.DECODE_LADDER if phase == "decode" else ep_harness.PREFILL_LADDER + if (not isinstance(points, list) or not points + or any(isinstance(point, bool) or not isinstance(point, int) or point <= 0 + for point in points) + or points != sorted(set(points))): + raise SystemExit(f"invalid {phase} token ladder: {points!r}") + return " ".join(map(str, points)) + + +def _v1_requested_ladder(case: dict[str, Any]) -> str: + """Bind extracted controls to the frozen v1 suite and workload catalog.""" + suite = V1_SUITE_CONTRACTS.get(case.get("suite")) + expected_profiles = None if suite is None else suite.get("precision_profiles") + precision_profile = case.get("precision_profile") + coordinate = ( + case.get("mode"), case.get("phase"), case.get("routing"), case.get("eplb") + ) + if ( + suite is None + or coordinate not in suite["coordinates"] + or case.get("required_publication") != suite["publication"] + or ( + case.get("workload"), case.get("hidden"), case.get("topk"), case.get("experts") + ) != V1_WORKLOAD + or (expected_profiles is None and precision_profile is not None) + or (expected_profiles is not None and precision_profile not in expected_profiles) + ): + raise MatrixError("case differs from the frozen v1 suite/workload catalog") + return " ".join(map(str, suite["ladders"][case["phase"]])) + + +def _expected_disposition( + sku: str, case: dict[str, Any] +) -> tuple[str, str | None, str | None]: + requested_ladder = _v1_requested_ladder(case) + precision_profile = case.get("precision_profile") + if precision_profile is not None and not cap.precision_target_declared( + precision_profile, + sku=sku, + backend=case["backend"], + ep=case["ep"], + mode=case["mode"], + ): + raise MatrixError("precision case is not an exact native capability target") + disposition, detail = cap.resolve_disposition( + sku, case["backend"], ep=case["ep"], nodes=case["nodes"], + routing=case["routing"], eplb=case["eplb"], mode=case["mode"], + precision_profile=precision_profile, + ) + if disposition == "supported": + if case["ladder"] != requested_ladder: + raise MatrixError("case ladder differs from the frozen v1 suite catalog") + return "runnable", None, None + if case["ladder"] != requested_ladder: + raise MatrixError("unsupported case ladder differs from the frozen v1 suite catalog") + if disposition == "unsupported": + reason = ( + "precision-profile-unsupported" + if precision_profile is not None + else "backend-platform-unsupported" + ) + return "unsupported", reason, detail + if disposition == "provisional": + raise MatrixError("provisional precision target entered the executable matrix") + raise MatrixError("not-applicable precision tuple entered the requested matrix") + + +def _case_id(sku: str, case: dict[str, Any]) -> str: + return identity.case_id( + sku=sku, profile=identity.profile_for_case(case), case=case + ) + + +def _semantic_points(sku: str, case: dict[str, Any]) -> list[str]: + execution = { + key: value for key, value in case.items() + if key not in {"canonical", "case_id", "ladder", "required_publication", "suite", "workload"} + } + return [ + json.dumps( + {"sku": sku, "tokens_per_rank": int(point), **execution}, + sort_keys=True, + separators=(",", ":"), + ) + for point in case["ladder"].split() + ] + + +def _select_backends(backend: str, backends: str) -> list[str]: + available = list(cap.SWEEP_BACKENDS) + if backend and backends: + raise SystemExit("--backend and --backends are mutually exclusive") + if backends: + names = available if backends == "all" else [ + value.strip() for value in backends.split(",") if value.strip() + ] + else: + names = [backend or "deepep"] + unknown = sorted(set(names) - set(available)) + if unknown: + raise SystemExit(f"unknown backend values {unknown}; have {available}") + if len(names) != len(set(names)): + raise SystemExit("backend selection contains duplicates") + return names + + +def resolve_matrix( + suites: str = "all", + backend: str = "", + backends: str = "", + only_sku: str = "", + min_nodes: int = 0, + max_nodes: int = 0, + max_cases: int = 128, +) -> dict[str, Any]: + """Resolve suite configuration into allocation-sized workflow shards.""" + if max_cases <= 0: + raise SystemExit("--max-cases must be positive") + if min_nodes < 0 or max_nodes < 0 or (min_nodes and max_nodes and min_nodes > max_nodes): + raise SystemExit("invalid node bounds") + if only_sku and only_sku not in cap.PLATFORMS: + raise SystemExit(f"unknown --only-sku {only_sku!r}; have {sorted(cap.PLATFORMS)}") + + workloads = _load("workloads.yaml") + suites_document = _load("suites.yaml") + validate_config_documents(suites_document, workloads) + registry = suites_document["suites"] + select_all = suites == "all" + names = ( + [name for name, suite in registry.items() if not suite.get("provisional", False)] + if select_all + else [value.strip() for value in suites.split(",") if value.strip()] + ) + if not names or len(names) != len(set(names)): + raise SystemExit("suite selection must be non-empty and unique") + unknown = sorted(set(names) - set(registry)) + if unknown: + raise SystemExit(f"unknown suites {unknown}; have {sorted(registry)}") + blocked = [name for name in names if registry[name].get("provisional", False)] + if blocked: + unresolved = sum( + len(cap.provisional_precision_targets(registry[name]["precision_profiles"])) + for name in blocked + ) + raise SystemExit( + f"provisional precision suites cannot be scheduled: {blocked}; " + f"resolve {unresolved} capability targets first" + ) + targets = _select_backends(backend, backends) + + shards: dict[tuple[str, str, int], list[dict[str, Any]]] = {} + requested_cases: list[dict[str, Any]] = [] + scheduled: set[str] = set() + for suite_name in names: + suite = registry[suite_name] + mode = suite["mode"] + phases = suite["phases"] + routings = suite["routings"] + eplb_values = suite.get("eplb", [False]) + precision_profiles = suite.get("precision_profiles", [None]) + suite_backends = set(suite.get("backends", cap.SWEEP_BACKENDS)) + suite_targets = [target for target in targets if target in suite_backends] + if not suite_targets: + continue + for platform_name in suite["platforms"]: + if only_sku and platform_name != only_sku: + continue + ep_degrees = suite["ep_degrees"] + for workload, ep, phase, routing, eplb, target, precision_profile in itertools.product( + suite["workloads"], ep_degrees, phases, routings, eplb_values, + suite_targets, precision_profiles, + ): + if precision_profile is not None and not cap.precision_target_declared( + precision_profile, + sku=platform_name, + backend=target, + ep=ep, + mode=mode, + ): + continue + topology = cap.topology_for(platform_name, ep) + if topology is None: + raise SystemExit( + f"suite {suite_name}: {platform_name} EP{ep} is not registered" + ) + nodes = int(topology["nodes"]) + if min_nodes and nodes < min_nodes: + continue + if max_nodes and nodes > max_nodes: + continue + capability_disposition, capability_detail = cap.resolve_disposition( + platform_name, + target, + ep=ep, + nodes=nodes, + routing=routing, + eplb=bool(eplb), + mode=mode, + precision_profile=precision_profile, + ) + hidden, topk, experts = _dims(workloads, workload) + + def add_case( + case_ladder: str, + disposition: str, + reason: str | None, + detail: str | None, + ) -> None: + case: dict[str, Any] = { + "suite": suite_name, + "workload": workload, + "required_publication": suite["required_publication"], + "backend": target, + "routing": routing, + "phase": phase, + "ep": ep, + "eplb": eplb, + "hidden": hidden, + "topk": topk, + "experts": experts, + "samples_per_point": ep_harness.TIMED_SAMPLES_PER_POINT, + "warmup_semantics": ep_harness.WARMUP_SEMANTICS, + "ladder": case_ladder, + "mode": mode, + "timing": EP_TIMING_PROFILE, + "canonical": True, + **{field: topology[field] for field in TOPOLOGY_FIELDS}, + } + if precision_profile is not None: + case["precision_profile"] = precision_profile + for signature in _semantic_points(platform_name, case): + if signature in scheduled: + raise SystemExit( + f"suite {suite_name}: duplicate semantic point for {platform_name}" + ) + scheduled.add(signature) + case["case_id"] = _case_id(platform_name, case) + requested_cases.append( + { + "sku": platform_name, + "case": case, + "disposition": disposition, + "reason": reason, + "detail": detail, + } + ) + if disposition == "runnable": + shards.setdefault((platform_name, target, nodes), []).append(case) + + requested_ladder = _ladder(suite, phase) + if capability_disposition == "not-applicable": + continue + if capability_disposition == "provisional": + raise SystemExit( + f"suite {suite_name}: provisional target escaped its suite gate: " + f"{precision_profile} {target} {platform_name} EP{ep}" + ) + if capability_disposition == "unsupported": + add_case( + requested_ladder, + "unsupported", + ( + "precision-profile-unsupported" + if precision_profile is not None + else "backend-platform-unsupported" + ), + capability_detail, + ) + continue + if capability_disposition != "supported": + raise SystemExit( + f"suite {suite_name}: invalid capability disposition " + f"{capability_disposition!r}" + ) + add_case(requested_ladder, "runnable", None, None) + + shards_by_sku: dict[str, list[dict[str, Any]]] = {} + for (sku, target, nodes), cases in sorted(shards.items()): + chunk_size = max_cases + for offset in range(0, len(cases), chunk_size): + chunk = cases[offset:offset + chunk_size] + part = offset // chunk_size + shard_id = f"{sku}-{target}-n{nodes}" + if len(cases) > chunk_size: + shard_id += f"-p{part}" + shards_by_sku.setdefault(sku, []).append({ + "id": shard_id, + "sku": sku, + "backend": target, + "launcher": cap.PLATFORMS[sku]["launcher"], + **{field: chunk[0][field] for field in TOPOLOGY_FIELDS}, + "n": len(chunk), + "execution_weight": execution_weight(chunk), + "case_ids": [case["case_id"] for case in chunk], + }) + include = [ + shards_by_sku[sku][round_index] + for round_index in range(max(map(len, shards_by_sku.values()), default=0)) + for sku in sorted(shards_by_sku) + if round_index < len(shards_by_sku[sku]) + ] + return { + "format": "collectivex.matrix.v1", + "schema_version": 1, + "requested_cases": requested_cases, + "include": include, + } + + +def _strict_json_load(path: Path) -> Any: + def reject_constant(value: str) -> None: + raise MatrixError(f"non-finite JSON number {value}") + + def reject_duplicates(pairs: list[tuple[str, Any]]) -> dict[str, Any]: + result: dict[str, Any] = {} + for key, value in pairs: + if key in result: + raise MatrixError(f"duplicate JSON key {key!r}") + result[key] = value + return result + + if not path.is_file(): + raise MatrixError(f"matrix does not exist: {path}") + if path.stat().st_size == 0: + raise MatrixError(f"matrix is empty: {path}") + try: + with path.open() as fh: + return json.load( + fh, parse_constant=reject_constant, object_pairs_hook=reject_duplicates + ) + except (OSError, json.JSONDecodeError) as exc: + raise MatrixError(f"matrix is not valid JSON: {exc}") from exc + + +def _positive_int(value: Any, field: str) -> int: + if type(value) is not int: + raise MatrixError(f"{field} must be a positive integer") + if value <= 0: + raise MatrixError(f"{field} must be a positive integer") + return value + + +def _qualification_index(value: Any, field: str = "qualification_index") -> int: + if type(value) is not int or value not in QUALIFICATION_INDICES: + raise MatrixError(f"{field} must be an integer in 1..3") + return value + + +def _requested_qualification_index(value: int | None = None) -> int: + if value is not None: + return _qualification_index(value) + raw = os.environ.get("CX_QUALIFICATION_INDEX", "1") + if raw not in {"1", "2", "3"}: + raise MatrixError("CX_QUALIFICATION_INDEX must be an integer in 1..3") + return int(raw) + + +def _case_precision_profile(case: dict[str, Any]) -> str: + profile = case.get("precision_profile", identity.V1_CONTROL_PRECISION_PROFILE) + if not isinstance(profile, str) or profile not in identity.V1_PRECISION_PROFILES: + raise MatrixError("qualification case has an invalid precision profile") + return profile + + +def _qualification_digest( + shard_id: str, + case_id: str, + profile_id: str, + qualification_index: int, +) -> bytes: + return hashlib.sha256( + "\0".join((shard_id, case_id, profile_id, str(qualification_index))).encode() + ).digest() + + +def _rotate(values: list[Any], offset: int) -> list[Any]: + if not values: + return [] + position = offset % len(values) + return values[position:] + values[:position] + + +def _seeded_qualification_order( + shard_id: str, + cases: list[dict[str, Any]], + qualification_index: int, +) -> list[dict[str, Any]]: + by_profile: dict[str, list[dict[str, Any]]] = {} + for case in cases: + by_profile.setdefault(_case_precision_profile(case), []).append(case) + profiles = sorted( + by_profile, + key=lambda profile: _qualification_digest( + shard_id, "profile-order", profile, 1 + ), + ) + profiles = _rotate(profiles, qualification_index - 1) + groups: dict[str, list[dict[str, Any]]] = {} + for profile in profiles: + group = sorted( + by_profile[profile], + key=lambda case: _qualification_digest( + shard_id, case["case_id"], profile, qualification_index + ), + ) + groups[profile] = _rotate(group, qualification_index - 1) + interleaved = [ + groups[profile][position] + for position in range(max(map(len, groups.values()))) + for profile in profiles + if position < len(groups[profile]) + ] + return interleaved + + +def qualification_execution_order( + shard_id: str, + cases: list[dict[str, Any]], + qualification_index: int, +) -> list[dict[str, Any]]: + """Return one deterministic, repeat-specific permutation of a shard's cases.""" + index = _qualification_index(qualification_index) + if not isinstance(shard_id, str) or not IDENTIFIER.fullmatch(shard_id): + raise MatrixError("qualification shard ID is invalid") + if not isinstance(cases, list) or not cases: + raise MatrixError("qualification planning requires at least one case") + selected: list[dict[str, Any]] = [] + seen: set[tuple[str, ...]] = set() + for current in range(1, index + 1): + candidate = _seeded_qualification_order(shard_id, cases, current) + signature = tuple(case["case_id"] for case in candidate) + if len(cases) >= current and signature in seen: + for offset in range(1, len(candidate)): + rotated = _rotate(candidate, offset) + rotated_signature = tuple(case["case_id"] for case in rotated) + if rotated_signature not in seen: + candidate = rotated + signature = rotated_signature + break + seen.add(signature) + selected = candidate + return selected + + +def execution_plan_sha256(cases: list[dict[str, Any]]) -> str: + """Bind an execution plan to only its ordered case and precision-profile IDs.""" + plan = [ + [case["case_id"], _case_precision_profile(case)] + for case in cases + ] + payload = json.dumps(plan, ensure_ascii=True, separators=(",", ":")).encode() + return hashlib.sha256(payload).hexdigest() + + +def execution_weight(cases: list[dict[str, Any]]) -> int: + """Return deterministic GPU-point work used to bound workflow parallelism.""" + if not isinstance(cases, list) or not cases: + raise MatrixError("execution weight requires at least one case") + weight = 0 + for case in cases: + ep = _positive_int(case.get("ep"), "execution-weight.ep") + ladder = case.get("ladder") + if not isinstance(ladder, str) or not ladder.split(): + raise MatrixError("execution weight requires a token ladder") + weight += ep * len(ladder.split()) + return weight + + +def qualification_execution_plan_sha256( + matrix: dict[str, Any], qualification_index: int +) -> str: + """Bind one qualification repeat to every shard's ordered case plan.""" + index = _qualification_index(qualification_index) + document = validate_matrix_document(matrix) + requested = { + item["case"]["case_id"]: item["case"] + for item in document["requested_cases"] + } + plan = [] + for shard in sorted(document["include"], key=lambda item: item["id"]): + ordered = qualification_execution_order( + shard["id"], + [requested[case_id] for case_id in shard["case_ids"]], + index, + ) + plan.append([ + shard["id"], shard["execution_weight"], execution_plan_sha256(ordered), + ]) + payload = json.dumps( + {"qualification_index": index, "shards": plan}, + ensure_ascii=True, + sort_keys=True, + separators=(",", ":"), + ).encode() + return hashlib.sha256(payload).hexdigest() + + +def validate_shard_control( + shard: dict[str, Any], + *, + sku: str, + backend: str, + nodes: int, + require_runnable: bool = True, + qualification_index: int | None = None, +) -> None: + """Validate one shard against the workflow cell that requested it.""" + if not isinstance(shard, dict): + raise MatrixError("shard must be a JSON object") + if sku not in cap.PLATFORMS or backend not in cap.SWEEP_BACKENDS: + raise MatrixError("shard platform/backend is not registered") + top_fields = { + "schema_version", "id", "sku", "backend", "nodes", "n", "cases", + "qualification_index", "execution_plan_sha256", "execution_weight", + } + if ( + set(shard) != top_fields + or type(shard.get("schema_version")) is not int + or shard["schema_version"] != 1 + ): + raise MatrixError("shard fields or schema version differ from v1 contract") + if not isinstance(shard.get("id"), str) or not IDENTIFIER.fullmatch(shard["id"]): + raise MatrixError("shard has invalid id") + observed_qualification = _qualification_index( + shard.get("qualification_index"), "shard.qualification_index" + ) + if ( + qualification_index is not None + and observed_qualification != _qualification_index(qualification_index) + ): + raise MatrixError("shard qualification_index differs from the requested repeat") + if ( + not isinstance(shard.get("execution_plan_sha256"), str) + or re.fullmatch(r"[0-9a-f]{64}", shard["execution_plan_sha256"]) is None + ): + raise MatrixError("shard execution_plan_sha256 is invalid") + for field, expected in (("sku", sku), ("backend", backend)): + if shard.get(field) != expected: + raise MatrixError( + f"shard {field} mismatch: expected {expected!r}, got {shard.get(field)!r}" + ) + if _positive_int(shard.get("nodes"), "shard.nodes") != nodes: + raise MatrixError( + f"shard nodes mismatch: expected {nodes}, got {shard.get('nodes')!r}" + ) + cases = shard.get("cases") + if not isinstance(cases, list) or not cases: + raise MatrixError("shard must contain at least one case") + if _positive_int(shard.get("n"), "shard.n") != len(cases): + raise MatrixError("shard.n does not match the number of cases") + seen: set[str] = set() + base_required = { + "case_id", "suite", "workload", "required_publication", "backend", "routing", + "mode", "phase", "ep", "eplb", "hidden", "topk", "experts", + "samples_per_point", + "warmup_semantics", "ladder", "timing", "canonical", + } | set(TOPOLOGY_FIELDS) + for index, case in enumerate(cases): + if not isinstance(case, dict): + raise MatrixError(f"case {index} must be a JSON object") + suite_contract = V1_SUITE_CONTRACTS.get(case.get("suite")) + required = base_required | ( + {"precision_profile"} + if suite_contract is not None and "precision_profiles" in suite_contract + else set() + ) + fields = set(case) + if fields != required: + raise MatrixError( + f"case {index} fields differ from v1 contract: " + f"missing={sorted(required - fields)}, extra={sorted(fields - required)}" + ) + case_id = case["case_id"] + if not identity.is_typed_id(case_id, "case"): + raise MatrixError(f"case {index} has invalid case_id") + if case_id in seen: + raise MatrixError(f"duplicate case_id {case_id}") + seen.add(case_id) + string_fields = [ + "suite", "workload", "required_publication", "backend", "mode", "routing", + "phase", "warmup_semantics", "ladder", "timing", + ] + if "precision_profile" in required: + string_fields.append("precision_profile") + for field in string_fields: + if not isinstance(case[field], str) or not case[field]: + raise MatrixError(f"case {index}.{field} must be a non-empty string") + identifier_fields = [ + "suite", "workload", "required_publication", "backend", "routing", "phase", + ] + if "precision_profile" in required: + identifier_fields.append("precision_profile") + for field in identifier_fields: + if not IDENTIFIER.fullmatch(case[field]): + raise MatrixError(f"case {index}.{field} is not a safe identifier") + if case["required_publication"] not in {"official", "comparable-experimental"}: + raise MatrixError(f"case {index} has invalid publication requirement") + case_identity = {key: value for key, value in case.items() if key != "case_id"} + if case_id != _case_id(sku, case_identity): + raise MatrixError(f"case {index} case_id does not match its contents") + if case["backend"] != backend: + raise MatrixError(f"case {index} backend does not match shard") + if case["mode"] not in identity.V1_CASE_PROFILES: + raise MatrixError(f"case {index} mode is invalid") + if _positive_int(case["nodes"], f"case {index}.nodes") != nodes: + raise MatrixError(f"case {index} nodes does not match shard") + ep = _positive_int(case["ep"], f"case {index}.ep") + gpus_per_node = _positive_int( + case["gpus_per_node"], f"case {index}.gpus_per_node" + ) + topology = cap.topology_for(sku, ep) + if topology is None or any(case[field] != topology[field] for field in TOPOLOGY_FIELDS): + raise MatrixError(f"case {index} differs from the platform registry") + if ep != nodes * gpus_per_node: + raise MatrixError(f"case {index} ep does not equal nodes * gpus_per_node") + if case["samples_per_point"] != ep_harness.TIMED_SAMPLES_PER_POINT: + raise MatrixError(f"case {index} violates fixed-512-v1") + if case["timing"] != EP_TIMING_PROFILE: + raise MatrixError(f"case {index} has invalid timing profile") + if case["warmup_semantics"] != ep_harness.WARMUP_SEMANTICS: + raise MatrixError(f"case {index} has invalid warmup semantics") + if case["phase"] not in {"decode", "prefill"}: + raise MatrixError(f"case {index} has invalid phase") + if case["routing"] not in {"uniform", "zipf"}: + raise MatrixError(f"case {index} has invalid routing") + if not isinstance(case["eplb"], bool) or (case["eplb"] and case["routing"] != "zipf"): + raise MatrixError(f"case {index} has invalid EPLB setting") + if not isinstance(case["canonical"], bool) or not case["canonical"]: + raise MatrixError(f"case {index} must use a canonical workload") + for field in ("ep", "nodes", "gpus_per_node", "hidden", "topk", "experts", + "samples_per_point", "scale_up_domain"): + if isinstance(case[field], bool) or not isinstance(case[field], int): + raise MatrixError(f"case {index}.{field} must be an integer") + _positive_int(case[field], f"case {index}.{field}") + scale_up_domain = _positive_int( + case["scale_up_domain"], f"case {index}.scale_up_domain" + ) + expected_scope = "scale-up" if ep <= scale_up_domain else "scale-out" + if case["scope"] != expected_scope or ( + expected_scope == "scale-out" and ep % scale_up_domain + ): + raise MatrixError(f"case {index} has invalid scale-up/scale-out geometry") + try: + ladder = [int(value) for value in case["ladder"].split()] + except (AttributeError, ValueError) as exc: + raise MatrixError(f"case {index} has invalid token ladder") from exc + if (not ladder or any(value <= 0 for value in ladder) + or ladder != sorted(set(ladder)) + or case["ladder"] != " ".join(map(str, ladder))): + raise MatrixError(f"case {index} has invalid token ladder") + if require_runnable: + disposition, reason, _ = _expected_disposition(sku, case) + if disposition != "runnable": + raise MatrixError(f"case {index} violates capability registry: {reason}") + else: + _v1_requested_ladder(case) + if _positive_int( + shard.get("execution_weight"), "shard.execution_weight" + ) != execution_weight(cases): + raise MatrixError("shard execution_weight differs from its cases") + expected_order = qualification_execution_order( + shard["id"], cases, observed_qualification + ) + if [case["case_id"] for case in cases] != [ + case["case_id"] for case in expected_order + ]: + raise MatrixError("shard cases differ from the qualification execution order") + if shard["execution_plan_sha256"] != execution_plan_sha256(cases): + raise MatrixError("shard execution_plan_sha256 differs from its ordered cases") + + +def validate_matrix_document(document: Any) -> dict[str, Any]: + """Validate the complete requested grid and its runnable shard partition.""" + if not isinstance(document, dict) or set(document) != { + "format", "schema_version", "requested_cases", "include" + }: + raise MatrixError("matrix fields differ from the v1 contract") + if ( + document["format"] != "collectivex.matrix.v1" + or type(document["schema_version"]) is not int + or document["schema_version"] != 1 + ): + raise MatrixError("matrix format/schema differs from v1") + requested = document["requested_cases"] + include = document["include"] + if not isinstance(requested, list) or not requested: + raise MatrixError("matrix.requested_cases must be non-empty") + if not isinstance(include, list): + raise MatrixError("matrix.include must be an array") + + cases_by_id: dict[str, dict[str, Any]] = {} + runnable_ids: set[str] = set() + semantic_points: set[str] = set() + for index, value in enumerate(requested): + path = f"matrix.requested_cases[{index}]" + if not isinstance(value, dict) or set(value) != { + "sku", "case", "disposition", "reason", "detail" + }: + raise MatrixError(f"{path} fields differ from the v1 contract") + sku = value["sku"] + case = value["case"] + disposition = value["disposition"] + if sku not in cap.PLATFORMS: + raise MatrixError(f"{path}.sku is unknown") + if disposition not in {"runnable", "unsupported"}: + raise MatrixError(f"{path}.disposition is invalid") + if disposition == "runnable": + if value["reason"] is not None or value["detail"] is not None: + raise MatrixError(f"{path} runnable cases cannot have a reason") + else: + if ( + not isinstance(value["reason"], str) + or not IDENTIFIER.fullmatch(value["reason"]) + or not isinstance(value["detail"], str) + or not value["detail"] + ): + raise MatrixError(f"{path} unsupported cases need a public reason and detail") + if not isinstance(case, dict): + raise MatrixError(f"{path}.case must be an object") + backend = case.get("backend") + nodes = case.get("nodes") + if not isinstance(backend, str) or type(nodes) is not int: + raise MatrixError(f"{path}.case backend/nodes are invalid") + requested_case_plan = [case] + validate_shard_control( + { + "schema_version": 1, + "id": "requested-case", + "sku": sku, + "backend": backend, + "nodes": nodes, + "n": 1, + "execution_weight": execution_weight(requested_case_plan), + "qualification_index": 1, + "execution_plan_sha256": execution_plan_sha256( + requested_case_plan + ), + "cases": requested_case_plan, + }, + sku=sku, + backend=backend, + nodes=nodes, + require_runnable=disposition == "runnable", + ) + case_id = case["case_id"] + if case_id in cases_by_id: + raise MatrixError(f"duplicate requested case_id {case_id}") + for signature in _semantic_points(sku, case): + if signature in semantic_points: + raise MatrixError(f"{path} duplicates a semantic token point") + semantic_points.add(signature) + cases_by_id[case_id] = value + expected = _expected_disposition(sku, case) + if (disposition, value["reason"], value["detail"]) != expected: + raise MatrixError(f"{path} disposition differs from the frozen v1 catalog") + if disposition == "runnable": + runnable_ids.add(case_id) + + shard_ids: set[str] = set() + assigned: list[str] = [] + for index, shard in enumerate(include): + path = f"matrix.include[{index}]" + expected = { + "id", "sku", "backend", "launcher", "n", "execution_weight", "case_ids", + } | set(TOPOLOGY_FIELDS) + if not isinstance(shard, dict) or set(shard) != expected: + raise MatrixError(f"{path} fields differ from the v1 contract") + shard_id = shard["id"] + if not isinstance(shard_id, str) or not IDENTIFIER.fullmatch(shard_id): + raise MatrixError(f"{path}.id is invalid") + if shard_id in shard_ids: + raise MatrixError(f"duplicate shard id {shard_id}") + shard_ids.add(shard_id) + sku = shard["sku"] + if sku not in cap.PLATFORMS: + raise MatrixError(f"{path}.sku is unknown") + platform = cap.PLATFORMS[sku] + if shard["launcher"] != platform["launcher"]: + raise MatrixError(f"{path}.launcher differs from the platform registry") + case_ids = shard["case_ids"] + if not isinstance(case_ids, list) or not case_ids or len(case_ids) != len(set(case_ids)): + raise MatrixError(f"{path}.case_ids must be a non-empty unique array") + if _positive_int(shard["n"], f"{path}.n") != len(case_ids): + raise MatrixError(f"{path}.n differs from case_ids") + nodes = _positive_int(shard["nodes"], f"{path}.nodes") + for case_id in case_ids: + wrapper = cases_by_id.get(case_id) + if wrapper is None or wrapper["disposition"] != "runnable": + raise MatrixError(f"{path} references a missing or unsupported case") + case = wrapper["case"] + if ( + wrapper["sku"] != sku + or case["backend"] != shard["backend"] + or case["nodes"] != nodes + or any(shard[field] != case[field] for field in TOPOLOGY_FIELDS) + ): + raise MatrixError(f"{path} case does not match shard coordinates") + assigned.append(case_id) + if shard["execution_weight"] != execution_weight( + [cases_by_id[case_id]["case"] for case_id in case_ids] + ): + raise MatrixError(f"{path}.execution_weight differs from its cases") + if len(assigned) != len(set(assigned)): + raise MatrixError("a runnable case is assigned to more than one shard") + if set(assigned) != runnable_ids: + raise MatrixError("runnable requested cases and shard assignments differ") + return document + + +def extract_shard( + matrix_path: str | os.PathLike[str], + shard_id: str, + output_path: str | os.PathLike[str], + *, + sku: str, + backend: str, + nodes: int, + qualification_index: int | None = None, +) -> dict[str, Any]: + """Extract one strictly matched shard control file, writing it atomically.""" + qualification = _requested_qualification_index(qualification_index) + document = validate_matrix_document(_strict_json_load(Path(matrix_path))) + include = document["include"] + matches = [item for item in include if isinstance(item, dict) and item.get("id") == shard_id] + if len(matches) != 1: + raise MatrixError(f"expected exactly one shard {shard_id!r}, found {len(matches)}") + source = matches[0] + requested = { + item["case"]["case_id"]: item + for item in document["requested_cases"] + } + cases = qualification_execution_order( + source["id"], + [requested[case_id]["case"] for case_id in source["case_ids"]], + qualification, + ) + control = { + "schema_version": 1, + "id": source.get("id"), + "sku": source.get("sku"), + "backend": source.get("backend"), + "nodes": source.get("nodes"), + "n": source.get("n"), + "execution_weight": source.get("execution_weight"), + "qualification_index": qualification, + "execution_plan_sha256": execution_plan_sha256(cases), + "cases": cases, + } + validate_shard_control( + control, + sku=sku, + backend=backend, + nodes=nodes, + qualification_index=qualification, + ) + output = Path(output_path) + output.parent.mkdir(parents=True, exist_ok=True) + temporary = output.with_name(f".{output.name}.tmp-{os.getpid()}") + try: + with temporary.open("w") as fh: + json.dump(control, fh, sort_keys=True, separators=(",", ":")) + fh.write("\n") + os.replace(temporary, output) + finally: + temporary.unlink(missing_ok=True) + return control + + +def emit_unsupported( + matrix_path: str | os.PathLike[str], output_dir: str | os.PathLike[str] +) -> list[Path]: + """Materialize one strict terminal outcome for each unsupported requested case.""" + source = Path(matrix_path) + document = validate_matrix_document(_strict_json_load(source)) + control_sha256 = hashlib.sha256(source.read_bytes()).hexdigest() + generated_at = dt.datetime.now(dt.timezone.utc).isoformat() + try: + qualification_index = int(os.environ.get("CX_QUALIFICATION_INDEX", "1")) + except ValueError as exc: + raise MatrixError("CX_QUALIFICATION_INDEX must be an integer in 1..3") from exc + if qualification_index not in range(1, 4): + raise MatrixError("CX_QUALIFICATION_INDEX must be in 1..3") + git_run = { + "run_id": os.environ.get("GITHUB_RUN_ID"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "qualification_index": qualification_index, + "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "job": os.environ.get("GITHUB_JOB"), + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"), + } + allocation_factors = { + "artifact": git_run["artifact"], + "execution_id": os.environ.get("COLLECTIVEX_EXECUTION_ID"), + "job": git_run["job"], + "qualification_index": qualification_index, + "repo": git_run["repo"], + "run_attempt": git_run["run_attempt"], + "run_id": git_run["run_id"], + "runner": "capability-resolver", + "source_sha": git_run["source_sha"], + } + destination = Path(output_dir) + destination.mkdir(parents=True, exist_ok=True) + written: list[Path] = [] + for wrapper in document["requested_cases"]: + if wrapper["disposition"] != "unsupported": + continue + scheduled = wrapper["case"] + case = {key: value for key, value in scheduled.items() if key != "case_id"} + case_factors = { + "case": case, + "profile": identity.profile_for_case(case), + "sku": wrapper["sku"], + } + case_id = identity.digest("case", case_factors) + if case_id != scheduled["case_id"]: + raise MatrixError(f"unsupported case identity differs for {scheduled['case_id']}") + attempt_ordinal = 1 + record = contracts.make_terminal_document( + allocation_factors=allocation_factors, + attempt_ordinal=attempt_ordinal, + case=case, + case_factors=case_factors, + control_sha256=control_sha256, + failure_mode="capability", + generated_at=generated_at, + git_run=git_run, + reason=wrapper["reason"], + return_code=5, + source="matrix-capability-resolver", + status="unsupported", + expected_case_id=case_id, + ) + path = destination / f"unsupported_{case_id}.json" + temporary = path.with_name(f".{path.name}.tmp-{os.getpid()}") + try: + with temporary.open("x") as handle: + json.dump(record, handle, allow_nan=False, sort_keys=True, separators=(",", ":")) + handle.write("\n") + handle.flush() + os.fsync(handle.fileno()) + os.replace(temporary, path) + finally: + temporary.unlink(missing_ok=True) + written.append(path) + return written + + +def frontend_catalog(matrix: dict[str, Any]) -> dict[str, Any]: + """Project the validated requested graph into a compact frontend test fixture.""" + document = validate_matrix_document(matrix) + matrix_bytes = contracts.canonical_json_bytes(document) + b"\n" + cases = [] + for wrapper in document["requested_cases"]: + case = wrapper["case"] + precision_profile = case.get( + "precision_profile", identity.V1_CONTROL_PRECISION_PROFILE + ) + cases.append({ + "backend": case["backend"], + "case_id": case["case_id"], + "disposition": wrapper["disposition"], + "eplb": case["eplb"], + "mode": case["mode"], + "phase": case["phase"], + "precision_profile": precision_profile, + "publication_tier": case["required_publication"], + "reason": wrapper["reason"], + "routing": case["routing"], + "sku": wrapper["sku"], + "suite": case["suite"], + "topology": { + "ep_size": case["ep"], + **{field: case[field] for field in TOPOLOGY_FIELDS}, + }, + "workload": case["workload"], + "points": [ + { + "global_tokens": int(token) * case["ep"], + "tokens_per_rank": int(token), + } + for token in case["ladder"].split() + ], + }) + return { + "case_count": len(cases), + "format": "collectivex.frontend-catalog.v1", + "matrix_sha256": hashlib.sha256(matrix_bytes).hexdigest(), + "point_count": sum(len(case["points"]) for case in cases), + "schema_version": 1, + "cases": cases, + } + + +def main() -> int: + parser = argparse.ArgumentParser(description="CollectiveX v1 matrix resolver") + parser.add_argument("--suites", default="all", help="'all' or comma-list of suites") + parser.add_argument("--backend", default="", help="select one EP backend") + parser.add_argument("--backends", default="", help="'all' or comma-list of EP backends") + parser.add_argument("--only-sku", default="") + parser.add_argument("--min-nodes", type=int, default=0) + parser.add_argument("--max-nodes", type=int, default=0) + parser.add_argument("--max-cases", type=int, default=128) + parser.add_argument("--extract-from", default="", metavar="MATRIX") + parser.add_argument("--validate-control", default="", metavar="SHARD") + parser.add_argument("--emit-unsupported-from", default="", metavar="MATRIX") + parser.add_argument("--out-dir", default="") + parser.add_argument("--frontend-catalog", action="store_true") + parser.add_argument("--shard-id", default="") + parser.add_argument("--expect-sku", default="") + parser.add_argument("--expect-backend", default="") + parser.add_argument("--expect-nodes", type=int, default=0) + parser.add_argument("--qualification-index", type=int, default=None) + parser.add_argument("--out", default="") + args = parser.parse_args() + + if args.emit_unsupported_from: + if not args.out_dir: + parser.error("unsupported outcome emission requires --out-dir") + try: + written = emit_unsupported(args.emit_unsupported_from, args.out_dir) + except MatrixError as exc: + parser.error(str(exc)) + print(f"emitted {len(written)} unsupported terminal outcomes", file=sys.stderr) + return 0 + + if args.validate_control: + if not all((args.expect_sku, args.expect_backend, args.expect_nodes)): + parser.error( + "control validation requires --expect-sku, --expect-backend, and --expect-nodes" + ) + try: + control = _strict_json_load(Path(args.validate_control)) + qualification = _requested_qualification_index( + args.qualification_index + ) + validate_shard_control( + control, + sku=args.expect_sku, + backend=args.expect_backend, + nodes=args.expect_nodes, + qualification_index=qualification, + ) + except MatrixError as exc: + parser.error(str(exc)) + print(f"validated {control.get('id')}: {control['n']} cases", file=sys.stderr) + return 0 + + if args.extract_from: + if not all((args.shard_id, args.expect_sku, args.expect_backend, args.expect_nodes, args.out)): + parser.error( + "shard extraction requires --shard-id, --expect-sku, --expect-backend, " + "--expect-nodes, and --out" + ) + try: + control = extract_shard( + args.extract_from, + args.shard_id, + args.out, + sku=args.expect_sku, + backend=args.expect_backend, + nodes=args.expect_nodes, + qualification_index=args.qualification_index, + ) + except MatrixError as exc: + parser.error(str(exc)) + print(f"extracted {control['id']}: {control['n']} cases", file=sys.stderr) + print(json.dumps(control, separators=(",", ":"))) + return 0 + + matrix = resolve_matrix( + suites=args.suites, + backend=args.backend, + backends=args.backends, + only_sku=args.only_sku, + min_nodes=args.min_nodes, + max_nodes=args.max_nodes, + max_cases=args.max_cases, + ) + try: + validate_matrix_document(matrix) + except MatrixError as exc: + parser.error(str(exc)) + output_document = frontend_catalog(matrix) if args.frontend_catalog else matrix + if args.out: + with open(args.out, "w") as fh: + json.dump(output_document, fh, sort_keys=True, separators=(",", ":")) + fh.write("\n") + runnable = sum( + item["disposition"] == "runnable" for item in matrix["requested_cases"] + ) + unsupported = len(matrix["requested_cases"]) - runnable + print( + f"resolved {len(matrix['include'])} shard-cells, " + f"{runnable} runnable and {unsupported} unsupported cases", + file=sys.stderr, + ) + print(json.dumps(output_document)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/ep_deepep.py b/experimental/CollectiveX/tests/ep_deepep.py new file mode 100644 index 000000000..5ca5f47ed --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python3 +"""CollectiveX DeepEP adapter for native V1 dispatch/combine precision profiles.""" +from __future__ import annotations + +import inspect +import os +import sys +import types + +import torch +import torch.distributed as dist +import contracts +import ep_precision + +try: + import deep_ep + from deep_ep import Buffer # type: ignore +except Exception as exc: # pragma: no cover - requires the benchmark image + print(f"ERROR: deep_ep import failed: {exc!r}", file=sys.stderr) + raise + + +def _deepep_version() -> str: + try: + import importlib.metadata as metadata + + return metadata.version("deep_ep") + except Exception: + return getattr(deep_ep, "__version__", "unknown") + + +def _mnnvl_buffer_configuration() -> tuple[dict[str, bool], str]: + """Resolve the explicit DeepEP MNNVL API contract.""" + requested_value = os.environ.get("CX_ALLOW_MNNVL") + if requested_value not in {None, "", "0", "1"}: + raise RuntimeError("CX_ALLOW_MNNVL must be unset, 0, or 1") + requested = requested_value == "1" + if not requested: + return contracts.resolve_deepep_mnnvl( + requested=False, signature_parameters=(), + deepep_commit=os.environ.get("DEEPEP_COMMIT"), + ) + try: + parameters = inspect.signature(Buffer.__init__).parameters + except (TypeError, ValueError) as exc: + raise RuntimeError("cannot inspect DeepEP Buffer MNNVL API") from exc + try: + return contracts.resolve_deepep_mnnvl( + requested=True, signature_parameters=parameters, + deepep_commit=os.environ.get("DEEPEP_COMMIT"), + ) + except contracts.ContractError as exc: + raise RuntimeError(str(exc)) from exc + + +def _normal_buffer_sizes(hidden: int, world_size: int) -> tuple[int, int]: + """Apply DeepEP's dispatch/combine buffer sizing contract for this EP world.""" + hidden_bytes = hidden * torch.tensor([], dtype=torch.bfloat16).element_size() + configs = (Buffer.get_dispatch_config(world_size), Buffer.get_combine_config(world_size)) + num_nvl_bytes = max( + int(config.get_nvl_buffer_size_hint(hidden_bytes, world_size)) for config in configs + ) + num_rdma_bytes = max( + int(config.get_rdma_buffer_size_hint(hidden_bytes, world_size)) for config in configs + ) + if num_nvl_bytes <= 0 or num_rdma_bytes < 0: + raise RuntimeError("DeepEP returned invalid normal-mode buffer size hints") + return num_nvl_bytes, num_rdma_bytes + + +class DeepEPBackend: + name = "deepep" + stage_device_work = False + combine_needs_redispatch = False + # DeepEP reduces activations and top-k weights independently. The activation + # tensor must therefore carry the complete local weighted expert sum. + combine_weight_semantics = "unweighted-rank-sum" + oracle_layout = "token-rank" + payload_unit = "token-rank" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = getattr(args, "mode", "normal") + if self.mode not in {"normal", "low-latency"}: + raise ValueError(f"unsupported DeepEP mode {self.mode!r}") + supported_profiles = { + "normal": { + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + }, + "low-latency": { + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-fused.c-bf16", + "d-bf16.c-logfmt10-dynamic64", + "d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64", + }, + } + self.precision_profile_id, self.communication_precision = ( + ep_precision.resolve_precision( + args, + backend=self.name, + mode=self.mode, + supported_profiles=supported_profiles[self.mode], + ) + ) + self._fp8_dispatch = ep_precision.is_low_precision_dispatch( + self.communication_precision + ) + self._use_logfmt = ep_precision.uses_logfmt_combine( + self.communication_precision + ) + self.stage_device_work = self._fp8_dispatch + + self.group = dist.group.WORLD + device_sms = torch.cuda.get_device_properties(device).multi_processor_count + mnnvl_kwargs, mnnvl_comm = _mnnvl_buffer_configuration() + if self.mode == "low-latency": + ep_precision.require_keyword( + Buffer.low_latency_dispatch, + "use_fp8", + api="deep_ep.Buffer.low_latency_dispatch", + ) + ep_precision.require_keyword( + Buffer.low_latency_combine, + "use_logfmt", + api="deep_ep.Buffer.low_latency_combine", + ) + if args.phase != "decode": + raise ValueError("DeepEP low-latency mode only supports the decode ladder") + if args.experts % world_size: + raise ValueError("DeepEP low-latency experts must divide the EP group") + self.combine_needs_redispatch = True + self.combine_weight_semantics = "gate-weighted-sum" + self.oracle_layout = "expert-packed" + self.payload_unit = "token-expert" + self.max_tokens_per_rank = 128 + num_qps_per_rank = args.experts // world_size + num_rdma_bytes = Buffer.get_low_latency_rdma_size_hint( + self.max_tokens_per_rank, args.hidden, world_size, args.experts + ) + self.buffer = Buffer( + self.group, + num_nvl_bytes=0, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=True, + num_qps_per_rank=num_qps_per_rank, + allow_nvlink_for_low_latency_mode=True, + explicitly_destroy=True, + **mnnvl_kwargs, + ) + self.buffer.clean_low_latency_buffer( + self.max_tokens_per_rank, args.hidden, args.experts + ) + resource_provenance = { + "requested_num_sms": None, + "num_sms": None, + "sm_fraction": None, + "tuned_source": "deepep-low-latency-fixed-kernel", + "num_max_tokens_per_rank": self.max_tokens_per_rank, + "num_nvl_bytes": 0, + "num_rdma_bytes": num_rdma_bytes, + "num_qps_per_rank": num_qps_per_rank, + } + else: + ep_precision.require_keyword( + Buffer.dispatch, + "async_finish", + api="deep_ep.Buffer.dispatch", + ) + ep_precision.require_keyword( + Buffer.combine, + "async_finish", + api="deep_ep.Buffer.combine", + ) + num_nvl_bytes, num_rdma_bytes = _normal_buffer_sizes(args.hidden, world_size) + if world_size > args.scale_up_domain and num_rdma_bytes == 0: + raise RuntimeError("DeepEP scale-out configuration returned no RDMA buffer") + self.buffer = Buffer( + self.group, num_nvl_bytes, num_rdma_bytes, **mnnvl_kwargs + ) + num_sms = int(getattr(Buffer, "num_sms", args.num_sms)) + try: + Buffer.set_num_sms(num_sms) + except Exception as exc: # pragma: no cover - version dependent + raise RuntimeError( + f"DeepEP did not apply requested num_sms={num_sms}: {exc!r}" + ) from exc + applied_num_sms = int(getattr(Buffer, "num_sms", num_sms)) + if applied_num_sms != num_sms: + raise RuntimeError( + f"DeepEP num_sms mismatch: requested={num_sms} applied={applied_num_sms}" + ) + resource_provenance = { + "requested_num_sms": num_sms, + "num_sms": applied_num_sms, + "sm_fraction": applied_num_sms / device_sms, + "tuned_source": "deepep-default-num_sms", + "num_nvl_bytes": num_nvl_bytes, + "num_rdma_bytes": num_rdma_bytes, + } + version = _deepep_version() + self.backend_provenance = { + "deepep_version": version, + "deepep_commit": os.environ.get("DEEPEP_COMMIT") or f"pkg-{version}", + "backend_lineage": "deepep-v1", + "mode": self.mode, + "dispatch_dtype": ep_precision.communication_format( + self.communication_precision, "dispatch" + ), + "combine_dtype": ep_precision.communication_format( + self.communication_precision, "combine" + ), + "resource_mode": "fixed-profile", + "device_sms": device_sms, + "allow_mnnvl": bool(mnnvl_kwargs), + "mnnvl_comm": mnnvl_comm, + **resource_provenance, + } + + def buffer_cap(self, args): + return self.max_tokens_per_rank if self.mode == "low-latency" else None + + def make_problem(self, T, idx, weights, x): + encoding = ep_precision.encode_dispatch( + torch, x, self.communication_precision + ) + return types.SimpleNamespace( + T=T, + x=x, + dispatch_x=encoding.native_input, + oracle_x=encoding.semantic, + dispatch_precision_evidence=encoding.evidence, + topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), + ) + + def dispatch(self, p): + if self.mode == "low-latency": + recv_x, recv_counts, handle, _, _ = self.buffer.low_latency_dispatch( + p.x, + p.topk_idx, + self.max_tokens_per_rank, + self.args.experts, + use_fp8=self._fp8_dispatch, # BF16 control realizes use_fp8=False. + async_finish=False, + return_recv_hook=False, + ) + return types.SimpleNamespace( + recv_x=recv_x, + recv_counts=recv_counts, + handle=handle, + ) + ( + num_tokens_per_rank, + num_tokens_per_rdma_rank, + num_tokens_per_expert, + is_token_in_rank, + _, + ) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + recv_x, recv_topk_idx, recv_topk_weights, recv_counts, handle, _ = self.buffer.dispatch( + p.dispatch_x, + topk_idx=p.topk_idx, + topk_weights=p.topk_weights, + num_tokens_per_rank=num_tokens_per_rank, + num_tokens_per_rdma_rank=num_tokens_per_rdma_rank, + is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=num_tokens_per_expert, + async_finish=False, + ) + return types.SimpleNamespace( + recv_x=recv_x, + recv_topk_idx=recv_topk_idx, + recv_topk_weights=recv_topk_weights, + recv_counts=recv_counts, + handle=handle, + ) + + def stage(self, p, h): + h.combine_input = self._semantic_recv(h, p) + + def combine(self, p, h): + if self.mode == "low-latency": + combined_x, _, _ = self.buffer.low_latency_combine( + h.combine_input, + p.topk_idx, + p.topk_weights, + h.handle, + use_logfmt=self._use_logfmt, + async_finish=False, + return_recv_hook=False, + ) + return combined_x + combined_x, _, _ = self.buffer.combine( + h.combine_input, h.handle, async_finish=False + ) + return combined_x + + def inspect_dispatch(self, p, h): + valid = h.recv_topk_idx >= 0 + expert_ids = torch.where( + valid, + h.recv_topk_idx + self.rank * (self.args.experts // self.world_size), + h.recv_topk_idx, + ) + return types.SimpleNamespace( + payload=self._semantic_recv(h, p), + encoded_payload=self._encoded_recv(h), + scales=self._recv_scales(h), + expert_ids=expert_ids, + weights=h.recv_topk_weights.masked_fill(~valid, 0), + local_expert_counts=torch.tensor(h.recv_counts, device=self.device, dtype=torch.int64), + ordering_contract="source-rank-major-stable-v1", + ) + + def inspect_expert_dispatch(self, p, h): + if self.mode != "low-latency": + raise RuntimeError("expert-packed inspection requires low-latency mode") + p.recv_counts = tuple(int(value) for value in h.recv_counts.tolist()) + return types.SimpleNamespace( + payload=self._semantic_recv(h, p), + encoded_payload=self._encoded_recv(h), + scales=self._recv_scales(h), + local_expert_counts=h.recv_counts, + source_info=h.handle[0], + layout_range=h.handle[1], + ) + + def combine_transformed(self, p, h, transformed): + if self.mode == "low-latency": + packed = torch.zeros( + self._encoded_recv(h).shape, + dtype=torch.bfloat16, + device=self._encoded_recv(h).device, + ) + packed[h.oracle_local_expert_slots, h.oracle_packed_positions] = transformed.to( + packed.dtype + ) + combined, _, _ = self.buffer.low_latency_combine( + packed, + p.topk_idx, + p.topk_weights, + h.handle, + use_logfmt=self._use_logfmt, + async_finish=False, + return_recv_hook=False, + ) + return combined + semantic = self._semantic_recv(h, p) + combined, _, _ = self.buffer.combine( + transformed.to(semantic.dtype), h.handle, async_finish=False + ) + return combined + + def recv_tokens(self, h): + if self.mode == "low-latency": + return int(h.recv_counts.to(torch.int64).sum().item()) + return int(self._encoded_recv(h).shape[0]) + + def _encoded_recv(self, h): + return h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x + + def _recv_scales(self, h): + return h.recv_x[1] if isinstance(h.recv_x, tuple) else None + + def _semantic_recv(self, h, problem=None): + if not self._fp8_dispatch: + return h.recv_x + if not hasattr(h, "recv_semantic"): + if self.mode == "low-latency": + counts = getattr(problem, "recv_counts", None) + if counts is None: + counts = tuple(int(value) for value in h.recv_counts.tolist()) + if problem is not None: + problem.recv_counts = counts + workspace = getattr(self, "_ll_semantic_workspace", None) + if workspace is None: + encoded = self._encoded_recv(h) + workspace = torch.empty( + encoded.shape, dtype=torch.bfloat16, device=encoded.device + ) + self._ll_semantic_workspace = workspace + h.recv_semantic = ep_precision.dequantize_expert_prefixes( + torch, + self._encoded_recv(h), + self._recv_scales(h), + self.communication_precision["dispatch"], + counts, + workspace, + ) + else: + h.recv_semantic = ep_precision.dequantize_dispatch( + torch, + self._encoded_recv(h), + self._recv_scales(h), + self.communication_precision["dispatch"], + ) + return h.recv_semantic + + def oracle_dispatch_payload(self, payload): + return ep_precision.encode_dispatch( + torch, payload, self.communication_precision + ).semantic + + def precision_evidence(self, problem, view=None): + return ep_precision.precision_evidence( + torch, + profile_id=self.precision_profile_id, + profile=self.communication_precision, + problem=problem, + view=view, + ) + + def finalize(self, rc): + try: + dist.barrier() + if self.mode == "low-latency": + self.buffer.destroy() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_deepep_hybrid.py b/experimental/CollectiveX/tests/ep_deepep_hybrid.py new file mode 100644 index 000000000..f48005e4c --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep_hybrid.py @@ -0,0 +1,551 @@ +#!/usr/bin/env python3 +"""CollectiveX EP backend adapter — DeepEP `hybrid-ep` branch (NVIDIA TMA-based HybridEPBuffer). + +The hybrid-ep branch (https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep) is NVIDIA's TMA + +warp-pipeline implementation of expert-parallel all-to-all, exposing `deep_ep.HybridEPBuffer` +(distinct from the mainline `deep_ep.Buffer`). HybridEP is NVIDIA's MoE backend built for NVL72 +rack-scale (Megatron `moe_flex_dispatcher_backend="hybridep"`). This adapter binds the API's +"ranks per node" field to active ranks per NVLink/MNNVL communication domain, not physical host +GPUs: x86 EP16 is two 8-rank domains, while GB EP8/EP16 is one 8/16-rank MNNVL domain across hosts. +The container build is done by runtime/run_in_container.sh `cx_build_deepep_hybrid` (CUDA-13 CCCL +include path, without the V2 NVSHMEM overlay). + +API (pinned on B300, branch e0a5b1d): + HybridEPBuffer(group, hidden_dim, max_num_of_tokens_per_rank, num_local_experts, use_fp8=False, ...) + .dispatch(hidden, topk_idx=, topk_weights=, num_of_experts=) -> (recv_hidden, recv_x2, None, handle) + .combine(hidden, handle=) -> [T, hidden] + +CORRECTNESS: identity expert (no expert compute), combine WITHOUT probs -> each source token is +reconstructed as x * (distinct ranks among its top_k experts) — verified: an 8-rank uniform top_k=8 +round trip gives relerr(combined, x) = 4.28, matching E[distinct ranks] ~ 5.26 exactly. So this uses +the same per-rank-sum combine contract (no gate re-weight). BF16 tolerance is 5e-2. + +STATUS: BF16 or native block-scaled FP8 dispatch, BF16 combine, normal mode. The v1 scope covers +one MNNVL domain or x86 scale-out between two eight-GPU NVLink domains. +""" +from __future__ import annotations + +import hashlib +import importlib +import json +import os +from pathlib import Path +import re +import shutil +import sys +import tempfile +import types + +import torch +import torch.distributed as dist +import contracts +import ep_precision + +try: + import deep_ep + HybridEPBuffer = deep_ep.HybridEPBuffer +except Exception as exc: # pragma: no cover - needs the hybrid-ep build + print("ERROR: deep_ep.HybridEPBuffer import failed — the hybrid-ep branch must be built at job " + "setup (cx_build_deepep_hybrid). " + f"{exc!r}", file=sys.stderr) + raise + + +def _deepep_hybrid_version() -> str: + return os.environ.get("DEEPEP_COMMIT", getattr(deep_ep, "__version__", "hybrid-ep")) + + +def _hybrid_build_evidence() -> list[dict[str, str]]: + records = [] + for module_name, role in ( + ("deep_ep_cpp", "deepep-extension"), + ("hybrid_ep_cpp", "deepep-hybrid-extension"), + ): + module = importlib.import_module(module_name) + path = getattr(module, "__file__", None) + if not path: + raise RuntimeError(f"{module_name} has no loaded extension path") + records.append(contracts.content_manifest_evidence( + role=role, + name=module_name, + files=[(os.path.basename(path), path)], + )) + return sorted(records, key=lambda item: (item["role"], item["name"])) + + +HYBRID_CONFIG_FIELDS = ( + "hidden_dim", "max_num_of_tokens_per_rank", "num_of_experts_per_rank", + "num_of_ranks_per_node", "num_of_nodes", "pad_multiple", + "num_of_tokens_per_chunk_preprocessing_api", + "num_of_threads_per_block_preprocessing_api", "num_of_blocks_preprocessing_api", + "num_of_blocks_permute", "num_of_blocks_unpermute", "token_data_type", + "num_of_stages_dispatch_api", "num_of_stages_permute_block_dispatch_api", + "num_of_in_flight_s2g_dispatch_api", + "num_of_in_flight_s2g_permute_block_dispatch_api", + "num_of_additional_in_flight_s2g_dispatch_api", + "num_of_tokens_per_chunk_dispatch_api", "num_of_blocks_dispatch_api", + "forward_dispatch_api", "device_side_sync_dispatch_api", + "num_of_stages_g2s_combine_api", "num_of_stages_s2g_combine_api", + "num_of_tokens_per_chunk_combine_api", "num_of_tokens_per_group_combine_api", + "num_of_blocks_combine_api", "num_of_additional_in_flight_s2g_combine_api", + "backward_combine_api", "device_side_sync_combine_api", +) + + +def _hybrid_realized_config(config) -> dict[str, str | int | bool]: + """Project the Python-visible, post-autotune HybridEP config to JSON scalars.""" + realized = {} + for field in HYBRID_CONFIG_FIELDS: + try: + value = getattr(config, field) + except AttributeError as exc: + raise RuntimeError(f"HybridEP realized config omits {field}") from exc + if field == "token_data_type": + token_type = getattr(value, "name", None) + if token_type not in {"UINT8", "UINT16"}: + token_type = {"uint8_t": "UINT8", "uint16_t": "UINT16"}.get(str(value)) + if token_type is None: + raise RuntimeError("HybridEP realized token_data_type is invalid") + realized[field] = token_type + continue + if type(value) is bool: + realized[field] = value + continue + try: + realized[field] = int(value) + except (TypeError, ValueError) as exc: + raise RuntimeError(f"HybridEP realized config {field} is not integral") from exc + return realized + + +def _sha256_with_size(path: Path) -> tuple[str, int]: + digest = hashlib.sha256() + size = 0 + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + size += len(chunk) + return digest.hexdigest(), size + + +def _hybrid_jit_evidence(root: Path) -> list[dict[str, str | int]]: + """Hash final JIT libraries without exposing rank-specific cache paths.""" + if not root.is_dir(): + raise RuntimeError("DeepEP Hybrid produced no JIT cache directory") + artifacts = [] + for path in sorted(root.iterdir(), key=lambda item: item.name): + if path.suffix != ".so": + continue + if path.is_symlink() or not path.is_file(): + raise RuntimeError("DeepEP Hybrid JIT artifact is not a regular file") + kernel_key = path.stem + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9_.+-]{0,511}", kernel_key): + raise RuntimeError("DeepEP Hybrid JIT kernel key is invalid") + digest, size = _sha256_with_size(path) + if size <= 0: + raise RuntimeError("DeepEP Hybrid JIT artifact is empty") + artifacts.append({ + "bytes": size, + "kernel_key": kernel_key, + "sha256": digest, + }) + if len(artifacts) != 3: + raise RuntimeError( + f"DeepEP Hybrid expected 3 final JIT libraries, found {len(artifacts)}" + ) + return artifacts + + +def _require_cross_rank_equal(value, label: str) -> None: + gathered = [None] * dist.get_world_size() + dist.all_gather_object(gathered, value) + canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered} + if len(canonical) != 1: + raise RuntimeError(f"DeepEP Hybrid {label} differs across ranks") + + +def _hybrid_topology(args, world_size: int) -> dict[str, int | str]: + """Translate physical placement into HybridEP communication-domain geometry.""" + gpus_per_node = int(args.gpus_per_node or world_size) + scale_up_domain = int(args.scale_up_domain or gpus_per_node) + key = ( + world_size, gpus_per_node, scale_up_domain, args.scope, + args.scale_up_transport, args.scale_out_transport or None, args.transport, + ) + fixed = { + (8, 8, 8, "scale-up", "nvlink", None, "nvlink"): (8, 1), + (16, 8, 8, "scale-out", "nvlink", "rdma", "nvlink-rdma"): (8, 2), + (8, 4, 72, "scale-up", "mnnvl", None, "mnnvl"): (8, 1), + (16, 4, 72, "scale-up", "mnnvl", None, "mnnvl"): (16, 1), + } + if key not in fixed: + raise RuntimeError("DeepEP Hybrid topology is outside the fixed v1 matrix") + domain_ranks, communication_domains = fixed[key] + + return { + "communication_domains": communication_domains, + "domain_ranks": domain_ranks, + "physical_nodes": world_size // gpus_per_node, + "transport": str(args.transport), + } + + +class DeepEPHybridBackend: + name = "deepep-hybrid" + stage_device_work = False + # HybridEPBuffer.combine consumes the recv payload + the dispatch handle (no re-dispatch needed + # before a timed combine); the harness times dispatch and combine separately (like ep_deepep). + combine_needs_redispatch = False + combine_weight_semantics = "unweighted-rank-sum" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = "normal" + self.precision_profile_id, self.communication_precision = ( + ep_precision.resolve_precision( + args, + backend=self.name, + mode=self.mode, + supported_profiles={ + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + }, + ) + ) + self._fp8_dispatch = ep_precision.is_low_precision_dispatch( + self.communication_precision + ) + self.stage_device_work = self._fp8_dispatch + ep_precision.require_keyword( + HybridEPBuffer.__init__, + "use_fp8", + api="deep_ep.HybridEPBuffer.__init__", + ) + ep_precision.require_keyword( + HybridEPBuffer.dispatch, + "scaling_factor", + api="deep_ep.HybridEPBuffer.dispatch", + ) + self.group = dist.group.WORLD + self.tolerance = 5e-2 + self.top_k = int(args.topk) + self.num_experts = int(args.experts) + self.hidden = int(args.hidden) + self.local_experts = max(1, self.num_experts // world_size) + topology = _hybrid_topology(args, world_size) + self.domain_ranks = int(topology["domain_ranks"]) + self.communication_domains = int(topology["communication_domains"]) + build_mode = os.environ.get("DEEPEP_HYBRID_BUILD_MODE", "") + if self.communication_domains > 1: + if ( + os.environ.get("HYBRID_EP_MULTINODE") != "1" + or build_mode != "multinode-doca" + or os.environ.get("USE_NIXL", "0") != "0" + ): + raise RuntimeError("DeepEP Hybrid scale-out build mode is not realized") + elif build_mode != "intradomain": + raise RuntimeError("DeepEP Hybrid scale-up requires the intradomain build") + if args.scale_up_transport == "mnnvl" and any( + os.environ.get(name) != "1" + for name in ("NCCL_CUMEM_ENABLE", "NCCL_MNNVL_ENABLE", "MC_FORCE_MNNVL") + ): + raise RuntimeError("DeepEP Hybrid MNNVL runtime enablement is incomplete") + # Token cap (per rank) for the symmetric buffer; the sweep is capped here (buffer_cap). + self.max_tokens = 4096 + dev_sms = torch.cuda.get_device_properties(device).multi_processor_count + ver = _deepep_hybrid_version() + loaded_libraries = _hybrid_build_evidence() + _require_cross_rank_equal(loaded_libraries, "loaded extension identities") + + # HybridEP's compiler uses a process-specific child of HYBRID_EP_CACHE_DIR. Give every + # rank a fresh private base so stale kernels cannot enter this attempt's evidence. + self._previous_jit_cache_dir = os.environ.get("HYBRID_EP_CACHE_DIR") + self._previous_domain_ranks = os.environ.get( + "NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN" + ) + self._jit_cache_dir = tempfile.mkdtemp(prefix=f"collectivex-hybrid-r{rank}-") + os.environ["HYBRID_EP_CACHE_DIR"] = self._jit_cache_dir + os.environ["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = str(self.domain_ranks) + self._jit_root = ( + Path(self._jit_cache_dir) / ".deepep" / "hybrid_ep" / "jit" + / f"proc-{os.getpid()}" + ) + self._realized_config = None + self._deferred_semantic_snapshot = None + self._deferred_jit_diagnostics = None + + try: + self.buffer = HybridEPBuffer( + self.group, hidden_dim=self.hidden, + max_num_of_tokens_per_rank=self.max_tokens, + num_local_experts=self.local_experts, + use_fp8=self._fp8_dispatch, + ) + realized_geometry = ( + int(self.buffer.num_of_hybrid_ep_ranks_per_nvlink_domain), + int(self.buffer.num_of_nodes), + int(self.buffer.local_rank), + int(self.buffer.node_rank), + ) + expected_geometry = ( + self.domain_ranks, + self.communication_domains, + rank % self.domain_ranks, + rank // self.domain_ranks, + ) + buffer_config = self.buffer.configurer.buffer_config + if realized_geometry != expected_geometry or ( + int(buffer_config.num_of_ranks_per_node) != self.domain_ranks + or int(buffer_config.num_of_nodes) != self.communication_domains + ): + raise RuntimeError( + "HybridEPBuffer communication-domain geometry differs from the case" + ) + except Exception as exc: + shutil.rmtree(self._jit_cache_dir, ignore_errors=True) + if self._previous_jit_cache_dir is None: + os.environ.pop("HYBRID_EP_CACHE_DIR", None) + else: + os.environ["HYBRID_EP_CACHE_DIR"] = self._previous_jit_cache_dir + if self._previous_domain_ranks is None: + os.environ.pop("NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN", None) + else: + os.environ[ + "NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN" + ] = self._previous_domain_ranks + raise RuntimeError( + f"HybridEPBuffer construction failed (hidden={self.hidden} max_tokens={self.max_tokens} " + f"local_experts={self.local_experts} world={world_size}): {exc!r}") from exc + update_template_config = self.buffer.update_template_config + + def tracked_update_template_config(*call_args, **call_kwargs): + config = update_template_config(*call_args, **call_kwargs) + realized = _hybrid_realized_config(config) + if ( + realized["num_of_ranks_per_node"] != self.domain_ranks + or realized["num_of_nodes"] != self.communication_domains + ): + raise RuntimeError("DeepEP Hybrid realized topology changed within one case") + expected_token_type = "UINT8" if self._fp8_dispatch else "UINT16" + if realized["token_data_type"] != expected_token_type: + raise RuntimeError( + "DeepEP Hybrid realized token dtype differs from the precision profile" + ) + if self._realized_config is not None and realized != self._realized_config: + raise RuntimeError("DeepEP Hybrid realized autotune config changed within one case") + self._realized_config = realized + return config + + self.buffer.update_template_config = tracked_update_template_config + self.domain_rank = int(self.buffer.local_rank) + if rank == 0: + print( + "[deepep-hybrid] HybridEPBuffer constructed " + f"(domains={self.communication_domains} ranks_per_domain={self.domain_ranks} " + f"world={world_size} local_experts={self.local_experts} hidden={self.hidden})", + file=sys.stderr, + ) + + self.backend_provenance = { + "deepep_commit": ver, "branch": "hybrid-ep", + "deepep_tree": os.environ.get("DEEPEP_TREE"), + "backend_lineage": "deepep-hybrid", + "loaded_libraries": loaded_libraries, + "impl": "deep_ep.HybridEPBuffer (NVIDIA TMA + warp-pipeline)", + "mode": "normal", "transport": topology["transport"], + "dispatch_dtype": ep_precision.communication_format( + self.communication_precision, "dispatch" + ), + "combine_dtype": ep_precision.communication_format( + self.communication_precision, "combine" + ), + "resource_mode": "fixed-profile", + "num_sms": None, "device_sms": dev_sms, + "tuned_source": "deepep-hybrid-configurer-autotune-v1", + "realized_config": None, "jit_kernel_keys": [], "jit_shared_objects": [], + "max_num_tokens": self.max_tokens, "top_k": self.top_k, + "num_experts": self.num_experts, "local_experts": self.local_experts, + "routing_factor": "ranks", + } + + def buffer_cap(self, args): + return self.max_tokens + + def make_problem(self, T, idx, weights, x): + encoding = ep_precision.encode_dispatch( + torch, x, self.communication_precision + ) + dispatch_x = ( + encoding.encoded_payload.view(torch.uint8) + if self._fp8_dispatch + else encoding.native_input + ) + return types.SimpleNamespace( + T=int(T), + x=x, + dispatch_x=dispatch_x, + dispatch_scales=encoding.scales, + oracle_x=encoding.semantic, + dispatch_precision_evidence=encoding.evidence, + topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), + ) + + def dispatch(self, p): + recv, recv_probs, _scales, handle = self.buffer.dispatch( + p.dispatch_x, + scaling_factor=p.dispatch_scales, + topk_idx=p.topk_idx, + topk_weights=p.topk_weights, + num_of_experts=self.num_experts, + ) + return types.SimpleNamespace( + recv=recv, + recv_payload=recv, + recv_scales=_scales, + recv_probs=recv_probs, + handle=handle, + combine_input=None, + ) + + def stage(self, p, h): + # Identity expert: the recv hidden IS the "expert output". combine reduces it per source token. + h.combine_input = self._semantic_recv(h, p.recv_tokens) + return None + + def combine(self, p, h): + # combine(hidden, handle=) -> [T, H] per-source-token reduction (no gate re-weight: "ranks"). + comb = self.buffer.combine(h.combine_input, handle=h.handle) + return comb[0] if isinstance(comb, (tuple, list)) else comb + + def capture_deferred_provenance(self): + torch.cuda.synchronize() + dist.barrier() + if self._realized_config is None: + raise RuntimeError("DeepEP Hybrid autotune config was not materialized") + local_artifacts = _hybrid_jit_evidence(self._jit_root) + semantic = { + "jit_kernel_keys": [item["kernel_key"] for item in local_artifacts], + "realized_config": dict(self._realized_config), + } + # NVCC may embed each rank's timestamped source basename in its ELF, so raw .so hashes are + # diagnostics rather than a cross-rank identity. Stable kernel keys encode every codegen + # input, including HybridEpConfigInstance fields that the Python binding does not expose. + _require_cross_rank_equal(semantic, "realized config/JIT kernel keys") + gathered_artifacts = [None] * dist.get_world_size() + dist.all_gather_object(gathered_artifacts, local_artifacts) + diagnostics = [] + for artifact_index, kernel_key in enumerate(semantic["jit_kernel_keys"]): + diagnostics.append({ + "kernel_key": kernel_key, + "rank_artifacts": [ + { + "bytes": rank_artifacts[artifact_index]["bytes"], + "rank": artifact_rank, + "sha256": rank_artifacts[artifact_index]["sha256"], + } + for artifact_rank, rank_artifacts in enumerate(gathered_artifacts) + ], + }) + if self._deferred_semantic_snapshot is not None and semantic != self._deferred_semantic_snapshot: + raise RuntimeError("DeepEP Hybrid config/JIT kernel set changed after measurement") + if self._deferred_jit_diagnostics is not None and diagnostics != self._deferred_jit_diagnostics: + raise RuntimeError("DeepEP Hybrid rank-local JIT artifacts changed after measurement") + self._deferred_semantic_snapshot = semantic + self._deferred_jit_diagnostics = diagnostics + self.backend_provenance.update(semantic) + self.backend_provenance["jit_shared_objects"] = diagnostics + + def inspect_dispatch(self, p, h): + count = self.recv_tokens(h) + routing_map = h.handle[4][:count] + rows, local_expert_ids = routing_map.nonzero(as_tuple=True) + positions = routing_map.to(torch.int64).cumsum(dim=1)[rows, local_expert_ids] - 1 + probability_columns = self.domain_rank * self.local_experts + local_expert_ids + if h.recv_probs.shape[1] < (self.domain_rank + 1) * self.local_experts: + raise RuntimeError("HybridEPBuffer probability tensor omits this NVLink-domain rank") + expert_ids = torch.full( + (count, self.top_k), -1, dtype=torch.int64, device=self.device + ) + weights = torch.zeros( + (count, self.top_k), dtype=torch.float32, device=self.device + ) + expert_ids[rows, positions] = local_expert_ids + self.rank * self.local_experts + weights[rows, positions] = h.recv_probs[:count][rows, probability_columns] + return types.SimpleNamespace( + payload=self._semantic_recv(h, count)[:count], + encoded_payload=h.recv_payload[:count], + scales=(h.recv_scales[:count] if h.recv_scales is not None else None), + expert_ids=expert_ids, + weights=weights, + local_expert_counts=routing_map.sum(dim=0, dtype=torch.int64), + ordering_contract="global-source-filter-stable-v1", + ) + + def combine_transformed(self, p, h, transformed): + combined = self.buffer.combine( + transformed.to(torch.bfloat16), handle=h.handle + ) + return combined[0] if isinstance(combined, (tuple, list)) else combined + + def recv_tokens(self, h): + return int(h.handle[3].item()) + + def _semantic_recv(self, h, rows): + if not self._fp8_dispatch: + return h.recv_payload + if not hasattr(h, "recv_semantic"): + semantic = torch.empty( + h.recv_payload.shape, + dtype=torch.bfloat16, + device=h.recv_payload.device, + ) + semantic[:rows].copy_(ep_precision.dequantize_dispatch( + torch, + h.recv_payload[:rows], + h.recv_scales[:rows], + self.communication_precision["dispatch"], + uint8_storage=True, + )) + h.recv_semantic = semantic + h.recv_semantic_rows = rows + elif h.recv_semantic_rows != rows: + raise RuntimeError("DeepEP Hybrid receive count changed for one dispatch handle") + return h.recv_semantic + + def oracle_dispatch_payload(self, payload): + return ep_precision.encode_dispatch( + torch, payload, self.communication_precision + ).semantic + + def precision_evidence(self, problem, view=None): + return ep_precision.precision_evidence( + torch, + profile_id=self.precision_profile_id, + profile=self.communication_precision, + problem=problem, + view=view, + uint8_storage=True, + ) + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + shutil.rmtree(self._jit_cache_dir, ignore_errors=True) + if self._previous_jit_cache_dir is None: + os.environ.pop("HYBRID_EP_CACHE_DIR", None) + else: + os.environ["HYBRID_EP_CACHE_DIR"] = self._previous_jit_cache_dir + if self._previous_domain_ranks is None: + os.environ.pop("NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN", None) + else: + os.environ[ + "NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN" + ] = self._previous_domain_ranks + return rc diff --git a/experimental/CollectiveX/tests/ep_deepep_v2.py b/experimental/CollectiveX/tests/ep_deepep_v2.py new file mode 100644 index 000000000..c6c30ad27 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_deepep_v2.py @@ -0,0 +1,624 @@ +#!/usr/bin/env python3 +"""DeepEP PR #605 adapter with PR #630's pure scale-up initialization fix.""" + +from __future__ import annotations + +import ctypes +import hashlib +import importlib.metadata +import inspect +import json +import os +import re +import sys +import types +from pathlib import Path + +import torch +import torch.distributed as dist +import contracts +import ep_harness +import ep_precision + +try: + import deep_ep + from deep_ep import ElasticBuffer # type: ignore +except Exception as exc: # pragma: no cover - requires the benchmark image + print(f"ERROR: DeepEP V2 import failed: {exc!r}", file=sys.stderr) + raise + + +DEEPEP_V2_PR = 605 +DEEPEP_V2_FIX_PR = 630 +DEEPEP_V2_COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6" +DEEPEP_V2_TREE = "29809e75c5874e6609dac4804e7b651d5226959f" +DEEPEP_V2_FMT_COMMIT = "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" +DEEPEP_V2_VERSION = "2.0.0" +DEEPEP_V2_DISTRIBUTION = "2.0.0+fa8a9b1" +DEEPEP_V2_JIT_RANDOM_SEED = "collectivex-deepep-v2-fa8a9b1" +TORCH_VERSION = "2.10.0+cu130" +NCCL_VERSION = "2.30.4" +NVSHMEM_VERSION = "3.3.9" +DEEPEP_V2_JIT_KERNELS = contracts.DEEPEP_V2_JIT_KERNELS + + +def _sha256(path: str) -> str: + digest = hashlib.sha256() + with open(path, "rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _api_sha256() -> str: + signatures = { + "ElasticBuffer.__init__": str(inspect.signature(ElasticBuffer.__init__)), + "ElasticBuffer.dispatch": str(inspect.signature(ElasticBuffer.dispatch)), + "ElasticBuffer.combine": str(inspect.signature(ElasticBuffer.combine)), + } + return hashlib.sha256( + json.dumps(signatures, sort_keys=True, separators=(",", ":")).encode() + ).hexdigest() + + +def _loaded_library_paths() -> set[str]: + extension = getattr(getattr(deep_ep, "_C", None), "__file__", None) + if not extension or not os.path.isfile(extension): + raise RuntimeError("DeepEP V2 extension library is not loaded") + paths = {os.path.realpath(extension)} + try: + with open("/proc/self/maps", encoding="utf-8") as handle: + for line in handle: + path = line.rstrip().split()[-1] + name = os.path.basename(path) + if ("libnccl.so" in name or "libnvshmem_host.so" in name) and os.path.isfile(path): + paths.add(os.path.realpath(path)) + except OSError as exc: # pragma: no cover - benchmark runtime is Linux + raise RuntimeError("cannot inspect loaded communication libraries") from exc + return paths + + +def _loaded_nccl_version() -> str: + matches = [ + path for path in _loaded_library_paths() + if "libnccl.so" in os.path.basename(path) + ] + if len(matches) != 1: + raise RuntimeError("expected exactly one loaded NCCL library") + version = ctypes.c_int() + if ctypes.CDLL(matches[0]).ncclGetVersion(ctypes.byref(version)) != 0: + raise RuntimeError("loaded NCCL version query failed") + return ep_harness.format_collective_version(version.value) + + +def _loaded_library_evidence() -> list[dict[str, str]]: + """Return content identities, never private library paths.""" + paths = _loaded_library_paths() + required = { + "nccl": [path for path in paths if "libnccl.so" in os.path.basename(path)], + "nvshmem": [path for path in paths if "libnvshmem_host.so" in os.path.basename(path)], + } + mismatches = [f"{name}={len(matches)}" for name, matches in required.items() if len(matches) != 1] + if mismatches: + raise RuntimeError("expected one loaded library for each dependency: " + ", ".join(mismatches)) + + def role(path: str) -> str: + name = os.path.basename(path) + if "libnccl.so" in name: + return "nccl" + if "libnvshmem_host.so" in name: + return "nvshmem" + return "deepep-extension" + + def label(path: str) -> str: + return "deep_ep._C" if role(path) == "deepep-extension" else os.path.basename(path) + + return sorted( + ({"role": role(path), "name": label(path), "sha256": _sha256(path)} for path in paths), + key=lambda item: (item["role"], item["name"], item["sha256"]), + ) + + +def _jit_artifact_evidence() -> list[dict[str, str]]: + root = Path(os.environ["EP_JIT_CACHE_DIR"]) / "cache" + if root.is_symlink() or not root.is_dir(): + raise RuntimeError("DeepEP V2 produced no JIT cache evidence") + artifacts = [] + kernel_names = set() + for directory in sorted(root.iterdir(), key=lambda item: item.name): + match = re.fullmatch(r"kernel\.([A-Za-z0-9_+-]+)\.([0-9a-f]{32})", directory.name) + if directory.is_symlink() or not directory.is_dir() or match is None: + raise RuntimeError("DeepEP V2 JIT cache contains an invalid entry") + if {path.name for path in directory.iterdir()} != { + "kernel.cu", "kernel.cubin", "kernel.sass", + }: + raise RuntimeError("DeepEP V2 JIT kernel evidence is incomplete") + source = directory / "kernel.cu" + cubin = directory / "kernel.cubin" + sass = directory / "kernel.sass" + if any(path.is_symlink() or not path.is_file() for path in (source, cubin, sass)): + raise RuntimeError("DeepEP V2 JIT evidence is not a regular file") + if any(path.stat().st_size <= 0 for path in (source, cubin, sass)): + raise RuntimeError("DeepEP V2 JIT evidence is empty") + kernel_names.add(match.group(1)) + artifacts.append({ + "cache_key": directory.name, + "source_sha256": _sha256(str(source)), + "sass_sha256": _sha256(str(sass)), + "cubin_sha256": _sha256(str(cubin)), + }) + if ( + len(artifacts) != len(DEEPEP_V2_JIT_KERNELS) + or kernel_names != DEEPEP_V2_JIT_KERNELS + ): + raise RuntimeError("DeepEP V2 JIT kernel set differs from the v1 contract") + return sorted(artifacts, key=lambda item: item["cache_key"]) + + +def _jit_cache_key( + args, + world_size: int, + max_tokens: int, + allow_hybrid_mode: bool, + realized: dict[str, int | bool], + precision_profile_id: str = "d-bf16.c-bf16", + communication_precision: dict[str, object] | None = None, +) -> str: + """Key generated kernels by codegen inputs, not routing data or case identity.""" + if communication_precision is None: + communication_precision = { + "dispatch": { + "communication_format": "bf16", + "api_input_dtype": "bf16", + }, + "combine": {"communication_format": "bf16"}, + } + payload = { + "contract": "deepep-v2-jit-config-v3", + "runner": args.runner, + "world_size": world_size, + "hidden": args.hidden, + "topk": args.topk, + "physical_experts": args.experts, + "tuning_experts": getattr(args, "num_logical_experts", args.experts), + "max_tokens": max_tokens, + "precision_profile": precision_profile_id, + "dispatch_dtype": communication_precision["dispatch"]["communication_format"], + "combine_dtype": communication_precision["combine"]["communication_format"], + "input_layout": communication_precision["dispatch"]["api_input_dtype"], + "expert_alignment": 1, + "do_cpu_sync": True, + "cached_mode": False, + "do_expand": False, + "use_expanded_layout": False, + "allow_hybrid_mode": allow_hybrid_mode, + "allow_multiple_reduction": True, + "prefer_overlap_with_compute": True, + "deterministic": False, + **realized, + } + return "jitcfg-v3-" + hashlib.sha256( + json.dumps(payload, sort_keys=True, separators=(",", ":")).encode() + ).hexdigest() + + +def _require_cross_rank_equal(value, label: str) -> None: + gathered = [None] * dist.get_world_size() + dist.all_gather_object(gathered, value) + canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered} + if len(canonical) != 1: + raise RuntimeError(f"DeepEP V2 {label} differs across ranks") + + +def _configure_gin_mode(args, world_size: int) -> bool: + scale_up_domain = int( + getattr(args, "scale_up_domain", None) + or getattr(args, "gpus_per_node", None) + or world_size + ) + allow_hybrid_mode = world_size > scale_up_domain + if allow_hybrid_mode: + os.environ.pop("EP_DISABLE_GIN", None) + else: + os.environ["EP_DISABLE_GIN"] = "1" + return allow_hybrid_mode + + +def _lsa_topology_is_valid( + gin_enabled: bool, + world_size: int, + scale_up_domain: int, + config: dict[str, int | bool], +) -> bool: + if gin_enabled: + domains = world_size // scale_up_domain + return ( + world_size % scale_up_domain == 0 + and domains > 1 + and config["physical_rdma_ranks"] == domains + and config["physical_nvlink_ranks"] == scale_up_domain + and config["logical_scaleout_ranks"] == domains + and config["logical_scaleup_ranks"] == scale_up_domain + and config["is_scaleup_nvlink"] is True + ) + return ( + config["physical_rdma_ranks"] == 1 + and config["physical_nvlink_ranks"] == world_size + and config["logical_scaleout_ranks"] == 1 + and config["logical_scaleup_ranks"] == world_size + and config["is_scaleup_nvlink"] is True + ) + + +def _require_runtime() -> tuple[str, str]: + expected = { + "DEEPEP_V2_PR": str(DEEPEP_V2_PR), + "DEEPEP_V2_FIX_PR": str(DEEPEP_V2_FIX_PR), + "DEEPEP_V2_COMMIT": DEEPEP_V2_COMMIT, + "DEEPEP_V2_TREE": DEEPEP_V2_TREE, + "DEEPEP_V2_FMT_COMMIT": DEEPEP_V2_FMT_COMMIT, + "DEEPEP_V2_JIT_RANDOM_SEED": DEEPEP_V2_JIT_RANDOM_SEED, + "EP_JIT_DUMP_SASS": "1", + } + mismatches = [ + f"{name}={os.environ.get(name)!r}, expected {value!r}" + for name, value in expected.items() + if os.environ.get(name) != value + ] + torch_version = str(torch.__version__) + nccl_package_version = importlib.metadata.version("nvidia-nccl-cu13") + nvshmem_package_version = importlib.metadata.version("nvidia-nvshmem-cu12") + actual = { + "deep_ep": str(getattr(deep_ep, "__version__", "")), + "deep_ep distribution": importlib.metadata.version("deep_ep"), + "torch": torch_version, + "nvidia-nccl-cu13": nccl_package_version, + "nvidia-nvshmem-cu12": nvshmem_package_version, + } + required = { + "deep_ep": DEEPEP_V2_VERSION, + "deep_ep distribution": DEEPEP_V2_DISTRIBUTION, + "torch": TORCH_VERSION, + "nvidia-nccl-cu13": NCCL_VERSION, + "nvidia-nvshmem-cu12": NVSHMEM_VERSION, + } + mismatches.extend( + f"{name}={actual[name]!r}, expected {value!r}" + for name, value in required.items() + if actual[name] != value + ) + if not inspect.isclass(ElasticBuffer) or ElasticBuffer.__name__ != "ElasticBuffer": + mismatches.append("deep_ep.ElasticBuffer is absent") + if os.environ.get("EP_SUPPRESS_NCCL_CHECK"): + mismatches.append("EP_SUPPRESS_NCCL_CHECK must be unset") + nccl_runtime_version = _loaded_nccl_version() + if nccl_runtime_version != NCCL_VERSION: + mismatches.append( + f"loaded NCCL={nccl_runtime_version!r}, expected {NCCL_VERSION!r}" + ) + if mismatches: + raise RuntimeError("invalid DeepEP V2 runtime: " + "; ".join(mismatches)) + return torch_version, nccl_runtime_version + + +class DeepEPV2Backend: + name = "deepep-v2" + stage_device_work = False + combine_needs_redispatch = False + combine_weight_semantics = "unweighted-rank-sum" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = "normal" + self.precision_profile_id, self.communication_precision = ( + ep_precision.resolve_precision( + args, + backend=self.name, + mode=self.mode, + supported_profiles={ + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + }, + ) + ) + self._fp8_dispatch = ep_precision.is_low_precision_dispatch( + self.communication_precision + ) + self.stage_device_work = self._fp8_dispatch + ep_precision.require_keyword( + ElasticBuffer.__init__, + "use_fp8_dispatch", + api="deep_ep.ElasticBuffer.__init__", + ) + self.group = dist.group.WORLD + torch_version, nccl_runtime_version = _require_runtime() + ladder, _ = ep_harness.token_ladder(args.tokens_ladder, args.phase, None) + conditioning = ep_harness.CONDITIONING_LADDERS[args.phase] + self.max_tokens = max([*ladder, *conditioning]) + jit_root = Path(os.environ["EP_JIT_CACHE_DIR"]) + scale_up_domain = int( + getattr(args, "scale_up_domain", None) + or getattr(args, "gpus_per_node", None) + or world_size + ) + allow_hybrid_mode = _configure_gin_mode(args, world_size) + gin_enabled = allow_hybrid_mode + communication_backend = "nccl-gin" if gin_enabled else "nccl-device-lsa" + self._deferred_jit_snapshot = None + self.buffer = ElasticBuffer( + self.group, + num_max_tokens_per_rank=self.max_tokens, + hidden=args.hidden, + num_topk=args.topk, + use_fp8_dispatch=self._fp8_dispatch, + deterministic=False, + allow_hybrid_mode=allow_hybrid_mode, + allow_multiple_reduction=True, + prefer_overlap_with_compute=True, + num_gpu_timeout_secs=100, + explicitly_destroy=True, + ) + tuning_num_experts = int(getattr(args, "num_logical_experts", args.experts)) + self.num_sms = int( + self.buffer.get_theoretical_num_sms(tuning_num_experts, args.topk) + ) + self.num_qps = int(self.buffer.get_theoretical_num_qps(self.num_sms)) + properties = torch.cuda.get_device_properties(device) + device_sms = int(properties.multi_processor_count) + jit_config = { + "num_sms": self.num_sms, + "num_qps": self.num_qps, + "allocated_qps": int(self.buffer.num_allocated_qps), + "logical_scaleout_ranks": int(self.buffer.num_scaleout_ranks), + "logical_scaleup_ranks": int(self.buffer.num_scaleup_ranks), + "physical_rdma_ranks": int(self.buffer.num_rdma_ranks), + "physical_nvlink_ranks": int(self.buffer.num_nvlink_ranks), + "is_scaleup_nvlink": self.buffer.num_scaleup_ranks == self.buffer.num_nvlink_ranks, + "device_arch_major": int(properties.major), + "device_arch_minor": int(properties.minor), + "device_sms": device_sms, + "device_smem_bytes": int(properties.shared_memory_per_block_optin), + "gpu_timeout_cycles": 100 * int(properties.clock_rate) * 1000, + } + _require_cross_rank_equal(jit_config, "JIT configuration") + if not _lsa_topology_is_valid( + gin_enabled, world_size, scale_up_domain, jit_config + ): + raise RuntimeError("DeepEP V2 realized communication domains differ from topology") + self.jit_cache_key = _jit_cache_key( + args, + world_size, + self.max_tokens, + allow_hybrid_mode, + jit_config, + self.precision_profile_id, + self.communication_precision, + ) + os.environ["EP_JIT_CACHE_DIR"] = str(jit_root / self.jit_cache_key) + realized_config = { + "jit_cache_key": self.jit_cache_key, + "num_max_tokens_per_rank": self.max_tokens, + **jit_config, + } + _require_cross_rank_equal(realized_config, "realized tuning/topology") + comm = getattr(self.buffer, "nccl_comm_handle", None) + communicator = ( + "deepep-managed" if getattr(comm, "managed", True) else "pytorch-reused" + ) + + loaded_libraries = _loaded_library_evidence() + _require_cross_rank_equal(loaded_libraries, "loaded libraries") + self.backend_provenance = { + "deepep_version": DEEPEP_V2_VERSION, + "deepep_distribution_version": importlib.metadata.version("deep_ep"), + "deepep_commit": DEEPEP_V2_COMMIT, + "deepep_tree": DEEPEP_V2_TREE, + "deepep_pr": DEEPEP_V2_PR, + "deepep_fix_pr": DEEPEP_V2_FIX_PR, + "fmt_commit": DEEPEP_V2_FMT_COMMIT, + "api": "deep_ep.ElasticBuffer", + "api_signature_sha256": _api_sha256(), + "communication_backend": communication_backend, + "gin_enabled": gin_enabled, + "nccl_communicator": communicator, + "torch_version": torch_version, + "torch_git_version": str(torch.version.git_version), + "cuda_version": str(torch.version.cuda), + "nccl_package_version": importlib.metadata.version("nvidia-nccl-cu13"), + "nccl_version": nccl_runtime_version, + "nvshmem_package_version": importlib.metadata.version("nvidia-nvshmem-cu12"), + "loaded_libraries": loaded_libraries, + "jit_cache_key": self.jit_cache_key, + "jit_cubins": [], + "jit_random_seed": DEEPEP_V2_JIT_RANDOM_SEED, + "num_experts": int(args.experts), + "mode": "normal", + "dispatch_dtype": ep_precision.communication_format( + self.communication_precision, "dispatch" + ), + "combine_dtype": ep_precision.communication_format( + self.communication_precision, "combine" + ), + "deterministic": False, + "resource_mode": "fixed-profile", + "requested_num_sms": self.num_sms, + "tuning_num_experts": tuning_num_experts, + "num_sms": self.num_sms, + "num_qps": self.num_qps, + "allocated_qps": int(self.buffer.num_allocated_qps), + "device_sms": device_sms, + "sm_fraction": self.num_sms / device_sms, + "tuned_source": "deepep-v2-analytical-sm-qp-logical-experts-v1", + "num_max_tokens_per_rank": self.max_tokens, + "allow_hybrid_mode": bool(self.buffer.allow_hybrid_mode), + "allow_multiple_reduction": bool(self.buffer.allow_multiple_reduction), + "prefer_overlap_with_compute": bool( + self.buffer.prefer_overlap_with_compute + ), + "logical_scaleout_ranks": int(self.buffer.num_scaleout_ranks), + "logical_scaleup_ranks": int(self.buffer.num_scaleup_ranks), + "physical_rdma_ranks": int(self.buffer.num_rdma_ranks), + "physical_nvlink_ranks": int(self.buffer.num_nvlink_ranks), + } + + def buffer_cap(self, args): + return self.max_tokens + + def make_problem(self, T, idx, weights, x): + encoding = ep_precision.encode_dispatch( + torch, x, self.communication_precision + ) + return types.SimpleNamespace( + T=T, + x=x, + dispatch_x=encoding.native_input, + oracle_x=encoding.semantic, + dispatch_precision_evidence=encoding.evidence, + topk_idx=idx.to(deep_ep.topk_idx_t), + topk_weights=weights.to(torch.float32), + ) + + def dispatch(self, p): + recv_x, recv_topk_idx, recv_topk_weights, handle, _ = self.buffer.dispatch( + p.dispatch_x, + topk_idx=p.topk_idx, + topk_weights=p.topk_weights, + num_experts=self.args.experts, + num_max_tokens_per_rank=self.max_tokens, + expert_alignment=1, + num_sms=self.num_sms, + num_qps=self.num_qps, + async_with_compute_stream=False, + do_handle_copy=True, + do_cpu_sync=True, + do_expand=False, + ) + return types.SimpleNamespace( + recv_x=recv_x, + recv_topk_idx=recv_topk_idx, + recv_topk_weights=recv_topk_weights, + handle=handle, + ) + + def stage(self, p, h): + h.combine_input = self._semantic_recv(h, p.recv_tokens) + + def combine(self, p, h): + combined_x, _, _ = self.buffer.combine( + h.combine_input, + handle=h.handle, + num_sms=self.num_sms, + num_qps=self.num_qps, + async_with_compute_stream=False, + ) + return combined_x + + def capture_deferred_provenance(self): + # destroy() uses this same barrier. Materialize its JIT kernel before hashing the + # implementation so the first and later routing cases see identical evidence. + self.buffer.barrier(use_comm_stream=True, with_cpu_sync=True) + torch.cuda.synchronize() + jit_cubins = _jit_artifact_evidence() + _require_cross_rank_equal(jit_cubins, "JIT CUBINs") + if ( + self._deferred_jit_snapshot is not None + and jit_cubins != self._deferred_jit_snapshot + ): + raise RuntimeError("DeepEP V2 JIT CUBIN set changed after measurement") + self._deferred_jit_snapshot = jit_cubins + self.backend_provenance["jit_cubins"] = jit_cubins + + def inspect_dispatch(self, p, h): + count = self.recv_tokens(h) + local_idx = h.recv_topk_idx[:count] + valid = local_idx >= 0 + expert_ids = torch.where( + valid, + local_idx + self.rank * (self.args.experts // self.world_size), + local_idx, + ) + local = local_idx[valid].to(torch.int64) + return types.SimpleNamespace( + payload=self._semantic_recv(h, count)[:count], + encoded_payload=self._encoded_recv(h)[:count], + scales=( + self._recv_scales(h)[:count] + if self._recv_scales(h) is not None + else None + ), + expert_ids=expert_ids, + weights=h.recv_topk_weights[:count].masked_fill(~valid, 0), + local_expert_counts=torch.bincount( + local, minlength=self.args.experts // self.world_size + ), + ordering_contract="elastic-source-metadata-v1", + ) + + def combine_transformed(self, p, h, transformed): + semantic = self._semantic_recv(h, self.recv_tokens(h)) + combine_input = torch.zeros_like(semantic) + combine_input[: transformed.shape[0]].copy_(transformed.to(combine_input.dtype)) + combined, _, _ = self.buffer.combine( + combine_input, + handle=h.handle, + num_sms=self.num_sms, + num_qps=self.num_qps, + async_with_compute_stream=False, + ) + return combined + + def recv_tokens(self, h): + return int(h.handle.psum_num_recv_tokens_per_scaleup_rank[-1].item()) + + def _encoded_recv(self, h): + return h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x + + def _recv_scales(self, h): + return h.recv_x[1] if isinstance(h.recv_x, tuple) else None + + def _semantic_recv(self, h, rows): + if not self._fp8_dispatch: + return h.recv_x + if not hasattr(h, "recv_semantic"): + encoded = self._encoded_recv(h) + semantic = torch.empty( + encoded.shape, dtype=torch.bfloat16, device=encoded.device + ) + semantic[:rows].copy_(ep_precision.dequantize_dispatch( + torch, + encoded[:rows], + self._recv_scales(h)[:rows], + self.communication_precision["dispatch"], + )) + h.recv_semantic = semantic + h.recv_semantic_rows = rows + elif h.recv_semantic_rows != rows: + raise RuntimeError("DeepEP V2 receive count changed for one dispatch handle") + return h.recv_semantic + + def oracle_dispatch_payload(self, payload): + return ep_precision.encode_dispatch( + torch, payload, self.communication_precision + ).semantic + + def precision_evidence(self, problem, view=None): + return ep_precision.precision_evidence( + torch, + profile_id=self.precision_profile_id, + profile=self.communication_precision, + problem=problem, + view=view, + ) + + def finalize(self, rc): + try: + dist.barrier() + self.buffer.destroy() + dist.barrier() + dist.destroy_process_group() + except Exception: + return 1 + return rc diff --git a/experimental/CollectiveX/tests/ep_harness.py b/experimental/CollectiveX/tests/ep_harness.py new file mode 100644 index 000000000..af2ec98ca --- /dev/null +++ b/experimental/CollectiveX/tests/ep_harness.py @@ -0,0 +1,2167 @@ +#!/usr/bin/env python3 +"""CollectiveX — shared EP (expert-parallel) dispatch/combine benchmark harness. + +Backend-agnostic core. The per-backend adapters (`ep_deepep.py`, `ep_mori.py`) +implement a small duck-typed protocol; this module owns the source-tokens-per-rank +sweep, the timing, the correctness gate, and the provenance-tagged JSON doc. + +Fair-comparison contract (see docs/methodology.md): + * **Deterministic shared routing trace** (`routing.py`): the per-token expert IDs + + gate weights are generated once from a fixed seed over the *global* batch and are + identical on every SKU; each rank materializes its slice. So every platform runs + the *same* problem (no per-rank/per-platform RNG in the adapters). + * **Explicit measurement contract**: layout-and-dispatch-v1 includes routing-layout + generation in dispatch timing. Combine excludes staging. + Isolated sum is derived independently at each percentile and is not a measured chained op. + * **Correct collective percentile**: each iteration's latency is reduced MAX across + ranks first (a collective finishes with its slowest rank), THEN percentiled — + `median_i(max_r)`, not `max_r(median_i)`. + * **One line = one fixed config**; only T varies. Both `tokens_per_rank` and + `global_tokens = T * ep_size` are recorded as explicit chart coordinates. + +stdlib-only at module top (torch is passed in by the entrypoint; `routing` is imported +lazily inside run_sweep) so this file `py_compile`s without torch. + +Backend protocol: + name, mode, combine_needs_redispatch, backend_provenance(dict) + buffer_cap(args) -> int|None + make_problem(T, idx, weights, x) -> problem # materialize this rank's trace slice + dispatch(problem) -> handle # pure dispatch comm (timed) + stage(problem, handle) # expert-output placement + stage_device_work # true only when stage launches device work + combine(problem, handle) -> tensor # pure combine comm (timed) + inspect_dispatch(problem, handle) -> view # normalized payload/expert/weight metadata + combine_transformed(problem, handle, tensor) -> tensor + recv_tokens(handle) -> int # realized tokens received this rank + finalize(rc) -> int|NoReturn +""" +from __future__ import annotations + +import argparse +import datetime as _dt +import hashlib +import json +import math +import os +import types + +import contracts +import identity +import workload as workload_contract + +# Raw v1 result emitted by one benchmark case. Publication uses a separate contract. +SCHEMA_VERSION = 1 + +# Every comparison-grade EP point uses the same literal timing profile on every SKU/backend. +# Eight timed iterations keep each MoRI burst well below its sustained-iteration wedge, 64 trials +# provide 512 observations per operation, and 32 warmups meet Blackwell's measured clock-ramp floor. +SAMPLING_CONTRACT = identity.V1_CASE_PROFILE["sampling_contract"] +TIMED_SAMPLES_PER_POINT = 512 +TIMED_ITERS_PER_TRIAL = 8 +TRIALS_PER_POINT = 64 +WARMUP_ITERS_PER_TRIAL = 32 +WARMUP_SEMANTICS = "full-roundtrip-before-each-component-trial-point-v1" +QUALIFICATION_RUNS = 3 +ROUTING_SEED = 67 +ROUTING_GENERATOR = workload_contract.GENERATOR_VERSION +ACTIVATION_PROFILE = "canonical-counter-source-v4" +ACTIVATION_GENERATOR = workload_contract.ACTIVATION_GENERATOR +PLACEMENT = "packed" +COMPONENT_ORDER_CONTRACT = "qualification-hash-rotated-components-v1" +LOW_LATENCY_MODE = "low-latency" +LOW_LATENCY_MAX_TOKENS_PER_RANK = 128 +LOW_LATENCY_MEASUREMENT_CONTRACT = "expert-packed-weighted-combine-v1" +LOW_LATENCY_COMPONENT_ORDER_CONTRACT = "qualification-hash-rotated-components-v1" +LOW_LATENCY_ORACLE_CONTRACT = "expert-assignment-transform-v1" +LOW_LATENCY_CORRECTNESS_SCOPE = "expert-assignment-and-weighted-combine" + +# Phase-default sweeps — token-size regimes, NOT distinct kernels (both run normal +# mode; "decode"/"prefill" name the small/large-token regime). Powers of two for a +# clean log x-axis; clamped to the backend buffer ceiling (MoRI's registerable heap). +DECODE_LADDER = [1, 2, 4, 8, 16, 32, 64, 128] +PREFILL_LADDER = [128, 256, 512, 1024, 2048, 4096] +CONDITIONING_LADDERS = { + phase: list(ladder) for phase, ladder in contracts.V1_CONDITIONING_LADDERS.items() +} +CONDITIONING_ROUNDS_PER_SHAPE = contracts.V1_CONDITIONING_ROUNDS_PER_SHAPE +CONDITIONING_CONTRACT = identity.V1_CASE_PROFILE["conditioning_contract"] +ORACLE_CONTRACT = identity.V1_CASE_PROFILE["oracle_contract"] +ORACLE_RTOL = 5e-2 +ORACLE_ATOL = 2e-2 + +EPLB_REDUNDANT_EXPERTS = 32 +EPLB_REFERENCE_TOKENS_PER_RANK = 2048 +EPLB_PLANNER = "greedy-rank-major-v1" +V1_PROFILE = { + "dispatch_dtype": "bf16", + "combine_dtype": "bf16", + "combine_quant_mode": "none", + "mode": "normal", + "measurement_contract": "layout-and-dispatch-v1", + "resource_mode": "fixed-profile", + "placement": PLACEMENT, + "activation_profile": ACTIVATION_PROFILE, + "activation_generator": ACTIVATION_GENERATOR, + "routing_generator": ROUTING_GENERATOR, + "component_order_contract": COMPONENT_ORDER_CONTRACT, + "conditioning_contract": CONDITIONING_CONTRACT, + "eplb_reference_tokens_per_rank": EPLB_REFERENCE_TOKENS_PER_RANK, + "eplb_redundant_experts": EPLB_REDUNDANT_EXPERTS, + "eplb_planner": EPLB_PLANNER, + # DeepEP/UCCL use this only as the fallback when their tuned default is not exported. + "num_sms": 24, +} + + +def precision_byte_provenance( + axis: dict, logical_copies: int, hidden: int +) -> dict[str, int | str]: + """Return comparable logical activation and required scale bytes for one direction.""" + if logical_copies < 0 or hidden < 0: + raise ValueError("logical precision byte dimensions must be non-negative") + bits_per_value = { + "bf16": 16, + "fp8-e4m3fn": 8, + "fp8-e4m3fnuz": 8, + "logfmt10": 10, + }.get(axis["communication_format"]) + if bits_per_value is None: + raise ValueError(f"unknown communication format {axis['communication_format']!r}") + activation_data_bytes = logical_copies * math.ceil(hidden * bits_per_value / 8) + scale_bytes_per_value = {None: 0, "f32": 4, "implicit-logfmt10": 0}.get( + axis["scale_dtype"] + ) + if scale_bytes_per_value is None: + raise ValueError(f"unknown communication scale dtype {axis['scale_dtype']!r}") + group_size = axis["scale_group_size"] + scale_groups = math.ceil(hidden / group_size) if group_size is not None else 0 + scale_bytes = logical_copies * scale_groups * scale_bytes_per_value + return { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": activation_data_bytes, + "scale_bytes": scale_bytes, + "total_logical_bytes": activation_data_bytes + scale_bytes, + } + +def format_collective_version(raw) -> str: + """Normalize PyTorch's tuple or packed NCCL/RCCL version representation.""" + if isinstance(raw, int): + if raw < 10_000: + return f"{raw // 1000}.{raw // 100 % 10}.{raw % 100}" + return f"{raw // 10_000}.{raw // 100 % 100}.{raw % 100}" + if isinstance(raw, (tuple, list)): + return ".".join(map(str, raw)) + return str(raw) if raw not in (None, "") else "unknown" + + +def add_common_args(ap: argparse.ArgumentParser) -> None: + """Add the varying v1 inputs; fixed profile values are not CLI axes.""" + ap.set_defaults(**V1_PROFILE) + ap.add_argument("--mode", default="normal", choices=["normal", LOW_LATENCY_MODE]) + ap.add_argument( + "--precision-profile", + default="", + choices=("", *identity.V1_PRECISION_PROFILES), + help="exact native dispatch/combine communication profile; blank selects BF16 control", + ) + ap.add_argument("--phase", default="decode", choices=["decode", "prefill"], + help="token-size regime: decode (small T) / prefill (large T) — picks the default ladder") + ap.add_argument("--tokens-ladder", default="", + help="space/comma-separated source-tokens-per-rank sweep; blank = phase default") + ap.add_argument("--hidden", type=int, default=7168) + ap.add_argument("--topk", type=int, default=8) + ap.add_argument("--experts", type=int, default=256, help="TOTAL experts (fixed across EP degrees)") + ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"]) + # EPLB (Expert-Parallel Load Balancer): replicate hot experts onto redundant physical + # slots + balanced-place so per-rank load equalizes. A pure routing-trace transform + # (tests/eplb.py); experts becomes num_logical+redundant. The remedy for `zipf` skew. + ap.add_argument("--eplb", action="store_true", + help="apply EPLB expert replication/placement to the routing trace") + # Canonical workloads consume pre-generated trace bytes instead of the + # seeded runtime generator, so a result is provably the SAME workload as another machine's + # (checksum match). Points at a dir of .npz/.manifest.json (make_workloads.py). + ap.add_argument("--workload-dir", default="", + help="dir of canonical workload traces; empty = seeded runtime generation (dev)") + ap.add_argument("--case-id", default="") + ap.add_argument("--suite", default="") + ap.add_argument("--workload-name", default="") + ap.add_argument("--required-publication", default="") + ap.add_argument("--seed", type=int, default=ROUTING_SEED) + ap.add_argument( + "--qualification-index", + type=int, + choices=range(1, QUALIFICATION_RUNS + 1), + default=os.environ.get("COLLECTIVEX_QUALIFICATION_INDEX", "1"), + help="one-based qualification repeat used for deterministic measurement ordering", + ) + # 32: B300/Blackwell needs ~30 untimed iters to reach steady-state GPU clocks + + # establish NVLink/NVSHMEM connections — at warmup=8 its dispatch read ~1787us + # (cold), at warmup>=30 it settles to ~85us (faster than H100, reproducible within + # ~2.5%). H100/MI355X reach steady state much sooner; the extra iters are harmless. + ap.add_argument("--warmup", type=int, default=WARMUP_ITERS_PER_TRIAL, + help=f"untimed full roundtrips before each trial/point; fixed by " + f"{SAMPLING_CONTRACT} to {WARMUP_ITERS_PER_TRIAL}") + ap.add_argument("--iters", type=int, default=TIMED_ITERS_PER_TRIAL, + help=f"timed iterations per trial; fixed by {SAMPLING_CONTRACT} to " + f"{TIMED_ITERS_PER_TRIAL}") + ap.add_argument("--trials", type=int, default=TRIALS_PER_POINT, + help=f"timed trials; fixed by {SAMPLING_CONTRACT} to {TRIALS_PER_POINT}") + # provenance / output + ap.add_argument("--runner", required=True) + ap.add_argument("--topology-class", required=True) + ap.add_argument("--transport", default="") + ap.add_argument("--scope", required=True, choices=["scale-up", "scale-out"]) + ap.add_argument("--scale-up-transport", required=True) + ap.add_argument("--scale-out-transport", default="") + # gpus-per-node=0 means one node containing the whole EP group. + ap.add_argument("--gpus-per-node", type=int, default=0) + ap.add_argument("--scale-up-domain", type=int, default=0, help="0 = gpus_per_node*ep (one domain)") + ap.add_argument("--timestamp") + ap.add_argument("--out", required=True) + + +def token_ladder(spec: str, phase: str, cap: int | None) -> tuple[list[int], list[int]]: + """Return (ladder, dropped): explicit spec else the phase default; positive ints; + clamped to `cap` with dropped points reported (never silently truncated).""" + if spec and spec.strip(): + want = [int(t) for t in spec.replace(",", " ").split() if t] + else: + want = DECODE_LADDER if phase == "decode" else PREFILL_LADDER + want = sorted({t for t in want if t > 0}) + if cap is not None: + return [t for t in want if t <= cap], [t for t in want if t > cap] + return want, [] + + +def sampling_contract_error(iters: int, trials: int, warmup: int) -> str | None: + """Return a user-facing error unless the exact cross-SKU timing profile is used.""" + expected = (TIMED_ITERS_PER_TRIAL, TRIALS_PER_POINT, WARMUP_ITERS_PER_TRIAL) + observed = (iters, trials, warmup) + if observed != expected: + return (f"{SAMPLING_CONTRACT} requires exactly iters:trials:warmup=" + f"{expected[0]}:{expected[1]}:{expected[2]} on every SKU/backend; got " + f"{observed[0]}:{observed[1]}:{observed[2]} " + f"({iters * trials if iters > 0 and trials > 0 else 'invalid'} timed samples)") + return None + + +def qualification_order( + values: list, qualification_index: int, trial_index: int, *, identity_key: str = "" +) -> list: + """Return a deterministic, position-balanced order for one qualification trial. + + Official runs bind the base permutation to the case identity. The cyclic schedule then gives + every value every position equally often over 64 trials while qualification repeats start at + different offsets. Keeping the empty-key behavior stable preserves local diagnostic fixtures. + """ + if not values or len(values) != len(set(values)): + raise ValueError("qualification order requires non-empty unique values") + if qualification_index not in range(1, QUALIFICATION_RUNS + 1): + raise ValueError(f"qualification_index must be in 1..{QUALIFICATION_RUNS}") + if type(trial_index) is not int or trial_index < 0: + raise ValueError("trial_index must be a non-negative integer") + if not isinstance(identity_key, str): + raise ValueError("qualification identity_key must be a string") + base_values = list(values) + if identity_key: + base_values.sort( + key=lambda value: hashlib.sha256( + f"{identity_key}\0{qualification_index}\0{value}".encode("utf-8") + ).digest() + ) + position = trial_index + qualification_index - 1 + cycle, offset = divmod(position, len(values)) + base = base_values if cycle % 2 == 0 else list(reversed(base_values)) + return base[offset:] + base[:offset] + + +def sampled_component_evidence(trials: list[list[float]]) -> dict: + """Validate and copy private 64x8 trial blocks without flattening their boundaries.""" + if not trials: + return {"availability": "unavailable", "sample_count": 0, "trials": None} + if len(trials) != TRIALS_PER_POINT: + raise ValueError( + f"measured component needs {TRIALS_PER_POINT} trial blocks; got {len(trials)}" + ) + normalized: list[list[float]] = [] + for trial in trials: + if len(trial) != TIMED_ITERS_PER_TRIAL: + raise ValueError( + f"measured trial needs {TIMED_ITERS_PER_TRIAL} samples; got {len(trial)}" + ) + block = [] + for sample in trial: + if isinstance(sample, bool) or not isinstance(sample, (int, float)): + raise ValueError("measured samples must be numeric") + value = float(sample) + if not math.isfinite(value) or value < 0: + raise ValueError("measured samples must be finite and non-negative") + block.append(value) + normalized.append(block) + count = sum(map(len, normalized)) + if count != TIMED_SAMPLES_PER_POINT: + raise ValueError( + f"measured component needs {TIMED_SAMPLES_PER_POINT} samples; got {count}" + ) + return {"availability": "measured", "sample_count": count, "trials": normalized} + + +def _stats_vec(xs: list[int]) -> dict: + """min/mean/max/CV (+ empty count) of a per-rank count vector — self-describing source-token + or load summary without dumping the full vector.""" + n = len(xs) or 1 + mean = sum(xs) / n + var = sum((x - mean) ** 2 for x in xs) / n + cv = (var ** 0.5 / mean) if mean > 0 else 0.0 + return {"min": min(xs) if xs else 0, "mean": round(mean, 3), + "max": max(xs) if xs else 0, "cv": round(cv, 4), + "empty_ranks": sum(1 for x in xs if x == 0), "total": sum(xs), "ranks": n} + + +def percentile(xs: list[float], q: float) -> float: + if not xs: + return float("nan") + s = sorted(xs) + i = max(0, min(len(s) - 1, math.ceil(q / 100.0 * len(s)) - 1)) + return s[i] + + +def _sha256_json(value) -> str: + payload = json.dumps( + value, allow_nan=False, ensure_ascii=False, sort_keys=True, separators=(",", ":") + ).encode() + return hashlib.sha256(payload).hexdigest() + + +def _series_provenance(provenance: dict) -> dict: + """Retain stable semantic build identity while keeping raw binaries diagnostic.""" + return contracts.series_provenance(provenance) + + +def _write_bytes_atomic(path: str, payload: bytes) -> tuple[str, int]: + os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True) + temporary = f"{path}.tmp-{os.getpid()}" + try: + with open(temporary, "wb") as handle: + handle.write(payload) + handle.flush() + os.fsync(handle.fileno()) + os.replace(temporary, path) + finally: + try: + os.unlink(temporary) + except FileNotFoundError: + pass + return hashlib.sha256(payload).hexdigest(), len(payload) + + +def _write_json_atomic(path: str, value) -> tuple[str, int]: + payload = ( + json.dumps(value, allow_nan=False, ensure_ascii=False, indent=2) + "\n" + ).encode() + return _write_bytes_atomic(path, payload) + + +def time_us(torch, fn, warmup: int, iters: int, pre=None) -> list[float]: + """Per-iteration CUDA-event latencies (µs) for THIS rank. + + Without `pre`: times `fn()`. With `pre`: runs `pre()` UNTIMED each iteration (sync + before the start event so its GPU work can't bleed in), then times `fn(pre_result)` + — how combine is isolated when it consumes the dispatch state and needs a fresh + untimed dispatch+stage before every sample. Returns the raw per-iteration series; + the caller reduces across ranks per iteration before percentiling. + """ + def sample(): + arg = pre() if pre is not None else None + if pre is not None: + torch.cuda.synchronize() + s = torch.cuda.Event(enable_timing=True) + e = torch.cuda.Event(enable_timing=True) + s.record() + fn(arg) if pre is not None else fn() + e.record() + torch.cuda.synchronize() + return s.elapsed_time(e) * 1000.0 # ms -> us + + for _ in range(max(0, warmup)): + if pre is not None: + a = pre() + torch.cuda.synchronize() + fn(a) + else: + fn() + # sync EACH warmup iteration, not just once after the loop: the measured-roundtrip fn + # interleaves dispatch+combine on a backend's persistent comm buffer, so back-to-back + # un-synced warmup iterations let iter N+1's dispatch race iter N's combine (CUDA abort + # on a rank -> NCCL-watchdog SIGABRT). Cheap (warmup is small); timed samples already sync. + torch.cuda.synchronize() + return [sample() for _ in range(iters)] + + +def kernel_generation(backend) -> str: + """Return the adapter's explicit kernel family when one exists.""" + declared = getattr(backend, "kernel_generation", None) + if declared: + return declared + return { + "deepep": "v1", + "deepep-v2": "v2-elastic-buffer", + "deepep-hybrid": "hybrid", + }.get(backend.name, "n-a") + + +def _reduce_vec(torch, dist, device, vals, op): + t = torch.tensor(vals, device=device, dtype=torch.float64) + dist.all_reduce(t, op=op) + return [float(x) for x in t.tolist()] + + +def _reduce_int(torch, dist, device, v: int, op) -> int: + t = torch.tensor([int(v)], device=device, dtype=torch.int64) + dist.all_reduce(t, op=op) + return int(t.item()) + + +def _same_hash_across_ranks(torch, dist, device, digest: str) -> bool: + parts = [int(digest[offset:offset + 8], 16) for offset in range(0, 64, 8)] + low = torch.tensor(parts, device=device, dtype=torch.int64) + high = low.clone() + dist.all_reduce(low, op=dist.ReduceOp.MIN) + dist.all_reduce(high, op=dist.ReduceOp.MAX) + return bool(torch.equal(low, high)) + + +def _tensor_sha256(*tensors) -> str: + digest = hashlib.sha256() + for tensor in tensors: + digest.update(tensor.detach().contiguous().cpu().numpy().tobytes()) + return digest.hexdigest() + + +def _normalized_expert_metadata(torch, expert_ids, weights): + """Sort each row by global expert ID while keeping -1 sentinels last.""" + valid = expert_ids >= 0 + keys = torch.where(valid, expert_ids.to(torch.int64), torch.full_like(expert_ids, 1 << 30)) + order = torch.argsort(keys, dim=1, stable=True) + sorted_ids = torch.gather(expert_ids.to(torch.int64), 1, order) + sorted_weights = torch.gather(weights.to(torch.float32), 1, order) + sorted_valid = sorted_ids >= 0 + return ( + torch.where(sorted_valid, sorted_ids, torch.full_like(sorted_ids, -1)), + sorted_weights.masked_fill(~sorted_valid, 0), + ) + + +def expert_packed_slot_map( + counts, + src_info, + layout_range, + *, + tokens_per_rank: int, + experts_per_rank: int, + world_size: int, +) -> list[tuple[int, int, int]]: + """Decode and validate DeepEP's expert-packed receive metadata. + + ``src_info`` stores a source-local token index. The source rank is carried by + the corresponding packed ``layout_range`` interval, so neither field is + independently sufficient to identify a source token. + """ + if tokens_per_rank <= 0 or experts_per_rank <= 0 or world_size <= 0: + raise ValueError("expert-packed dimensions must be positive") + if len(counts) != experts_per_rank: + raise ValueError("expert-packed count shape differs from local experts") + if len(src_info) != experts_per_rank or len(layout_range) != experts_per_rank: + raise ValueError("expert-packed metadata shape differs from local experts") + + mask = (1 << 32) - 1 + slots: list[tuple[int, int, int]] = [] + pairs: set[tuple[int, int]] = set() + for local_expert in range(experts_per_rank): + count = counts[local_expert] + if type(count) is not int or count < 0: + raise ValueError("expert-packed receive count is invalid") + if len(layout_range[local_expert]) != world_size: + raise ValueError("expert-packed layout rank dimension is invalid") + if len(src_info[local_expert]) < count: + raise ValueError("expert-packed source metadata is truncated") + + covered = [False] * count + for source_rank, encoded in enumerate(layout_range[local_expert]): + if type(encoded) is not int or encoded < 0: + raise ValueError("expert-packed layout range is invalid") + begin, span = encoded >> 32, encoded & mask + if begin > count or begin + span > count: + raise ValueError("expert-packed layout range exceeds valid slots") + for packed_position in range(begin, begin + span): + if covered[packed_position]: + raise ValueError("expert-packed layout ranges overlap") + covered[packed_position] = True + local_source = src_info[local_expert][packed_position] + if ( + type(local_source) is not int + or local_source < 0 + or local_source >= tokens_per_rank + ): + raise ValueError("expert-packed source token index is invalid") + source_id = source_rank * tokens_per_rank + local_source + pair = (source_id, local_expert) + if pair in pairs: + raise ValueError("expert-packed source/expert assignment is duplicated") + pairs.add(pair) + slots.append((local_expert, packed_position, source_id)) + if not all(covered): + raise ValueError("expert-packed layout ranges omit valid receive slots") + return slots + + +def expert_packed_dispatch_view( + torch, + packed_payload, + packed_counts, + packed_src_info, + packed_layout_range, + *, + rank: int, + tokens_per_rank: int, + experts_per_rank: int, + world_size: int, +): + """Return the valid expert-packed rows with exact global source identities.""" + if packed_payload.ndim != 3: + raise ValueError("expert-packed payload must have shape [experts, slots, hidden]") + if packed_payload.shape[0] != experts_per_rank: + raise ValueError("expert-packed payload expert dimension is invalid") + if tuple(packed_counts.shape) != (experts_per_rank,): + raise ValueError("expert-packed count tensor shape is invalid") + if tuple(packed_src_info.shape[:1]) != (experts_per_rank,): + raise ValueError("expert-packed source tensor shape is invalid") + if tuple(packed_layout_range.shape) != (experts_per_rank, world_size): + raise ValueError("expert-packed layout tensor shape is invalid") + if packed_src_info.ndim != 2 or packed_src_info.shape[1] < packed_payload.shape[1]: + raise ValueError("expert-packed source tensor capacity is invalid") + + counts = [int(value) for value in packed_counts.detach().cpu().tolist()] + if any(count > packed_payload.shape[1] for count in counts): + raise ValueError("expert-packed receive count exceeds payload capacity") + slots = expert_packed_slot_map( + counts, + packed_src_info.detach().cpu().tolist(), + packed_layout_range.detach().cpu().tolist(), + tokens_per_rank=tokens_per_rank, + experts_per_rank=experts_per_rank, + world_size=world_size, + ) + device = packed_payload.device + local_expert_slots = torch.tensor( + [slot[0] for slot in slots], device=device, dtype=torch.int64 + ) + packed_positions = torch.tensor( + [slot[1] for slot in slots], device=device, dtype=torch.int64 + ) + source_ids = torch.tensor( + [slot[2] for slot in slots], device=device, dtype=torch.int64 + ) + expert_ids = local_expert_slots + rank * experts_per_rank + payload = packed_payload[local_expert_slots, packed_positions] + return types.SimpleNamespace( + payload=payload, + source_ids=source_ids, + expert_ids=expert_ids, + local_expert_counts=packed_counts.to(torch.int64), + local_expert_slots=local_expert_slots, + packed_positions=packed_positions, + ordering_contract="expert-major/layout-addressed-packed-slot-v1", + ) + + +def _expert_transform(torch, payload, expert_ids, weights, combine_weight_semantics): + """Build one local expert aggregate for the v1 unweighted combine contract.""" + if combine_weight_semantics != "unweighted-rank-sum": + raise ValueError("v1 requires unweighted rank-sum combine") + valid = expert_ids >= 0 + expert = expert_ids.clamp(min=0).to(torch.int64) + gate = weights.to(torch.float32).masked_fill(~valid, 0) + scale = ((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32 + offset_a = (((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64 + offset_b = (((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128 + scale_sum = (gate * scale).sum(dim=1, keepdim=True) + offset_a_sum = (gate * offset_a).sum(dim=1, keepdim=True) + offset_b_sum = (gate * offset_b).sum(dim=1, keepdim=True) + columns = torch.arange(payload.shape[1], device=payload.device, dtype=torch.int64) + pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8 + transformed = ( + payload.float() * scale_sum + offset_a_sum + offset_b_sum * pattern.unsqueeze(0) + ) + return transformed.to(payload.dtype) + + +def _expert_transform_expanded(torch, payload, expert_ids): + """Apply the oracle transform to one row per token/expert assignment.""" + expert = expert_ids.to(torch.int64) + scale = (((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32).unsqueeze(1) + offset_a = ((((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64).unsqueeze(1) + offset_b = ((((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128).unsqueeze(1) + columns = torch.arange(payload.shape[1], device=payload.device, dtype=torch.int64) + pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8 + transformed = payload.float() * scale + offset_a + offset_b * pattern.unsqueeze(0) + return transformed.to(payload.dtype) + + +def _expected_transformed_combine(torch, problem): + """Independently derive sum_i gate_i * expert_i(x) for each source token.""" + semantic_x = getattr(problem, "oracle_x", problem.x) + expected = torch.zeros_like(semantic_x, dtype=torch.float32) + expert_ids = problem.topk_idx.to(torch.int64) + weights = problem.topk_weights.to(torch.float32) + columns = torch.arange(semantic_x.shape[1], device=semantic_x.device, dtype=torch.int64) + pattern = (((columns * 13) % 17) - 8).to(torch.float32) / 8 + for slot in range(expert_ids.shape[1]): + expert = expert_ids[:, slot] + gate = weights[:, slot].unsqueeze(1) + scale = (((expert * 17 + 5) % 31 + 1).to(torch.float32) / 32).unsqueeze(1) + offset_a = ((((expert * 29 + 7) % 37) - 18).to(torch.float32) / 64).unsqueeze(1) + offset_b = ((((expert * 43 + 11) % 41) - 20).to(torch.float32) / 128).unsqueeze(1) + expert_output = semantic_x.float() * scale + offset_a + offset_b * pattern.unsqueeze(0) + expected.add_(gate * expert_output) + return expected + + +def _baseline_precision_axis() -> dict: + return { + "encoded_payload_valid": True, + "scales_finite": None, + "scales_positive": None, + "dequantized_semantics": True, + "saturation_count": 0, + "saturation_rate": 0.0, + "max_abs_error": 0.0, + "max_rel_error": 0.0, + "passed": True, + } + + +def _precision_evidence(backend, problem, view, combined, expected_combined) -> dict: + method = getattr(backend, "precision_evidence", None) + if method is not None: + evidence = method(problem, view) + combine_axis = backend.communication_precision["combine"] + if combine_axis["communication_format"] != "bf16": + if combined.shape == expected_combined.shape and combined.numel(): + absolute = (combined.float() - expected_combined.float()).abs() + max_abs_error = float(absolute.max().item()) + max_rel_error = max_abs_error / ( + float(expected_combined.float().abs().max().item()) + 1e-6 + ) + tolerance = ORACLE_ATOL + float( + getattr(backend, "tolerance", ORACLE_RTOL) + ) * expected_combined.float().abs() + semantics = bool((absolute <= tolerance).all().item()) + elif combined.shape == expected_combined.shape: + max_abs_error = max_rel_error = 0.0 + semantics = True + else: + max_abs_error = max_rel_error = 1e30 + semantics = False + direction = evidence["combine"] + direction.update({ + "dequantized_semantics": semantics, + "max_abs_error": max_abs_error, + "max_rel_error": max_rel_error, + }) + scale_ok = ( + direction["scales_finite"] is not False + and direction["scales_positive"] is not False + ) + direction["passed"] = bool( + direction["encoded_payload_valid"] and semantics and scale_ok + ) + evidence["passed"] = bool( + evidence["dispatch"]["passed"] and direction["passed"] + ) + return evidence + profile_id = getattr(backend, "precision_profile_id", None) + if profile_id != identity.V1_CONTROL_PRECISION_PROFILE: + failed = _baseline_precision_axis() + failed.update({"encoded_payload_valid": False, "dequantized_semantics": False, + "passed": False}) + return {"profile_id": profile_id, "dispatch": failed, "combine": dict(failed), + "passed": False} + return { + "profile_id": profile_id, + "dispatch": _baseline_precision_axis(), + "combine": _baseline_precision_axis(), + "passed": True, + } + + +def _failed_precision_evidence(backend) -> dict: + failed = _baseline_precision_axis() + failed.update({"encoded_payload_valid": False, "dequantized_semantics": False, + "passed": False}) + return { + "profile_id": getattr(backend, "precision_profile_id", None), + "dispatch": failed, + "combine": dict(failed), + "passed": False, + } + + +def aggregate_precision_evidence(evidence_by_rank: list[dict]) -> dict: + """Collapse pre/post rank evidence without hiding any direction's worst observation.""" + records = [record[phase] for record in evidence_by_rank for phase in ("pre", "post")] + profile_ids = {record["profile_id"] for record in records} + if len(profile_ids) != 1: + raise ValueError("precision evidence profiles differ across ranks or oracle passes") + result = {"profile_id": profile_ids.pop()} + for direction in ("dispatch", "combine"): + axes = [record[direction] for record in records] + rank_counts = [] + for rank_index in range(len(evidence_by_rank)): + rank_counts.append(max( + evidence_by_rank[rank_index][phase][direction]["saturation_count"] + for phase in ("pre", "post") + )) + scale_finite = [axis["scales_finite"] for axis in axes] + scale_positive = [axis["scales_positive"] for axis in axes] + result[direction] = { + "encoded_payload_valid": all(axis["encoded_payload_valid"] for axis in axes), + "scales_finite": ( + None if all(value is None for value in scale_finite) + else all(value is True for value in scale_finite) + ), + "scales_positive": ( + None if all(value is None for value in scale_positive) + else all(value is True for value in scale_positive) + ), + "dequantized_semantics": all(axis["dequantized_semantics"] for axis in axes), + "saturation_count": sum(rank_counts), + "saturation_rate": max(axis["saturation_rate"] for axis in axes), + "max_abs_error": max(axis["max_abs_error"] for axis in axes), + "max_rel_error": max(axis["max_rel_error"] for axis in axes), + "passed": all(axis["passed"] for axis in axes), + } + result["passed"] = all(result[direction]["passed"] for direction in ("dispatch", "combine")) + return result + + +def _run_expert_packed_oracle( + torch, + routing, + backend, + problem, + global_idx, + global_weights, + rank: int, + experts_per_rank: int, + seed: int, +): + """Verify an expert-packed dispatch and native gate-weighted combine.""" + contract = LOW_LATENCY_ORACLE_CONTRACT + handle = backend.dispatch(problem) + torch.cuda.synchronize() + try: + packed = backend.inspect_expert_dispatch(problem, handle) + view = expert_packed_dispatch_view( + torch, + packed.payload, + packed.local_expert_counts, + packed.source_info, + packed.layout_range, + rank=rank, + tokens_per_rank=problem.T, + experts_per_rank=experts_per_rank, + world_size=backend.world_size, + ) + decoded_source_ids = routing.decode_source_ids(view.payload, seed) + except Exception as inspection_error: + try: + problem.recv_tokens = backend.recv_tokens(handle) + backend.stage(problem, handle) + backend.combine(problem, handle) + torch.cuda.synchronize() + except Exception as cleanup_error: + raise inspection_error from cleanup_error + return { + "_precision": _failed_precision_evidence(backend), + "contract": contract, + "passed": False, + "ordering_contract": "adapter-inspection-failed", + "order_sha256": None, + "dispatch_sha256": None, + "combine_weight_semantics": getattr( + backend, "combine_weight_semantics", "undeclared" + ), + "receive_count": 0, + "atol": ORACLE_ATOL, + "max_absolute_error": None, + "max_elementwise_relative_error": None, + "max_relative_error": None, + "max_weight_error": None, + "rtol": ORACLE_RTOL, + "checks": { + "combine_values": False, + "counts": False, + "metadata": False, + "multiplicity": False, + "payload": False, + "source_set": False, + "weights": False, + }, + } + + device = problem.x.device + world_size = backend.world_size + total_experts = experts_per_rank * world_size + global_idx_device = global_idx.to(device=device, dtype=torch.int64) + global_weights_device = global_weights.to(device=device, dtype=torch.float32) + source_grid = torch.arange( + global_idx_device.shape[0], device=device, dtype=torch.int64 + ).unsqueeze(1).expand_as(global_idx_device) + local_mask = (global_idx_device // experts_per_rank) == rank + expected_sources = source_grid[local_mask] + expected_experts = global_idx_device[local_mask] + expected_pair_weights = global_weights_device[local_mask] + + receive_count = int(view.payload.shape[0]) + shape_ok = ( + view.payload.ndim == 2 + and view.source_ids.shape == (receive_count,) + and view.expert_ids.shape == (receive_count,) + and view.local_expert_counts.shape == (experts_per_rank,) + ) + source_range = bool( + receive_count == 0 + or ( + (view.source_ids >= 0) + & (view.source_ids < global_idx_device.shape[0]) + ).all().item() + ) + expected_payload = ( + routing.activations_for_source_ids( + view.source_ids, problem.x.shape[1], seed, problem.x.dtype + ) + if source_range + else torch.empty_like(view.payload) + ) + normalize_payload = getattr(backend, "oracle_dispatch_payload", None) + if source_range and normalize_payload is not None: + expected_payload = normalize_payload(expected_payload) + payload_ok = bool( + source_range + and torch.equal(decoded_source_ids.to(torch.int64), view.source_ids) + and torch.equal(view.payload, expected_payload) + ) + + actual_keys = view.source_ids * total_experts + view.expert_ids + expected_keys = expected_sources * total_experts + expected_experts + actual_order = torch.argsort(actual_keys, stable=True) + expected_order = torch.argsort(expected_keys, stable=True) + canonical_sources = view.source_ids.index_select(0, actual_order) + canonical_experts = view.expert_ids.index_select(0, actual_order) + canonical_expected_weights = expected_pair_weights.index_select(0, expected_order) + expected_local_idx = global_idx_device[ + rank * problem.T:(rank + 1) * problem.T + ] + metadata_ok = bool( + shape_ok + and torch.equal(problem.topk_idx.to(torch.int64), expected_local_idx) + and torch.equal( + actual_keys.index_select(0, actual_order), + expected_keys.index_select(0, expected_order), + ) + ) + expected_counts = torch.bincount( + expected_experts - rank * experts_per_rank, minlength=experts_per_rank + ) + counts_ok = torch.equal( + view.local_expert_counts.to(torch.int64), expected_counts.to(torch.int64) + ) + actual_multiplicity = torch.bincount( + view.source_ids, minlength=global_idx_device.shape[0] + ) + expected_multiplicity = torch.bincount( + expected_sources, minlength=global_idx_device.shape[0] + ) + multiplicity_ok = torch.equal(actual_multiplicity, expected_multiplicity) + source_set_ok = torch.equal( + torch.sort(torch.unique(view.source_ids)).values, + torch.sort(torch.unique(expected_sources)).values, + ) + + expected_local_weights = global_weights_device[ + rank * problem.T:(rank + 1) * problem.T + ] + if problem.topk_weights.shape == expected_local_weights.shape: + max_weight_error = ( + float((problem.topk_weights.float() - expected_local_weights).abs().max().item()) + if expected_local_weights.numel() + else 0.0 + ) + else: + max_weight_error = None + weights_ok = max_weight_error == 0.0 + ordering_contract = f"canonical-source-expert-v1/{view.ordering_contract}" + order_sha256 = _tensor_sha256(canonical_sources, canonical_experts) + dispatch_sha256 = _tensor_sha256( + canonical_sources, canonical_experts, canonical_expected_weights + ) + + handle.oracle_local_expert_slots = view.local_expert_slots + handle.oracle_packed_positions = view.packed_positions + problem.recv_tokens = receive_count + transformed = _expert_transform_expanded(torch, view.payload, view.expert_ids) + combined = backend.combine_transformed(problem, handle, transformed) + torch.cuda.synchronize() + expected_combined = _expected_transformed_combine(torch, problem) + if combined.shape == expected_combined.shape and combined.numel(): + absolute_error = (combined.float() - expected_combined).abs() + max_absolute_error = float(absolute_error.max().item()) + max_relative_error = max_absolute_error / ( + float(expected_combined.abs().max().item()) + 1e-6 + ) + max_elementwise_relative_error = float( + (absolute_error / expected_combined.abs().clamp_min(ORACLE_ATOL)).max().item() + ) + combine_values_ok = bool(torch.allclose( + combined.float(), expected_combined, rtol=ORACLE_RTOL, atol=ORACLE_ATOL + )) + elif combined.shape == expected_combined.shape: + max_absolute_error = 0.0 + max_elementwise_relative_error = 0.0 + max_relative_error = 0.0 + combine_values_ok = True + else: + max_absolute_error = None + max_elementwise_relative_error = None + max_relative_error = None + combine_values_ok = False + tolerance = float(getattr(backend, "tolerance", ORACLE_RTOL)) + precision = _precision_evidence(backend, problem, view, combined, expected_combined) + checks = { + "combine_values": combine_values_ok, + "counts": counts_ok, + "metadata": metadata_ok, + "multiplicity": multiplicity_ok, + "payload": payload_ok, + "source_set": source_set_ok, + "weights": weights_ok, + } + return { + "_precision": precision, + "contract": contract, + "passed": bool( + all(checks.values()) + and precision["passed"] + and ordering_contract + and max_relative_error is not None + and max_relative_error < tolerance + ), + "atol": ORACLE_ATOL, + "combine_weight_semantics": backend.combine_weight_semantics, + "ordering_contract": ordering_contract, + "order_sha256": order_sha256, + "dispatch_sha256": dispatch_sha256, + "receive_count": receive_count, + "max_absolute_error": max_absolute_error, + "max_elementwise_relative_error": max_elementwise_relative_error, + "max_relative_error": max_relative_error, + "max_weight_error": max_weight_error, + "rtol": ORACLE_RTOL, + "checks": checks, + } + + +def _run_expert_oracle( + torch, + routing, + backend, + problem, + global_idx, + global_weights, + rank: int, + experts_per_rank: int, + seed: int, +): + """Verify one real dispatch/transform/combine without entering a timed region.""" + if getattr(backend, "oracle_layout", "token-rank") == "expert-packed": + return _run_expert_packed_oracle( + torch, + routing, + backend, + problem, + global_idx, + global_weights, + rank, + experts_per_rank, + seed, + ) + handle = backend.dispatch(problem) + torch.cuda.synchronize() + try: + view = backend.inspect_dispatch(problem, handle) + source_ids = routing.decode_source_ids(view.payload, seed) + except Exception as inspection_error: + try: + problem.recv_tokens = backend.recv_tokens(handle) + backend.stage(problem, handle) + backend.combine(problem, handle) + torch.cuda.synchronize() + except Exception as cleanup_error: + raise inspection_error from cleanup_error + return { + "_precision": _failed_precision_evidence(backend), + "contract": ORACLE_CONTRACT, + "passed": False, + "ordering_contract": "adapter-inspection-failed", + "order_sha256": None, + "dispatch_sha256": None, + "combine_weight_semantics": getattr( + backend, "combine_weight_semantics", "undeclared" + ), + "receive_count": 0, + "atol": ORACLE_ATOL, + "max_absolute_error": None, + "max_elementwise_relative_error": None, + "max_relative_error": None, + "max_weight_error": None, + "rtol": ORACLE_RTOL, + "checks": { + "combine_values": False, + "counts": False, + "metadata": False, + "multiplicity": False, + "payload": False, + "source_set": False, + "weights": False, + }, + } + + receive_count = int(view.payload.shape[0]) + shape_ok = ( + view.payload.ndim == 2 + and view.expert_ids.shape == (receive_count, problem.topk_idx.shape[1]) + and view.weights.shape == view.expert_ids.shape + ) + source_range = bool( + receive_count == 0 + or ((source_ids >= 0) & (source_ids < global_idx.shape[0])).all().item() + ) + if source_range: + expected_idx = global_idx.to(problem.x.device).index_select(0, source_ids) + expected_weights = global_weights.to(problem.x.device).index_select(0, source_ids) + local = (expected_idx // experts_per_rank) == rank + expected_ids = torch.where(local, expected_idx, torch.full_like(expected_idx, -1)) + expected_weights = expected_weights.masked_fill(~local, 0) + expected_payload = routing.activations_for_source_ids( + source_ids, problem.x.shape[1], seed, problem.x.dtype + ) + normalize_payload = getattr(backend, "oracle_dispatch_payload", None) + if normalize_payload is not None: + expected_payload = normalize_payload(expected_payload) + else: + expected_ids = torch.full_like(view.expert_ids, -1) + expected_weights = torch.zeros_like(view.weights) + expected_payload = torch.empty_like(view.payload) + actual_ids, actual_weights = _normalized_expert_metadata( + torch, view.expert_ids, view.weights + ) + expected_ids, expected_weights = _normalized_expert_metadata( + torch, expected_ids, expected_weights + ) + expected_sources = ( + ((global_idx // experts_per_rank) == rank).any(dim=1).nonzero(as_tuple=True)[0] + ).to(problem.x.device) + source_set_ok = ( + source_range + and source_ids.numel() == torch.unique(source_ids).numel() + and torch.equal(torch.sort(source_ids).values, expected_sources) + ) + payload_ok = source_range and torch.equal(view.payload, expected_payload) + metadata_ok = shape_ok and torch.equal(actual_ids, expected_ids) + max_weight_error = ( + float((actual_weights - expected_weights).abs().max().item()) + if actual_weights.numel() + else 0.0 + ) + weights_ok = max_weight_error == 0.0 + valid_expected = expected_ids >= 0 + expected_local = expected_ids[valid_expected] - rank * experts_per_rank + expected_counts = torch.bincount(expected_local, minlength=experts_per_rank) + counts_ok = torch.equal( + view.local_expert_counts.to(torch.int64), expected_counts.to(torch.int64) + ) + multiplicity_ok = torch.equal( + (actual_ids >= 0).sum(dim=1), (expected_ids >= 0).sum(dim=1) + ) + # Receive-slot assignment may use atomics and is not a semantic EP guarantee. Compare + # pre/post dispatch evidence in canonical source-token order without changing the native path. + canonical_order = torch.argsort(source_ids.to(torch.int64), stable=True) + canonical_sources = source_ids.to(torch.int64).index_select(0, canonical_order) + canonical_ids = actual_ids.to(torch.int64).index_select(0, canonical_order) + canonical_weights = actual_weights.index_select(0, canonical_order) + ordering_contract = f"canonical-source-id-v1/{view.ordering_contract}" + order_sha256 = _tensor_sha256(canonical_sources) + dispatch_sha256 = _tensor_sha256( + canonical_sources, canonical_ids, canonical_weights + ) + + problem.recv_tokens = receive_count + combine_weight_semantics = backend.combine_weight_semantics + transformed = _expert_transform( + torch, view.payload, actual_ids, actual_weights, combine_weight_semantics + ) + combined = backend.combine_transformed(problem, handle, transformed) + torch.cuda.synchronize() + expected_combined = _expected_transformed_combine(torch, problem) + if combined.shape == expected_combined.shape and combined.numel(): + absolute_error = (combined.float() - expected_combined).abs() + max_absolute_error = float(absolute_error.max().item()) + max_relative_error = max_absolute_error / ( + float(expected_combined.abs().max().item()) + 1e-6 + ) + max_elementwise_relative_error = float( + (absolute_error / expected_combined.abs().clamp_min(ORACLE_ATOL)).max().item() + ) + combine_values_ok = bool(torch.allclose( + combined.float(), expected_combined, rtol=ORACLE_RTOL, atol=ORACLE_ATOL + )) + elif combined.shape == expected_combined.shape: + max_absolute_error = 0.0 + max_elementwise_relative_error = 0.0 + max_relative_error = 0.0 + combine_values_ok = True + else: + max_absolute_error = None + max_elementwise_relative_error = None + max_relative_error = None + combine_values_ok = False + tolerance = float(getattr(backend, "tolerance", 5e-2)) + precision = _precision_evidence(backend, problem, view, combined, expected_combined) + checks = { + "combine_values": combine_values_ok, + "counts": counts_ok, + "metadata": metadata_ok, + "multiplicity": multiplicity_ok, + "payload": payload_ok, + "source_set": source_set_ok, + "weights": weights_ok, + } + return { + "_precision": precision, + "contract": ORACLE_CONTRACT, + "passed": bool( + all(checks.values()) + and precision["passed"] + and ordering_contract + and max_relative_error is not None + and max_relative_error < tolerance + ), + "atol": ORACLE_ATOL, + "combine_weight_semantics": combine_weight_semantics, + "ordering_contract": ordering_contract, + "order_sha256": order_sha256, + "dispatch_sha256": dispatch_sha256, + "receive_count": receive_count, + "max_absolute_error": max_absolute_error, + "max_elementwise_relative_error": max_elementwise_relative_error, + "max_relative_error": max_relative_error, + "max_weight_error": max_weight_error, + "rtol": ORACLE_RTOL, + "checks": checks, + } + + +def _histogram(xs: list[float], nbins: int = 40) -> dict: + """Compact equal-width summary of the exact private cross-rank-max samples.""" + if not xs: + return {"n": 0} + lo, hi = min(xs), max(xs) + if hi <= lo: + return {"n": len(xs), "min": lo, "max": hi, "bins": nbins, "counts": [len(xs)]} + counts = [0] * nbins + span = hi - lo + for x in xs: + b = min(nbins - 1, int((x - lo) / span * nbins)) + counts[b] += 1 + return {"n": len(xs), "min": round(lo, 3), "max": round(hi, 3), "bins": nbins, "counts": counts} + + +def _derive_publication_status(v: dict) -> str: + """Classify raw attempts; only the isolated coverage publisher may promote evidence.""" + if v["execution_status"] != "complete": + return "failed" + if v["semantic_correctness"] != "pass" or v["measurement_conformance"] != "conformant" \ + or v["workload_identity"] == "inconsistent": + return "invalid" + # Per-case producers cannot prove exact matrix coverage, repeat stability, or controlled + # cohorts. Keep even sound attempts diagnostic until the isolated publisher validates them. + return "diagnostic" + + +def run_sweep(args, backend, torch, dist, device, rank: int, world_size: int) -> int: + """Drive the source-tokens-per-rank sweep for one fully-specified line.""" + mode = getattr(args, "mode", "normal") + requested_precision = getattr(args, "precision_profile", "") or None + resolved_precision_id = requested_precision or identity.V1_CONTROL_PRECISION_PROFILE + try: + profile_case = {"mode": mode} + if requested_precision is not None: + profile_case["precision_profile"] = requested_precision + case_profile = identity.profile_for_case(profile_case) + communication_precision = identity.precision_profile(resolved_precision_id) + except identity.IdentityError as exc: + if rank == 0: + print(f"ERROR: {exc}") + return 2 + sampling_error = sampling_contract_error(args.iters, args.trials, args.warmup) + if sampling_error: + if rank == 0: + print(f"ERROR: {sampling_error}") + return 2 + import routing # torch-based; imported lazily so the module byte-compiles without torch + import eplb # stdlib planner + torch remap (the EPLB transform) + + ep_size = world_size + # EPLB (if on): run_ep.py already bumped args.experts to the PHYSICAL count and stashed the + # logical count, so experts_per_rank below is physical. The trace is built over LOGICAL + # experts then remapped to physical (build_trace), so the whole sweep runs over the + # balanced physical placement with no adapter change. + eplb_on = getattr(args, "eplb", False) + num_logical = getattr(args, "num_logical_experts", args.experts) + if args.experts % ep_size != 0: + if rank == 0: + print(f"ERROR: experts ({args.experts}) must divide ep_size ({ep_size})") + return 2 + experts_per_rank = args.experts // ep_size + if getattr(backend, "mode", None) != mode: + if rank == 0: + print(f"ERROR: backend mode {getattr(backend, 'mode', None)!r} != {mode!r}") + return 2 + if ( + getattr(backend, "precision_profile_id", None) != resolved_precision_id + or getattr(backend, "communication_precision", None) != communication_precision + ): + if rank == 0: + print("ERROR: backend did not realize the requested communication precision") + return 2 + expected_weight_semantics = ( + "gate-weighted-sum" + if case_profile["combine_semantics"] == "gate-weighted" + else "unweighted-rank-sum" + ) + if getattr(backend, "combine_weight_semantics", None) != expected_weight_semantics: + if rank == 0: + print( + f"ERROR: {mode} requires combine semantics {expected_weight_semantics}" + ) + return 2 + if mode == LOW_LATENCY_MODE and ( + args.phase != "decode" + or getattr(backend, "oracle_layout", None) != "expert-packed" + or getattr(backend, "payload_unit", None) != "token-expert" + ): + if rank == 0: + print("ERROR: low-latency requires decode expert-packed token-expert execution") + return 2 + + cap = backend.buffer_cap(args) + conditioning_ladder = CONDITIONING_LADDERS[args.phase] + if cap is not None and cap < conditioning_ladder[-1]: + if rank == 0: + print(f"ERROR: {backend.name} buffer cap {cap} cannot run the v1 conditioning ladder") + return 2 + ladder, dropped = token_ladder(args.tokens_ladder, args.phase, cap) + if rank == 0 and dropped: + print(f"NOTE: dropped tokens/rank {dropped} — exceed {backend.name} buffer cap {cap} " + f"(hidden={args.hidden}); not silently truncated.") + if not ladder: + if rank == 0: + print(f"ERROR: empty token ladder (phase={args.phase}, cap={cap})") + return 2 + MAX, MIN, SUM = dist.ReduceOp.MAX, dist.ReduceOp.MIN, dist.ReduceOp.SUM + + # EPLB plan (once): estimate logical load from the global logical trace at the largest + # ladder T (most samples), then replicate+place. Held fixed across all T (as real EPLB + # plans from an observed load estimate). build_trace builds the LOGICAL trace and remaps + # to physical when the plan is present; otherwise it's the identity (logical == physical). + eplb_plan = None + eplb_calibration = None + if eplb_on: + calibration_id, calibration_checksums, calibration_rows, _ = ( + workload_contract.canonical_eplb_calibration_member( + args.routing, + args.hidden, + args.topk, + num_logical, + ep_size, + EPLB_REFERENCE_TOKENS_PER_RANK, + args.seed, + ) + ) + ref_idx = torch.tensor( + calibration_rows, + dtype=torch.int64, + ) + eplb_calibration = { + "token_offset": workload_contract.EPLB_CALIBRATION_TOKEN_OFFSET, + "trace_sha256": calibration_checksums["trace"], + "window": workload_contract.EPLB_CALIBRATION_WINDOW, + "workload_id": calibration_id, + } + if ref_idx.shape != ( + EPLB_REFERENCE_TOKENS_PER_RANK * ep_size, + args.topk, + ): + raise RuntimeError("EPLB calibration trace dimensions differ from the contract") + load = torch.bincount(ref_idx.reshape(-1), minlength=num_logical).float().tolist() + eplb_plan = eplb.build_plan(load, args.experts, ep_size) + if rank == 0: + print(f"NOTE: EPLB {num_logical}->{args.experts} experts ({ep_size}x{experts_per_rank}); " + f"per-rank load imbalance {eplb_plan['imbalance_before']:.2f}x -> " + f"{eplb_plan['imbalance_after']:.2f}x; {eplb_plan['replicated_experts']} experts " + f"replicated (hottest {eplb_plan['max_replicas']}x)") + + canonical = bool(getattr(args, "workload_dir", "")) + loaded_workload_ids, loaded_checksums = [], {} + if canonical: + import workload as _wl + + def build_trace(gt): + # canonical: load pre-serialized trace bytes (verified by checksum) so this run is + # provably the SAME workload as any other consuming the same files. else: seeded gen. + if canonical: + wid = _wl.compute_workload_id( + args.routing, args.hidden, args.topk, num_logical, ep_size, gt, args.seed + ) + idx_np, w_np, man = _wl.load_workload(os.path.join(args.workload_dir, f"{wid}.npz"), verify=True) + idx_l = torch.from_numpy(idx_np).to(torch.int64) + w = torch.from_numpy(w_np).to(torch.float32) + if wid not in loaded_workload_ids: + loaded_workload_ids.append(wid) + loaded_checksums[wid] = man.get("checksums") + else: + idx_l, w = routing.build_global_routing( + gt, num_logical, args.topk, args.routing, args.seed + ) + return (eplb.remap_idx(idx_l, eplb_plan) if eplb_plan is not None else idx_l), w + + # Fabric/clock warm-up BEFORE any timed point (review: H200 had an anomalous cold + # first point and a 40% decode-vs-prefill mismatch at the shared T=128). Gradually + # ramp through the small ladder shapes untimed — warms clocks/fabric for everyone + # and is also cold-jump-safe for MoRI. + def warm_roundtrips(problem, count): + for _ in range(count): + handle = backend.dispatch(problem) + if not hasattr(problem, "recv_tokens"): + # Dynamic receive cardinality is stable for this fixed routing trace. Cache it + # during untimed conditioning so adapters never read a device scalar in timing. + problem.recv_tokens = backend.recv_tokens(handle) + backend.stage(problem, handle) + backend.combine(problem, handle) + torch.cuda.synchronize() + + for wt in conditioning_ladder: + # Warm-only shapes need not have canonical manifests: they are never measured or emitted. + wi, ww = routing.build_global_routing( + wt * ep_size, num_logical, args.topk, args.routing, args.seed, + ) + if eplb_plan is not None: + wi = eplb.remap_idx(wi, eplb_plan) + wsi, wsw = routing.rank_slice(wi, ww, rank, wt) + wx = routing.rank_activations(wt, args.hidden, args.seed, rank, device, torch.bfloat16) + wp = backend.make_problem(wt, wsi.to(device), wsw.to(device), wx) + warm_roundtrips(wp, CONDITIONING_ROUNDS_PER_SHAPE) + torch.cuda.synchronize() + dist.barrier() + # Setup may materialize deferred provenance such as DeepEP V2 JIT CUBINs. + # Resolve it after conditioning but before correctness or timed measurements. + capture_deferred_provenance = getattr(backend, "capture_deferred_provenance", None) + if capture_deferred_provenance is not None: + capture_deferred_provenance() + provenance_issues = contracts.backend_provenance_issues( + backend.name, backend.backend_provenance + ) + if provenance_issues: + if rank == 0: + print( + f"ERROR: unpinned provenance {provenance_issues} " + f"in {backend.backend_provenance}" + ) + return 4 + # ---- Pass 1: build each deterministic problem and run the expert oracle. ---- + problems, gate, gts, global_traces, input_snapshots = {}, {}, {}, {}, {} + routing_hashes = set() + for T in ladder: + counts = [T] * ep_size + gt = T * ep_size + gts[T] = gt + idx_g, w_g = build_trace(gt) + rstats = routing.routing_stats(idx_g, args.experts, experts_per_rank, weights=w_g) + gpn = args.gpus_per_node or ep_size + rstats["locality"] = routing.routing_locality(idx_g, experts_per_rank, ep_size, max(1, T), + gpn, args.scale_up_domain or None) + rstats["source_token_stats"] = _stats_vec(counts) + routing_hashes.add(rstats["routing_hash"]) + my_off, my_cnt = rank * T, T + idx_s = idx_g[my_off:my_off + my_cnt].contiguous() + w_s = w_g[my_off:my_off + my_cnt].contiguous() + x = routing.rank_activations(my_cnt, args.hidden, args.seed, rank, device, torch.bfloat16) + problem = backend.make_problem(my_cnt, idx_s.to(device), w_s.to(device), x) + input_snapshots[T] = ( + problem.x.clone(), problem.topk_idx.clone(), problem.topk_weights.clone() + ) + oracle = _run_expert_oracle( + torch, routing, backend, problem, idx_g, w_g, rank, experts_per_rank, + args.seed, + ) + precision_pre = oracle.pop("_precision") + before_x, before_idx, before_weights = input_snapshots[T] + pre_input_unchanged = ( + torch.equal(problem.x, before_x) + and torch.equal(problem.topk_idx, before_idx) + and torch.equal(problem.topk_weights, before_weights) + ) + problems[T] = problem + global_traces[T] = (idx_g, w_g) + gate[T] = { + "rstats": rstats, + "recv_local": oracle["receive_count"], + "max_rel": oracle["max_relative_error"] or 0.0, + "local_ok": int(oracle["passed"]), + "oracle_pre": oracle, + "precision_pre": precision_pre, + "pre_input_unchanged": pre_input_unchanged, + } + + # ---- Pass 2: every backend uses the same ascending point order and conditioning ramp. + # Per-iteration cross-rank MAX samples are pooled across trials. ---- + disp_pool = {T: [] for T in ladder} # pooled per-iteration cross-rank MAX (dispatch) + stage_pool = {T: [] for T in ladder} # measured only when stage launches device work + comb_pool = {T: [] for T in ladder} # ... combine + rt_pool = {T: [] for T in ladder} # independently measured round trip + disp_trials = {T: [] for T in ladder} + stage_trials = {T: [] for T in ladder} + comb_trials = {T: [] for T in ladder} + rt_trials = {T: [] for T in ladder} + stage_device_work = getattr(backend, "stage_device_work", False) + if type(stage_device_work) is not bool: + raise ValueError("backend.stage_device_work must be a boolean") + order_identity = args.case_id or _sha256_json({ + "backend": backend.name, + "ep_size": ep_size, + "mode": mode, + "phase": args.phase, + "precision_profile": resolved_precision_id, + "runner": args.runner, + "suite": args.suite, + }) + observed_component_orders = [] + for trial_index in range(args.trials): + order = qualification_order( + list(ladder), args.qualification_index, trial_index, + identity_key=f"{order_identity}:tokens", + ) + for T in order: + problem = problems[T] + # Stateful paired APIs may expose only a measured round trip. + # Do not synthesize component latency from that measurement. + roundtrip_only = getattr(backend, "roundtrip_only", False) + + def rt_once(p=problem): + hh = backend.dispatch(p) + backend.stage(p, hh) + return backend.combine(p, hh) + + available_components = ["roundtrip"] + if not roundtrip_only: + available_components.extend(["dispatch", "combine"]) + if stage_device_work: + available_components.append("stage") + component_order = qualification_order( + available_components, + args.qualification_index, + trial_index, + identity_key=f"{order_identity}:components:{T}", + ) + observed_component_orders.append({ + "components": component_order, + "tokens_per_rank": T, + "trial_index": trial_index, + }) + measured = {name: [] for name in ("dispatch", "stage", "combine", "roundtrip")} + + def prep_stage(p=problem): + return backend.dispatch(p) + + def prep_combine(p=problem): + hh = backend.dispatch(p) + backend.stage(p, hh) + return hh + + for component_name in component_order: + # Every measured component receives the same 32 synchronized full-roundtrip + # warmups immediately before its timed trial. + warm_roundtrips(problem, args.warmup) + if component_name == "roundtrip": + measured[component_name] = time_us( + torch, lambda p=problem: rt_once(p), 0, args.iters + ) + elif component_name == "dispatch": + measured[component_name] = time_us( + torch, lambda p=problem: backend.dispatch(p), 0, args.iters + ) + elif component_name == "stage": + measured[component_name] = time_us( + torch, + lambda hh, p=problem: backend.stage(p, hh), + 0, + args.iters, + pre=prep_stage, + ) + elif component_name == "combine": + if backend.combine_needs_redispatch: + measured[component_name] = time_us( + torch, + lambda hh, p=problem: backend.combine(p, hh), + 0, + args.iters, + pre=prep_combine, + ) + else: + hh = prep_combine() + torch.cuda.synchronize() + measured[component_name] = time_us( + torch, + lambda p=problem, hx=hh: backend.combine(p, hx), + 0, + args.iters, + ) + else: # pragma: no cover - generated from the fixed list above + raise RuntimeError(f"unknown timed component {component_name!r}") + disp_iters = measured["dispatch"] + stage_iters = measured["stage"] + comb_iters = measured["combine"] + rt_iters = measured["roundtrip"] + # per-iteration cross-rank MAX (the distributed-op latency per iter), pooled. + if disp_iters: + reduced_dispatch = _reduce_vec(torch, dist, device, disp_iters, MAX) + reduced_combine = _reduce_vec(torch, dist, device, comb_iters, MAX) + disp_trials[T].append(reduced_dispatch) + comb_trials[T].append(reduced_combine) + disp_pool[T] += reduced_dispatch + comb_pool[T] += reduced_combine + if stage_iters: + reduced_stage = _reduce_vec(torch, dist, device, stage_iters, MAX) + stage_trials[T].append(reduced_stage) + stage_pool[T] += reduced_stage + reduced_roundtrip = _reduce_vec(torch, dist, device, rt_iters, MAX) + rt_trials[T].append(reduced_roundtrip) + rt_pool[T] += reduced_roundtrip + + # ---- Pass 3: prove timed inputs were immutable and repeat the full oracle. ---- + for T in ladder: + problem = problems[T] + before_x, before_idx, before_weights = input_snapshots[T] + input_unchanged = gate[T]["pre_input_unchanged"] and ( + torch.equal(problem.x, before_x) + and torch.equal(problem.topk_idx, before_idx) + and torch.equal(problem.topk_weights, before_weights) + ) + idx_g, w_g = global_traces[T] + post = _run_expert_oracle( + torch, routing, backend, problem, idx_g, w_g, rank, experts_per_rank, + args.seed, + ) + precision_post = post.pop("_precision") + pre = gate[T]["oracle_pre"] + order_stable = ( + pre["ordering_contract"] == post["ordering_contract"] + and pre["order_sha256"] == post["order_sha256"] + and pre["dispatch_sha256"] == post["dispatch_sha256"] + ) + gate[T].update({ + "input_unchanged": input_unchanged, + "local_ok": int(pre["passed"] and post["passed"] and input_unchanged and order_stable), + "max_rel": max(pre["max_relative_error"] or 0.0, post["max_relative_error"] or 0.0), + "oracle_post": post, + "precision_post": precision_post, + "order_stable": order_stable, + }) + + # ---- Pass 4: percentiles (p50/p90/p95/p99, nearest-rank) from pooled samples + bytes + row ---- + def pcts(xs): + return ({"p50": percentile(xs, 50), "p90": percentile(xs, 90), + "p95": percentile(xs, 95), "p99": percentile(xs, 99)} if xs else None) + + def component(percentiles, count, *, derived=False): + if percentiles is None: + return {"availability": "unavailable", "origin": None, + "percentiles_us": None, "sample_count": 0} + return { + "availability": "derived" if derived else "measured", + "origin": "derived-percentile-sum" if derived else "measured", + "percentiles_us": percentiles, + "sample_count": 0 if derived else count, + } + rows = [] + all_anomalies = [] + thr_rt = 3.0 + for T in ladder: + gt = gts[T] + g = gate[T] + rstats = g["rstats"] + d, s, c, rt = disp_pool[T], stage_pool[T], comb_pool[T], rt_pool[T] + dp, sp, cp, rtp = pcts(d), pcts(s), pcts(c), pcts(rt) + # isolated_sum = SUM of the isolated dispatch+stage+combine percentiles. Stage contributes + # zero when it is explicitly not applicable. This is NOT a measured chained operation + # (can't reveal shared sync / launch amortization / overlap) — do NOT use for throughput + # or SLO capacity. The MEASURED round trip (rtp) is the real chained latency. + isum = ( + {key: dp[key] + (sp[key] if sp is not None else 0.0) + cp[key] for key in dp} + if dp and cp else None + ) + recv_total = _reduce_int(torch, dist, device, g["recv_local"], SUM) + recv_max = _reduce_int(torch, dist, device, g["recv_local"], MAX) + recv_min = _reduce_int(torch, dist, device, g["recv_local"], MIN) + global_ok = _reduce_int(torch, dist, device, g["local_ok"], MIN) + max_rel = _reduce_vec(torch, dist, device, [g["max_rel"]], MAX)[0] + point_ok = bool(global_ok) and recv_total > 0 + rank_evidence = [None] * world_size + dist.all_gather_object( + rank_evidence, + { + "input_unchanged": g["input_unchanged"], + "order_stable": g["order_stable"], + "post_timing": g["oracle_post"], + "pre_timing": g["oracle_pre"], + "rank": rank, + }, + ) + precision_rank_evidence = [None] * world_size + dist.all_gather_object( + precision_rank_evidence, + {"pre": g["precision_pre"], "post": g["precision_post"]}, + ) + precision_evidence = aggregate_precision_evidence(precision_rank_evidence) + # Canonical LOGICAL payload byte contracts (from the routing trace, NOT backend recv + # tensors): token-rank = one copy per unique (token,dest-rank); token-expert = one copy + # per routed (token,expert). routed_copies = token-rank copies; gt*topk = token-expert. + token_rank_copies = rstats["routed_copies"] + logical_copies = ( + sum(rstats["expert_assignments_per_rank"]) + if case_profile["payload_unit"] == "token-expert" + else token_rank_copies + ) + H = args.hidden + throughput = { + percentile_name: gt / (latency_us * 1e-6) + for percentile_name, latency_us in rtp.items() + } + dispatch_bytes = precision_byte_provenance( + communication_precision["dispatch"], logical_copies, H + ) + combine_bytes = precision_byte_provenance( + communication_precision["combine"], logical_copies, H + ) + stage_bytes = { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": 0, + "scale_bytes": 0, + "total_logical_bytes": 0, + } + roundtrip_bytes = { + "accounting_contract": "activation-data-plus-scales-v1", + **{ + field: dispatch_bytes[field] + combine_bytes[field] + for field in ( + "activation_data_bytes", "scale_bytes", "total_logical_bytes" + ) + }, + } + # Contract-level anomalies are attached to the row and rolled into validity. + # roundtrip_gt_isolated_sum: measured RT p99 >> Σ(isolated dispatch+combine) p99. + # roundtrip_lt_component_floor: measured RT p50 < max(dispatch,combine) p50 — a chained + # op can't finish faster than its slowest required component (sync semantics violated). + row_anoms = [] + if isum and isum["p99"] > 0 and rtp["p99"] > thr_rt * isum["p99"]: + row_anoms.append({"type": "roundtrip_gt_isolated_sum", "T": T, + "roundtrip_p99": round(rtp["p99"], 2), "isolated_sum_p99": round(isum["p99"], 2), + "ratio": round(rtp["p99"] / isum["p99"], 2), "threshold": thr_rt}) + floor = ( + max(dp["p50"], cp["p50"], sp["p50"] if sp is not None else 0.0) + if dp and cp else None + ) + if floor and rtp["p50"] > 0 and rtp["p50"] < 0.95 * floor: + row_anoms.append({"type": "roundtrip_lt_component_floor", "T": T, + "roundtrip_p50": round(rtp["p50"], 2), "component_floor_p50": round(floor, 2)}) + all_anomalies.extend(row_anoms) + rows.append({ + "anomalies": row_anoms, + "components": { + "combine": component(cp, len(c)), + "dispatch": component(dp, len(d)), + "isolated_sum": component(isum, 0, derived=True), + "roundtrip": component(rtp, len(rt)), + "stage": component(sp, len(s)), + }, + "correctness": { + "contract": case_profile["oracle_contract"], + "max_relative_error": max_rel, + "passed": point_ok, + "precision": precision_evidence, + "rank_evidence": rank_evidence, + "scope": case_profile["correctness_scope"], + }, + "global_tokens": gt, + "byte_provenance": { + "combine": combine_bytes, + "dispatch": dispatch_bytes, + "roundtrip": roundtrip_bytes, + "stage": stage_bytes, + }, + "receive": { + "max": recv_max, + "mean": recv_total / world_size, + "min": recv_min, + "total": recv_total, + }, + "routing": { + "empty_expert_count": rstats["empty_expert_count"], + "empty_rank_count": rstats["empty_rank_count"], + "expert_assignment_rank_cv": rstats["expert_assignment_rank_cv"], + "expert_assignments_per_rank": rstats["expert_assignments_per_rank"], + "expert_load_cv": rstats["expert_load_cv"], + "expert_load_max": rstats["expert_load_max"], + "expert_load_mean": rstats["expert_load_mean"], + "expert_load_min": rstats["expert_load_min"], + "fanout_histogram": rstats["fanout_hist"], + "fanout_max": rstats["fanout_max"], + "fanout_mean": rstats["fanout_mean"], + "fanout_min": rstats["fanout_min"], + "hash": rstats["routing_hash"], + "hotspot_ratio": rstats["hotspot_ratio"], + "locality": rstats.get("locality"), + "payload_copies_per_rank": rstats["payload_copies_per_rank"], + "payload_rank_cv": rstats["payload_rank_cv"], + "routed_copies": rstats["routed_copies"], + "source_token_stats": rstats.get("source_token_stats"), + }, + "sample_histograms": { + "dispatch": _histogram(d) if d else None, + "stage": _histogram(s) if s else None, + "combine": _histogram(c) if c else None, + "roundtrip": _histogram(rt), + }, + "token_rate_at_latency_percentile": throughput, + "tokens_per_rank": T, + }) + if rank == 0: + component_log = (f"disp p50/p99={dp['p50']:7.1f}/{dp['p99']:7.1f} " + f"comb {cp['p50']:6.1f}/{cp['p99']:6.1f} " if dp and cp + else "components=unavailable ") + print(f" T={T:<5} {component_log}" + f"RT p50/p99={rtp['p50']:7.1f}/{rtp['p99']:7.1f}us n={len(rt)} fanout={rstats['fanout_mean']:.2f} " + f"recv[min/mean/max]={recv_min}/{recv_total // world_size}/{recv_max} " + f"correct={point_ok}") + + # Cross-rank workload-identity proof: every rank must have built the SAME global routing + # (one hash per T here); confirm all ranks agree by hashing the per-T hash set and + # MIN/MAX-reducing it — a mismatch means NVIDIA and AMD did NOT run identical routing. + trace_sig = hashlib.sha256("|".join(sorted(routing_hashes)).encode()).hexdigest() + routing_consistent = _same_hash_across_ranks(torch, dist, device, trace_sig) + + # Capture again after correctness and timing so no lazily generated kernel can escape + # the implementation identity recorded in the artifact. + if capture_deferred_provenance is not None: + capture_deferred_provenance() + + if rank != 0: + return 0 + + # status=valid requires correctness AND a proven-identical routing trace across ranks. + all_ok = bool(rows) and all(r["correctness"]["passed"] for r in rows) and routing_consistent + + # Adapters never self-label official; status is derived from these gates. + prov = backend.backend_provenance + allocation_stratum_sha256 = getattr(args, "allocation_stratum_sha256", None) + provenance_complete = contracts.provenance_complete( + prov, + backend.name, + getattr(args, "git_run", None), + allocation_stratum_sha256=allocation_stratum_sha256, + image_digest=getattr(args, "image_digest", None), + image_verified=getattr(args, "image_digest_verified", False), + squash_sha256=getattr(args, "squash_sha256", None), + ) + resource_profile = contracts.project_resource_profile(prov) + resource_conformance = resource_profile["conformance_class"] + # record the canonical workload identity consumed (one trace per T -> set of ids/checksums). + if canonical and loaded_workload_ids: + args.workload_id = identity.workload_id( + { + "members": [ + {"checksums": loaded_checksums[member], "workload_id": member} + for member in sorted(loaded_workload_ids) + ] + } + ) + args.workload_members = sorted(loaded_workload_ids) + args.workload_checksums = loaded_checksums + canonical_workload = bool(getattr(args, "workload_id", None)) + activation_identity = workload_contract.compute_activation_identity(args.seed, args.hidden) + # EPLB identity covers replica placement, not only counts. + eplb_mapping_hash = None + if eplb_plan is not None: + eplb_mapping_hash = eplb.mapping_hash(eplb_plan) + anomaly_free = len(all_anomalies) == 0 + validity = { + "execution_status": "complete" if rows else "failed", + "semantic_correctness": ( + "pass" if rows and all(r["correctness"]["passed"] for r in rows) else "fail" + ), + "workload_identity": "consistent-across-ranks" if routing_consistent else "inconsistent", + "workload_source": "canonical-serialized" if canonical_workload else "seeded-runtime", + "measurement_conformance": "conformant", # run_ep gate rejects nonconformant pre-run + "sampling_conformance": "conformant", # fixed-512-v1 gate rejects any other profile + "resource_conformance": resource_conformance, + "provenance_complete": provenance_complete, + # anomaly-free unless a contract-level timing anomaly fired (then diagnostic, see above). + "anomaly_free": anomaly_free, + } + publication_status = _derive_publication_status(validity) + + shape = { # FIXED line identity (no T, no per-backend resource knobs) + "hidden": args.hidden, "topk": args.topk, "experts": args.experts, + "experts_per_rank": experts_per_rank, + "precision_profile": resolved_precision_id, + "dispatch_precision": communication_precision["dispatch"], + "combine_precision": communication_precision["combine"], + "routing": args.routing, "eplb": bool(eplb_plan), "num_logical_experts": num_logical, + # V2 is reserved for the PR #605 ElasticBuffer adapter; package versions never imply it. + "kernel_gen": kernel_generation(backend), + "activation_profile": ACTIVATION_PROFILE, + } + generated_at = args.timestamp or _dt.datetime.now().astimezone().isoformat() + realized_placement = getattr(args, "realized_placement", None) + nodes = ( + realized_placement["nodes"] + if realized_placement is not None + else int(os.environ.get("SLURM_NNODES", "1")) + ) + scheduled_case = { + "backend": backend.name, + "canonical": canonical, + "eplb": bool(eplb_plan), + "ep": ep_size, + "experts": num_logical, + "gpus_per_node": args.gpus_per_node or ep_size, + "hidden": args.hidden, + "ladder": " ".join(map(str, ladder)), + "mode": mode, + "nodes": nodes, + "phase": args.phase, + "required_publication": args.required_publication or "diagnostic", + "routing": args.routing, + "samples_per_point": TIMED_SAMPLES_PER_POINT, + "scale_up_domain": args.scale_up_domain or (args.gpus_per_node or ep_size), + "scale_up_transport": args.scale_up_transport, + "scale_out_transport": args.scale_out_transport or None, + "scope": args.scope, + "suite": args.suite or "manual", + "timing": f"{args.iters}:{args.trials}:{args.warmup}", + "topk": args.topk, + "topology_class": args.topology_class, + "transport": args.transport, + "warmup_semantics": WARMUP_SEMANTICS, + "workload": args.workload_name or "manual", + } + if requested_precision is not None: + scheduled_case["precision_profile"] = requested_precision + case_factors = { + "case": scheduled_case, + "profile": case_profile, + "sku": args.runner, + } + computed_case_id = identity.digest("case", case_factors) + if args.case_id and args.case_id != computed_case_id: + raise ValueError( + f"scheduled case ID does not match realized factors: {args.case_id} != {computed_case_id}" + ) + case_identifier = args.case_id or computed_case_id + git_run = getattr(args, "git_run", None) or {} + allocation_factors = { + "artifact": git_run.get("artifact"), + "execution_id": getattr(args, "allocation_execution_id", None), + "job": git_run.get("job"), + "qualification_index": args.qualification_index, + "repo": git_run.get("repo"), + "run_attempt": git_run.get("run_attempt"), + "run_id": git_run.get("run_id"), + "runner": args.runner, + "source_sha": git_run.get("source_sha"), + } + allocation_identifier = identity.allocation_id(allocation_factors) + try: + attempt_ordinal = int(os.environ.get("CX_ATTEMPT_ID", "1")) + except ValueError: + attempt_ordinal = 0 + if attempt_ordinal <= 0: + raise ValueError("CX_ATTEMPT_ID must be a positive integer") + attempt_identifier = identity.attempt_id( + allocation=allocation_identifier, case=case_identifier, ordinal=attempt_ordinal + ) + runtime_fingerprint = getattr(args, "runtime_fingerprint", None) or {} + implementation_contract = { + "kernel_generation": kernel_generation(backend), + "name": backend.name, + "provenance": _series_provenance(backend.backend_provenance), + "resource_profile": resource_profile, + } + public_config = contracts.public_series_config( + kernel_generation=implementation_contract["kernel_generation"], + provenance=backend.backend_provenance, + resource_profile=resource_profile, + resource_mode=args.resource_mode, + device_product=getattr(args, "runtime_device_product", None), + ) + series_factors = { + "backend": backend.name, + "implementation_contract_sha256": _sha256_json(implementation_contract), + "public_config_sha256": contracts.public_series_config_sha256(public_config), + "routing_control_sha256": contracts.routing_implementation_control_sha256( + implementation_contract + ), + "case_id": case_identifier, + "image_digest": getattr(args, "image_digest", None), + "runtime_fingerprint_sha256": _sha256_json(runtime_fingerprint), + "source_sha": git_run.get("source_sha"), + "squash_sha256": getattr(args, "squash_sha256", None), + "workload_id": getattr(args, "workload_id", None) or trace_sig, + } + series_identifier = identity.series_id(series_factors) + + sample_points = [] + for row in rows: + token_count = row["tokens_per_rank"] + + sample_point = { + "components": { + "combine": sampled_component_evidence(comb_trials[token_count]), + "dispatch": sampled_component_evidence(disp_trials[token_count]), + "roundtrip": sampled_component_evidence(rt_trials[token_count]), + "stage": sampled_component_evidence(stage_trials[token_count]), + }, + "tokens_per_rank": token_count, + } + sample_sha256 = _sha256_json(sample_point) + point_identifier = identity.point_id( + series=series_identifier, tokens_per_rank=token_count + ) + evidence_identifier = identity.evidence_id( + point=point_identifier, + allocation=allocation_identifier, + attempt=attempt_identifier, + sample_sha256=sample_sha256, + ) + sample_point.update( + { + "evidence_id": evidence_identifier, + "point_id": point_identifier, + "sample_sha256": sample_sha256, + } + ) + sample_points.append(sample_point) + row.update({ + "evidence_id": evidence_identifier, + "point_id": point_identifier, + "sample_sha256": sample_sha256, + }) + + samples_path = args.out[:-5] + ".samples.json" if args.out.endswith(".json") else args.out + ".samples.json" + samples_document = { + "allocation_id": allocation_identifier, + "attempt_id": attempt_identifier, + "case_id": case_identifier, + "format": "collectivex.samples.v1", + "points": sample_points, + "qualification_index": args.qualification_index, + "sampling": { + "iterations_per_trial": args.iters, + "reduction": case_profile["rank_reduction"], + "trials": args.trials, + }, + "schema_version": 1, + "series_id": series_identifier, + } + samples_payload = contracts.canonical_json_bytes(samples_document) + samples_sha256 = hashlib.sha256(samples_payload).hexdigest() + samples_bytes = len(samples_payload) + sample_artifact = { + "bytes": samples_bytes, + "format": "collectivex.samples.v1", + "path": os.path.basename(samples_path), + "sha256": samples_sha256, + } + headline = next((r for r in rows if r["tokens_per_rank"] == 64), rows[len(rows) // 2]) + eplb_record = ( + { + "calibration_token_offset": eplb_calibration["token_offset"], + "calibration_trace_sha256": eplb_calibration["trace_sha256"], + "calibration_window": eplb_calibration["window"], + "calibration_workload_id": eplb_calibration["workload_id"], + "enabled": True, + "imbalance_after": eplb_plan["imbalance_after"], + "imbalance_before": eplb_plan["imbalance_before"], + "mapping_hash": eplb_mapping_hash, + "max_replicas": eplb_plan["max_replicas"], + "num_logical_experts": num_logical, + "num_physical_experts": args.experts, + "num_redundant": args.experts - num_logical, + "planner": EPLB_PLANNER, + "reference_tokens_per_rank": EPLB_REFERENCE_TOKENS_PER_RANK, + "replicated_experts": eplb_plan["replicated_experts"], + } + if eplb_plan + else { + "calibration_token_offset": None, + "calibration_trace_sha256": None, + "calibration_window": None, + "calibration_workload_id": None, + "enabled": False, + "imbalance_after": None, + "imbalance_before": None, + "mapping_hash": None, + "max_replicas": None, + "num_logical_experts": num_logical, + "num_physical_experts": args.experts, + "num_redundant": 0, + "planner": None, + "reference_tokens_per_rank": None, + "replicated_experts": 0, + } + ) + doc = { + "format": "collectivex.ep.v1", + "schema_version": SCHEMA_VERSION, + "record_type": "case-attempt", + "generated_at": generated_at, + "identity": { + "allocation_factors": allocation_factors, + "allocation_id": allocation_identifier, + "attempt_id": attempt_identifier, + "attempt_ordinal": attempt_ordinal, + "case_factors": case_factors, + "case_id": case_identifier, + "series_factors": series_factors, + "series_id": series_identifier, + }, + "case": { + "attempt_ordinal": attempt_ordinal, + "backend": backend.name, + "eplb": eplb_record, + "ep_size": ep_size, + "mode": mode, + "phase": args.phase, + "required_publication": args.required_publication or "diagnostic", + "resource_mode": "fixed-profile", + "runner": args.runner, + "shape": shape, + "suite": args.suite or "manual", + "workload_name": args.workload_name or "manual", + }, + "workload": { + "activation_generator": ACTIVATION_GENERATOR, + "activation_identity": activation_identity, + "activation_profile": ACTIVATION_PROFILE, + "cross_rank_consistent": routing_consistent, + "manifest_checksums": getattr(args, "workload_checksums", None), + "members": getattr(args, "workload_members", None), + "routing_generator": ROUTING_GENERATOR, + "source": validity["workload_source"], + "trace_hashes": sorted(routing_hashes), + "trace_signature": trace_sig, + "workload_id": getattr(args, "workload_id", None), + }, + "measurement": { + "component_order_contract": case_profile["component_order_contract"], + "conditioning": { + "contract": CONDITIONING_CONTRACT, + "ladder": conditioning_ladder, + "roundtrips_per_shape": CONDITIONING_ROUNDS_PER_SHAPE, + }, + "contract": case_profile["contract"], + "execution_order_sha256": _sha256_json(observed_component_orders), + "qualification_index": args.qualification_index, + "rows": rows, + "sampling": { + "contract": SAMPLING_CONTRACT, + "iterations_per_trial": args.iters, + "percentile_method": case_profile["percentile_method"], + "reduction": case_profile["rank_reduction"], + "samples_per_component": TIMED_SAMPLES_PER_POINT, + "trials": args.trials, + "warmup_iterations": args.warmup, + "warmup_semantics": WARMUP_SEMANTICS, + }, + "source_allocation": "even", + }, + "implementation": { + "kernel_generation": kernel_generation(backend), + "name": backend.name, + "provenance": backend.backend_provenance, + "resource_profile": resource_profile, + }, + "topology": { + "device_count": getattr(args, "runtime_device_count", None), + "device_product": getattr(args, "runtime_device_product", None), + "gpus_per_node": args.gpus_per_node or ep_size, + "nodes": nodes, + "placement": "packed", + "realized_placement": realized_placement, + "scale_up_domain": args.scale_up_domain or (args.gpus_per_node or ep_size), + "scale_up_transport": args.scale_up_transport, + "scale_out_transport": args.scale_out_transport or None, + "scope": args.scope, + "topology_class": args.topology_class, + "transport": args.transport, + "world_size": world_size, + }, + "runtime_fingerprint": runtime_fingerprint, + "provenance": { + "allocation_stratum_sha256": allocation_stratum_sha256, + "command": getattr(args, "reproduction_command", ""), + "distributed_launcher": getattr(args, "distributed_launcher", None), + "git_run": getattr(args, "git_run", None), + "image": { + "arch": getattr(args, "image_arch", None), + "digest": getattr(args, "image_digest", "") or None, + "digest_verified": getattr(args, "image_digest_verified", False), + "reference": getattr(args, "image", "") or None, + "squash_sha256": getattr(args, "squash_sha256", None), + }, + "redaction": "sanitized-v1", + }, + "sample_artifact": sample_artifact, + "outcome": { + "publication_status": publication_status, + "reasons": [] if all_ok else ["semantic correctness or routing identity failed"], + "status": "success" if all_ok else "invalid", + "validity": validity, + }, + } + contracts.validate_raw_document(doc, samples_document) + _write_bytes_atomic(samples_path, samples_payload) + _write_json_atomic(args.out, doc) + dispatch_percentiles = headline["components"]["dispatch"]["percentiles_us"] + dispatch_p99 = dispatch_percentiles["p99"] if dispatch_percentiles else None + component_summary = (f"disp_p99={dispatch_p99:.1f}us " + if dispatch_p99 is not None + else "components=unavailable ") + print(f"{backend.name} ep-dispatch-combine [{args.phase}/{mode}/{case_profile['contract']}]: " + f"status={doc['outcome']['status']} {len(rows)} pts, routing_consistent={routing_consistent}, " + f"headline T={headline['tokens_per_rank']} {component_summary}" + f"-> {args.out}") + # A complete invalid document is still a successfully captured terminal outcome. Launchers + # inspect its status to fail the case without conflating it with an execution failure. + return 0 diff --git a/experimental/CollectiveX/tests/ep_mori.py b/experimental/CollectiveX/tests/ep_mori.py new file mode 100644 index 000000000..832a69ea2 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_mori.py @@ -0,0 +1,524 @@ +#!/usr/bin/env python3 +"""CollectiveX MoRI adapter for native V1 dispatch/combine precision profiles.""" +from __future__ import annotations + +import os +from pathlib import Path +import re +import sys +import types + +# MoRI registers the whole symmetric heap at import time. The pinned upstream +# inter-node benchmark uses 6 GiB for its InterNodeV1 staging and signal buffers. +os.environ["MORI_SHMEM_HEAP_SIZE"] = "6G" + +import torch +import torch.distributed as dist +import ep_precision + +try: + import mori # type: ignore +except Exception as exc: # pragma: no cover - requires the benchmark image + print(f"ERROR: mori import failed: {exc!r}", file=sys.stderr) + raise + + +def _project_local_metadata(torch_module, raw_expert_ids, raw_weights, rank, experts_per_rank): + local_start = rank * experts_per_rank + local = (raw_expert_ids >= local_start) & ( + raw_expert_ids < local_start + experts_per_rank + ) + expert_ids = torch_module.where( + local, raw_expert_ids, torch_module.full_like(raw_expert_ids, -1) + ) + weights = torch_module.where(local, raw_weights, torch_module.zeros_like(raw_weights)) + return expert_ids, weights, raw_expert_ids[local] - local_start + + +def _mori_source_commit() -> str: + module_path = Path(mori.__file__).resolve() + for root in module_path.parents: + head = root / ".git" / "HEAD" + if not head.is_symlink() and head.is_file() and head.stat().st_size <= 128: + value = head.read_text(encoding="ascii").strip() + if re.fullmatch(r"[0-9a-f]{40}", value): + return value + raise RuntimeError("MoRI image source is not pinned to a detached commit") + raise RuntimeError("MoRI image source revision is unavailable") + + +class MoRIBackend: + name = "mori" + stage_device_work = False + combine_needs_redispatch = True + combine_weight_semantics = "unweighted-rank-sum" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = "normal" + runner = str(getattr(args, "runner", "")) + if runner.startswith("mi355x"): + fp8_format = "fp8-e4m3fn" + supported_profiles = { + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + } + if world_size == 8: + supported_profiles.update({ + "d-bf16.c-fp8-e4m3fn-direct-cast-noscale", + "d-fp8-e4m3fn-b128-f32-prequantized.c-fp8-e4m3fn-direct-cast-noscale", + }) + elif runner.startswith("mi325x"): + fp8_format = "fp8-e4m3fnuz" + supported_profiles = { + "d-bf16.c-bf16", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-bf16", + } + if world_size == 8: + supported_profiles.update({ + "d-bf16.c-fp8-e4m3fnuz-direct-cast-noscale", + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-fp8-e4m3fnuz-direct-cast-noscale", + }) + else: + raise ep_precision.PrecisionError( + f"MoRI precision contract has no pinned FP8 format for runner {runner!r}" + ) + self.precision_profile_id, self.communication_precision = ( + ep_precision.resolve_precision( + args, + backend=self.name, + mode=self.mode, + supported_profiles=supported_profiles, + ) + ) + self._fp8_dispatch = ep_precision.is_low_precision_dispatch( + self.communication_precision + ) + self._direct_cast_combine = ep_precision.uses_direct_cast_combine( + self.communication_precision + ) + if self._fp8_dispatch and ep_precision.communication_format( + self.communication_precision, "dispatch" + ) != fp8_format: + raise ep_precision.PrecisionError( + "MoRI dispatch FP8 format differs from the pinned GPU architecture" + ) + if self._direct_cast_combine: + quant_enum = getattr(mori.ops, "EpDispatchCombineQuantType", None) + if quant_enum is None or not hasattr(quant_enum, "Fp8DirectCast"): + raise ep_precision.PrecisionError( + "pinned MoRI API omits EpDispatchCombineQuantType.Fp8DirectCast" + ) + + self.ep_size = world_size + self.experts_per_rank = args.experts // self.ep_size + device_properties = torch.cuda.get_device_properties(device) + device_cus = device_properties.multi_processor_count + realized_arch = str(getattr(device_properties, "gcnArchName", "")).split(":", 1)[0] + expected_arch = "gfx950" if runner.startswith("mi355x") else "gfx942" + if realized_arch != expected_arch: + raise ep_precision.PrecisionError( + f"MoRI runner {runner!r} realized architecture {realized_arch!r}, " + f"expected {expected_arch!r}" + ) + gpus_per_node = int(args.gpus_per_node or world_size) + scale_up_domain = int(args.scale_up_domain or gpus_per_node) + scale_out = world_size > scale_up_domain + if ( + gpus_per_node <= 0 + or scale_up_domain <= 0 + or world_size % gpus_per_node + or world_size % scale_up_domain + ): + raise RuntimeError("MoRI placement is not divisible into complete domains") + if scale_out != (args.scope == "scale-out"): + raise RuntimeError("MoRI requested scope differs from the EP topology") + if not scale_out and ( + world_size != 8 + or gpus_per_node != 8 + or scale_up_domain != 8 + or args.scale_up_transport != "xgmi" + or args.scale_out_transport + or args.transport != "xgmi" + ): + raise RuntimeError("MoRI scale-up is pinned to EP8 over one XGMI domain") + if scale_out and ( + world_size != 16 + or gpus_per_node != 8 + or scale_up_domain != 8 + or args.scale_up_transport != "xgmi" + or args.scale_out_transport != "rdma" + or args.transport != "xgmi-rdma" + ): + raise RuntimeError( + "MoRI InterNodeV1 is pinned to EP16 over two 8-GPU XGMI/RDMA nodes" + ) + self.block_num = self._block_target = 80 + self.rdma_block_num = 0 + self.num_qps = 1 + self._block_floored = False + self._tuned_source = "default-80" + self.dispatch_warps = 16 + self.combine_warps = 8 + + # MI355X uses the direct intranode kernel. MI325X uses MoRI's split + # AsyncLL send/receive kernel as its normal-mode XGMI transport. + kernel_request = os.environ.get("CX_MORI_KERNEL_TYPE", "intranode").strip().lower() + self._kernel_type = None + self._kernel_type_label = "IntraNode" + self._async_ll = False + self._inter_node = False + if kernel_request in ("asyncll", "async_ll", "async-ll"): + if scale_out: + raise RuntimeError("MoRI EP16 must use InterNodeV1, not AsyncLL") + kernel_enum = getattr(mori.ops, "EpDispatchCombineKernelType", None) + if kernel_enum is None or not hasattr(kernel_enum, "AsyncLL"): + raise RuntimeError( + "CX_MORI_KERNEL_TYPE=asyncll requires " + "EpDispatchCombineKernelType.AsyncLL" + ) + self._kernel_type = kernel_enum.AsyncLL + self._kernel_type_label = "AsyncLL" + self._async_ll = True + self.block_num = self._block_target = 64 + self.dispatch_warps = self.combine_warps = 8 + self._tuned_source = "upstream-asyncll-64x8-external-input" + elif kernel_request in ("internode-v1", "internode_v1", "internodev1"): + if not scale_out: + raise RuntimeError("MoRI InterNodeV1 is valid only for scale-out EP16") + kernel_enum = getattr(mori.ops, "EpDispatchCombineKernelType", None) + if kernel_enum is None or not hasattr(kernel_enum, "InterNodeV1"): + raise RuntimeError( + "CX_MORI_KERNEL_TYPE=internode-v1 requires " + "EpDispatchCombineKernelType.InterNodeV1" + ) + self._kernel_type = kernel_enum.InterNodeV1 + self._kernel_type_label = "InterNodeV1" + self._inter_node = True + self.block_num = self._block_target = 96 + self.rdma_block_num = 64 + self.dispatch_warps = self.combine_warps = 8 + self._tuned_source = "upstream-internode-v1-96-64x8-qps1" + elif kernel_request not in ("intranode", "intra_node", "intra-node", ""): + raise RuntimeError( + f"unknown CX_MORI_KERNEL_TYPE={kernel_request!r} " + "(expected intranode|asyncll|internode-v1)" + ) + elif scale_out: + raise RuntimeError("MoRI scale-out EP16 requires CX_MORI_KERNEL_TYPE=internode-v1") + self.kernel_generation = ( + "inter-node-v1" if self._inter_node + else "async-ll" if self._async_ll + else "intranode" + ) + self._external_input = ( + self._async_ll or self._inter_node or self._direct_cast_combine + ) + # Registered-input MoRI copies expert output into a device-side symmetric buffer. External + # input kernels consume the dispatch output directly, so their stage is not applicable. + self.stage_device_work = self._fp8_dispatch or not self._external_input + + world_group = torch.distributed.group.WORLD + torch._C._distributed_c10d._register_process_group("default", world_group) + mori.shmem.shmem_torch_process_group_init("default") + realized_qps = int(mori.shmem.shmem_num_qp_per_pe()) + if realized_qps < self.num_qps: + raise RuntimeError( + f"MoRI realized {realized_qps} QPs per PE; {self.num_qps} required" + ) + + self._cap = self.buffer_cap(args) + dispatch_dtype = ( + getattr( + torch, + "float8_e4m3fn" + if fp8_format == "fp8-e4m3fn" + else "float8_e4m3fnuz", + None, + ) + if self._fp8_dispatch + else torch.bfloat16 + ) + if dispatch_dtype is None: + raise ep_precision.PrecisionError( + f"active torch build does not expose {fp8_format}" + ) + scale_dim = args.hidden // 128 if self._fp8_dispatch else 0 + if self._fp8_dispatch and args.hidden % 128: + raise ep_precision.PrecisionError( + "MoRI native FP8 dispatch requires hidden divisible by 128" + ) + config_kwargs = { + "data_type": dispatch_dtype, + "rank": rank, + "world_size": world_size, + "hidden_dim": args.hidden, + "scale_dim": scale_dim, + "scale_type_size": 4 if self._fp8_dispatch else 1, + "max_token_type_size": ( + torch.tensor([], dtype=torch.bfloat16).element_size() + if self._inter_node + else torch.tensor([], dtype=torch.float32).element_size() + ), + "max_num_inp_token_per_rank": max(512, self._cap), + "num_experts_per_rank": self.experts_per_rank, + "num_experts_per_token": args.topk, + "use_external_inp_buf": self._external_input, + "quant_type": ( + "fp8_direct_cast" if self._direct_cast_combine else "none" + ), + } + if self._kernel_type is not None: + config_kwargs["kernel_type"] = self._kernel_type + if self._async_ll: + config_kwargs["max_total_recv_tokens"] = 0 + if self._async_ll or self._inter_node: + config_kwargs["block_num"] = self.block_num + config_kwargs["warp_num_per_block"] = self.dispatch_warps + if self._inter_node: + config_kwargs.update({ + "gpu_per_node": gpus_per_node, + "rdma_block_num": self.rdma_block_num, + "num_qp_per_pe": self.num_qps, + }) + self.config = mori.ops.EpDispatchCombineConfig(**config_kwargs) + expected_config = { + "data_type": dispatch_dtype, + "scale_dim": scale_dim, + "scale_type_size": 4 if self._fp8_dispatch else 1, + "use_external_inp_buf": self._external_input, + "quant_type": config_kwargs["quant_type"], + } + if self._async_ll or self._inter_node: + expected_config.update({ + "block_num": self.block_num, + "warp_num_per_block": self.dispatch_warps, + }) + if self._inter_node: + expected_config.update({ + "gpu_per_node": 8, + "rdma_block_num": 64, + "num_qp_per_pe": 1, + }) + if any(getattr(self.config, key, None) != value for key, value in expected_config.items()): + raise RuntimeError("MoRI requested launch/topology configuration was not realized") + # The newer pinned MoRI revision can otherwise replace explicit values + # with token-dependent tuning rules from the image. + os.environ["MORI_EP_LAUNCH_CONFIG_MODE"] = "MANUAL" + self.op = mori.ops.EpDispatchCombineOp(self.config) + if getattr(self.op, "launch_config_mode", None) != "MANUAL": + raise RuntimeError("MoRI explicit launch configuration was not applied") + + expected_mori_commit = os.environ.get("MORI_COMMIT") + mori_commit = _mori_source_commit() + if expected_mori_commit and mori_commit != expected_mori_commit: + raise RuntimeError("MoRI image source revision differs from canonical provenance") + self.backend_provenance = { + "mori_commit": mori_commit, + "api": ( + "mori.ops.EpDispatchCombineOp/external-input" + if self._external_input + else "mori.ops.EpDispatchCombineOp/registered-input" + ), + "mode": "normal", + "dispatch_dtype": ep_precision.communication_format( + self.communication_precision, "dispatch" + ), + "combine_dtype": ep_precision.communication_format( + self.communication_precision, "combine" + ), + "kernel_type": self._kernel_type_label, + "enable_sdma": os.environ.get("MORI_ENABLE_SDMA"), + "heap_size": os.environ.get("MORI_SHMEM_HEAP_SIZE"), + "max_num_inp_token_per_rank": max(512, self._cap), + "max_total_recv_tokens": config_kwargs.get("max_total_recv_tokens"), + "gpus_per_node": gpus_per_node, + "rdma_block_num": self.rdma_block_num, + "use_external_inp_buf": self._external_input, + "num_qps": self.num_qps, + "resource_mode": "fixed-profile", + "block_num": self.block_num, + "block_num_target": self._block_target, + "block_num_floored": self._block_floored, + "dispatch_warps": self.dispatch_warps, + "combine_warps": self.combine_warps, + "device_cus": device_cus, + "sm_fraction": None if self._async_ll else self.block_num / device_cus, + "tuned_source": self._tuned_source, + } + + def buffer_cap(self, args): + return 512 + + def make_problem(self, T, idx, weights, x): + encoding = ep_precision.encode_dispatch( + torch, x, self.communication_precision + ) + indices = idx.to(torch.int32) + gate_weights = weights.to(torch.float32) + return types.SimpleNamespace( + T=T, + x=x, + dispatch_x=encoding.native_input[0] if self._fp8_dispatch else x, + oracle_x=encoding.semantic, + dispatch_precision_evidence=encoding.evidence, + topk_idx=indices, + topk_weights=gate_weights, + indices=indices, + weights=gate_weights, + scales=( + encoding.scales + if encoding.scales is not None + else torch.empty((T, 0), dtype=torch.uint8, device=self.device) + ), + ) + + def dispatch(self, p): + dispatch_output, dispatch_weights, _scales, dispatch_indices, recv_num = ( + self.op.dispatch( + p.dispatch_x, + p.weights, + p.scales, + p.indices, + block_num=self.block_num, + rdma_block_num=self.rdma_block_num, + warp_per_block=self.dispatch_warps, + ) + ) + if self._async_ll: + self.op.dispatch_recv(warp_per_block=self.dispatch_warps) + return types.SimpleNamespace( + dispatch_output=dispatch_output, + dispatch_weights=dispatch_weights, + dispatch_scales=_scales, + dispatch_indices=dispatch_indices, + recv_num=recv_num[0], + combine_input=None, + ) + + def stage(self, p, h): + rows = getattr(p, "recv_tokens", None) + if not isinstance(rows, int) or rows < 0 or rows > h.dispatch_output.size(0): + raise RuntimeError("MoRI receive count was not validated before staging") + h.combine_input = self._semantic_recv(h, rows) + if self._external_input: + return None + buffer = self.op.get_registered_combine_input_buffer( + torch.bfloat16, hidden_dim=h.combine_input.size(1) + ) + buffer[:rows, :].copy_(h.combine_input[:rows, :]) + h.combine_input = buffer + + def combine(self, p, h): + combine_indices = p.indices if self._async_ll else h.dispatch_indices + combined, _weights = self.op.combine( + h.combine_input, + None, + combine_indices, + block_num=self.block_num, + rdma_block_num=self.rdma_block_num, + warp_per_block=self.combine_warps, + ) + if self._async_ll: + self.op.combine_recv(warp_per_block=self.combine_warps) + return combined[:p.T] + + def inspect_dispatch(self, p, h): + count = self.recv_tokens(h) + if h.dispatch_weights is None: + raise RuntimeError("MoRI dispatch did not expose gate weights") + if count < 0 or any( + tensor.ndim == 0 or count > tensor.size(0) + for tensor in (h.dispatch_output, h.dispatch_indices, h.dispatch_weights) + ): + raise RuntimeError("MoRI receive count exceeds dispatch metadata") + raw_expert_ids = h.dispatch_indices[:count].to(torch.int64) + expert_ids, weights, local_expert_ids = _project_local_metadata( + torch, + raw_expert_ids, + h.dispatch_weights[:count].to(torch.float32), + self.rank, + self.experts_per_rank, + ) + return types.SimpleNamespace( + payload=self._semantic_recv(h, count)[:count], + encoded_payload=h.dispatch_output[:count], + scales=( + h.dispatch_scales[:count] + if h.dispatch_scales is not None + else None + ), + expert_ids=expert_ids, + weights=weights, + local_expert_counts=torch.bincount( + local_expert_ids, minlength=self.experts_per_rank + ), + ordering_contract="mori-global-topk-masked-v1", + ) + + def combine_transformed(self, p, h, transformed): + h.combine_input = transformed.to(torch.bfloat16) + rows = getattr(p, "recv_tokens", None) + if not isinstance(rows, int) or rows < 0 or rows > h.combine_input.size(0): + raise RuntimeError("MoRI receive count was not validated before transformed combine") + if not self._external_input: + buffer = self.op.get_registered_combine_input_buffer( + torch.bfloat16, hidden_dim=h.combine_input.size(1) + ) + buffer[:rows, :].copy_(h.combine_input[:rows, :]) + h.combine_input = buffer + return self.combine(p, h) + + def recv_tokens(self, h): + return int(h.recv_num.item()) + + def _semantic_recv(self, h, rows): + if not self._fp8_dispatch: + return h.dispatch_output + if not hasattr(h, "recv_semantic"): + if h.dispatch_scales is None: + raise ep_precision.PrecisionError( + "MoRI FP8 dispatch did not return scaling factors" + ) + semantic = torch.empty( + h.dispatch_output.shape, + dtype=torch.bfloat16, + device=h.dispatch_output.device, + ) + semantic[:rows].copy_(ep_precision.dequantize_dispatch( + torch, + h.dispatch_output[:rows], + h.dispatch_scales[:rows], + self.communication_precision["dispatch"], + )) + h.recv_semantic = semantic + h.recv_semantic_rows = rows + elif h.recv_semantic_rows != rows: + raise RuntimeError("MoRI receive count changed for one dispatch handle") + return h.recv_semantic + + def oracle_dispatch_payload(self, payload): + return ep_precision.encode_dispatch( + torch, payload, self.communication_precision + ).semantic + + def precision_evidence(self, problem, view=None): + return ep_precision.precision_evidence( + torch, + profile_id=self.precision_profile_id, + profile=self.communication_precision, + problem=problem, + view=view, + ) + + def finalize(self, rc): + try: + dist.barrier() + except Exception: + pass + sys.stdout.flush() + sys.stderr.flush() + os._exit(rc if 0 <= rc <= 255 else 1) diff --git a/experimental/CollectiveX/tests/ep_nccl.py b/experimental/CollectiveX/tests/ep_nccl.py new file mode 100644 index 000000000..d440f8ecf --- /dev/null +++ b/experimental/CollectiveX/tests/ep_nccl.py @@ -0,0 +1,223 @@ +"""CollectiveX NCCL all-to-all expert-parallel reference backend. + +The canonical "token-shuffle" EP built on torch.distributed's NCCL ``all_to_all_single``. Like the +DeepEP-family APIs, dispatch sends one hidden-state copy to each distinct destination rank, even when +multiple selected experts live on that rank. Combine reverses the shuffle and sums those rank copies. + +Why this exists alongside DeepEP/UCCL/MoRI: it is the portable collective reference baseline for the +same rank-deduplicated payload and routing metadata. It keeps the library comparison anchored to the +platform collective stack without claiming the custom fused kernels use the same transport algorithm. + +Scope: BF16, normal mode, layout-and-dispatch-v1. The timed dispatch includes layout, count exchange, +payload, rank-masked expert indices, gate weights, and source-token metadata; combine returns only +the activation payload. RCCL exposes the same API. The v1 AMD matrix uses this backend at EP8 and EP16. +""" + +import os +import re +import types + +import torch +import torch.distributed as dist +import contracts +import ep_precision + + +def _runtime_collective(args, torch_module) -> tuple[str, str]: + expected = "rccl" if torch_module.version.hip else "nccl" + fingerprint = getattr(args, "runtime_fingerprint", None) + collective = fingerprint.get("collective_library") if isinstance(fingerprint, dict) else None + if ( + not isinstance(collective, dict) + or collective.get("kind") != expected + or not isinstance(collective.get("version"), str) + or not re.fullmatch(r"[0-9]+\.[0-9]+\.[0-9]+", collective["version"]) + ): + raise RuntimeError("loaded collective runtime identity is unavailable") + return expected, collective["version"] + + +class NCCLBackend: + name = "nccl-ep" + stage_device_work = False + combine_needs_redispatch = False # dispatch saves the permutation + splits + combine_weight_semantics = "unweighted-rank-sum" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.experts = args.experts + self.mode = getattr(args, "mode", "normal") + if self.mode != "normal": + raise ep_precision.PrecisionError("NCCL/RCCL EP supports normal mode only") + self.precision_profile_id, self.communication_precision = ( + ep_precision.resolve_precision( + args, + backend=self.name, + mode=self.mode, + supported_profiles={"d-bf16.c-bf16"}, + ) + ) + if args.experts % world_size: + raise ValueError(f"experts({args.experts}) must divide world_size({world_size})") + self.experts_per_rank = args.experts // world_size + self.tolerance = 5e-2 # bf16 round-trip + _library, _version = _runtime_collective(args, torch) + if args.scale_out_transport: + hcas = os.environ.get("NCCL_IB_HCA", "") + if os.environ.get("NCCL_NET") != "IB" or not re.fullmatch( + r"=[A-Za-z][A-Za-z0-9_.-]{0,31}(?::[1-9][0-9]*)?" + r"(?:,[A-Za-z][A-Za-z0-9_.-]{0,31}(?::[1-9][0-9]*)?)*", + hcas, + ): + raise RuntimeError("scale-out collective transport is not pinned to RDMA") + self.kernel_generation = contracts.collective_kernel_generation(_library) + self.backend_provenance = { + "backend": f"{_library}-all2all", + "backend_lineage": _library, + "collective_library": _library, + "nccl_version": _version, + "transport": f"{_library}-all_to_all_single", + "resource_mode": "fixed-profile", + "num_sms": None, + "device_sms": torch.cuda.get_device_properties(device).multi_processor_count, + "tuned_source": "nccl-collective", + "reference_semantics": "rank-deduplicated-payload-plus-routing-metadata-v2", + "routing_metadata": "expert-index-gate-weight-source-token", + "dispatch_dtype": "bf16", + "combine_dtype": "bf16", + } + + def buffer_cap(self, args): + return None # no fixed pre-allocated buffer; all-to-all sizes itself per step + + def make_problem(self, T, idx, weights, x): + encoding = ep_precision.encode_dispatch( + torch, x, self.communication_precision + ) + # idx[T,topk] int64, weights[T,topk] f32, x[T,hidden] bf16 — the shared routing-trace slice. + return types.SimpleNamespace( + T=T, + x=x, + oracle_x=encoding.semantic, + dispatch_precision_evidence=encoding.evidence, + topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), + layout=None, + ) + + def dispatch(self, p): + ws = self.world_size + x = p.x # [T, H] bf16 + idx = p.topk_idx # [T, topk] + T, H = int(x.shape[0]), int(x.shape[1]) + dev = x.device + # DeepEP dispatches one token per destination rank, not one copy per expert. Build the same + # rank-deduplicated routing map so NCCL traffic and combine semantics are comparable. + destinations = (idx // self.experts_per_rank).clamp_(0, ws - 1) + present = torch.zeros((T, ws), dtype=torch.bool, device=dev) + present.scatter_(1, destinations, True) + flat_token, flat_dest = present.nonzero(as_tuple=True) + # Group rank copies by destination (stable -> deterministic, invertible permutation). + order = torch.argsort(flat_dest, stable=True) + ordered_token = flat_token.index_select(0, order) + ordered_dest = flat_dest.index_select(0, order) + send_counts = torch.bincount(flat_dest, minlength=ws) # [ws] + send_x = x.index_select(0, ordered_token).contiguous() + send_topk_idx = idx.index_select(0, ordered_token).contiguous() + expert_start = ordered_dest.unsqueeze(1) * self.experts_per_rank + local_mask = ((send_topk_idx >= expert_start) + & (send_topk_idx < expert_start + self.experts_per_rank)) + send_topk_idx = torch.where( + local_mask, send_topk_idx - expert_start, torch.full_like(send_topk_idx, -1) + ) + send_topk_weights = p.topk_weights.index_select(0, ordered_token).contiguous() + send_topk_weights.masked_fill_(~local_mask, 0) + send_src_metadata = (ordered_token.to(torch.int64) | (self.rank << 32)).contiguous() + # Exchange per-rank counts so every rank can size its receive buffer. + recv_counts = torch.empty_like(send_counts) + dist.all_to_all_single(recv_counts, send_counts) + sc = send_counts.tolist() + rc = recv_counts.tolist() + total_recv = int(sum(rc)) + recv_x = torch.empty((total_recv, H), dtype=x.dtype, device=dev) + recv_topk_idx = torch.empty((total_recv, int(idx.shape[1])), dtype=idx.dtype, device=dev) + recv_topk_weights = torch.empty((total_recv, int(idx.shape[1])), + dtype=p.topk_weights.dtype, device=dev) + recv_src_metadata = torch.empty((total_recv,), dtype=torch.int64, device=dev) + # Dispatch the uneven per-rank splits over the configured collective transport. + dist.all_to_all_single(recv_x, send_x, rc, sc) + dist.all_to_all_single(recv_topk_idx, send_topk_idx, rc, sc) + dist.all_to_all_single(recv_topk_weights, send_topk_weights, rc, sc) + dist.all_to_all_single(recv_src_metadata, send_src_metadata, rc, sc) + return types.SimpleNamespace( + recv_x=recv_x, combine_input=None, order=order, flat_token=flat_token, + recv_topk_idx=recv_topk_idx, + recv_topk_weights=recv_topk_weights, recv_src_rank=recv_src_metadata >> 32, + recv_src_token=recv_src_metadata & ((1 << 32) - 1), send_counts=sc, recv_counts=rc, + T=T, H=H, topk=int(idx.shape[1]), total_recv=total_recv) + + def stage(self, p, h): + # No expert compute: the expert "output" is the received tokens as-is (the round-trip identity). + h.combine_input = h.recv_x + return None + + def combine(self, p, h): + # Reverse all-to-all: ship expert outputs back to their origin ranks (swap the split lists). + send_back = torch.empty((int(h.order.shape[0]), h.H), dtype=h.combine_input.dtype, + device=h.combine_input.device) + dist.all_to_all_single(send_back, h.combine_input.contiguous(), + h.send_counts, h.recv_counts) + # send_back is in send (sorted) order; invert the argsort to token-copy order. + copies = torch.empty_like(send_back) + copies[h.order] = send_back + # Sum one copy per destination rank under this reference's explicit unweighted contract. + out = torch.zeros((h.T, h.H), dtype=torch.float32, device=send_back.device) + out.index_add_(0, h.flat_token, copies.float()) + return out.to(p.x.dtype) + + def inspect_dispatch(self, p, h): + valid = h.recv_topk_idx >= 0 + expert_ids = torch.where( + valid, + h.recv_topk_idx + self.rank * self.experts_per_rank, + h.recv_topk_idx, + ) + return types.SimpleNamespace( + payload=h.recv_x, + expert_ids=expert_ids, + weights=h.recv_topk_weights.masked_fill(~valid, 0), + local_expert_counts=torch.bincount( + h.recv_topk_idx[valid], minlength=self.experts_per_rank + ), + ordering_contract="source-rank-major-stable-v1", + ) + + def combine_transformed(self, p, h, transformed): + h.combine_input = transformed.to(h.recv_x.dtype) + return self.combine(p, h) + + def recv_tokens(self, h): + return int(h.total_recv) + + def oracle_dispatch_payload(self, payload): + return payload + + def precision_evidence(self, problem, view=None): + return ep_precision.precision_evidence( + torch, + profile_id=self.precision_profile_id, + profile=self.communication_precision, + problem=problem, + view=view, + ) + + def finalize(self, rc): + try: + dist.barrier() + dist.destroy_process_group() + except Exception: + pass + return rc diff --git a/experimental/CollectiveX/tests/ep_precision.py b/experimental/CollectiveX/tests/ep_precision.py new file mode 100644 index 000000000..c4f19f54f --- /dev/null +++ b/experimental/CollectiveX/tests/ep_precision.py @@ -0,0 +1,371 @@ +#!/usr/bin/env python3 +"""Native communication-precision helpers for CollectiveX EP adapters.""" +from __future__ import annotations + +import inspect +from copy import deepcopy +from dataclasses import dataclass +from typing import Any, Iterable + +import identity + + +class PrecisionError(RuntimeError): + """A requested precision profile cannot be realized by the pinned API.""" + + +@dataclass(frozen=True) +class DispatchEncoding: + """One dispatch input plus its post-codec semantic representation.""" + + native_input: Any + encoded_payload: Any | None + scales: Any | None + semantic: Any + evidence: dict[str, Any] + + +def resolve_precision( + args, + *, + backend: str, + mode: str, + supported_profiles: Iterable[str], +) -> tuple[str, dict[str, Any]]: + """Resolve and validate the exact profile requested for one adapter.""" + profile_id = ( + getattr(args, "precision_profile", "") + or identity.V1_CONTROL_PRECISION_PROFILE + ) + try: + profile = identity.precision_profile(profile_id) + except identity.IdentityError as exc: + raise PrecisionError(str(exc)) from exc + if mode not in profile["modes"]: + raise PrecisionError( + f"precision profile {profile_id!r} is not valid in mode {mode!r}" + ) + supported = frozenset(supported_profiles) + if profile_id not in supported: + raise PrecisionError( + f"{backend} does not realize precision profile {profile_id!r} in mode {mode!r}" + ) + return profile_id, profile + + +def require_keyword(callable_object, keyword: str, *, api: str) -> None: + """Fail closed when a pinned Python API does not expose a required control.""" + try: + parameters = inspect.signature(callable_object).parameters + except (TypeError, ValueError) as exc: + raise PrecisionError(f"cannot inspect required precision API {api}") from exc + if keyword not in parameters: + raise PrecisionError(f"required precision API {api} omits {keyword!r}") + + +def communication_format(profile: dict[str, Any], component: str) -> str: + """Return the exact wire format for dispatch or combine.""" + return str(profile[component]["communication_format"]) + + +def is_low_precision_dispatch(profile: dict[str, Any]) -> bool: + return communication_format(profile, "dispatch").startswith("fp8-") + + +def is_caller_prequantized(profile: dict[str, Any]) -> bool: + return profile["dispatch"]["quantization_origin"] == "caller-prequantized" + + +def uses_logfmt_combine(profile: dict[str, Any]) -> bool: + return communication_format(profile, "combine") == "logfmt10" + + +def uses_direct_cast_combine(profile: dict[str, Any]) -> bool: + return profile["combine"]["quantization_origin"] == "backend-internal-direct-cast" + + +def _fp8_dtype(torch_module, axis: dict[str, Any]): + fmt = axis["communication_format"] + attribute = { + "fp8-e4m3fn": "float8_e4m3fn", + "fp8-e4m3fnuz": "float8_e4m3fnuz", + }.get(fmt) + if attribute is None: + raise PrecisionError(f"unsupported FP8 communication format {fmt!r}") + dtype = getattr(torch_module, attribute, None) + if dtype is None: + raise PrecisionError(f"active torch build does not expose torch.{attribute}") + return dtype + + +def _axis_evidence( + *, + dequantized_semantics: bool, + encoded_payload_valid: bool, + max_abs_error: float, + max_rel_error: float, + saturation_count: int, + saturation_rate: float, + scales_finite: bool | None, + scales_positive: bool | None, + passed: bool, +) -> dict[str, Any]: + return { + "encoded_payload_valid": bool(encoded_payload_valid), + "scales_finite": scales_finite, + "scales_positive": scales_positive, + "dequantized_semantics": bool(dequantized_semantics), + "saturation_count": int(saturation_count), + "saturation_rate": float(saturation_rate), + "max_abs_error": float(max_abs_error), + "max_rel_error": float(max_rel_error), + "passed": bool(passed), + } + + +def exact_axis_evidence() -> dict[str, Any]: + """Evidence for an unquantized BF16 communication axis.""" + return _axis_evidence( + encoded_payload_valid=True, + scales_finite=None, + scales_positive=None, + dequantized_semantics=True, + saturation_count=0, + saturation_rate=0.0, + max_abs_error=0.0, + max_rel_error=0.0, + passed=True, + ) + + +def _quantize_fp8(torch_module, x, axis: dict[str, Any]) -> DispatchEncoding: + group_size = axis["scale_group_size"] + if group_size != 128 or axis["scale_dtype"] != "f32": + raise PrecisionError("v1 FP8 dispatch requires block-128 FP32 scales") + if x.ndim != 2 or x.shape[1] % group_size: + raise PrecisionError( + "v1 native FP8 dispatch requires a 2D hidden dimension divisible by 128" + ) + dtype = _fp8_dtype(torch_module, axis) + fp8_max = float(torch_module.finfo(dtype).max) + blocks = x.float().reshape(x.shape[0], x.shape[1] // group_size, group_size) + amax = blocks.abs().amax(dim=-1) + # Match the pinned DeepEP/HybridEP block codec, including its nonzero scale floor. + scales = (amax.clamp_min(1e-4) / fp8_max).to(torch_module.float32) + normalized = blocks / scales.unsqueeze(-1) + saturation_mask = normalized.abs() > fp8_max + encoded = normalized.clamp(min=-fp8_max, max=fp8_max).to(dtype).reshape_as(x).contiguous() + semantic = dequantize_dispatch( + torch_module, encoded, scales, axis + ).to(x.dtype).contiguous() + absolute = (semantic.float() - x.float()).abs() + max_abs = float(absolute.max().item()) if absolute.numel() else 0.0 + reference_max = float(x.float().abs().max().item()) if x.numel() else 0.0 + max_rel = max_abs / (reference_max + 1e-6) + saturation_count = int(saturation_mask.sum().item()) + saturation_rate = saturation_count / max(1, int(x.numel())) + finite = bool(torch_module.isfinite(scales).all().item()) + positive = bool((scales > 0).all().item()) + semantic_ok = bool( + torch_module.isfinite(semantic.float()).all().item() + and torch_module.allclose( + semantic.float(), x.float(), rtol=0.05, atol=0.02 + ) + ) + valid = encoded.dtype == dtype and encoded.shape == x.shape + evidence = _axis_evidence( + encoded_payload_valid=valid, + scales_finite=finite, + scales_positive=positive, + dequantized_semantics=semantic_ok, + saturation_count=saturation_count, + saturation_rate=saturation_rate, + max_abs_error=max_abs, + max_rel_error=max_rel, + passed=valid and finite and positive and semantic_ok, + ) + return DispatchEncoding( + native_input=(encoded, scales), + encoded_payload=encoded, + scales=scales, + semantic=semantic, + evidence=evidence, + ) + + +def encode_dispatch(torch_module, x, profile: dict[str, Any]) -> DispatchEncoding: + """Build caller-prequantized input or a fused-codec oracle outside timing.""" + axis = profile["dispatch"] + origin = axis["quantization_origin"] + if origin == "none": + return DispatchEncoding( + native_input=x, + encoded_payload=None, + scales=None, + semantic=x, + evidence=exact_axis_evidence(), + ) + if origin not in {"caller-prequantized", "backend-fused"}: + raise PrecisionError(f"unsupported dispatch quantization origin {origin!r}") + encoded = _quantize_fp8(torch_module, x, axis) + if origin == "backend-fused": + return DispatchEncoding( + native_input=x, + encoded_payload=encoded.encoded_payload, + scales=encoded.scales, + semantic=encoded.semantic, + evidence=encoded.evidence, + ) + return encoded + + +def dequantize_dispatch( + torch_module, + encoded_payload, + scales, + axis: dict[str, Any], + *, + uint8_storage: bool = False, +): + """Decode one native block-scaled FP8 payload to BF16 semantics.""" + group_size = axis["scale_group_size"] + if group_size != 128 or scales is None: + raise PrecisionError("FP8 dispatch payload is missing block-128 scales") + dtype = _fp8_dtype(torch_module, axis) + payload = encoded_payload.view(dtype) if uint8_storage else encoded_payload + if payload.dtype != dtype or payload.ndim < 2 or payload.shape[-1] % group_size: + raise PrecisionError("native FP8 dispatch payload has an invalid dtype or shape") + expected_scale_shape = (*payload.shape[:-1], payload.shape[-1] // group_size) + if tuple(scales.shape) != expected_scale_shape or scales.dtype != torch_module.float32: + raise PrecisionError("native FP8 dispatch scales have an invalid dtype or shape") + values = payload.float().reshape( + *payload.shape[:-1], payload.shape[-1] // group_size, group_size + ) + return (values * scales.float().reshape(*expected_scale_shape, 1)).reshape( + payload.shape + ).to(torch_module.bfloat16).contiguous() + + +def validate_received_encoding( + torch_module, + *, + encoded_payload, + scales, + semantic, + axis: dict[str, Any], + uint8_storage: bool = False, +) -> bool: + """Validate that received bytes/scales exactly decode to the semantic view.""" + try: + decoded = dequantize_dispatch( + torch_module, + encoded_payload, + scales, + axis, + uint8_storage=uint8_storage, + ) + except PrecisionError: + return False + return bool(torch_module.equal(decoded, semantic)) + + +def dequantize_expert_prefixes( + torch_module, + encoded_payload, + scales, + axis: dict[str, Any], + counts: tuple[int, ...], + output, +): + """Decode only valid expert-packed prefixes into a reusable BF16 workspace.""" + if encoded_payload.ndim != 3 or len(counts) != encoded_payload.shape[0]: + raise PrecisionError("expert-packed FP8 receive counts have an invalid shape") + if output.shape != encoded_payload.shape or output.dtype != torch_module.bfloat16: + raise PrecisionError("expert-packed BF16 stage workspace has an invalid shape") + capacity = encoded_payload.shape[1] + for expert, count in enumerate(counts): + if count < 0 or count > capacity: + raise PrecisionError("expert-packed FP8 receive count exceeds capacity") + if count: + output[expert, :count].copy_(dequantize_dispatch( + torch_module, + encoded_payload[expert, :count], + scales[expert, :count], + axis, + )) + return output + + +def _direct_cast_saturation(torch_module, profile: dict[str, Any], view) -> tuple[int, float]: + """Count values clipped by MoRI's unscaled BF16-to-FP8 combine cast.""" + if not all(hasattr(view, field) for field in ("payload", "expert_ids", "weights")): + return 0, 0.0 + expert_ids = view.expert_ids + weights = view.weights + if expert_ids.ndim != 2 or weights.shape != expert_ids.shape: + return 0, 0.0 + valid = expert_ids >= 0 + expert = expert_ids.clamp(min=0).to(torch_module.int64) + gate = weights.to(torch_module.float32).masked_fill(~valid, 0) + scale = ((expert * 17 + 5) % 31 + 1).to(torch_module.float32) / 32 + offset_a = (((expert * 29 + 7) % 37) - 18).to(torch_module.float32) / 64 + offset_b = (((expert * 43 + 11) % 41) - 20).to(torch_module.float32) / 128 + columns = torch_module.arange( + view.payload.shape[1], device=view.payload.device, dtype=torch_module.int64 + ) + pattern = (((columns * 13) % 17) - 8).to(torch_module.float32) / 8 + transformed = ( + view.payload.float() * (gate * scale).sum(dim=1, keepdim=True) + + (gate * offset_a).sum(dim=1, keepdim=True) + + (gate * offset_b).sum(dim=1, keepdim=True) * pattern.unsqueeze(0) + ) + dtype = _fp8_dtype(torch_module, profile["combine"]) + fp8_max = float(torch_module.finfo(dtype).max) + count = int((transformed.abs() > fp8_max).sum().item()) + return count, count / max(1, int(transformed.numel())) + + +def precision_evidence( + torch_module, + *, + profile_id: str, + profile: dict[str, Any], + problem, + view=None, + uint8_storage: bool = False, +) -> dict[str, Any]: + """Return schema-shaped codec evidence; the harness adds combine-oracle errors.""" + dispatch = deepcopy(problem.dispatch_precision_evidence) + if ( + is_low_precision_dispatch(profile) + and view is not None + and all(hasattr(view, field) for field in ("encoded_payload", "scales", "payload")) + ): + valid = validate_received_encoding( + torch_module, + encoded_payload=view.encoded_payload, + scales=view.scales, + semantic=view.payload, + axis=profile["dispatch"], + uint8_storage=uint8_storage, + ) + dispatch["encoded_payload_valid"] = ( + dispatch["encoded_payload_valid"] and valid + ) + dispatch["passed"] = dispatch["passed"] and valid + combine = exact_axis_evidence() + # Internal quantizers are validated by native configuration here. The harness + # replaces the error fields and pass bit with the transformed-combine oracle. + if communication_format(profile, "combine") != "bf16": + combine["scales_finite"] = None + combine["scales_positive"] = None + if uses_direct_cast_combine(profile) and view is not None: + count, rate = _direct_cast_saturation(torch_module, profile, view) + combine["saturation_count"] = count + combine["saturation_rate"] = rate + return { + "profile_id": profile_id, + "dispatch": dispatch, + "combine": combine, + "passed": bool(dispatch["passed"] and combine["passed"]), + } diff --git a/experimental/CollectiveX/tests/ep_uccl.py b/experimental/CollectiveX/tests/ep_uccl.py new file mode 100644 index 000000000..8eb514741 --- /dev/null +++ b/experimental/CollectiveX/tests/ep_uccl.py @@ -0,0 +1,535 @@ +#!/usr/bin/env python3 +"""CollectiveX UCCL adapter for native V1 dispatch/combine precision profiles.""" +from __future__ import annotations + +import importlib.metadata as metadata +import json +import os +from pathlib import Path +from pathlib import PurePosixPath +import sys +import types + +import torch +import torch.distributed as dist +import contracts +import ep_precision + +try: + import uccl + import uccl_deepep + from uccl_deepep import Buffer # type: ignore +except Exception as exc: # pragma: no cover - requires the benchmark image + print(f"ERROR: uccl.ep import failed: {exc!r}", file=sys.stderr) + raise + + +def _uccl_version() -> str: + try: + return metadata.version("uccl") + except Exception: + return getattr(uccl, "__version__", "unknown") + + +def _uccl_dependency_versions() -> dict[str, str]: + versions = { + package: metadata.version(package) + for package in contracts.UCCL_DEPENDENCY_VERSIONS + } + if versions != contracts.UCCL_DEPENDENCY_VERSIONS: + raise RuntimeError( + "UCCL runtime dependency versions differ from the v1 contract" + ) + return versions + + +def _is_uccl_runtime_payload(name: str) -> bool: + path = PurePosixPath(name) + return ( + bool(path.parts) + and path.parts[0] in {"uccl", "uccl.libs"} + and "__pycache__" not in path.parts + and path.suffix != ".pyc" + ) + + +def _python_dependency_evidence(package: str, version: str) -> dict[str, str]: + distribution = metadata.distribution(package) + runtime_files = [] + for entry in distribution.files or (): + logical = PurePosixPath(entry.as_posix()) + path = Path(distribution.locate_file(entry)) + if ( + logical.parts + and logical.parts[0] == package + and "__pycache__" not in logical.parts + and logical.suffix != ".pyc" + and path.is_file() + ): + runtime_files.append((entry.as_posix(), path)) + return contracts.content_manifest_evidence( + role=f"{package}-distribution", + name=f"{package}-{version}", + files=runtime_files, + ) + + +def _loaded_libcudart_evidence( + version: str, maps_path: Path = Path("/proc/self/maps") +) -> dict[str, str]: + distribution = metadata.distribution("nvidia-cuda-runtime-cu12") + candidates = { + Path(distribution.locate_file(entry)).resolve() + for entry in distribution.files or () + if PurePosixPath(entry.as_posix()).name.startswith("libcudart.so") + and Path(distribution.locate_file(entry)).is_file() + } + candidate_names = {path.name for path in candidates} + if not candidates or not candidate_names: + raise RuntimeError("pinned CUDA runtime distribution has no libcudart payload") + + loaded: set[Path] = set() + try: + mappings = maps_path.read_text().splitlines() + except OSError as exc: + raise RuntimeError("cannot inspect mapped UCCL runtime libraries") from exc + for mapping in mappings: + columns = mapping.split(maxsplit=5) + if len(columns) != 6: + continue + raw_path = columns[5] + deleted = raw_path.endswith(" (deleted)") + if deleted: + raw_path = raw_path.removesuffix(" (deleted)") + mapped = Path(raw_path) + if mapped.name not in candidate_names: + continue + if deleted or not mapped.is_file(): + raise RuntimeError( + "mapped libcudart is unavailable for content verification" + ) + resolved = mapped.resolve() + if resolved not in candidates: + raise RuntimeError( + "mapped libcudart is not owned by the pinned CUDA runtime package" + ) + loaded.add(resolved) + if len(loaded) != 1: + raise RuntimeError( + "expected exactly one mapped libcudart from the pinned CUDA runtime" + ) + return contracts.content_manifest_evidence( + role="cuda-runtime", + name=f"nvidia-cuda-runtime-cu12-{version}", + files=[("libcudart.so", loaded.pop())], + ) + + +def _uccl_build_evidence( + version: str, dependency_versions: dict[str, str] +) -> list[dict[str, str]]: + distribution = metadata.distribution("uccl") + distribution_files = [ + (entry.as_posix(), distribution.locate_file(entry)) + for entry in distribution.files or () + if _is_uccl_runtime_payload(entry.as_posix()) + and Path(distribution.locate_file(entry)).is_file() + ] + wrapper_root = Path(uccl_deepep.__file__).resolve().parent + wrapper_files = [ + (path.relative_to(wrapper_root).as_posix(), path) + for path in wrapper_root.rglob("*.py") + if path.is_file() + ] + return [ + contracts.content_manifest_evidence( + role="uccl-distribution", + name=f"uccl-{version}", + files=distribution_files, + ), + contracts.content_manifest_evidence( + role="uccl-wrapper", + name="uccl-deepep-wrapper", + files=wrapper_files, + ), + _python_dependency_evidence("intervaltree", dependency_versions["intervaltree"]), + _python_dependency_evidence( + "sortedcontainers", dependency_versions["sortedcontainers"] + ), + _loaded_libcudart_evidence(dependency_versions["nvidia-cuda-runtime-cu12"]), + ] + + +def _require_cross_rank_equal(value, label: str) -> None: + gathered = [None] * dist.get_world_size() + dist.all_gather_object(gathered, value) + canonical = {json.dumps(item, sort_keys=True, separators=(",", ":")) for item in gathered} + if len(canonical) != 1: + raise RuntimeError(f"UCCL {label} differs across ranks") + + +def _normal_buffer_sizes(hidden: int, world_size: int) -> tuple[int, int]: + """Apply the wrapped DeepEP dispatch/combine sizing contract for this EP world.""" + hidden_bytes = hidden * torch.tensor([], dtype=torch.bfloat16).element_size() + configs = (Buffer.get_dispatch_config(world_size), Buffer.get_combine_config(world_size)) + num_nvl_bytes = max( + int(config.get_nvl_buffer_size_hint(hidden_bytes, world_size)) for config in configs + ) + num_rdma_bytes = max( + int(config.get_rdma_buffer_size_hint(hidden_bytes, world_size)) for config in configs + ) + if num_nvl_bytes <= 0 or num_rdma_bytes < 0: + raise RuntimeError("UCCL returned invalid normal-mode buffer size hints") + return num_nvl_bytes, num_rdma_bytes + + +class UCCLBackend: + name = "uccl" + stage_device_work = False + combine_needs_redispatch = False + combine_weight_semantics = "unweighted-rank-sum" + oracle_layout = "token-rank" + payload_unit = "token-rank" + + def __init__(self, args, rank, world_size, local_rank, device): + self.args = args + self.rank = rank + self.world_size = world_size + self.device = device + self.mode = getattr(args, "mode", "normal") + if self.mode not in {"normal", "low-latency"}: + raise ValueError(f"unsupported UCCL mode {self.mode!r}") + supported_profiles = { + "normal": { + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + }, + "low-latency": { + "d-bf16.c-bf16", + "d-fp8-e4m3fn-b128-f32-fused.c-bf16", + "d-bf16.c-logfmt10-dynamic64", + "d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64", + }, + } + self.precision_profile_id, self.communication_precision = ( + ep_precision.resolve_precision( + args, + backend=self.name, + mode=self.mode, + supported_profiles=supported_profiles[self.mode], + ) + ) + self._fp8_dispatch = ep_precision.is_low_precision_dispatch( + self.communication_precision + ) + self._use_logfmt = ep_precision.uses_logfmt_combine( + self.communication_precision + ) + self.stage_device_work = self._fp8_dispatch + + self.group = dist.group.WORLD + device_sms = torch.cuda.get_device_properties(device).multi_processor_count + if self.mode == "low-latency": + ep_precision.require_keyword( + Buffer.low_latency_dispatch, + "use_fp8", + api="uccl_deepep.Buffer.low_latency_dispatch", + ) + ep_precision.require_keyword( + Buffer.low_latency_combine, + "use_logfmt", + api="uccl_deepep.Buffer.low_latency_combine", + ) + if args.phase != "decode": + raise ValueError("UCCL low-latency mode only supports the decode ladder") + if args.experts % world_size: + raise ValueError("UCCL low-latency experts must divide the EP group") + self.combine_needs_redispatch = True + self.combine_weight_semantics = "gate-weighted-sum" + self.oracle_layout = "expert-packed" + self.payload_unit = "token-expert" + self.max_tokens_per_rank = 128 + num_qps_per_rank = args.experts // world_size + num_rdma_bytes = Buffer.get_low_latency_rdma_size_hint( + self.max_tokens_per_rank, args.hidden, world_size, args.experts + ) + self.buffer = Buffer( + self.group, + num_nvl_bytes=0, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=True, + num_qps_per_rank=num_qps_per_rank, + allow_nvlink_for_low_latency_mode=True, + ) + self.buffer.clean_low_latency_buffer( + self.max_tokens_per_rank, args.hidden, args.experts + ) + resource_provenance = { + "requested_num_sms": None, + "num_sms": None, + "sm_fraction": None, + "tuned_source": "uccl-low-latency-fixed-kernel", + "num_max_tokens_per_rank": self.max_tokens_per_rank, + "num_nvl_bytes": 0, + "num_rdma_bytes": num_rdma_bytes, + } + else: + ep_precision.require_keyword( + Buffer.dispatch, + "async_finish", + api="uccl_deepep.Buffer.dispatch", + ) + ep_precision.require_keyword( + Buffer.combine, + "async_finish", + api="uccl_deepep.Buffer.combine", + ) + num_nvl_bytes, num_rdma_bytes = _normal_buffer_sizes(args.hidden, world_size) + if world_size > args.scale_up_domain and num_rdma_bytes == 0: + raise RuntimeError("UCCL scale-out configuration returned no RDMA buffer") + self.buffer = Buffer(self.group, num_nvl_bytes, num_rdma_bytes) + num_sms = int(getattr(Buffer, "num_sms", args.num_sms)) + try: + Buffer.set_num_sms(num_sms) + except Exception as exc: # pragma: no cover - version dependent + raise RuntimeError( + f"UCCL did not apply requested num_sms={num_sms}: {exc!r}" + ) from exc + applied_num_sms = int(getattr(Buffer, "num_sms", num_sms)) + if applied_num_sms != num_sms: + raise RuntimeError( + f"UCCL num_sms mismatch: requested={num_sms} applied={applied_num_sms}" + ) + resource_provenance = { + "requested_num_sms": num_sms, + "num_sms": applied_num_sms, + "sm_fraction": applied_num_sms / device_sms, + "tuned_source": "uccl-default-num_sms", + "num_nvl_bytes": num_nvl_bytes, + "num_rdma_bytes": num_rdma_bytes, + } + version = _uccl_version() + dependency_versions = _uccl_dependency_versions() + loaded_libraries = _uccl_build_evidence(version, dependency_versions) + _require_cross_rank_equal(loaded_libraries, "installed content identities") + self.backend_provenance = { + "uccl_version": version, + "uccl_commit": os.environ.get("UCCL_COMMIT") or f"pkg-{version}", + "uccl_wrapper_commit": os.environ.get("UCCL_WRAPPER_COMMIT"), + "backend_lineage": "uccl", + "uccl_dependency_versions": dependency_versions, + "loaded_libraries": loaded_libraries, + "mode": self.mode, + "dispatch_dtype": ep_precision.communication_format( + self.communication_precision, "dispatch" + ), + "combine_dtype": ep_precision.communication_format( + self.communication_precision, "combine" + ), + "resource_mode": "fixed-profile", + "device_sms": device_sms, + **resource_provenance, + } + + def buffer_cap(self, args): + return self.max_tokens_per_rank if self.mode == "low-latency" else None + + def make_problem(self, T, idx, weights, x): + encoding = ep_precision.encode_dispatch( + torch, x, self.communication_precision + ) + return types.SimpleNamespace( + T=T, + x=x, + dispatch_x=encoding.native_input, + oracle_x=encoding.semantic, + dispatch_precision_evidence=encoding.evidence, + topk_idx=idx.to(torch.int64), + topk_weights=weights.to(torch.float32), + ) + + def dispatch(self, p): + if self.mode == "low-latency": + recv_x, recv_counts, handle, _, _ = self.buffer.low_latency_dispatch( + p.x, + p.topk_idx, + self.max_tokens_per_rank, + self.args.experts, + use_fp8=self._fp8_dispatch, # BF16 control realizes use_fp8=False. + async_finish=False, + return_recv_hook=False, + ) + return types.SimpleNamespace( + recv_x=recv_x, + recv_counts=recv_counts, + handle=handle, + ) + ( + num_tokens_per_rank, + num_tokens_per_rdma_rank, + num_tokens_per_expert, + is_token_in_rank, + _, + ) = self.buffer.get_dispatch_layout(p.topk_idx, self.args.experts) + recv_x, recv_topk_idx, recv_topk_weights, recv_counts, handle, _ = self.buffer.dispatch( + p.dispatch_x, + topk_idx=p.topk_idx, + topk_weights=p.topk_weights, + num_tokens_per_rank=num_tokens_per_rank, + num_tokens_per_rdma_rank=num_tokens_per_rdma_rank, + is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=num_tokens_per_expert, + async_finish=False, + ) + return types.SimpleNamespace( + recv_x=recv_x, + recv_topk_idx=recv_topk_idx, + recv_topk_weights=recv_topk_weights, + recv_counts=recv_counts, + handle=handle, + ) + + def stage(self, p, h): + h.combine_input = self._semantic_recv(h, p) + + def combine(self, p, h): + if self.mode == "low-latency": + combined_x, _, _ = self.buffer.low_latency_combine( + h.combine_input, + p.topk_idx, + p.topk_weights, + h.handle, + use_logfmt=self._use_logfmt, + async_finish=False, + return_recv_hook=False, + ) + return combined_x + combined_x, _, _ = self.buffer.combine( + h.combine_input, h.handle, async_finish=False + ) + return combined_x + + def inspect_dispatch(self, p, h): + valid = h.recv_topk_idx >= 0 + expert_ids = torch.where( + valid, + h.recv_topk_idx + self.rank * (self.args.experts // self.world_size), + h.recv_topk_idx, + ) + return types.SimpleNamespace( + payload=self._semantic_recv(h, p), + encoded_payload=self._encoded_recv(h), + scales=self._recv_scales(h), + expert_ids=expert_ids, + weights=h.recv_topk_weights.masked_fill(~valid, 0), + local_expert_counts=torch.tensor(h.recv_counts, device=self.device, dtype=torch.int64), + ordering_contract="source-rank-major-stable-v1", + ) + + def inspect_expert_dispatch(self, p, h): + if self.mode != "low-latency": + raise RuntimeError("expert-packed inspection requires low-latency mode") + p.recv_counts = tuple(int(value) for value in h.recv_counts.tolist()) + return types.SimpleNamespace( + payload=self._semantic_recv(h, p), + encoded_payload=self._encoded_recv(h), + scales=self._recv_scales(h), + local_expert_counts=h.recv_counts, + source_info=h.handle[0], + layout_range=h.handle[1], + ) + + def combine_transformed(self, p, h, transformed): + if self.mode == "low-latency": + packed = torch.zeros( + self._encoded_recv(h).shape, + dtype=torch.bfloat16, + device=self._encoded_recv(h).device, + ) + packed[h.oracle_local_expert_slots, h.oracle_packed_positions] = transformed.to( + packed.dtype + ) + combined, _, _ = self.buffer.low_latency_combine( + packed, + p.topk_idx, + p.topk_weights, + h.handle, + use_logfmt=self._use_logfmt, + async_finish=False, + return_recv_hook=False, + ) + return combined + semantic = self._semantic_recv(h, p) + combined, _, _ = self.buffer.combine( + transformed.to(semantic.dtype), h.handle, async_finish=False + ) + return combined + + def recv_tokens(self, h): + if self.mode == "low-latency": + return int(h.recv_counts.to(torch.int64).sum().item()) + return int(self._encoded_recv(h).shape[0]) + + def _encoded_recv(self, h): + return h.recv_x[0] if isinstance(h.recv_x, tuple) else h.recv_x + + def _recv_scales(self, h): + return h.recv_x[1] if isinstance(h.recv_x, tuple) else None + + def _semantic_recv(self, h, problem=None): + if not self._fp8_dispatch: + return h.recv_x + if not hasattr(h, "recv_semantic"): + if self.mode == "low-latency": + counts = getattr(problem, "recv_counts", None) + if counts is None: + counts = tuple(int(value) for value in h.recv_counts.tolist()) + if problem is not None: + problem.recv_counts = counts + workspace = getattr(self, "_ll_semantic_workspace", None) + if workspace is None: + encoded = self._encoded_recv(h) + workspace = torch.empty( + encoded.shape, dtype=torch.bfloat16, device=encoded.device + ) + self._ll_semantic_workspace = workspace + h.recv_semantic = ep_precision.dequantize_expert_prefixes( + torch, + self._encoded_recv(h), + self._recv_scales(h), + self.communication_precision["dispatch"], + counts, + workspace, + ) + else: + h.recv_semantic = ep_precision.dequantize_dispatch( + torch, + self._encoded_recv(h), + self._recv_scales(h), + self.communication_precision["dispatch"], + ) + return h.recv_semantic + + def oracle_dispatch_payload(self, payload): + return ep_precision.encode_dispatch( + torch, payload, self.communication_precision + ).semantic + + def precision_evidence(self, problem, view=None): + return ep_precision.precision_evidence( + torch, + profile_id=self.precision_profile_id, + profile=self.communication_precision, + problem=problem, + view=view, + ) + + def finalize(self, rc): + # UCCL's proxy teardown can crash after results are written; preserve the real rc. + try: + dist.barrier() + except Exception: + pass + sys.stdout.flush() + sys.stderr.flush() + os._exit(rc if 0 <= rc <= 255 else 1) diff --git a/experimental/CollectiveX/tests/eplb.py b/experimental/CollectiveX/tests/eplb.py new file mode 100644 index 000000000..b1479da9f --- /dev/null +++ b/experimental/CollectiveX/tests/eplb.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +"""CollectiveX — EPLB (Expert-Parallel Load Balancer), the DeepSeek-style remedy for +skewed (zipf) expert load. + +Under skewed routing, the ranks hosting hot logical experts receive far more token-copies +than the rest; dispatch/combine latency is gated by that busiest rank (the cross-rank MAX +the harness measures), so the whole collective stalls on it. EPLB REPLICATES hot experts +onto extra physical slots and PLACES the slots so every rank carries ~equal load. + +This module is backend-agnostic: it is purely a transform of the deterministic routing +trace. The trick that keeps every adapter unchanged — DeepEP/MoRI both route expert i to +rank `i // experts_per_rank` (contiguous block placement) — is to number the physical slots +RANK-MAJOR (rank r owns physical ids [r*spp, (r+1)*spp)), so the standard contiguous mapping +reproduces EPLB's balanced placement. The harness then runs with `experts = num_physical` +and the remapped (physical) trace; nothing else changes. + + num_physical = num_logical + redundant (redundant rounded up to a multiple of ep_size) + build_plan(): greedy replicate-by-load + equal-cardinality balanced packing onto ep_size ranks + remap_idx(): each token's logical targets -> physical replicas, spread by global token id + +Pure-Python planner (no torch) so it unit-tests on a login node; remap_idx needs torch. +""" +from __future__ import annotations + +import hashlib +import json + + +def physical_count(num_logical: int, num_redundant: int, ep_size: int) -> int: + """num_logical + redundant, with redundant rounded UP to a multiple of ep_size so the + physical experts divide evenly across ranks (symmetric dispatch).""" + r = ((max(0, num_redundant) + ep_size - 1) // ep_size) * ep_size + return num_logical + r + + +def _contiguous_rank_load(logical_load, ep_size): + """Per-rank received load WITHOUT EPLB: logical experts placed contiguously + (experts_per_rank = num_logical/ep_size), so rank r carries its block's total.""" + n = len(logical_load) + per = n // ep_size + return [sum(logical_load[r * per:(r + 1) * per]) for r in range(ep_size)] + + +def build_plan(logical_load, num_physical: int, ep_size: int) -> dict: + """logical_load: list[float] length num_logical (token-copies per logical expert). + Returns the replication+placement plan (all pure-Python lists) + before/after balance.""" + num_logical = len(logical_load) + assert num_physical >= num_logical, "num_physical must be >= num_logical" + assert num_physical % ep_size == 0, "num_physical must divide ep_size" + assert num_logical % ep_size == 0, "num_logical must divide ep_size" + spp = num_physical // ep_size # physical slots per rank (fixed) + + # 1) Replica allocation — start one slot per logical expert, then hand each redundant + # slot to the expert with the highest CURRENT per-replica load (greedy min-max). + replicas = [1] * num_logical + for _ in range(num_physical - num_logical): + best, best_lps = 0, -1.0 + for e in range(num_logical): + lps = logical_load[e] / replicas[e] + if lps > best_lps: + best, best_lps = e, lps + replicas[best] += 1 + + # 2) Slots = (per-replica load, logical expert), one per replica. + slots = [] + for e in range(num_logical): + lps = logical_load[e] / replicas[e] + slots.extend((lps, e) for _ in range(replicas[e])) + + # 3) Balanced packing into ep_size bins of EQUAL cardinality (spp each), minimizing the + # max per-rank load: heaviest slot first -> least-loaded rank that still has capacity. + slots.sort(reverse=True) + rank_slots = [[] for _ in range(ep_size)] + rank_load = [0.0] * ep_size + for lps, e in slots: + r = min((r for r in range(ep_size) if len(rank_slots[r]) < spp), + key=lambda r: rank_load[r]) + rank_slots[r].append(e) + rank_load[r] += lps + + # 4) Rank-major physical numbering -> contiguous placement == this balanced placement. + phys2log, rank_of_phys = [], [] + for r in range(ep_size): + for e in rank_slots[r]: + phys2log.append(e) + rank_of_phys.append(r) + log2phys = [[] for _ in range(num_logical)] + for pid, e in enumerate(phys2log): + log2phys[e].append(pid) + + before = _contiguous_rank_load(logical_load, ep_size) + total = sum(logical_load) or 1.0 + mean = total / ep_size + return { + "num_logical": num_logical, "num_physical": num_physical, "ep_size": ep_size, + "slots_per_rank": spp, "replicas": replicas, "max_replicas": max(replicas), + "phys2log": phys2log, "rank_of_phys": rank_of_phys, "log2phys": log2phys, + "rank_load_after": rank_load, "rank_load_before": before, + # imbalance = busiest rank / mean (1.0 = perfect). This is the number EPLB cuts. + "imbalance_before": max(before) / mean, "imbalance_after": max(rank_load) / mean, + "replicated_experts": sum(1 for r in replicas if r > 1), + } + + +def mapping_hash(plan: dict) -> str: + """Hash the placement fields that fully determine the logical-to-physical remap.""" + payload = { + "phys2log": plan["phys2log"], + "rank_of_phys": plan["rank_of_phys"], + "replicas": plan["replicas"], + } + return hashlib.sha256(json.dumps(payload, sort_keys=True).encode()).hexdigest() + + +def remap_rows(indices: list[list[int]], plan: dict) -> list[list[int]]: + """Pure-Python equivalent of remap_idx for contract verification.""" + replicas = plan["log2phys"] + return [ + [replicas[expert][token % len(replicas[expert])] for expert in row] + for token, row in enumerate(indices) + ] + + +def remap_idx(idx_logical, plan): + """idx_logical: torch [gt, topk] int64 logical-expert ids (global trace). + Returns idx_physical [gt, topk]: each token's logical target -> one of that expert's + physical replicas, SPREAD by global token id (row) so a hot expert's tokens fan out + across its replicas (= across ranks). Replicas of distinct logical experts are disjoint, + so a token's top-k physical ids stay distinct (dispatch invariant preserved).""" + import torch + replicas = plan["replicas"] + num_logical = len(replicas) + max_rc = plan["max_replicas"] + rc = torch.tensor(replicas, dtype=torch.int64) + # padded [num_logical, max_rc] table of physical ids (pad with replica 0; never indexed + # past rc[e] because the replica index is taken mod rc[e]). + padded = torch.zeros(num_logical, max_rc, dtype=torch.int64) + for e, phys in enumerate(plan["log2phys"]): + for k in range(max_rc): + padded[e, k] = phys[k] if k < len(phys) else phys[0] + gt = idx_logical.shape[0] + rows = torch.arange(gt, dtype=torch.int64).unsqueeze(1) # [gt,1] global token id + e = idx_logical.to(torch.int64) # [gt,topk] + ridx = rows % rc[e] # [gt,topk] replica index + return padded[e, ridx] # [gt,topk] physical ids + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + # Synthetic zipf load (popularity ∝ 1/(e+1)) — the case EPLB targets. No torch needed. + import sys + NUM_LOGICAL, EP, REDUNDANT = 256, 8, 32 + load = [1.0 / (e + 1) for e in range(NUM_LOGICAL)] + nphys = physical_count(NUM_LOGICAL, REDUNDANT, EP) + plan = build_plan(load, nphys, EP) + print(f"num_logical={NUM_LOGICAL} ep={EP} num_physical={nphys} slots/rank={plan['slots_per_rank']}") + print(f"replicated experts={plan['replicated_experts']} max_replicas={plan['max_replicas']} " + f"(hottest expert 0 replicas={plan['replicas'][0]})") + print(f"per-rank load BEFORE (contiguous): {[round(x,3) for x in plan['rank_load_before']]}") + print(f"per-rank load AFTER (EPLB): {[round(x,3) for x in plan['rank_load_after']]}") + print(f"imbalance (max/mean) BEFORE={plan['imbalance_before']:.2f}x AFTER={plan['imbalance_after']:.2f}x") + # Gates: equal slot cardinality, every logical expert placed, big imbalance cut. + assert all(plan["replicas"][e] >= 1 for e in range(NUM_LOGICAL)) + assert sum(plan["replicas"]) == nphys + assert len(plan["phys2log"]) == nphys + assert all(len(plan["log2phys"][e]) == plan["replicas"][e] for e in range(NUM_LOGICAL)) + # rank-major numbering => contiguous block per rank => rank_of_phys is non-decreasing + assert plan["rank_of_phys"] == sorted(plan["rank_of_phys"]) + assert plan["imbalance_after"] < plan["imbalance_before"], "EPLB must reduce imbalance" + assert plan["imbalance_after"] < 1.30, f"EPLB should get within ~30% of perfect, got {plan['imbalance_after']:.2f}" + # remap (if torch present): distinctness + balanced receive on a sampled zipf trace. + try: + import torch + g = torch.Generator().manual_seed(0) + p = torch.tensor(load) + p = (p / p.sum()).expand(4096, NUM_LOGICAL) + idx_l = torch.multinomial(p, 8, replacement=False, generator=g).to(torch.int64) + idx_p = remap_idx(idx_l, plan) + assert idx_p.shape == idx_l.shape + # top-k physical ids distinct per token + assert all(len(set(row.tolist())) == 8 for row in idx_p), "physical top-k must stay distinct" + spp = plan["slots_per_rank"] + recv_before = [0] * EP + recv_after = [0] * EP + per_log = NUM_LOGICAL // EP + for row_l, row_p in zip(idx_l.tolist(), idx_p.tolist()): + for e in row_l: + recv_before[e // per_log] += 1 + for pid in row_p: + recv_after[pid // spp] += 1 + ib = max(recv_before) / (sum(recv_before) / EP) + ia = max(recv_after) / (sum(recv_after) / EP) + print(f"sampled-trace receive imbalance BEFORE={ib:.2f}x AFTER={ia:.2f}x") + assert ia < ib and ia < 1.35, "remap must balance per-rank receive load" + print("remap self-test: OK") + except ImportError: + print("(torch absent — skipped remap self-test; planner gates passed)") + print("EPLB self-test: PASS") + sys.exit(0) diff --git a/experimental/CollectiveX/tests/make_workloads.py b/experimental/CollectiveX/tests/make_workloads.py new file mode 100644 index 000000000..862c3d037 --- /dev/null +++ b/experimental/CollectiveX/tests/make_workloads.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +"""Generate canonical serialized workloads. Runs the stdlib counter generator for +each (routing, global_tokens) in a ladder and writes .npz + .manifest.json into a +dir that runs then consume via `run_ep.py --workload-dir`. One trace is emitted per global-token +count because global token count is part of workload identity. + + python3 tests/make_workloads.py --out-dir /path/to/cx_workloads \\ + --routing uniform --ep 8 --hidden 7168 --topk 8 --experts 256 --seed 67 \\ + --tokens-ladder "1 2 4 8 16 32 64 128 256 512" + +Or by the named v1 workload in configs/workloads.yaml. Explicit dimension flags still override it: + + python3 tests/make_workloads.py --out-dir /path/to/cx_workloads --workload deepseek-v3-v1 --routing uniform --ep 8 + +--id-only prints the content-bound workload_id per ladder point without torch/numpy: + + python3 tests/make_workloads.py --workload deepseek-v3-v1 --ep 8 --id-only + +Generate every routing the suites need by running once per --routing. Idempotent (same id => same +file). The dir is the cross-hardware artifact: copy it to each cluster so all consume identical bytes. +""" +from __future__ import annotations + +import argparse +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import workload as wl # noqa: E402 + +# Repo root holds configs/ (this file is in tests/). Used only for --workload name resolution. +_REPO = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +def resolve_manifest(name): + """Look a workload name up in configs/workloads.yaml and return (hidden, topk, experts). + Searches synthetic + model_derived; expert count = `experts` or (for model-derived) `routed_experts`. + Raises SystemExit with the known names if the manifest is absent. Pure PyYAML + stdlib.""" + import yaml + path = os.path.join(_REPO, "configs", "workloads.yaml") + with open(path) as handle: + cfg = yaml.safe_load(handle) + known = [] + for section in ("synthetic", "model_derived"): + sec = cfg.get(section) or {} + known += list(sec) + m = sec.get(name) + if m is None: + continue + experts = m.get("experts", m.get("routed_experts")) + if m.get("hidden") is None or m.get("topk") is None or experts is None: + raise SystemExit(f"workload '{name}' is missing hidden/topk/experts in {path}") + return int(m["hidden"]), int(m["topk"]), int(experts) + raise SystemExit(f"unknown --workload '{name}'; known: {sorted(known)}") + + +def main() -> int: + ap = argparse.ArgumentParser(description="Generate canonical CollectiveX workloads") + ap.add_argument("--out-dir", help="required unless --id-only") + ap.add_argument("--workload", help="named manifest in configs/workloads.yaml (sets hidden/topk/experts)") + ap.add_argument("--routing", default="uniform", choices=["uniform", "zipf"]) + ap.add_argument("--ep", type=int, required=True, help="ep_size (global_tokens = T * ep)") + ap.add_argument("--hidden", type=int, help="override (default 7168, or the --workload's hidden)") + ap.add_argument("--topk", type=int, help="override (default 8, or the --workload's topk)") + ap.add_argument("--experts", type=int, help="override (default 256, or the --workload's experts)") + ap.add_argument("--seed", type=int, default=67) + ap.add_argument("--tokens-ladder", default="1 2 4 8 16 32 64 128 256 512") + ap.add_argument("--id-only", action="store_true", + help="print content-bound workload_id per point without torch/numpy") + a = ap.parse_args() + + # Resolve dims: a named --workload supplies defaults; explicit --hidden/--topk/--experts override + # per field. With neither, fall back to the v1 DeepSeek dimensions (7168/8/256). + base_h, base_t, base_e = (7168, 8, 256) + if a.workload: + base_h, base_t, base_e = resolve_manifest(a.workload) + hidden = a.hidden if a.hidden is not None else base_h + topk = a.topk if a.topk is not None else base_t + experts = a.experts if a.experts is not None else base_e + + if not a.id_only and not a.out_dir: + ap.error("--out-dir is required unless --id-only") + + raw_ladder = [int(token) for token in a.tokens_ladder.replace(",", " ").split()] + if (a.ep <= 0 or min(hidden, topk, experts) <= 0 or topk > experts or experts % a.ep + or not raw_ladder or any(token <= 0 for token in raw_ladder) + or len(raw_ladder) != len(set(raw_ladder))): + ap.error("shape, EP, and token ladder must be positive, divisible, and unique") + ladder = sorted(raw_ladder) + epr = experts // a.ep + label = f"workload={a.workload} " if a.workload else "" + + if a.id_only: + # The stdlib counter generator derives the same content-bound ID on every runtime. + made = [] + for T in ladder: + gt = T * a.ep + wid = wl.compute_workload_id(a.routing, hidden, topk, experts, a.ep, gt, a.seed) + made.append((T, gt, wid)) + print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid}") + print(f"{label}id-only: {len(made)} workload_id(s) " + f"(hidden={hidden} topk={topk} experts={experts} ep={a.ep} routing={a.routing} seed={a.seed})") + return 0 + + os.makedirs(a.out_dir, exist_ok=True) + made = [] + for T in ladder: + gt = T * a.ep + idx, w, man = wl.build_workload(hidden, topk, experts, a.routing, gt, a.seed, epr) + wid = wl.save_workload(a.out_dir, idx, w, man) + made.append((T, gt, wid)) + print(f" T={T:<5} gt={gt:<6} routing={a.routing} -> {wid} " + f"(trace sha {man['checksums']['trace'][:12]})") + print(f"{label}wrote {len(made)} canonical workloads to {a.out_dir} (routing={a.routing}, ep={a.ep})") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/probe_precision.py b/experimental/CollectiveX/tests/probe_precision.py new file mode 100644 index 000000000..2a92e7079 --- /dev/null +++ b/experimental/CollectiveX/tests/probe_precision.py @@ -0,0 +1,1162 @@ +#!/usr/bin/env python3 +"""Bounded real-hardware capability probe for provisional CollectiveX precision cells.""" +from __future__ import annotations + +import argparse +import datetime as dt +import hashlib +import inspect +import json +import os +import platform +import re +import socket +import sys +import tempfile +from pathlib import Path +from types import SimpleNamespace +from typing import Any + + +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +sys.path[:0] = [str(HERE), str(ROOT)] + +import artifact_safety # noqa: E402 +import capability # noqa: E402 +import ep_harness # noqa: E402 + + +FORMAT = "collectivex.precision-probe.v1" +PLAN_FORMAT = "collectivex.precision-probe-plan.v1" +CONTROL_FORMAT = "collectivex.precision-probe-control.v1" +RECORD_TYPE = "precision-capability-probe" +PROBE_CONTRACT = "bounded-native-cell-v1" +FENCE_CONTRACT = "caller-event-cross-stream-v1" +SUPPORTED_REASON = "native-probe-passed" +UNSUPPORTED_REASONS = frozenset({ + "backend-construction-failed", + "completion-fence-failed", + "cross-rank-evidence-mismatch", + "native-operation-failed", + "precision-contract-mismatch", + "runtime-identity-mismatch", + "target-not-provisional", + "transport-fallback-detected", + "unsupported-native-api", + "unverified-execution-identity", +}) +BACKENDS = frozenset({ + "deepep", "deepep-v2", "deepep-hybrid", "mori", "uccl", +}) +SHA40 = re.compile(r"[0-9a-f]{40}") +SHA256 = re.compile(r"[0-9a-f]{64}") +IMAGE_DIGEST = re.compile(r"sha256:[0-9a-f]{64}") + + +class ProbeError(RuntimeError): + """A provisional cell did not produce complete native runtime evidence.""" + + def __init__(self, reason: str): + if reason not in UNSUPPORTED_REASONS: + raise ValueError(f"unknown precision probe reason {reason!r}") + super().__init__(reason) + self.reason = reason + + +def _canonical(value: Any) -> bytes: + return json.dumps( + value, allow_nan=False, ensure_ascii=True, sort_keys=True, separators=(",", ":") + ).encode("ascii") + + +def _sha(value: Any) -> str: + return hashlib.sha256(_canonical(value)).hexdigest() + + +def _exact_keys(value: Any, expected: set[str], path: str) -> dict[str, Any]: + if not isinstance(value, dict) or set(value) != expected: + raise ValueError(f"{path} fields differ from {FORMAT}") + return value + + +def _text(value: Any, path: str, *, nullable: bool = False) -> str | None: + if nullable and value is None: + return None + if ( + not isinstance(value, str) + or not value + or len(value) > 4096 + or any(ord(character) < 0x20 or ord(character) > 0x7E for character in value) + ): + raise ValueError(f"{path} is not bounded printable ASCII") + return value + + +def _boolean(value: Any, path: str) -> bool: + if type(value) is not bool: + raise ValueError(f"{path} is not boolean") + return value + + +def _integer(value: Any, path: str, minimum: int = 0) -> int: + if type(value) is not int or value < minimum: + raise ValueError(f"{path} is not an integer >= {minimum}") + return value + + +def validate_manifest(document: Any) -> dict[str, Any]: + """Validate the closed probe format without extending publication schemas.""" + doc = _exact_keys(document, { + "evidence", "format", "generated_at", "privacy", "probe_contract", + "record_type", "result", "schema_version", "target", "topology", + }, "probe") + if ( + doc["format"] != FORMAT + or doc["record_type"] != RECORD_TYPE + or doc["schema_version"] != 1 + or doc["probe_contract"] != PROBE_CONTRACT + ): + raise ValueError("probe format, record type, schema, or contract differs") + _text(doc["generated_at"], "probe.generated_at") + target = _exact_keys(doc["target"], { + "backend", "basis", "ep", "mode", "precision_profile", "registry_disposition", + "sku", + }, "probe.target") + if ( + target["backend"] not in BACKENDS + or target["registry_disposition"] != "provisional" + or target["mode"] not in {"normal", "low-latency"} + ): + raise ValueError("probe target is not a provisional native adapter cell") + for field in ("basis", "precision_profile", "sku"): + _text(target[field], f"probe.target.{field}") + _integer(target["ep"], "probe.target.ep", 1) + topology = _exact_keys(doc["topology"], { + "gpus_per_node", "nodes", "placement_valid", "scale_up_domain", + "scale_up_transport", "scale_out_transport", "scope", "topology_class", + "transport", "world_size", + }, "probe.topology") + for field in ("gpus_per_node", "nodes", "scale_up_domain", "world_size"): + _integer(topology[field], f"probe.topology.{field}", 1) + for field in ("scale_up_transport", "scope", "topology_class", "transport"): + _text(topology[field], f"probe.topology.{field}") + _text(topology["scale_out_transport"], "probe.topology.scale_out_transport", nullable=True) + _boolean(topology["placement_valid"], "probe.topology.placement_valid") + result = _exact_keys(doc["result"], { + "disposition", "reason", "registry_mutation", "runtime_executed", + "static_inspection_sufficient", + }, "probe.result") + if result["disposition"] not in {"supported", "unsupported"}: + raise ValueError("probe result disposition is invalid") + expected_reason = ( + SUPPORTED_REASON if result["disposition"] == "supported" else result["reason"] + ) + if result["reason"] != expected_reason or ( + result["disposition"] == "unsupported" + and result["reason"] not in UNSUPPORTED_REASONS + ): + raise ValueError("probe result reason is invalid") + if result["registry_mutation"] is not False or result["static_inspection_sufficient"] is not False: + raise ValueError("probe must never mutate or statically promote the registry") + _boolean(result["runtime_executed"], "probe.result.runtime_executed") + privacy = _exact_keys(doc["privacy"], {"contract", "sanitized"}, "probe.privacy") + if privacy != {"contract": "artifact-safety-v1", "sanitized": True}: + raise ValueError("probe privacy contract differs") + if result["disposition"] == "supported": + _validate_evidence(doc["evidence"]) + elif doc["evidence"] is not None: + _validate_evidence(doc["evidence"]) + artifact_safety.assert_publication_safe([doc]) + return doc + + +def _validate_evidence(value: Any) -> None: + evidence = _exact_keys(value, { + "api", "completion", "identity", "precision", "transport", + }, "probe.evidence") + api = _exact_keys(evidence["api"], {"calls", "signature_sha256"}, "probe.evidence.api") + if not isinstance(api["calls"], list) or not api["calls"]: + raise ValueError("probe API calls are empty") + for index, call in enumerate(api["calls"]): + item = _exact_keys(call, {"name", "signature"}, f"probe.evidence.api.calls[{index}]") + _text(item["name"], "probe API name") + _text(item["signature"], "probe API signature") + if not isinstance(api["signature_sha256"], str) or not SHA256.fullmatch(api["signature_sha256"]): + raise ValueError("probe API signature digest is invalid") + completion = _exact_keys(evidence["completion"], { + "caller_event_complete", "contract", "mode", "output_finite", + "verifier_stream_complete", + }, "probe.evidence.completion") + if completion["contract"] != FENCE_CONTRACT: + raise ValueError("probe completion contract differs") + _text(completion["mode"], "probe completion mode") + if not all( + _boolean(completion[field], f"probe completion {field}") + for field in ("caller_event_complete", "output_finite", "verifier_stream_complete") + ): + raise ValueError("probe completion evidence did not pass") + identity_record = _exact_keys(evidence["identity"], { + "backend_components", "backend_provenance_sha256", "image_digest", + "image_digest_verified", "image_reference", "source_sha", + }, "probe.evidence.identity") + if not SHA40.fullmatch(str(identity_record["source_sha"])): + raise ValueError("probe source SHA is invalid") + if not IMAGE_DIGEST.fullmatch(str(identity_record["image_digest"])): + raise ValueError("probe image digest is invalid") + _text(identity_record["image_reference"], "probe image reference") + if identity_record["image_digest_verified"] is not True: + raise ValueError("probe image digest is unverified") + if not SHA256.fullmatch(str(identity_record["backend_provenance_sha256"])): + raise ValueError("probe backend provenance digest is invalid") + components = identity_record["backend_components"] + if not isinstance(components, list) or not components: + raise ValueError("probe backend component identity is empty") + for component in components: + item = _exact_keys(component, {"revision", "role", "version"}, "probe backend component") + _text(item["role"], "probe backend component role") + _text(item["revision"], "probe backend component revision", nullable=True) + _text(item["version"], "probe backend component version", nullable=True) + if item["revision"] is None and item["version"] is None: + raise ValueError("probe backend component has no identity") + precision = _exact_keys(evidence["precision"], { + "combine", "correctness", "dispatch", "profile_id", + }, "probe.evidence.precision") + _text(precision["profile_id"], "probe precision profile") + for direction in ("dispatch", "combine"): + axis = _exact_keys(precision[direction], { + "accumulator_dtype", "accumulator_evidence", "api_input_dtype", + "api_output_dtype", "communication_format", "runtime_input", + "runtime_output", "scale_contract", "semantic_output", + }, f"probe.evidence.precision.{direction}") + for field in ( + "accumulator_dtype", "accumulator_evidence", "api_input_dtype", + "api_output_dtype", "communication_format", + ): + _text(axis[field], f"probe precision {direction} {field}") + _validate_tensor_summary(axis["runtime_input"], f"probe precision {direction} input") + _validate_tensor_summary(axis["runtime_output"], f"probe precision {direction} output") + _validate_tensor_summary(axis["semantic_output"], f"probe precision {direction} semantic") + scale = _exact_keys(axis["scale_contract"], { + "alignment", "dtype", "finite", "group_size", "layout", "padding", + "positive", "runtime_shapes", "runtime_storage_dtype", + }, f"probe precision {direction} scales") + for field in ("alignment", "layout", "padding"): + _text(scale[field], f"probe precision {direction} scale {field}") + _text(scale["dtype"], "probe scale dtype", nullable=True) + _text(scale["runtime_storage_dtype"], "probe scale storage", nullable=True) + if scale["group_size"] is not None: + _integer(scale["group_size"], "probe scale group", 1) + for field in ("finite", "positive"): + if scale[field] is not None: + _boolean(scale[field], f"probe scale {field}") + if not isinstance(scale["runtime_shapes"], list): + raise ValueError("probe scale shapes are invalid") + correctness = precision["correctness"] + if not isinstance(correctness, dict) or correctness.get("passed") is not True: + raise ValueError("probe precision correctness did not pass") + transport = _exact_keys(evidence["transport"], { + "evidence", "fallback_used", "native_backend", "requested", "runtime_route", + }, "probe.evidence.transport") + for field in ("native_backend", "requested", "runtime_route"): + _text(transport[field], f"probe transport {field}") + if transport["fallback_used"] is not False: + raise ValueError("probe transport fallback is present") + if not isinstance(transport["evidence"], list) or not transport["evidence"]: + raise ValueError("probe transport evidence is empty") + for item in transport["evidence"]: + _text(item, "probe transport evidence item") + + +def _validate_tensor_summary(value: Any, path: str) -> None: + summary = _exact_keys(value, {"finite", "rank", "shapes", "storage_dtype"}, path) + _text(summary["storage_dtype"], f"{path}.storage_dtype") + _integer(summary["rank"], f"{path}.rank", 0) + if summary["finite"] is not True: + raise ValueError(f"{path} is not finite") + if not isinstance(summary["shapes"], list) or not summary["shapes"]: + raise ValueError(f"{path} shapes are empty") + for shape in summary["shapes"]: + if not isinstance(shape, list) or any(type(item) is not int or item < 0 for item in shape): + raise ValueError(f"{path} shape is invalid") + + +def provisional_targets() -> list[dict[str, Any]]: + """Return deterministic probe cells without changing their dispositions.""" + return sorted( + capability.provisional_precision_targets(), + key=lambda item: ( + item["sku"], item["backend"], item["ep"], item["mode"], + item["precision_profile"], + ), + ) + + +def _probe_id(target: dict[str, Any]) -> str: + return f"probe-{_sha({key: target[key] for key in ('backend', 'sku', 'ep', 'mode', 'precision_profile')})[:20]}" + + +def _workflow_row(target: dict[str, Any]) -> dict[str, Any]: + topology = capability.topology_for(target["sku"], target["ep"]) + if topology is None: + raise ValueError("precision probe target has no registered topology") + return { + "backend": target["backend"], + "basis": target["basis"], + "disposition": target["disposition"], + "ep": target["ep"], + "execution_weight": target["ep"], + "gpus_per_node": topology["gpus_per_node"], + "id": _probe_id(target), + "launcher": capability.PLATFORMS[target["sku"]]["launcher"], + "mode": target["mode"], + "n": 1, + "nodes": topology["nodes"], + "precision_profile": target["precision_profile"], + "scale_up_domain": topology["scale_up_domain"], + "sku": target["sku"], + } + + +def workflow_plan(*, backend: str = "all", only_sku: str = "") -> dict[str, Any]: + targets = [ + target for target in provisional_targets() + if (backend == "all" or target["backend"] == backend) + and (not only_sku or target["sku"] == only_sku) + ] + if backend != "all" and backend not in BACKENDS: + raise ValueError("precision probe backend is not registered") + if only_sku and only_sku not in capability.PLATFORMS: + raise ValueError("precision probe SKU is not registered") + if not targets: + raise ValueError("precision probe filters select no provisional cells") + return { + "format": PLAN_FORMAT, + "include": [_workflow_row(target) for target in targets], + "schema_version": 1, + } + + +def validate_workflow_plan(document: Any) -> dict[str, Any]: + plan = _exact_keys(document, {"format", "include", "schema_version"}, "probe plan") + if plan["format"] != PLAN_FORMAT or plan["schema_version"] != 1: + raise ValueError("precision probe plan format differs") + if not isinstance(plan["include"], list): + raise ValueError("precision probe plan include is not a list") + expected = {_probe_id(target): _workflow_row(target) for target in provisional_targets()} + seen: set[str] = set() + for row in plan["include"]: + if not isinstance(row, dict) or row.get("id") not in expected or row != expected[row["id"]]: + raise ValueError("precision probe plan row differs from the capability registry") + if row["id"] in seen: + raise ValueError("precision probe plan contains a duplicate row") + seen.add(row["id"]) + return plan + + +def extract_control( + plan: Any, *, probe_id: str, sku: str, backend: str, nodes: int, +) -> dict[str, Any]: + rows = [row for row in validate_workflow_plan(plan)["include"] if row["id"] == probe_id] + if len(rows) != 1: + raise ValueError("precision probe ID is not unique in the plan") + row = rows[0] + if (row["sku"], row["backend"], row["nodes"]) != (sku, backend, nodes): + raise ValueError("precision probe control differs from the workflow matrix") + target = select_target( + backend=row["backend"], sku=row["sku"], ep=row["ep"], mode=row["mode"], + precision_profile=row["precision_profile"], + ) + topology = capability.topology_for(row["sku"], row["ep"]) + if topology is None: + raise ValueError("precision probe control has no topology") + return { + "format": CONTROL_FORMAT, + "id": row["id"], + "launcher": row["launcher"], + "schema_version": 1, + "target": target, + "topology": topology, + } + + +def validate_control( + document: Any, *, sku: str, backend: str, nodes: int, +) -> dict[str, Any]: + control = _exact_keys( + document, {"format", "id", "launcher", "schema_version", "target", "topology"}, + "probe control", + ) + if control["format"] != CONTROL_FORMAT or control["schema_version"] != 1: + raise ValueError("precision probe control format differs") + expected = extract_control( + {"format": PLAN_FORMAT, "include": [_workflow_row(control["target"])], "schema_version": 1}, + probe_id=control["id"], sku=sku, backend=backend, nodes=nodes, + ) + if control != expected: + raise ValueError("precision probe control differs from the capability registry") + return control + + +def validate_bundle(plan: Any, manifests: list[Any]) -> None: + rows = validate_workflow_plan(plan)["include"] + expected = { + (row["backend"], row["sku"], row["ep"], row["mode"], row["precision_profile"]) + for row in rows + } + observed = [] + for manifest in manifests: + target = validate_manifest(manifest)["target"] + observed.append(( + target["backend"], target["sku"], target["ep"], target["mode"], + target["precision_profile"], + )) + if len(observed) != len(set(observed)) or set(observed) != expected: + raise ValueError("precision probe bundle does not cover the exact workflow plan") + + +def select_target( + *, backend: str, sku: str, ep: int, mode: str, precision_profile: str +) -> dict[str, Any]: + matches = [ + item for item in provisional_targets() + if item["backend"] == backend and item["sku"] == sku and item["ep"] == ep + and item["mode"] == mode and item["precision_profile"] == precision_profile + ] + if len(matches) != 1: + raise ProbeError("target-not-provisional") + return matches[0] + + +def _target_record(target: dict[str, Any]) -> dict[str, Any]: + return { + "backend": target["backend"], + "basis": target["basis"], + "ep": target["ep"], + "mode": target["mode"], + "precision_profile": target["precision_profile"], + "registry_disposition": "provisional", + "sku": target["sku"], + } + + +def build_manifest( + *, target: dict[str, Any], topology: dict[str, Any], disposition: str, + reason: str, runtime_executed: bool, evidence: dict[str, Any] | None, +) -> dict[str, Any]: + document = { + "evidence": evidence, + "format": FORMAT, + "generated_at": dt.datetime.now(dt.timezone.utc).isoformat(), + "privacy": {"contract": "artifact-safety-v1", "sanitized": True}, + "probe_contract": PROBE_CONTRACT, + "record_type": RECORD_TYPE, + "result": { + "disposition": disposition, + "reason": reason, + "registry_mutation": False, + "runtime_executed": runtime_executed, + "static_inspection_sufficient": False, + }, + "schema_version": 1, + "target": _target_record(target), + "topology": topology, + } + return validate_manifest(document) + + +def _write_atomic(path: Path, document: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + descriptor, temporary = tempfile.mkstemp(prefix=f".{path.name}.", dir=path.parent) + try: + os.fchmod(descriptor, 0o600) + with os.fdopen(descriptor, "wb") as handle: + handle.write(_canonical(document) + b"\n") + handle.flush() + os.fsync(handle.fileno()) + os.replace(temporary, path) + finally: + if os.path.exists(temporary): + os.unlink(temporary) + + +def _dtype_name(dtype: Any) -> str: + return str(dtype).removeprefix("torch.") + + +def _local_tensor_summary(torch_module, tensor) -> dict[str, Any]: + return { + "finite": bool(torch_module.isfinite(tensor.float()).all().item()), + "rank": int(tensor.ndim), + "shape": [int(item) for item in tensor.shape], + "storage_dtype": _dtype_name(tensor.dtype), + } + + +def _aggregate_tensor_summaries(records: list[dict[str, Any]]) -> dict[str, Any]: + dtypes = {record["storage_dtype"] for record in records} + ranks = {record["rank"] for record in records} + if len(dtypes) != 1 or len(ranks) != 1: + raise ProbeError("cross-rank-evidence-mismatch") + return { + "finite": all(record["finite"] for record in records), + "rank": ranks.pop(), + "shapes": sorted({tuple(record["shape"]) for record in records}), + "storage_dtype": dtypes.pop(), + } + + +def _scale_contract(torch_module, axis: dict[str, Any], scales) -> dict[str, Any]: + return { + "alignment": axis["alignment_contract"], + "dtype": axis["scale_dtype"], + "finite": ( + bool(torch_module.isfinite(scales.float()).all().item()) + if scales is not None else None + ), + "group_size": axis["scale_group_size"], + "layout": axis["scale_layout"], + "padding": axis["padding_contract"], + "positive": bool((scales > 0).all().item()) if scales is not None else None, + "runtime_shape": [int(item) for item in scales.shape] if scales is not None else None, + "runtime_storage_dtype": _dtype_name(scales.dtype) if scales is not None else None, + } + + +def _aggregate_scale_contracts(records: list[dict[str, Any]]) -> dict[str, Any]: + fixed_fields = ( + "alignment", "dtype", "group_size", "layout", "padding", "runtime_storage_dtype", + ) + result: dict[str, Any] = {} + for field in fixed_fields: + values = {_canonical(record[field]) for record in records} + if len(values) != 1: + raise ProbeError("cross-rank-evidence-mismatch") + result[field] = records[0][field] + for field in ("finite", "positive"): + values = [record[field] for record in records] + result[field] = None if all(value is None for value in values) else all(value is True for value in values) + result["runtime_shapes"] = sorted({ + tuple(record["runtime_shape"] or ()) for record in records + }) if any(record["runtime_shape"] is not None for record in records) else [] + return result + + +def _signature(callable_object: Any, name: str) -> dict[str, str]: + try: + signature = str(inspect.signature(callable_object)) + except (TypeError, ValueError) as exc: + raise ProbeError("unsupported-native-api") from exc + _text(signature, f"native API {name} signature") + return {"name": name, "signature": signature} + + +def _api_evidence(backend_name: str, backend) -> dict[str, Any]: + if backend_name in {"deepep", "uccl"}: + native = type(backend.buffer) + dispatch_name = "low_latency_dispatch" if backend.mode == "low-latency" else "dispatch" + combine_name = "low_latency_combine" if backend.mode == "low-latency" else "combine" + calls = [ + _signature(native.__init__, f"{native.__name__}.__init__"), + _signature(getattr(native, dispatch_name), f"{native.__name__}.{dispatch_name}"), + _signature(getattr(native, combine_name), f"{native.__name__}.{combine_name}"), + ] + elif backend_name in {"deepep-v2", "deepep-hybrid"}: + native = type(backend.buffer) + calls = [ + _signature(native.__init__, f"{native.__name__}.__init__"), + _signature(native.dispatch, f"{native.__name__}.dispatch"), + _signature(native.combine, f"{native.__name__}.combine"), + ] + elif backend_name == "mori": + native = type(backend.op) + calls = [ + _signature(type(backend.config).__init__, "EpDispatchCombineConfig.__init__"), + _signature(native.dispatch, f"{native.__name__}.dispatch"), + _signature(native.combine, f"{native.__name__}.combine"), + ] + else: # pragma: no cover - guarded by target registry + raise ProbeError("unsupported-native-api") + return {"calls": calls, "signature_sha256": _sha(calls)} + + +def _completion_mode(backend_name: str, mode: str) -> str: + if backend_name in {"deepep", "uccl"}: + return "async_finish=false;return_recv_hook=false" + if backend_name == "deepep-v2": + return "async_with_compute_stream=false;do_cpu_sync=true" + if backend_name == "deepep-hybrid": + return "metadata-nonblocking=false;caller-stream-ordered" + if backend_name == "mori": + return "current-stream-ordered" + raise ProbeError("unsupported-native-api") + + +def _transport_evidence(backend_name: str, backend, args) -> dict[str, Any]: + provenance = backend.backend_provenance + fallback = False + facts: list[str] + if backend_name == "deepep": + if args.scope == "scale-out" and int(provenance["num_rdma_bytes"]) <= 0: + fallback = True + if args.scale_up_transport == "mnnvl" and provenance["mnnvl_comm"] != "explicit-allow-mnnvl": + fallback = True + route = f"deepep-{backend.mode}" + facts = [ + f"mnnvl={provenance['mnnvl_comm']}", + f"nvl-buffer={int(provenance['num_nvl_bytes']) > 0}", + f"rdma-buffer={int(provenance['num_rdma_bytes']) > 0}", + ] + elif backend_name == "uccl": + scratch_location = str(backend.buffer.scratch.device.type) + rdma_active = int(provenance["num_rdma_bytes"]) > 0 + fallback = rdma_active and scratch_location != "cuda" + route = f"uccl-proxy-{backend.mode}" + facts = [f"rdma-buffer={rdma_active}", f"rdma-memory={scratch_location}"] + elif backend_name == "deepep-v2": + expected_gin = args.scope == "scale-out" + fallback = bool(provenance["gin_enabled"]) != expected_gin + route = str(provenance["communication_backend"]) + facts = [ + f"gin-enabled={bool(provenance['gin_enabled'])}", + f"nccl-communicator={provenance['nccl_communicator']}", + ] + elif backend_name == "deepep-hybrid": + route = str(provenance["transport"]) + expected_build = "multinode-doca" if args.scope == "scale-out" else "intradomain" + realized_build = os.environ.get("DEEPEP_HYBRID_BUILD_MODE") + fallback = realized_build != expected_build + facts = [f"build-mode={realized_build or 'missing'}", f"domains={backend.communication_domains}"] + elif backend_name == "mori": + route = str(backend.kernel_generation) + expected_kernel = ( + "inter-node-v1" if args.scope == "scale-out" + else "async-ll" if args.runner == "mi325x" else "intranode" + ) + fallback = route != expected_kernel + facts = [ + f"kernel={route}", + f"external-input={bool(provenance['use_external_inp_buf'])}", + f"qps={int(provenance['num_qps'])}", + ] + else: # pragma: no cover - guarded by registry + raise ProbeError("unsupported-native-api") + return { + "evidence": sorted(facts), + "fallback_used": fallback, + "native_backend": backend_name, + "requested": str(args.transport), + "runtime_route": route, + } + + +def _component_identities(backend_name: str, provenance: dict[str, Any]) -> list[dict[str, Any]]: + if backend_name == "deepep": + values = [("deepep", provenance.get("deepep_commit"), provenance.get("deepep_version"))] + elif backend_name == "deepep-v2": + values = [ + ("deepep-v2", provenance.get("deepep_commit"), provenance.get("deepep_version")), + ("deepep-tree", provenance.get("deepep_tree"), None), + ("fmt", provenance.get("fmt_commit"), None), + ] + elif backend_name == "deepep-hybrid": + values = [ + ("deepep-hybrid", provenance.get("deepep_commit"), None), + ("deepep-tree", provenance.get("deepep_tree"), None), + ] + elif backend_name == "uccl": + values = [ + ("uccl", provenance.get("uccl_commit"), provenance.get("uccl_version")), + ("uccl-wrapper", provenance.get("uccl_wrapper_commit"), None), + ] + elif backend_name == "mori": + values = [("mori", provenance.get("mori_commit"), None)] + else: # pragma: no cover + raise ProbeError("unsupported-native-api") + result = [ + {"revision": revision, "role": role, "version": version} + for role, revision, version in values + ] + for item in result: + if item["revision"] is None and item["version"] is None: + raise ProbeError("unverified-execution-identity") + return result + + +def _execution_identity(backend_name: str, backend) -> dict[str, Any]: + source_sha = os.environ.get("COLLECTIVEX_SOURCE_SHA") or os.environ.get("GITHUB_SHA") + image_reference = os.environ.get("COLLECTIVEX_IMAGE") + image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST") + verified = os.environ.get("COLLECTIVEX_IMAGE_DIGEST_VERIFIED") == "1" + if ( + not isinstance(source_sha, str) or not SHA40.fullmatch(source_sha) + or not isinstance(image_reference, str) or not image_reference + or not isinstance(image_digest, str) or not IMAGE_DIGEST.fullmatch(image_digest) + or not verified + ): + raise ProbeError("unverified-execution-identity") + provenance = backend.backend_provenance + return { + "backend_components": _component_identities(backend_name, provenance), + "backend_provenance_sha256": _sha(provenance), + "image_digest": image_digest, + "image_digest_verified": True, + "image_reference": image_reference, + "source_sha": source_sha, + } + + +def _correctness_aggregate(records: list[dict[str, Any]]) -> dict[str, Any]: + profile_ids = {record["profile_id"] for record in records} + if len(profile_ids) != 1: + raise ProbeError("cross-rank-evidence-mismatch") + result: dict[str, Any] = {"profile_id": profile_ids.pop()} + for direction in ("dispatch", "combine"): + axes = [record[direction] for record in records] + scale_finite = [axis["scales_finite"] for axis in axes] + scale_positive = [axis["scales_positive"] for axis in axes] + result[direction] = { + "dequantized_semantics": all(axis["dequantized_semantics"] for axis in axes), + "encoded_payload_valid": all(axis["encoded_payload_valid"] for axis in axes), + "max_abs_error": max(float(axis["max_abs_error"]) for axis in axes), + "max_rel_error": max(float(axis["max_rel_error"]) for axis in axes), + "passed": all(axis["passed"] for axis in axes), + "saturation_count": sum(int(axis["saturation_count"]) for axis in axes), + "saturation_rate": max(float(axis["saturation_rate"]) for axis in axes), + "scales_finite": ( + None if all(value is None for value in scale_finite) + else all(value is True for value in scale_finite) + ), + "scales_positive": ( + None if all(value is None for value in scale_positive) + else all(value is True for value in scale_positive) + ), + } + result["passed"] = all(record["passed"] for record in records) and all( + result[direction]["passed"] for direction in ("dispatch", "combine") + ) + return result + + +def _topology_record(topology: dict[str, Any], placement_valid: bool) -> dict[str, Any]: + return { + "gpus_per_node": topology["gpus_per_node"], + "nodes": topology["nodes"], + "placement_valid": placement_valid, + "scale_up_domain": topology["scale_up_domain"], + "scale_up_transport": topology["scale_up_transport"], + "scale_out_transport": topology["scale_out_transport"], + "scope": topology["scope"], + "topology_class": topology["topology_class"], + "transport": topology["transport"], + "world_size": topology["nodes"] * topology["gpus_per_node"], + } + + +def _backend_class(name: str): + if name == "deepep": + from ep_deepep import DeepEPBackend + return DeepEPBackend + if name == "deepep-v2": + from ep_deepep_v2 import DeepEPV2Backend + return DeepEPV2Backend + if name == "deepep-hybrid": + from ep_deepep_hybrid import DeepEPHybridBackend + return DeepEPHybridBackend + if name == "uccl": + from ep_uccl import UCCLBackend + return UCCLBackend + if name == "mori": + from ep_mori import MoRIBackend + return MoRIBackend + raise ProbeError("unsupported-native-api") + + +def _runtime_args(target: dict[str, Any], topology: dict[str, Any], fingerprint: dict[str, Any]): + return SimpleNamespace( + backend=target["backend"], + eplb=False, + experts=256, + gpus_per_node=topology["gpus_per_node"], + hidden=7168, + mode=target["mode"], + num_logical_experts=256, + num_sms=24, + phase="decode", + precision_profile=target["precision_profile"], + runner=target["sku"], + runtime_fingerprint=fingerprint, + scale_out_transport=topology["scale_out_transport"] or "", + scale_up_domain=topology["scale_up_domain"], + scale_up_transport=topology["scale_up_transport"], + scope=topology["scope"], + tokens_ladder="8", + topk=8, + topology_class=topology["topology_class"], + transport=topology["transport"], + ) + + +def _init_distributed(torch_module, dist, backend_name: str, device, rank: int, world_size: int) -> None: + if dist.is_initialized(): + return + if backend_name == "mori": + dist.init_process_group( + backend="cpu:gloo,cuda:nccl", rank=rank, world_size=world_size, device_id=device + ) + elif backend_name == "deepep-v2": + dist.init_process_group("nccl", device_id=device) + else: + dist.init_process_group("nccl") + + +def _runtime_context(torch_module, dist, target: dict[str, Any], device, local_rank: int): + import run_ep + + world_size = dist.get_world_size() + topology = capability.topology_for(target["sku"], target["ep"]) + if topology is None or world_size != target["ep"]: + raise ProbeError("runtime-identity-mismatch") + machine = {"x86_64": "amd64", "aarch64": "arm64"}.get( + platform.machine(), platform.machine() + ) + properties = torch_module.cuda.get_device_properties(device) + if torch_module.version.hip: + vendor = "amd" + arch = str(getattr(properties, "gcnArchName", "")).split(":", 1)[0] + else: + vendor = "nvidia" + major, minor = torch_module.cuda.get_device_capability(device) + arch = f"sm{major}{minor}" + fingerprint = run_ep._runtime_fingerprint( + torch_module, device, machine=machine, vendor=vendor, arch=arch + ) + issues = capability.runtime_identity_issues( + target["sku"], vendor=vendor, arch=arch, machine=machine, + device_name=torch_module.cuda.get_device_name(device), + device_count=torch_module.cuda.device_count(), world_size=world_size, + ) + records: list[Any] = [None] * world_size + dist.all_gather_object(records, (socket.gethostname(), local_rank, fingerprint, issues)) + if any(record[3] for record in records): + raise ProbeError("runtime-identity-mismatch") + placement = run_ep._summarize_realized_placement( + [(record[0], record[1]) for record in records], + expected_nodes=topology["nodes"], + expected_gpus_per_node=topology["gpus_per_node"], + expected_world_size=world_size, + ) + common_fingerprint = run_ep._common_runtime_fingerprint([record[2] for record in records]) + return topology, placement, common_fingerprint + + +def _local_probe(torch_module, dist, target: dict[str, Any], backend, args, rank: int): + import routing + + tokens = 8 + global_idx, global_weights = routing.build_global_routing( + tokens * target["ep"], args.experts, args.topk, "uniform", ep_harness.ROUTING_SEED + ) + local_idx, local_weights = routing.rank_slice( + global_idx, global_weights, rank, tokens + ) + x = routing.rank_activations( + tokens, args.hidden, ep_harness.ROUTING_SEED, rank, + torch_module.device(f"cuda:{int(os.environ.get('LOCAL_RANK', '0'))}"), + torch_module.bfloat16, + ) + problem = backend.make_problem( + tokens, local_idx.to(x.device), local_weights.to(x.device), x + ) + oracle = ep_harness._run_expert_oracle( + torch_module, routing, backend, problem, global_idx, global_weights, rank, + args.experts // target["ep"], ep_harness.ROUTING_SEED, + ) + if not oracle["passed"] or not oracle["_precision"]["passed"]: + raise ProbeError("precision-contract-mismatch") + + caller = torch_module.cuda.Stream(device=x.device) + verifier = torch_module.cuda.Stream(device=x.device) + completion_event = torch_module.cuda.Event() + with torch_module.cuda.stream(caller): + handle = backend.dispatch(problem) + problem.recv_tokens = backend.recv_tokens(handle) + view = ( + backend.inspect_expert_dispatch(problem, handle) + if target["mode"] == "low-latency" + else backend.inspect_dispatch(problem, handle) + ) + backend.stage(problem, handle) + combined = backend.combine(problem, handle) + completion_event.record(caller) + with torch_module.cuda.stream(verifier): + verifier.wait_event(completion_event) + verifier_sentinel = combined.float().abs().sum() + verifier.synchronize() + completion = { + "caller_event_complete": bool(completion_event.query()), + "contract": FENCE_CONTRACT, + "mode": _completion_mode(target["backend"], target["mode"]), + "output_finite": bool(torch_module.isfinite(combined.float()).all().item()), + "verifier_stream_complete": bool(torch_module.isfinite(verifier_sentinel).item()), + } + if not all( + completion[field] + for field in ("caller_event_complete", "output_finite", "verifier_stream_complete") + ): + raise ProbeError("completion-fence-failed") + + deferred = getattr(backend, "capture_deferred_provenance", None) + if deferred is not None: + deferred() + dispatch_input = problem.dispatch_x[0] if isinstance(problem.dispatch_x, tuple) else problem.dispatch_x + dispatch_input_scales = ( + problem.dispatch_x[1] if isinstance(problem.dispatch_x, tuple) + else getattr(problem, "dispatch_scales", None) + or getattr(problem, "scales", None) + ) + dispatch_axis = backend.communication_precision["dispatch"] + combine_axis = backend.communication_precision["combine"] + local = { + "api": _api_evidence(target["backend"], backend), + "completion": completion, + "identity": _execution_identity(target["backend"], backend), + "precision": { + "profile_id": backend.precision_profile_id, + "correctness": oracle["_precision"], + "dispatch": { + "accumulator_dtype": "not-applicable", + "accumulator_evidence": "not-applicable", + "api_input_dtype": dispatch_axis["api_input_dtype"], + "api_output_dtype": dispatch_axis["api_output_dtype"], + "communication_format": dispatch_axis["communication_format"], + "runtime_input": _local_tensor_summary(torch_module, dispatch_input), + "runtime_output": _local_tensor_summary(torch_module, view.encoded_payload), + "scale_contract": _scale_contract( + torch_module, dispatch_axis, + view.scales if view.scales is not None else dispatch_input_scales, + ), + "semantic_output": _local_tensor_summary(torch_module, view.payload), + }, + "combine": { + "accumulator_dtype": "fp32", + "accumulator_evidence": "pinned-source-image-plus-runtime-oracle", + "api_input_dtype": combine_axis["api_input_dtype"], + "api_output_dtype": combine_axis["api_output_dtype"], + "communication_format": combine_axis["communication_format"], + "runtime_input": _local_tensor_summary(torch_module, handle.combine_input), + "runtime_output": _local_tensor_summary(torch_module, combined), + "scale_contract": _scale_contract(torch_module, combine_axis, None), + "semantic_output": _local_tensor_summary(torch_module, combined), + }, + }, + "transport": _transport_evidence(target["backend"], backend, args), + } + if local["transport"]["fallback_used"]: + raise ProbeError("transport-fallback-detected") + return local + + +def _aggregate_local(records: list[dict[str, Any]]) -> dict[str, Any]: + for field in ("api", "completion", "identity", "transport"): + values = {_canonical(record[field]) for record in records} + if len(values) != 1: + raise ProbeError("cross-rank-evidence-mismatch") + profile_ids = {record["precision"]["profile_id"] for record in records} + if len(profile_ids) != 1: + raise ProbeError("cross-rank-evidence-mismatch") + precision: dict[str, Any] = { + "profile_id": profile_ids.pop(), + "correctness": _correctness_aggregate([ + record["precision"]["correctness"] for record in records + ]), + } + for direction in ("dispatch", "combine"): + axes = [record["precision"][direction] for record in records] + fixed = ( + "accumulator_dtype", "accumulator_evidence", "api_input_dtype", + "api_output_dtype", "communication_format", + ) + result: dict[str, Any] = {} + for field in fixed: + if len({axis[field] for axis in axes}) != 1: + raise ProbeError("cross-rank-evidence-mismatch") + result[field] = axes[0][field] + for field in ("runtime_input", "runtime_output", "semantic_output"): + result[field] = _aggregate_tensor_summaries([axis[field] for axis in axes]) + result["scale_contract"] = _aggregate_scale_contracts([ + axis["scale_contract"] for axis in axes + ]) + precision[direction] = result + return { + "api": records[0]["api"], + "completion": records[0]["completion"], + "identity": records[0]["identity"], + "precision": precision, + "transport": records[0]["transport"], + } + + +def _finalize(backend, dist) -> None: + if backend is not None: + backend.finalize(0) + return + if dist.is_initialized(): + dist.destroy_process_group() + + +def run_target(target: dict[str, Any], output: Path) -> int: + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover - diagnostic runtime requirement + raise ProbeError("runtime-identity-mismatch") from exc + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12355") + torch.cuda.set_device(local_rank) + device = torch.device(f"cuda:{local_rank}") + _init_distributed(torch, dist, target["backend"], device, rank, world_size) + backend = None + topology = capability.topology_for(target["sku"], target["ep"]) + topology_record = _topology_record(topology, False) if topology is not None else { + "gpus_per_node": 1, "nodes": target["ep"], "placement_valid": False, + "scale_up_domain": 1, "scale_up_transport": "unknown", + "scale_out_transport": None, "scope": "scale-out", + "topology_class": "unknown", "transport": "unknown", "world_size": target["ep"], + } + try: + topology, placement, fingerprint = _runtime_context( + torch, dist, target, device, local_rank + ) + topology_record = _topology_record(topology, bool(placement["valid"])) + args = _runtime_args(target, topology, fingerprint) + try: + backend = _backend_class(target["backend"])( + args, rank, world_size, local_rank, device + ) + construction = {"ok": True} + except Exception: + construction = {"ok": False, "reason": "backend-construction-failed"} + gathered: list[Any] = [None] * world_size + dist.all_gather_object(gathered, construction) + if not all(record.get("ok") is True for record in gathered): + manifest = build_manifest( + target=target, topology=topology_record, disposition="unsupported", + reason="backend-construction-failed", runtime_executed=True, evidence=None, + ) + else: + try: + local = {"ok": True, "evidence": _local_probe( + torch, dist, target, backend, args, rank + )} + except ProbeError as exc: + local = {"ok": False, "reason": exc.reason} + except Exception: + local = {"ok": False, "reason": "native-operation-failed"} + gathered = [None] * world_size + dist.all_gather_object(gathered, local) + if not all(record.get("ok") is True for record in gathered): + reasons = {record.get("reason") for record in gathered} + reason = reasons.pop() if len(reasons) == 1 else "cross-rank-evidence-mismatch" + manifest = build_manifest( + target=target, topology=topology_record, disposition="unsupported", + reason=reason, runtime_executed=True, evidence=None, + ) + else: + evidence = _aggregate_local([record["evidence"] for record in gathered]) + manifest = build_manifest( + target=target, topology=topology_record, disposition="supported", + reason=SUPPORTED_REASON, runtime_executed=True, evidence=evidence, + ) + except ProbeError as exc: + manifest = build_manifest( + target=target, topology=topology_record, disposition="unsupported", + reason=exc.reason, runtime_executed=False, evidence=None, + ) + if rank == 0: + _write_atomic(output, manifest) + print(json.dumps(manifest, allow_nan=False, sort_keys=True, separators=(",", ":"))) + dist.barrier() + _finalize(backend, dist) + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--list-targets", action="store_true") + parser.add_argument("--workflow-plan", action="store_true") + parser.add_argument("--extract-from", type=Path) + parser.add_argument("--probe-id") + parser.add_argument("--validate-control", type=Path) + parser.add_argument("--validate-manifest", type=Path, nargs="+") + parser.add_argument("--validate-bundle", type=Path) + parser.add_argument("--backend", choices=sorted(BACKENDS | {"all"})) + parser.add_argument("--sku") + parser.add_argument("--only-sku", default="") + parser.add_argument("--expect-sku") + parser.add_argument("--expect-backend") + parser.add_argument("--expect-nodes", type=int) + parser.add_argument("--ep", type=int) + parser.add_argument("--mode", choices=("normal", "low-latency")) + parser.add_argument("--precision-profile") + parser.add_argument("--out", type=Path) + args = parser.parse_args() + if args.list_targets: + print(json.dumps(provisional_targets(), allow_nan=False, sort_keys=True, separators=(",", ":"))) + return 0 + if args.workflow_plan: + plan = workflow_plan(backend=args.backend or "all", only_sku=args.only_sku) + if args.out is None: + print(json.dumps(plan, allow_nan=False, sort_keys=True, separators=(",", ":"))) + else: + _write_atomic(args.out, plan) + return 0 + if args.extract_from is not None: + if None in (args.probe_id, args.expect_sku, args.expect_backend, args.expect_nodes, args.out): + parser.error("probe extraction requires ID, expected placement, and --out") + control = extract_control( + json.loads(args.extract_from.read_text()), probe_id=args.probe_id, + sku=args.expect_sku, backend=args.expect_backend, nodes=args.expect_nodes, + ) + _write_atomic(args.out, control) + return 0 + if args.validate_control is not None: + if None in (args.expect_sku, args.expect_backend, args.expect_nodes): + parser.error("control validation requires expected placement") + validate_control( + json.loads(args.validate_control.read_text()), sku=args.expect_sku, + backend=args.expect_backend, nodes=args.expect_nodes, + ) + return 0 + if args.validate_bundle is not None: + if not args.validate_manifest: + parser.error("bundle validation requires manifest paths") + validate_bundle( + json.loads(args.validate_bundle.read_text()), + [json.loads(path.read_text()) for path in args.validate_manifest], + ) + return 0 + if args.validate_manifest is not None: + for path in args.validate_manifest: + validate_manifest(json.loads(path.read_text())) + return 0 + if any( + value is None + for value in (args.backend, args.sku, args.ep, args.mode, args.precision_profile, args.out) + ): + parser.error("one exact --backend/--sku/--ep/--mode/--precision-profile/--out cell is required") + try: + target = select_target( + backend=args.backend, sku=args.sku, ep=args.ep, mode=args.mode, + precision_profile=args.precision_profile, + ) + return run_target(target, args.out) + except ProbeError as exc: + parser.error(exc.reason) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/routing.py b/experimental/CollectiveX/tests/routing.py new file mode 100644 index 000000000..ee4eb12a8 --- /dev/null +++ b/experimental/CollectiveX/tests/routing.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +"""CollectiveX — deterministic, platform-independent MoE routing trace. + +Fair-comparison fix #1: routing (per-token expert IDs + gate weights) is generated +ONCE from a fixed seed over the *global* token batch, indexed by global token id, and +is identical on every SKU for the same (seed, routing, global_tokens, experts, top-k). +Each rank materializes its slice `[rank*T,(rank+1)*T)`. Activations +are per-rank (same rank ⇒ same x on any platform), so a given global token id has +identical activation everywhere without materializing a global activation tensor. + +The v1 suite keeps two routing distributions: + + * uniform — top-k distinct experts drawn uniformly per token. The DEFAULT. + Expected fan-out for top-k=8, 256 experts, EP8 (32 experts/rank) ≈ + 8·(1 − C(224,8)/C(256,8)) ≈ 5.3 ranks/token. Load ~ Poisson. + * zipf — expert popularity proportional to 1/rank, producing expert/rank load skew. + +Always publish the realized fan-out so the workload is never misread again +(`routing_stats`). +""" +from __future__ import annotations + +import hashlib + +import torch + +ACTIVATION_GENERATOR = "collectivex-activation-counter-v4" +SOURCE_ID_BITS = 32 +SOURCE_CHECKSUM_BITS = 16 +SOURCE_ID_COLUMNS = SOURCE_ID_BITS + SOURCE_CHECKSUM_BITS +SOURCE_ID_CONTRACT = "bounded-sign-bit-source-v1" + + +def build_global_routing( + global_tokens: int, + experts: int, + topk: int, + routing: str, + seed: int, + *, + token_offset: int = 0, +): + """Return one byte-stable counter-generated routing window on CPU.""" + import workload + + indices, weights = workload.canonical_routing_rows( + int(global_tokens), + int(experts), + int(topk), + routing, + int(seed), + token_offset=token_offset, + ) + return ( + torch.tensor(indices, dtype=torch.int64), + torch.tensor(weights, dtype=torch.float32), + ) + + +def rank_slice(idx, weights, rank: int, tokens_per_rank: int): + lo = rank * tokens_per_rank + return idx[lo:lo + tokens_per_rank].contiguous(), weights[lo:lo + tokens_per_rank].contiguous() + + +def rank_activations(tokens: int, hidden: int, seed: int, rank: int, device, + dtype=torch.bfloat16): + """Exact counter-derived inputs with a quantization-safe source-token prefix.""" + source = torch.arange(tokens, device=device, dtype=torch.int64) + rank * tokens + return activations_for_source_ids(source, hidden, seed, dtype) + + +def activations_for_source_ids(source, hidden: int, seed: int, dtype=torch.bfloat16): + """Materialize canonical activations for arbitrary global source-token IDs.""" + if hidden < SOURCE_ID_COLUMNS: + raise ValueError(f"hidden must be at least {SOURCE_ID_COLUMNS}") + source = source.to(torch.int64) + column = torch.arange(hidden, device=source.device, dtype=torch.int64) + values = (source[:, None] * 131 + column[None, :] * 17 + int(seed) * 19) % 257 - 128 + output = values.to(dtype).mul_(1 / 64) + if bool((source < 0).any().item()) or bool((source >= (1 << SOURCE_ID_BITS)).any().item()): + raise ValueError("source token ID is outside the bounded identity contract") + source_columns = torch.arange(SOURCE_ID_BITS, device=source.device, dtype=torch.int64) + source_bits = ((source[:, None] >> source_columns[None, :]) & 1) * 2 - 1 + checksum = (source * 0x9E37 + int(seed) * 0xA24B) & ((1 << SOURCE_CHECKSUM_BITS) - 1) + checksum_columns = torch.arange( + SOURCE_CHECKSUM_BITS, device=source.device, dtype=torch.int64 + ) + checksum_bits = ((checksum[:, None] >> checksum_columns[None, :]) & 1) * 2 - 1 + # Magnitude one sits inside the ordinary [-2, 2] activation range, so the identity cannot set + # an FP8 block scale. Decode depends only on sign and remains stable after dequantization. + output[:, :SOURCE_ID_BITS] = source_bits.to(dtype) + output[:, SOURCE_ID_BITS:SOURCE_ID_COLUMNS] = checksum_bits.to(dtype) + return output + + +def decode_source_ids(payload, seed: int): + """Decode and validate source IDs carried by rank_activations.""" + if payload.ndim != 2 or payload.shape[1] < SOURCE_ID_COLUMNS: + raise ValueError("received payload cannot carry the source-token prefix") + prefix = payload[:, :SOURCE_ID_COLUMNS].float() + if not bool(torch.isfinite(prefix).all().item()) or bool((prefix.abs() < 0.25).any().item()): + raise ValueError("received source-token prefix is not quantization-stable") + bits = prefix >= 0 + powers = 1 << torch.arange(SOURCE_ID_BITS, device=payload.device, dtype=torch.int64) + source = (bits[:, :SOURCE_ID_BITS].to(torch.int64) * powers).sum(dim=1) + checksum_powers = 1 << torch.arange( + SOURCE_CHECKSUM_BITS, device=payload.device, dtype=torch.int64 + ) + observed_checksum = ( + bits[:, SOURCE_ID_BITS:SOURCE_ID_COLUMNS].to(torch.int64) * checksum_powers + ).sum(dim=1) + checksum = (source * 0x9E37 + int(seed) * 0xA24B) & ( + (1 << SOURCE_CHECKSUM_BITS) - 1 + ) + if not torch.equal(checksum, observed_checksum): + raise ValueError("received source-token checksum differs") + return source + + +def routing_locality(idx, experts_per_rank: int, ep_size: int, tokens_per_rank: int, + gpus_per_node: int, scale_up_domain: int = None) -> dict: + """Locality of rank-deduplicated payload copies under packed placement.""" + import torch as _t + gt = idx.shape[0] + assignments = (idx // experts_per_rank).clamp(max=ep_size - 1) + destinations = _t.zeros((gt, ep_size), dtype=_t.bool) + destinations.scatter_(1, assignments, True) + token, dest = destinations.nonzero(as_tuple=True) + src = (token // max(1, tokens_per_rank)).clamp(max=ep_size - 1) + sud = scale_up_domain or (gpus_per_node * ep_size) # default: all one domain + phys = _t.arange(ep_size, dtype=_t.int64) + pd, ps = phys[dest], phys[src] + local = (dest == src) + same_node = (pd // gpus_per_node) == (ps // gpus_per_node) + same_dom = (pd // sud) == (ps // sud) + n = dest.numel() + return { + "placement": "packed", + "local_rank_fraction": float(local.float().mean()), + "same_node_fraction": float(same_node.float().mean()), + "same_scaleup_domain_fraction": float(same_dom.float().mean()), + "cross_node_fraction": float((~same_node).float().mean()), + "cross_domain_fraction": float((~same_dom).float().mean()), + "gpus_per_node": gpus_per_node, "scale_up_domain": sud, "copies": int(n), + } + + +def routing_stats(idx, experts: int, experts_per_rank: int, weights=None) -> dict: + """Realized routing properties for the GLOBAL trace — published per point so the + fan-out / load can never be silently misread. idx is the global [gt, topk] tensor; + weights the matching [gt, topk] gate weights (hashed too for workload identity). + """ + ep = max(1, experts // max(1, experts_per_rank)) + ranks = (idx // experts_per_rank) # [gt, topk] destination rank per assignment + # unique destination ranks per token (fan-out) + onehot = torch.zeros(idx.shape[0], ep, dtype=torch.bool) + onehot.scatter_(1, ranks.clamp(max=ep - 1), True) + fanout = onehot.sum(dim=1) # [gt] + hist = torch.bincount(fanout, minlength=ep + 1)[1:ep + 1].tolist() # counts for fan-out 1..ep + load = torch.bincount(idx.reshape(-1), minlength=experts).float() + # Keep expert assignments (compute load) separate from rank-deduplicated payload copies + # (network load). Conflating them overstates traffic when two experts share a rank. + assignment_load = torch.bincount( + ranks.reshape(-1).clamp(max=ep - 1), minlength=ep + ).float() + payload_load = onehot.sum(dim=0).float() + # One-number imbalance summaries so a row is self-describing for the distribution-sensitivity + # suite (no need to read the full histograms): CV = std/mean of the load; hotspot_ratio = + # worst expert load over the mean. Zipf should be more concentrated than uniform. + def _cv(t): + m = float(t.mean()) + return float(t.std(unbiased=False) / m) if m > 0 else 0.0 + expert_load_cv = _cv(load) + assignment_rank_cv = _cv(assignment_load) + payload_rank_cv = _cv(payload_load) + hotspot_ratio = float(load.max() / load.mean()) if float(load.mean()) > 0 else 0.0 + # Empty experts capture compute skew; empty destination ranks capture network skew. + empty_expert_count = int((load == 0).sum()) + empty_rank_count = int((payload_load == 0).sum()) + # SHA-256 workload identity over both topk_idx and gate weights: a chart + # point's routing is provably identical across SKUs only if both hashes match. + idx_bytes = idx.to(torch.int32).cpu().numpy().tobytes() + idx_hash = hashlib.sha256(idx_bytes).hexdigest() + if weights is not None: + w_bytes = weights.to(torch.float32).cpu().numpy().tobytes() + w_hash = hashlib.sha256(w_bytes).hexdigest() + routing_hash = hashlib.sha256(idx_bytes + w_bytes).hexdigest() + else: + w_hash, routing_hash = None, idx_hash + return { + "fanout_mean": float(fanout.float().mean()), + "fanout_min": int(fanout.min()), "fanout_max": int(fanout.max()), + "fanout_hist": hist, # index k-1 = #tokens with fan-out k + "expert_assignments_per_rank": [int(x) for x in assignment_load.tolist()], + "payload_copies_per_rank": [int(x) for x in payload_load.tolist()], + "routed_copies": int(fanout.sum()), # total (token, dest-rank) pairs + "expert_load_min": int(load.min()), "expert_load_max": int(load.max()), + "expert_load_mean": float(load.mean()), "expert_load_cv": expert_load_cv, + "expert_assignment_rank_cv": assignment_rank_cv, + "payload_rank_cv": payload_rank_cv, "hotspot_ratio": hotspot_ratio, + "empty_expert_count": empty_expert_count, "empty_rank_count": empty_rank_count, + "routing_hash": routing_hash, "idx_hash": idx_hash, "weights_hash": w_hash, + } + + +# --------------------------------------------------------------------------- self-test +if __name__ == "__main__": + import sys + E, TOPK, EPR, GT = 256, 8, 32, 4096 + ui, _ = build_global_routing(GT, E, TOPK, "uniform", 67) + zi, _ = build_global_routing(GT, E, TOPK, "zipf", 67) + assert all(len(set(row.tolist())) == TOPK for row in ui[:16]) + uniform, zipf = routing_stats(ui, E, EPR), routing_stats(zi, E, EPR) + assert uniform["hotspot_ratio"] < zipf["hotspot_ratio"] + dev = torch.device("cpu") + first = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32) + second = rank_activations(8, 256, 67, 0, dev, dtype=torch.float32) + assert torch.equal(first, second) and torch.isfinite(first).all() + print("routing self-test: PASS") + sys.exit(0) diff --git a/experimental/CollectiveX/tests/run_ep.py b/experimental/CollectiveX/tests/run_ep.py new file mode 100644 index 000000000..7f3ca79d0 --- /dev/null +++ b/experimental/CollectiveX/tests/run_ep.py @@ -0,0 +1,529 @@ +#!/usr/bin/env python3 +"""CollectiveX v1 EP benchmark entrypoint for torchrun or rank environments.""" + +from __future__ import annotations + +import argparse +import ctypes +import hashlib +import hmac +import json +import os +import platform +import re +import shlex +import socket +import subprocess +import sys + +# Make the sibling tests/ modules importable when run as `tests/run_ep.py` under +# torchrun (it executes the file as __main__, not as a package). +HERE = os.path.dirname(os.path.abspath(__file__)) +sys.path[:0] = [HERE, os.path.dirname(HERE)] + +import ep_harness # noqa: E402 (stdlib-only; safe before torch) +import identity # noqa: E402 + + +ALLOCATION_STRATUM_CONTRACT = "collectivex-allocation-stratum-v1" +PRIVATE_FABRIC_ENV = { + "ib_gid_index": "CX_IB_GID_INDEX", + "rdma_devices": "CX_RDMA_DEVICES", + "rdma_service_level": "CX_RDMA_SERVICE_LEVEL", + "socket_ifname": "CX_SOCKET_IFNAME", +} + + +def _numeric_version(command: list[str]) -> str | None: + try: + result = subprocess.run( + command, capture_output=True, check=False, text=True, timeout=10 + ) + except (OSError, subprocess.TimeoutExpired): + return None + if result.returncode != 0: + return None + match = re.search(r"\b[0-9]+(?:\.[0-9]+){1,3}\b", result.stdout) + return match.group(0) if match else None + + +def _loaded_collective_version() -> str | None: + try: + with open("/proc/self/maps", encoding="utf-8") as handle: + paths = { + os.path.realpath(line.rstrip().split()[-1]) + for line in handle + if any(name in line for name in ("libnccl.so", "librccl.so")) + and os.path.isfile(line.rstrip().split()[-1]) + } + if len(paths) != 1: + return None + version = ctypes.c_int() + library = ctypes.CDLL(paths.pop()) + if library.ncclGetVersion(ctypes.byref(version)) != 0: + return None + return ep_harness.format_collective_version(version.value) + except (AttributeError, OSError): + return None + + +def _runtime_fingerprint( + torch, device, *, machine: str, vendor: str, arch: str +) -> dict: + """Return strict runtime facts without hosts, addresses, UUIDs, or paths.""" + properties = torch.cuda.get_device_properties(device) + if vendor == "nvidia": + driver = _numeric_version( + ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"] + ) + runtime_kind, runtime_version, collective_kind = ( + "cuda", + torch.version.cuda, + "nccl", + ) + else: + driver = _numeric_version(["rocm-smi", "--showdriverversion"]) + runtime_kind, runtime_version, collective_kind = ( + "hip", + torch.version.hip, + "rccl", + ) + return { + "accelerator_runtime": {"kind": runtime_kind, "version": runtime_version}, + "collective_library": { + "kind": collective_kind, + "version": _loaded_collective_version(), + }, + "device": { + "arch": arch, + "compute_units": int(properties.multi_processor_count), + "memory_bytes": int(properties.total_memory), + "product": torch.cuda.get_device_name(device), + "warp_size": int(properties.warp_size), + }, + "driver_version": driver, + "framework": {"kind": "torch", "version": str(torch.__version__)}, + "machine": machine, + "python_version": platform.python_version(), + "vendor": vendor, + } + + +def _summarize_realized_placement( + records: list[tuple[str, int]], + *, + expected_nodes: int, + expected_gpus_per_node: int, + expected_world_size: int, +) -> dict: + """Validate private host/rank records and return only publication-safe aggregates.""" + if expected_nodes < 1 or expected_gpus_per_node < 1: + raise ValueError("requested placement dimensions must be positive") + if expected_nodes * expected_gpus_per_node != expected_world_size: + raise ValueError("requested nodes x GPUs per node differs from world size") + if len(records) != expected_world_size: + raise ValueError("realized rank count differs from world size") + + by_host: dict[str, list[int]] = {} + for host, local_rank in records: + if not isinstance(host, str) or not host or type(local_rank) is not int: + raise ValueError("realized placement record has invalid types") + by_host.setdefault(host, []).append(local_rank) + + counts = sorted(len(local_ranks) for local_ranks in by_host.values()) + complete_local_ranks = all( + sorted(local_ranks) == list(range(expected_gpus_per_node)) + for local_ranks in by_host.values() + ) + unique_pairs = len(set(records)) == len(records) + if len(by_host) != expected_nodes: + raise ValueError( + f"realized node count {len(by_host)} differs from requested {expected_nodes}" + ) + if counts != [expected_gpus_per_node] * expected_nodes: + raise ValueError("realized ranks per node differ from requested GPUs per node") + if not complete_local_ranks or not unique_pairs: + raise ValueError("realized local ranks are incomplete or duplicated") + return { + "gpus_per_node": expected_gpus_per_node, + "nodes": expected_nodes, + "ranks_per_node": expected_gpus_per_node, + "unique_local_ranks": True, + "valid": True, + } + + +def _common_runtime_fingerprint(records: list[dict]) -> dict: + """Return the shared sanitized fingerprint, rejecting heterogeneous ranks.""" + if not records: + raise ValueError("runtime fingerprint evidence is empty") + canonical = { + json.dumps(record, allow_nan=False, sort_keys=True, separators=(",", ":")) + for record in records + } + if len(canonical) != 1: + raise ValueError("runtime fingerprint differs across distributed ranks") + return records[0] + + +def _allocation_stratum_sha256( + physical_hosts: list[str], + *, + audit_salt: str | None, + fabric_selectors: dict[str, str | None], + required: bool, +) -> str | None: + """Commit private allocation/fabric identity without exposing its inputs.""" + if audit_salt in (None, ""): + if required: + raise ValueError("canonical execution requires a private allocation audit salt") + return None + if not isinstance(audit_salt, str) or not re.fullmatch(r"[0-9a-f]{64}", audit_salt): + raise ValueError("allocation audit salt is invalid") + if set(fabric_selectors) != set(PRIVATE_FABRIC_ENV): + raise ValueError("private fabric selector set differs from the stratum contract") + for value in fabric_selectors.values(): + if value is not None and ( + not isinstance(value, str) + or not value + or len(value) > 512 + or any(ord(char) < 32 or ord(char) == 127 for char in value) + ): + raise ValueError("private fabric selector is invalid") + if not physical_hosts or any( + not isinstance(host, str) + or not host + or len(host) > 255 + or any(ord(char) < 32 or ord(char) == 127 for char in host) + for host in physical_hosts + ): + raise ValueError("physical allocation host evidence is invalid") + payload = json.dumps( + { + "contract": ALLOCATION_STRATUM_CONTRACT, + "fabric_selectors": fabric_selectors, + "physical_hosts": sorted(set(physical_hosts)), + }, + allow_nan=False, + ensure_ascii=False, + separators=(",", ":"), + sort_keys=True, + ).encode("utf-8") + return hmac.new(bytes.fromhex(audit_salt), payload, hashlib.sha256).hexdigest() + + +def _common_allocation_stratum( + records: list[str | None], *, required: bool +) -> str | None: + """Require every distributed rank to derive the same private stratum.""" + if not records or any( + value is not None + and (not isinstance(value, str) or not re.fullmatch(r"[0-9a-f]{64}", value)) + for value in records + ): + raise ValueError("allocation stratum evidence is invalid") + distinct = set(records) + if len(distinct) != 1: + raise ValueError("allocation stratum differs across distributed ranks") + value = records[0] + if required and value is None: + raise ValueError("canonical execution requires an allocation stratum") + return value + + +def main() -> int: + ap = argparse.ArgumentParser(description="CollectiveX EP dispatch/combine sweep") + ap.add_argument( + "--backend", + required=True, + choices=[ + "deepep", + "deepep-v2", + "deepep-hybrid", + "mori", + "uccl", + "nccl-ep", + ], + ) + ep_harness.add_common_args(ap) + args = ap.parse_args() + + if args.mode == ep_harness.LOW_LATENCY_MODE: + if args.backend not in {"deepep", "uccl"}: + print( + "ERROR: low-latency mode is supported only by deepep and uccl", + file=sys.stderr, + ) + return 2 + if args.phase != "decode": + print("ERROR: low-latency mode requires --phase decode", file=sys.stderr) + return 2 + if args.case_id and not identity.is_typed_id(args.case_id, "case"): + print(f"ERROR: invalid native case ID {args.case_id!r}", file=sys.stderr) + return 2 + if args.case_id and args.seed != ep_harness.ROUTING_SEED: + print( + f"ERROR: scheduled v1 cases require seed={ep_harness.ROUTING_SEED}; got {args.seed}", + file=sys.stderr, + ) + return 2 + if args.qualification_index not in range(1, ep_harness.QUALIFICATION_RUNS + 1): + print( + f"ERROR: qualification index must be in 1..{ep_harness.QUALIFICATION_RUNS}", + file=sys.stderr, + ) + return 2 + + sampling_error = ep_harness.sampling_contract_error( + args.iters, args.trials, args.warmup + ) + if sampling_error: + print(f"ERROR: {sampling_error}", file=sys.stderr) + return 2 + + try: + import torch + import torch.distributed as dist + except Exception as exc: # pragma: no cover + print(f"ERROR: torch unavailable: {exc!r}", file=sys.stderr) + return 3 + + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + torch.cuda.set_device(local_rank) + device = torch.device(f"cuda:{local_rank}") + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "12355") + + import capability + + sku = capability.PLATFORMS.get(args.runner) + if sku is None: + print(f"ERROR: unknown runner identity {args.runner!r}", file=sys.stderr) + return 5 + machine = {"x86_64": "amd64", "aarch64": "arm64"}.get( + platform.machine(), platform.machine() + ) + props = torch.cuda.get_device_properties(device) + if torch.version.hip: + vendor = "amd" + accelerator = str(getattr(props, "gcnArchName", "")).split(":", 1)[0] + else: + vendor = "nvidia" + major, minor = torch.cuda.get_device_capability(device) + accelerator = f"sm{major}{minor}" + device_name = torch.cuda.get_device_name(device) + device_count = torch.cuda.device_count() + identity_issues = capability.runtime_identity_issues( + args.runner, + vendor=vendor, + arch=accelerator, + machine=machine, + device_name=device_name, + device_count=device_count, + world_size=world_size, + ) + if identity_issues: + print( + f"ERROR: runtime identity does not match {args.runner}: " + + "; ".join(identity_issues), + file=sys.stderr, + ) + return 5 + observed_gpus_per_node = args.gpus_per_node or device_count + if observed_gpus_per_node != sku["gpus_per_node"]: + print( + f"ERROR: {args.runner} requires {sku['gpus_per_node']} GPUs per node", + file=sys.stderr, + ) + return 5 + if world_size % observed_gpus_per_node: + print("ERROR: distributed world is not divisible by GPUs per node", file=sys.stderr) + return 5 + observed_nodes = world_size // observed_gpus_per_node + topology = capability.topology_for(args.runner, world_size) + observed_topology = { + "nodes": observed_nodes, + "gpus_per_node": observed_gpus_per_node, + "scale_up_domain": args.scale_up_domain or observed_gpus_per_node, + "scope": args.scope, + "scale_up_transport": args.scale_up_transport, + "scale_out_transport": args.scale_out_transport or None, + "transport": args.transport, + "topology_class": args.topology_class, + } + if topology is None or any( + observed_topology[field] != topology[field] for field in observed_topology + ): + print( + f"ERROR: runtime topology does not match {args.runner} EP{world_size}", + file=sys.stderr, + ) + return 5 + schedulable, reason = capability.resolve( + args.runner, + args.backend, + ep=world_size, + nodes=observed_nodes, + routing=args.routing, + eplb=args.eplb, + mode=args.mode, + ) + if not schedulable: + print(f"ERROR: scheduled case is unsupported: {reason}", file=sys.stderr) + return 5 + args.runtime_device_product = device_name + args.runtime_device_count = device_count + args.allocation_execution_id = os.environ.get("COLLECTIVEX_EXECUTION_ID") + + # EPLB bumps the expert count to PHYSICAL (logical + redundant) BEFORE backend construction + # so the backend sizes its buffers for the replicated set; ep_harness builds the LOGICAL + # routing trace and remaps it to the balanced physical placement (a pure routing transform, + # tests/eplb.py — no adapter change). Deterministic, so every rank agrees on the count. + if getattr(args, "eplb", False): + import eplb + + args.num_logical_experts = args.experts + args.experts = eplb.physical_count( + args.experts, ep_harness.EPLB_REDUNDANT_EXPERTS, world_size + ) + + # Reproduction provenance (recorded in the artifact). Rack launchers provide ranks directly + # through srun, while single-node launchers use torchrun; do not claim torchrun for both. + if os.environ.get("TORCHELASTIC_RUN_ID"): + args.distributed_launcher = "torchrun" + prefix = f"torchrun --nproc_per_node={world_size}" + else: + args.distributed_launcher = "rank-environment" + prefix = f"RANK={rank} WORLD_SIZE={world_size} LOCAL_RANK={local_rank} python3" + args.reproduction_command = f"{prefix} tests/run_ep.py {shlex.join(sys.argv[1:])}" + args.image = os.environ.get("COLLECTIVEX_IMAGE", "") + args.image_digest = os.environ.get("COLLECTIVEX_IMAGE_DIGEST", "") + args.image_digest_verified = ( + os.environ.get("COLLECTIVEX_IMAGE_DIGEST_VERIFIED") == "1" + ) + # Container architecture and local squash hash for Enroot/Pyxis. + args.image_arch = machine + args.squash_sha256 = os.environ.get("COLLECTIVEX_SQUASH_SHA256") + # GitHub provenance: repo, run ID, attempt, ref, source SHA, job, + # artifact. A result is only publication-'official' when these are present (validity gate). + _run = { + "run_id": os.environ.get("GITHUB_RUN_ID"), + "run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + "ref": os.environ.get("GITHUB_REF_NAME") or os.environ.get("GITHUB_REF"), + "source_sha": os.environ.get("COLLECTIVEX_SOURCE_SHA") + or os.environ.get("GITHUB_SHA"), + "repo": os.environ.get("GITHUB_REPOSITORY"), + "job": os.environ.get("GITHUB_JOB"), + "artifact": os.environ.get("COLLECTIVEX_ARTIFACT_NAME"), + } + if any(_run.values()): + _run["qualification_index"] = args.qualification_index + args.git_run = _run + else: + args.git_run = None + + # Import the backend class only after torch initializes. The selected mode is an + # explicit case dimension; adapters do not infer it from the token ladder. + if args.backend == "mori": + from ep_mori import MoRIBackend as Backend + elif args.backend == "nccl-ep": + from ep_nccl import NCCLBackend as Backend + elif args.backend == "uccl": + from ep_uccl import UCCLBackend as Backend + elif args.backend == "deepep-hybrid": + from ep_deepep_hybrid import DeepEPHybridBackend as Backend + elif args.backend == "deepep-v2": + from ep_deepep_v2 import DeepEPV2Backend as Backend + else: + from ep_deepep import DeepEPBackend as Backend + + # MoRI uses the gloo+NCCL group shape from its reference; other adapters use NCCL/RCCL. + if not dist.is_initialized(): + if args.backend == "mori": + dist.init_process_group( + backend="cpu:gloo,cuda:nccl", + rank=rank, + world_size=world_size, + device_id=device, + ) + elif args.backend == "deepep-v2": + # PR #605 reuses PyTorch's NCCL communicator through ``_comm_ptr``. Supplying + # device_id eagerly forms it before ElasticBuffer construction. + dist.init_process_group("nccl", device_id=device) + else: + dist.init_process_group("nccl") + + args.runtime_fingerprint = _runtime_fingerprint( + torch, device, machine=machine, vendor=vendor, arch=accelerator + ) + + gpus_per_node = args.gpus_per_node or sku["gpus_per_node"] + try: + expected_nodes = int( + os.environ.get("SLURM_NNODES", str(world_size // gpus_per_node)) + ) + except ValueError as exc: + raise ValueError("SLURM_NNODES must be a positive integer") from exc + realized_records: list[tuple[str, int, dict] | None] = [None] * world_size + dist.all_gather_object( + realized_records, + (socket.gethostname(), local_rank, args.runtime_fingerprint), + ) + complete_records = [record for record in realized_records if record is not None] + args.realized_placement = _summarize_realized_placement( + [(record[0], record[1]) for record in complete_records], + expected_nodes=expected_nodes, + expected_gpus_per_node=gpus_per_node, + expected_world_size=world_size, + ) + args.runtime_fingerprint = _common_runtime_fingerprint( + [record[2] for record in complete_records] + ) + canonical = bool(args.workload_dir) + local_stratum = _allocation_stratum_sha256( + [record[0] for record in complete_records], + audit_salt=os.environ.get("CX_AUDIT_SALT"), + fabric_selectors={ + field: os.environ.get(environment) or None + for field, environment in PRIVATE_FABRIC_ENV.items() + }, + required=canonical, + ) + stratum_records: list[str | None] = [None] * world_size + dist.all_gather_object(stratum_records, local_stratum) + args.allocation_stratum_sha256 = _common_allocation_stratum( + stratum_records, required=canonical + ) + + # Construct + run inside a try so a backend exception (esp. a new adapter on GPU) prints its + # FULL traceback to STDOUT — torchrun captures per-rank stdout but only summarizes stderr, so an + # uncaught exception is otherwise invisible in CI. Print on every rank (prefixed) then re-raise. + try: + backend = Backend(args, rank, world_size, local_rank, device) + if rank == 0: + print( + f"[run_ep] backend={args.backend} phase={args.phase} mode={args.mode} " + f"world={world_size} ep_size={world_size} hidden={args.hidden} " + f"topk={args.topk} experts={args.experts} dtype=bf16 " + f"routing={args.routing} seed={args.seed} " + f"qualification_index={args.qualification_index}" + ) + rc = ep_harness.run_sweep(args, backend, torch, dist, device, rank, world_size) + except Exception: + import traceback + + print( + f"[run_ep][rank{rank}] backend={args.backend} FAILED:\n" + + traceback.format_exc(), + flush=True, + ) + raise + # finalize() handles backend-specific teardown: DeepEP returns rc cleanly; + # MoRI hard-exits past its post-shmem_finalize teardown assertion. + return backend.finalize(rc) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/experimental/CollectiveX/tests/test_deepep_v2_contract.py b/experimental/CollectiveX/tests/test_deepep_v2_contract.py new file mode 100644 index 000000000..c9f65c2c3 --- /dev/null +++ b/experimental/CollectiveX/tests/test_deepep_v2_contract.py @@ -0,0 +1,2151 @@ +#!/usr/bin/env python3 +"""CPU-only structural and registry tests for the pinned DeepEP V2 path.""" +from __future__ import annotations + +import ast +import argparse +import copy +import ctypes +import hashlib +import json +import os +from pathlib import Path +from pathlib import PurePosixPath +import shutil +import stat +import subprocess +import sys +import tempfile +import types +import unittest + +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +sys.path.insert(0, str(ROOT)) + +import capability # noqa: E402 +import contracts # noqa: E402 +import ep_harness # noqa: E402 +import identity # noqa: E402 +import run_ep # noqa: E402 + + +COMMIT = "fa8a9b16898204afd347c663b89e65ef87dc6ce6" +TREE = "29809e75c5874e6609dac4804e7b651d5226959f" +FMT_COMMIT = "a4c7e17133ee9cb6a2f45545f6e974dd3c393efa" + + +def deepep_v2_jit_provenance() -> list[dict[str, str]]: + return [ + { + "cache_key": f"kernel.{name}.{index:032x}", + "cubin_sha256": f"{index + 1:x}" * 64, + "sass_sha256": f"{index + 2:x}" * 64, + "source_sha256": f"{index + 3:x}" * 64, + } + for index, name in enumerate(sorted(contracts.DEEPEP_V2_JIT_KERNELS)) + ] + + +def hybrid_realized_config() -> dict[str, object]: + config = {field: 1 for field in contracts.HYBRID_REALIZED_CONFIG_FIELDS} + for field in contracts.HYBRID_REALIZED_BOOL_FIELDS: + config[field] = True + config["token_data_type"] = "UINT16" + return config + + +def hybrid_jit_provenance(ranks: int = 2) -> tuple[list[str], list[dict[str, object]]]: + keys = ["combine-key", "dispatch-key", "preprocess-key"] + artifacts = [ + { + "kernel_key": key, + "rank_artifacts": [ + {"bytes": 10 + index, "rank": rank, "sha256": f"{index + 1:x}" * 64} + for rank in range(ranks) + ], + } + for index, key in enumerate(keys) + ] + return keys, artifacts + + +def load_uccl_function(name: str, namespace: dict[str, object]): + path = HERE / "ep_uccl.py" + function = next( + node + for node in ast.parse(path.read_text()).body + if isinstance(node, ast.FunctionDef) and node.name == name + ) + exec(compile(ast.Module(body=[function], type_ignores=[]), str(path), "exec"), namespace) + return namespace[name] + + +def operator_config(root: Path) -> dict[str, object]: + path = str(root) + network = {"socket_ifname": "eth0", "rdma_devices": "mlx5_0:1"} + runners = { + "h100-dgxc": { + "partition": "test", "account": "test", "squash_dir": path, + "stage_dir": path, **network, + }, + "h200-dgxc": { + "partition": "test", "squash_dir": path, "stage_dir": path, **network, + }, + "b200-dgxc": { + "partition": "test", "account": "test", "squash_dir": path, + "stage_dir": path, **network, + }, + "b300": { + "partition": "test", "account": "test", "squash_dir": path, "stage_dir": path, + **network, + }, + "gb200": {"partition": "test", "account": "test", "storage_roots": [path]}, + "gb300": { + "partition": "test", "account": "test", "squash_dir": path, + "stage_dir": path, "enroot_cache_path": path, + }, + "mi325x": { + "partition": "test", "squash_dir": path, "stage_dir": path, **network, + }, + "mi355x": { + "partition": "test", "squash_dir": path, "stage_dir": path, **network, + }, + } + return {"schema_version": 1, "audit_salt": "a" * 64, "runners": runners} + + +class DeepEPV2ContractTests(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls.path = HERE / "ep_deepep_v2.py" + cls.tree = ast.parse(cls.path.read_text(), str(cls.path)) + + def test_capability_is_explicit_for_every_sku(self) -> None: + backend = capability.BACKENDS["deepep-v2"] + self.assertEqual( + (backend["implementation"], backend["commit"], backend["torch"], backend["nccl"]), + ("deep_ep.ElasticBuffer", COMMIT, "2.10.0+cu130", "2.30.4"), + ) + self.assertEqual(backend["source"], "deepseek-ai/DeepEP#605+#630") + self.assertEqual(backend["communication_backend"], "nccl-device-lsa") + self.assertEqual(set(backend["sku_capabilities"]), set(capability.PLATFORMS)) + for sku, platform in capability.PLATFORMS.items(): + ok, _ = capability.resolve(sku, "deepep-v2") + self.assertEqual(ok, platform["vendor"] == "nvidia" and sku != "h100-dgxc") + self.assertEqual( + set(backend["sku_capabilities"][sku]), {"basis", "schedulable"} + ) + self.assertEqual( + backend["sku_capabilities"]["h100-dgxc"], + { + "schedulable": False, + "basis": "current-runner-nccl-device-api-symmetric-memory-unavailable", + }, + ) + + def test_adapter_ast_pins_elastic_api_and_weight_semantics(self) -> None: + imports = { + alias.name + for node in ast.walk(self.tree) + if isinstance(node, ast.ImportFrom) and node.module == "deep_ep" + for alias in node.names + } + self.assertEqual(imports, {"ElasticBuffer"}) + constants = { + node.targets[0].id: ast.literal_eval(node.value) + for node in self.tree.body + if isinstance(node, ast.Assign) + and len(node.targets) == 1 + and isinstance(node.targets[0], ast.Name) + and isinstance(node.value, ast.Constant) + } + self.assertEqual(constants["DEEPEP_V2_COMMIT"], COMMIT) + self.assertEqual(constants["DEEPEP_V2_TREE"], TREE) + self.assertEqual(constants["DEEPEP_V2_FMT_COMMIT"], FMT_COMMIT) + self.assertEqual(constants["DEEPEP_V2_PR"], 605) + self.assertEqual(constants["DEEPEP_V2_FIX_PR"], 630) + self.assertEqual( + constants["DEEPEP_V2_JIT_RANDOM_SEED"], + "collectivex-deepep-v2-fa8a9b1", + ) + self.assertEqual(constants["NCCL_VERSION"], "2.30.4") + self.assertEqual(constants["NVSHMEM_VERSION"], "3.3.9") + backend = next( + node for node in self.tree.body + if isinstance(node, ast.ClassDef) and node.name == "DeepEPV2Backend" + ) + assignments = { + node.targets[0].id: ast.literal_eval(node.value) + for node in backend.body + if isinstance(node, ast.Assign) + and isinstance(node.targets[0], ast.Name) + and isinstance(node.value, ast.Constant) + } + self.assertEqual(assignments["combine_weight_semantics"], "unweighted-rank-sum") + methods = {node.name for node in backend.body if isinstance(node, ast.FunctionDef)} + self.assertTrue({ + "dispatch", "inspect_dispatch", "combine_transformed", "capture_deferred_provenance", + "finalize", + } <= methods) + self.assertNotIn("expected", methods) + constructor = next( + node for node in ast.walk(backend) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "ElasticBuffer" + ) + deterministic = next( + keyword for keyword in constructor.keywords if keyword.arg == "deterministic" + ) + self.assertIs(ast.literal_eval(deterministic.value), False) + self.assertIn("deterministic", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("num_experts", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("tuning_num_experts", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("jit_random_seed", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("gin_enabled", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("communication_backend", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("deepep_pr", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + self.assertIn("deepep_fix_pr", contracts.REQUIRED_BACKEND_PROVENANCE["deepep-v2"]) + source = self.path.read_text() + self.assertIn('getattr(args, "num_logical_experts", args.experts)', source) + self.assertIn('"use_expanded_layout": False', source) + self.assertIn("allow_hybrid_mode = _configure_gin_mode(args, world_size)", source) + self.assertIn("get_theoretical_num_sms(tuning_num_experts, args.topk)", source) + + jit_function = next( + node for node in self.tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_jit_cache_key" + ) + namespace = {"hashlib": __import__("hashlib"), "json": json} + exec(compile(ast.Module(body=[jit_function], type_ignores=[]), str(self.path), "exec"), namespace) + key = namespace["_jit_cache_key"] + baseline = types.SimpleNamespace( + runner="h100-dgxc", hidden=7168, topk=8, experts=256, + routing="uniform", eplb=False, case_id="uniform", + ) + zipf = types.SimpleNamespace(**{**vars(baseline), "routing": "zipf", "case_id": "zipf"}) + eplb = types.SimpleNamespace( + **{**vars(zipf), "experts": 288, "num_logical_experts": 256, "eplb": True} + ) + realized = { + "num_sms": 24, + "num_qps": 9, + "allocated_qps": 17, + "logical_scaleout_ranks": 1, + "logical_scaleup_ranks": 8, + "physical_rdma_ranks": 2, + "physical_nvlink_ranks": 4, + "is_scaleup_nvlink": False, + "device_arch_major": 9, + "device_arch_minor": 0, + "device_sms": 132, + "device_smem_bytes": 232448, + "gpu_timeout_cycles": 198000000000, + } + direct = key(baseline, 8, 128, False, realized) + self.assertTrue(direct.startswith("jitcfg-v3-")) + self.assertEqual(direct, key(zipf, 8, 128, False, realized)) + self.assertNotEqual(direct, key(zipf, 8, 128, True, realized)) + self.assertNotEqual(direct, key(eplb, 8, 128, False, realized)) + for field, value in realized.items(): + changed = not value if type(value) is bool else value + 1 + self.assertNotEqual( + direct, + key(baseline, 8, 128, False, {**realized, field: changed}), + field, + ) + init = next( + node for node in backend.body + if isinstance(node, ast.FunctionDef) and node.name == "__init__" + ) + buffer_call = next( + node for node in ast.walk(init) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "ElasticBuffer" + ) + jit_config_check = next( + node for node in ast.walk(init) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "_require_cross_rank_equal" + and ast.literal_eval(node.args[1]) == "JIT configuration" + ) + cache_assignment = next( + node for node in ast.walk(init) + if isinstance(node, ast.Assign) + and isinstance(node.targets[0], ast.Subscript) + and ast.unparse(node.targets[0].value) == "os.environ" + and ast.literal_eval(node.targets[0].slice) == "EP_JIT_CACHE_DIR" + ) + self.assertLess(buffer_call.lineno, jit_config_check.lineno) + self.assertLess(jit_config_check.lineno, cache_assignment.lineno) + capture = next( + node for node in backend.body + if isinstance(node, ast.FunctionDef) + and node.name == "capture_deferred_provenance" + ) + calls = [node for node in ast.walk(capture) if isinstance(node, ast.Call)] + barrier = next( + node for node in calls + if isinstance(node.func, ast.Attribute) and node.func.attr == "barrier" + ) + self.assertEqual( + {keyword.arg: ast.literal_eval(keyword.value) for keyword in barrier.keywords}, + {"use_comm_stream": True, "with_cpu_sync": True}, + ) + scan = next( + node for node in calls + if isinstance(node.func, ast.Name) and node.func.id == "_jit_artifact_evidence" + ) + self.assertLess(barrier.lineno, scan.lineno) + realized_check = next( + node for node in ast.walk(backend) + if isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "_require_cross_rank_equal" + and len(node.args) > 1 + and isinstance(node.args[1], ast.Constant) + and node.args[1].value == "realized tuning/topology" + ) + self.assertIsInstance(realized_check, ast.Call) + self.assertEqual( + (ROOT / "tests" / "ep_harness.py").read_text().count( + "capture_deferred_provenance()" + ), + 2, + ) + schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text()) + provenance = schema["properties"]["implementation"]["properties"]["provenance"] + self.assertEqual(provenance["properties"]["deterministic"], {"type": "boolean"}) + self.assertEqual( + provenance["properties"]["num_experts"], + {"minimum": 1, "type": "integer"}, + ) + self.assertEqual( + provenance["properties"]["tuning_num_experts"], + {"minimum": 1, "type": "integer"}, + ) + self.assertEqual( + provenance["properties"]["jit_cubins"]["items"], + {"$ref": "#/$defs/deepep_v2_jit_cubin"}, + ) + self.assertEqual( + ( + provenance["properties"]["jit_cubins"]["minItems"], + provenance["properties"]["jit_cubins"]["maxItems"], + ), + (5, 5), + ) + self.assertEqual( + provenance["properties"]["jit_random_seed"], + {"const": "collectivex-deepep-v2-fa8a9b1"}, + ) + self.assertEqual(provenance["properties"]["allow_hybrid_mode"], {"type": "boolean"}) + self.assertEqual(provenance["properties"]["gin_enabled"], {"type": "boolean"}) + self.assertEqual(provenance["properties"]["deepep_pr"], {"const": 605}) + self.assertEqual(provenance["properties"]["deepep_fix_pr"], {"const": 630}) + self.assertEqual( + provenance["properties"]["communication_backend"], + {"enum": ["nccl-device-lsa", "nccl-gin"]}, + ) + self.assertEqual( + provenance["properties"]["num_rdma_bytes"], + {"minimum": 0, "type": "integer"}, + ) + self.assertEqual( + provenance["properties"]["num_qps_per_rank"], + {"minimum": 1, "type": "integer"}, + ) + for field, value in ( + ("num_experts", "288"), + ("tuning_num_experts", "not-an-integer"), + ("tuning_num_experts", 0), + ): + with self.subTest(provenance_field=field, value=value): + self.assertIn( + field, + contracts.backend_provenance_issues( + "deepep-v2", {field: value} + ), + ) + + def test_v2_gin_mode_uses_the_scale_up_domain_and_safe_fallbacks(self) -> None: + functions = { + node.name: node for node in self.tree.body if isinstance(node, ast.FunctionDef) + } + namespace = {"os": os} + exec( + compile( + ast.Module( + body=[ + functions["_configure_gin_mode"], + functions["_lsa_topology_is_valid"], + ], + type_ignores=[], + ), + str(self.path), + "exec", + ), + namespace, + ) + configure = namespace["_configure_gin_mode"] + topology_is_valid = namespace["_lsa_topology_is_valid"] + original = os.environ.get("EP_DISABLE_GIN") + try: + args = types.SimpleNamespace(scale_up_domain=72, gpus_per_node=4) + self.assertFalse(configure(args, 8)) + self.assertEqual(os.environ.get("EP_DISABLE_GIN"), "1") + + os.environ["EP_DISABLE_GIN"] = "stale" + args = types.SimpleNamespace(scale_up_domain=8, gpus_per_node=4) + self.assertTrue(configure(args, 16)) + self.assertNotIn("EP_DISABLE_GIN", os.environ) + + args = types.SimpleNamespace(gpus_per_node=4) + self.assertTrue(configure(args, 8)) + self.assertNotIn("EP_DISABLE_GIN", os.environ) + + self.assertFalse(configure(types.SimpleNamespace(), 8)) + self.assertEqual(os.environ.get("EP_DISABLE_GIN"), "1") + + topology = { + "physical_rdma_ranks": 1, + "physical_nvlink_ranks": 8, + "logical_scaleout_ranks": 1, + "logical_scaleup_ranks": 8, + "is_scaleup_nvlink": True, + } + self.assertTrue(topology_is_valid(False, 8, 8, topology)) + topology["physical_rdma_ranks"] = 2 + topology["logical_scaleout_ranks"] = 2 + self.assertTrue(topology_is_valid(True, 16, 8, topology)) + topology["physical_nvlink_ranks"] = 4 + self.assertFalse(topology_is_valid(False, 8, 8, topology)) + finally: + if original is None: + os.environ.pop("EP_DISABLE_GIN", None) + else: + os.environ["EP_DISABLE_GIN"] = original + + def test_ep_adapters_declare_unweighted_rank_sum(self) -> None: + adapters = { + "ep_deepep.py": "DeepEPBackend", + "ep_deepep_v2.py": "DeepEPV2Backend", + "ep_deepep_hybrid.py": "DeepEPHybridBackend", + "ep_mori.py": "MoRIBackend", + "ep_nccl.py": "NCCLBackend", + "ep_uccl.py": "UCCLBackend", + } + for filename, class_name in adapters.items(): + with self.subTest(adapter=filename): + tree = ast.parse((HERE / filename).read_text()) + backend = next( + node for node in tree.body + if isinstance(node, ast.ClassDef) and node.name == class_name + ) + assignment = next( + node for node in backend.body + if isinstance(node, ast.Assign) + and isinstance(node.targets[0], ast.Name) + and node.targets[0].id == "combine_weight_semantics" + ) + self.assertEqual(ast.literal_eval(assignment.value), "unweighted-rank-sum") + combine_methods = [ + item for item in backend.body + if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)) + and item.name in {"combine", "combine_transformed"} + ] + self.assertEqual(len(combine_methods), 2) + for method in combine_methods: + source = ast.unparse(method) + if filename in {"ep_deepep.py", "ep_uccl.py"}: + self.assertIn("self.mode == 'low-latency'", source) + else: + self.assertNotIn("topk_weights", source) + self.assertNotIn("combine_topk_weights", source) + + def test_low_latency_mode_parser_and_profile_are_explicit(self) -> None: + parser = argparse.ArgumentParser() + ep_harness.add_common_args(parser) + required = [ + "--runner", "test", "--topology-class", "test", + "--scope", "scale-up", "--scale-up-transport", "nvlink", + "--out", "test.json", + ] + self.assertEqual(parser.parse_args(required).mode, "normal") + self.assertEqual( + parser.parse_args([*required, "--mode", "low-latency"]).mode, + "low-latency", + ) + profile = identity.case_profile("low-latency") + self.assertEqual(profile["contract"], "expert-packed-weighted-combine-v1") + self.assertEqual( + profile["component_order_contract"], + "qualification-hash-rotated-components-v1", + ) + self.assertEqual( + profile["correctness_scope"], + "expert-assignment-and-weighted-combine", + ) + self.assertEqual(profile["payload_unit"], "token-expert") + + def test_expert_packed_slot_map_reconstructs_exact_sources(self) -> None: + pack = lambda begin, count: (begin << 32) | count + slots = ep_harness.expert_packed_slot_map( + [2, 1], + [[1, 0, 0, 0], [1, 0, 0, 0]], + [[pack(0, 1), pack(1, 1)], [pack(0, 0), pack(0, 1)]], + tokens_per_rank=2, + experts_per_rank=2, + world_size=2, + ) + self.assertEqual(slots, [(0, 0, 1), (0, 1, 2), (1, 0, 3)]) + + invalid = ( + ([1], [[0]], [[pack(1, 1), pack(0, 0)]]), + ([1], [[2]], [[pack(0, 1), pack(1, 0)]]), + ([2], [[1, 1]], [[pack(0, 2), pack(2, 0)]]), + ) + for counts, source, layout in invalid: + with self.subTest(counts=counts, source=source, layout=layout): + with self.assertRaises(ValueError): + ep_harness.expert_packed_slot_map( + counts, + source, + layout, + tokens_per_rank=2, + experts_per_rank=1, + world_size=2, + ) + + def test_deepep_and_uccl_expose_genuine_low_latency_calls(self) -> None: + required_fragments = ( + "Buffer.get_low_latency_rdma_size_hint(", + "low_latency_mode=True", + "num_qps_per_rank=num_qps_per_rank", + "self.buffer.clean_low_latency_buffer(", + "self.buffer.low_latency_dispatch(", + "use_fp8=False", + "self.buffer.low_latency_combine(", + "p.topk_weights", + 'self.combine_weight_semantics = "gate-weighted-sum"', + "self.combine_needs_redispatch = True", + "def inspect_expert_dispatch(", + ) + for filename in ("ep_deepep.py", "ep_uccl.py"): + source = (HERE / filename).read_text() + with self.subTest(adapter=filename): + for fragment in required_fragments: + self.assertIn(fragment, source) + self.assertIn("self.max_tokens_per_rank = 128", source) + self.assertIn("async_finish=False", source) + self.assertIn("return_recv_hook=False", source) + + run_ep_source = (HERE / "run_ep.py").read_text() + self.assertIn('args.backend not in {"deepep", "uccl"}', run_ep_source) + self.assertIn('args.phase != "decode"', run_ep_source) + + def test_deepep_v2_jit_evidence_is_strict_and_stable(self) -> None: + valid = deepep_v2_jit_provenance() + self.assertTrue(contracts._deepep_v2_jit_cubins_are_valid(valid)) + for invalid in ( + [], + [{**valid[0], "path": "/private/kernel.cubin"}], + [{**item, "cache_key": "dispatch"} for item in valid], + [{**item, "cubin_sha256": "invalid"} for item in valid], + valid[:-1], + [*valid, valid[0]], + [ + *valid, + { + **valid[0], + "cache_key": valid[0]["cache_key"][:-32] + "f" * 32, + }, + ], + ): + with self.subTest(invalid=invalid): + self.assertFalse(contracts._deepep_v2_jit_cubins_are_valid(invalid)) + + backend = next( + node for node in self.tree.body + if isinstance(node, ast.ClassDef) and node.name == "DeepEPV2Backend" + ) + capture = next( + node for node in backend.body + if isinstance(node, ast.FunctionDef) + and node.name == "capture_deferred_provenance" + ) + artifacts = copy.deepcopy(valid) + + class FakeBuffer: + @staticmethod + def barrier(*, use_comm_stream: bool, with_cpu_sync: bool) -> None: + self.assertTrue(use_comm_stream) + self.assertTrue(with_cpu_sync) + + namespace = { + "torch": types.SimpleNamespace( + cuda=types.SimpleNamespace(synchronize=lambda: None) + ), + "_jit_artifact_evidence": lambda: copy.deepcopy(artifacts), + "_require_cross_rank_equal": lambda _value, _label: None, + } + exec( + compile(ast.Module(body=[capture], type_ignores=[]), str(self.path), "exec"), + namespace, + ) + state = types.SimpleNamespace( + buffer=FakeBuffer(), + _deferred_jit_snapshot=None, + backend_provenance={"jit_cubins": []}, + ) + namespace["capture_deferred_provenance"](state) + namespace["capture_deferred_provenance"](state) + artifacts[0]["cubin_sha256"] = "f" * 64 + with self.assertRaisesRegex(RuntimeError, "changed after measurement"): + namespace["capture_deferred_provenance"](state) + + def test_deepep_v2_jit_files_are_complete_regular_and_content_bound(self) -> None: + functions = [ + node for node in self.tree.body + if isinstance(node, ast.FunctionDef) + and node.name in {"_sha256", "_jit_artifact_evidence"} + ] + namespace = { + "hashlib": hashlib, + "os": os, + "Path": Path, + "re": __import__("re"), + "DEEPEP_V2_JIT_KERNELS": contracts.DEEPEP_V2_JIT_KERNELS, + } + exec(compile(ast.Module(body=functions, type_ignores=[]), str(self.path), "exec"), namespace) + with tempfile.TemporaryDirectory() as temporary: + cache = Path(temporary) / "cache" + cache.mkdir() + for index, name in enumerate(sorted(contracts.DEEPEP_V2_JIT_KERNELS)): + kernel = cache / f"kernel.{name}.{index:032x}" + kernel.mkdir() + for suffix in ("cu", "cubin", "sass"): + (kernel / f"kernel.{suffix}").write_bytes(f"{name}-{suffix}".encode()) + old_cache = os.environ.get("EP_JIT_CACHE_DIR") + os.environ["EP_JIT_CACHE_DIR"] = temporary + try: + evidence = namespace["_jit_artifact_evidence"]() + self.assertEqual(len(evidence), len(contracts.DEEPEP_V2_JIT_KERNELS)) + self.assertEqual( + set(evidence[0]), + {"cache_key", "cubin_sha256", "sass_sha256", "source_sha256"}, + ) + first = cache / evidence[0]["cache_key"] + duplicate = cache / (evidence[0]["cache_key"][:-32] + "f" * 32) + duplicate.mkdir() + for suffix in ("cu", "cubin", "sass"): + (duplicate / f"kernel.{suffix}").write_bytes(b"duplicate") + with self.assertRaisesRegex(RuntimeError, "kernel set"): + namespace["_jit_artifact_evidence"]() + shutil.rmtree(duplicate) + (first / "kernel.sass").unlink() + with self.assertRaisesRegex(RuntimeError, "incomplete"): + namespace["_jit_artifact_evidence"]() + (first / "kernel.sass").symlink_to(first / "kernel.cubin") + with self.assertRaisesRegex(RuntimeError, "regular file"): + namespace["_jit_artifact_evidence"]() + finally: + if old_cache is None: + os.environ.pop("EP_JIT_CACHE_DIR", None) + else: + os.environ["EP_JIT_CACHE_DIR"] = old_cache + + def test_runtime_and_shared_version_formatter_are_valid(self) -> None: + subprocess.run( + ["bash", "-n", str(ROOT / "runtime" / "run_in_container.sh")], + check=True, + ) + self.assertEqual(ep_harness.format_collective_version(23004), "2.30.4") + self.assertEqual(ep_harness.format_collective_version((2, 30, 4)), "2.30.4") + source = self.path.read_text() + version_function = next( + node for node in self.tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_loaded_nccl_version" + ) + + class FakeNccl: + @staticmethod + def ncclGetVersion(pointer) -> int: + pointer._obj.value = 23004 + return 0 + + namespace = { + "ctypes": types.SimpleNamespace( + CDLL=lambda _path: FakeNccl(), byref=ctypes.byref, c_int=ctypes.c_int, + ), + "ep_harness": ep_harness, + "os": os, + "_loaded_library_paths": lambda: {"/safe/libnccl.so.2"}, + } + exec( + compile(ast.Module(body=[version_function], type_ignores=[]), str(self.path), "exec"), + namespace, + ) + self.assertEqual(namespace["_loaded_nccl_version"](), "2.30.4") + for paths in (set(), {"/safe/libnccl.so.2", "/other/libnccl.so.2"}): + namespace["_loaded_library_paths"] = lambda paths=paths: paths + with self.assertRaisesRegex(RuntimeError, "exactly one"): + namespace["_loaded_nccl_version"]() + evidence_function = next( + node for node in self.tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_loaded_library_evidence" + ) + paths = { + "/safe/_C.cpython-310-x86_64-linux-gnu.so", + "/safe/libnccl.so.2", + "/safe/libnvshmem_host.so.3", + } + namespace.update( + _loaded_library_paths=lambda: paths, + _sha256=lambda _path: "a" * 64, + ) + exec( + compile(ast.Module(body=[evidence_function], type_ignores=[]), str(self.path), "exec"), + namespace, + ) + evidence = namespace["_loaded_library_evidence"]() + self.assertIn( + {"name": "deep_ep._C", "role": "deepep-extension", "sha256": "a" * 64}, + evidence, + ) + self.assertTrue( + contracts._content_evidence_is_valid( + evidence, {"deepep-extension", "nccl", "nvshmem"} + ) + ) + self.assertNotIn("torch.cuda.nccl.version()", source) + fingerprint = {"runtime": "cuda", "version": "13.0"} + self.assertIs( + run_ep._common_runtime_fingerprint([fingerprint, dict(fingerprint)]), + fingerprint, + ) + with self.assertRaises(ValueError): + run_ep._common_runtime_fingerprint([fingerprint, {"runtime": "cuda", "version": "12.8"}]) + + def test_conditioning_contract_is_exact_for_each_phase(self) -> None: + expected = { + "decode": [1, 2, 4, 8, 16, 32, 64, 128], + "prefill": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512], + } + for phase, ladder in expected.items(): + valid = { + "contract": "fixed-phase-ramp-8-roundtrips-v1", + "ladder": ladder, + "roundtrips_per_shape": 8, + } + self.assertIs(contracts.validate_conditioning_contract(valid, phase), valid) + for mutate in ( + lambda item: item["ladder"].reverse(), + lambda item: item["ladder"].pop(), + lambda item: item.update(ladder=[1.0, *item["ladder"][1:]]), + lambda item: item.update(roundtrips_per_shape=7), + lambda item: item.update(roundtrips_per_shape=8.0), + ): + changed = copy.deepcopy(valid) + mutate(changed) + with self.assertRaises(contracts.ContractError): + contracts.validate_conditioning_contract(changed, phase) + other = "prefill" if phase == "decode" else "decode" + with self.assertRaises(contracts.ContractError): + contracts.validate_conditioning_contract(valid, other) + + def test_content_manifest_evidence_is_stable_and_content_sensitive(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + first, second = root / "first", root / "second" + first.write_bytes(b"first") + second.write_bytes(b"second") + files = [("pkg/first", first), ("pkg/second", second)] + evidence = contracts.content_manifest_evidence( + role="test-content", name="test-build", files=files, + ) + self.assertNotIn(temporary, json.dumps(evidence)) + self.assertEqual( + evidence, + contracts.content_manifest_evidence( + role="test-content", name="test-build", files=reversed(files), + ), + ) + self.assertRegex(evidence["sha256"], r"^[0-9a-f]{64}$") + second.write_bytes(b"changed") + self.assertNotEqual( + evidence, + contracts.content_manifest_evidence( + role="test-content", name="test-build", files=files, + ), + ) + for invalid in ( + [("../first", first)], + [("same", first), ("same", second)], + [("missing", root / "missing")], + ): + with self.assertRaises(contracts.ContractError): + contracts.content_manifest_evidence( + role="test-content", name="test-build", files=invalid, + ) + + def test_hybrid_realized_config_and_jit_evidence_are_path_free(self) -> None: + path = HERE / "ep_deepep_hybrid.py" + tree = ast.parse(path.read_text(), str(path)) + selected = [ + node for node in tree.body + if ( + isinstance(node, ast.Assign) + and any( + isinstance(target, ast.Name) and target.id == "HYBRID_CONFIG_FIELDS" + for target in node.targets + ) + ) + or isinstance(node, ast.FunctionDef) + and node.name in { + "_hybrid_realized_config", "_sha256_with_size", "_hybrid_jit_evidence", + } + ] + namespace = {"Path": Path, "hashlib": hashlib, "re": __import__("re")} + exec(compile(ast.Module(body=selected, type_ignores=[]), str(path), "exec"), namespace) + fields = namespace["HYBRID_CONFIG_FIELDS"] + self.assertEqual(set(fields), contracts.HYBRID_REALIZED_CONFIG_FIELDS) + + class TokenType: + def __init__(self, label: str, name: str | None = None) -> None: + self.label = label + if name is not None: + self.name = name + + def __str__(self) -> str: + return self.label + + values = {field: 1 for field in fields} + values.update({field: True for field in contracts.HYBRID_REALIZED_BOOL_FIELDS}) + for raw, expected in (("uint16_t", "UINT16"), ("uint8_t", "UINT8")): + values["token_data_type"] = TokenType(raw) + config = types.SimpleNamespace(**values) + realized = namespace["_hybrid_realized_config"](config) + self.assertEqual(realized["token_data_type"], expected) + self.assertEqual(set(realized), contracts.HYBRID_REALIZED_CONFIG_FIELDS) + values["token_data_type"] = TokenType("opaque-enum", "UINT16") + self.assertEqual( + namespace["_hybrid_realized_config"](types.SimpleNamespace(**values))[ + "token_data_type" + ], + "UINT16", + ) + values["token_data_type"] = TokenType("UINT16") + with self.assertRaisesRegex(RuntimeError, "token_data_type is invalid"): + namespace["_hybrid_realized_config"](types.SimpleNamespace(**values)) + values["token_data_type"] = TokenType("uint16_t") + config = types.SimpleNamespace(**values) + delattr(config, "hidden_dim") + with self.assertRaisesRegex(RuntimeError, "omits hidden_dim"): + namespace["_hybrid_realized_config"](config) + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + for key, payload in ( + ("preprocess-key", b"pre"), + ("combine-key", b"combine"), + ("dispatch-key", b"dispatch"), + ): + (root / f"{key}.so").write_bytes(payload) + evidence = namespace["_hybrid_jit_evidence"](root) + self.assertEqual( + [item["kernel_key"] for item in evidence], + ["combine-key", "dispatch-key", "preprocess-key"], + ) + self.assertNotIn(temporary, json.dumps(evidence)) + (root / "dispatch-key.so").write_bytes(b"changed") + self.assertNotEqual(evidence, namespace["_hybrid_jit_evidence"](root)) + (root / "extra-key.so").write_bytes(b"extra") + with self.assertRaisesRegex(RuntimeError, "expected 3"): + namespace["_hybrid_jit_evidence"](root) + (root / "extra-key.so").unlink() + (root / "bad key.so").write_bytes(b"bad") + with self.assertRaisesRegex(RuntimeError, "kernel key"): + namespace["_hybrid_jit_evidence"](root) + (root / "bad key.so").unlink() + (root / "combine-key.so").unlink() + (root / "combine-key.so").symlink_to(root / "dispatch-key.so") + with self.assertRaisesRegex(RuntimeError, "regular file"): + namespace["_hybrid_jit_evidence"](root) + empty = root / "empty" + empty.mkdir() + with self.assertRaisesRegex(RuntimeError, "expected 3"): + namespace["_hybrid_jit_evidence"](empty) + + def test_hybrid_uses_communication_domains_not_physical_hosts(self) -> None: + path = HERE / "ep_deepep_hybrid.py" + function = next( + node for node in ast.parse(path.read_text(), str(path)).body + if isinstance(node, ast.FunctionDef) and node.name == "_hybrid_topology" + ) + namespace: dict[str, object] = {} + exec(compile(ast.Module(body=[function], type_ignores=[]), str(path), "exec"), namespace) + resolve = namespace["_hybrid_topology"] + cases = ( + (8, 8, 8, "scale-up", "nvlink", "", 8, 1, 1), + (16, 8, 8, "scale-out", "nvlink", "rdma", 8, 2, 2), + (8, 4, 72, "scale-up", "mnnvl", "", 8, 1, 2), + (16, 4, 72, "scale-up", "mnnvl", "", 16, 1, 4), + ) + for world, gpn, domain, scope, up, out, ranks, domains, hosts in cases: + with self.subTest(world=world, gpus_per_node=gpn, transport=up): + topology = resolve(types.SimpleNamespace( + gpus_per_node=gpn, + scale_up_domain=domain, + scope=scope, + scale_up_transport=up, + scale_out_transport=out, + transport=up if not out else f"{up}-{out}", + ), world) + self.assertEqual( + (topology["domain_ranks"], topology["communication_domains"], + topology["physical_nodes"]), + (ranks, domains, hosts), + ) + with self.assertRaisesRegex(RuntimeError, "outside the fixed v1 matrix"): + resolve(types.SimpleNamespace( + gpus_per_node=8, scale_up_domain=8, scope="scale-up", + scale_up_transport="nvlink", scale_out_transport="", transport="nvlink", + ), 16) + + def test_mori_ep16_pins_upstream_internode_v1_resources(self) -> None: + source = (HERE / "ep_mori.py").read_text() + for fragment in ( + 'kernel_enum.InterNodeV1', + 'self.block_num = self._block_target = 96', + 'self.rdma_block_num = 64', + 'self.dispatch_warps = self.combine_warps = 8', + 'self.num_qps = 1', + '"gpu_per_node": gpus_per_node', + '"rdma_block_num": self.rdma_block_num', + '"num_qp_per_pe": self.num_qps', + '"use_external_inp_buf": self._external_input', + 'os.environ["MORI_EP_LAUNCH_CONFIG_MODE"] = "MANUAL"', + 'rdma_block_num=self.rdma_block_num', + ): + self.assertIn(fragment, source) + self.assertGreaterEqual(source.count("rdma_block_num=self.rdma_block_num"), 2) + + def test_hybrid_deferred_provenance_wraps_before_conditioning_and_recaptures(self) -> None: + path = HERE / "ep_deepep_hybrid.py" + source = path.read_text() + tree = ast.parse(source, str(path)) + backend = next( + node for node in tree.body + if isinstance(node, ast.ClassDef) and node.name == "DeepEPHybridBackend" + ) + methods = {node.name for node in backend.body if isinstance(node, ast.FunctionDef)} + self.assertIn("capture_deferred_provenance", methods) + constructor = next(node for node in backend.body if isinstance(node, ast.FunctionDef) and node.name == "__init__") + buffer_call = next( + node for node in ast.walk(constructor) + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) + and node.func.id == "HybridEPBuffer" + ) + wrapper_install = next( + node for node in ast.walk(constructor) + if isinstance(node, ast.Assign) + and any( + isinstance(target, ast.Attribute) + and target.attr == "update_template_config" + for target in node.targets + ) + ) + cache_line = source[:source.index('os.environ["HYBRID_EP_CACHE_DIR"]')].count("\n") + 1 + self.assertLess(cache_line, buffer_call.lineno) + self.assertLess(buffer_call.lineno, wrapper_install.lineno) + + capture = next( + node for node in backend.body + if isinstance(node, ast.FunctionDef) and node.name == "capture_deferred_provenance" + ) + called = { + node.func.id if isinstance(node.func, ast.Name) else node.func.attr + for node in ast.walk(capture) if isinstance(node, ast.Call) + and isinstance(node.func, (ast.Name, ast.Attribute)) + } + self.assertTrue({"_hybrid_jit_evidence", "_require_cross_rank_equal", "all_gather_object"} <= called) + self.assertIn("changed after measurement", ast.get_source_segment(source, capture)) + + artifacts = [[ + {"bytes": 1, "kernel_key": key, "sha256": digit * 64} + for key, digit in (("a", "1"), ("b", "2"), ("c", "3")) + ]] + + class FakeCuda: + @staticmethod + def synchronize() -> None: + return None + + class FakeDist: + @staticmethod + def barrier() -> None: + return None + + @staticmethod + def get_world_size() -> int: + return 2 + + @staticmethod + def all_gather_object(output, value) -> None: + output[:] = [copy.deepcopy(value), copy.deepcopy(value)] + + namespace = { + "torch": types.SimpleNamespace(cuda=FakeCuda), + "dist": FakeDist, + "_hybrid_jit_evidence": lambda _root: copy.deepcopy(artifacts[0]), + "_require_cross_rank_equal": lambda _value, _label: None, + } + exec(compile(ast.Module(body=[capture], type_ignores=[]), str(path), "exec"), namespace) + state = types.SimpleNamespace( + _deferred_jit_diagnostics=None, + _deferred_semantic_snapshot=None, + _jit_root=Path("private-cache"), + _realized_config=hybrid_realized_config(), + backend_provenance={}, + ) + namespace["capture_deferred_provenance"](state) + artifacts[0][0]["kernel_key"] = "changed" + with self.assertRaisesRegex(RuntimeError, "kernel set changed"): + namespace["capture_deferred_provenance"](state) + artifacts[0][0]["kernel_key"] = "a" + artifacts[0][0]["sha256"] = "f" * 64 + with self.assertRaisesRegex(RuntimeError, "artifacts changed"): + namespace["capture_deferred_provenance"](state) + + harness = (HERE / "ep_harness.py").read_text() + captures = [ + index for index in range(len(harness)) + if harness.startswith("capture_deferred_provenance()", index) + ] + self.assertEqual(len(captures), 2) + self.assertLess(harness.index("for wt in conditioning_ladder:"), captures[0]) + self.assertLess(captures[0], harness.index("oracle = _run_expert_oracle(")) + self.assertLess(harness.index("trace_sig = hashlib.sha256"), captures[1]) + + def test_hybrid_diagnostic_hashes_do_not_split_series_identity(self) -> None: + keys, artifacts = hybrid_jit_provenance() + provenance = { + "deepep_tree": "b" * 40, + "jit_kernel_keys": keys, + "jit_shared_objects": artifacts, + "loaded_libraries": [{ + "name": "hybrid_ep_cpp", "role": "deepep-hybrid-extension", + "sha256": "a" * 64, + }], + "realized_config": hybrid_realized_config(), + } + baseline = ep_harness._series_provenance(provenance) + changed = copy.deepcopy(provenance) + changed["jit_shared_objects"][0]["rank_artifacts"][0]["sha256"] = "f" * 64 + self.assertEqual(ep_harness._series_provenance(changed), baseline) + changed = copy.deepcopy(provenance) + changed["loaded_libraries"][0]["sha256"] = "f" * 64 + self.assertEqual(ep_harness._series_provenance(changed), baseline) + changed = copy.deepcopy(provenance) + changed["jit_kernel_keys"][0] = "changed-key" + self.assertNotEqual(ep_harness._series_provenance(changed), baseline) + changed = copy.deepcopy(provenance) + changed["realized_config"]["num_of_blocks_dispatch_api"] += 1 + self.assertNotEqual(ep_harness._series_provenance(changed), baseline) + changed = copy.deepcopy(provenance) + changed["deepep_tree"] = "c" * 40 + self.assertNotEqual(ep_harness._series_provenance(changed), baseline) + + def test_v2_series_identity_uses_source_and_sass_not_container_metadata(self) -> None: + provenance = { + "deepep_tree": "a" * 40, + "loaded_libraries": [ + {"name": "deep_ep._C.so", "role": "deepep-extension", "sha256": "1" * 64}, + {"name": "libnccl.so.2", "role": "nccl", "sha256": "2" * 64}, + ], + "jit_cubins": deepep_v2_jit_provenance(), + "jit_random_seed": "collectivex-deepep-v2-fa8a9b1", + } + baseline = contracts.series_provenance(provenance) + changed = copy.deepcopy(provenance) + changed["loaded_libraries"][0]["sha256"] = "f" * 64 + changed["jit_cubins"][0]["cubin_sha256"] = "e" * 64 + self.assertEqual(contracts.series_provenance(changed), baseline) + for mutate in ( + lambda item: item["loaded_libraries"][1].update(sha256="f" * 64), + lambda item: item["jit_cubins"][0].update(source_sha256="f" * 64), + lambda item: item["jit_cubins"][0].update(sass_sha256="f" * 64), + lambda item: item.update(deepep_tree="f" * 40), + ): + changed = copy.deepcopy(provenance) + mutate(changed) + self.assertNotEqual(contracts.series_provenance(changed), baseline) + + def test_mnnvl_resolution_has_no_ambiguous_signature_fallback(self) -> None: + self.assertEqual( + contracts.resolve_deepep_mnnvl( + requested=False, signature_parameters=(), deepep_commit=None, + ), + ({}, "not-requested"), + ) + self.assertEqual( + contracts.resolve_deepep_mnnvl( + requested=True, signature_parameters=("allow_mnnvl",), + deepep_commit="a" * 40, + ), + ({"allow_mnnvl": True}, "explicit-allow-mnnvl"), + ) + with self.assertRaises(contracts.ContractError): + contracts.resolve_deepep_mnnvl( + requested=True, signature_parameters=(), + deepep_commit="814e508537c6ffc775d59f6f1b9ba43f3a65968c", + ) + + def test_backend_provenance_requires_lineage_and_content_hashes(self) -> None: + def record(role: str, name: str, digit: str) -> dict[str, str]: + return {"role": role, "name": name, "sha256": digit * 64} + + hybrid_keys, hybrid_artifacts = hybrid_jit_provenance() + v2 = { + **contracts.DEEPEP_V2_V1_PROVENANCE, + "api_signature_sha256": "c" * 64, + "loaded_libraries": [ + record("deepep-extension", "deep_ep._C", "1"), + record("nccl", "libnccl.so.2", "2"), + record("nvshmem", "libnvshmem_host.so.3", "3"), + ], + "jit_cubins": deepep_v2_jit_provenance(), + "jit_random_seed": "collectivex-deepep-v2-fa8a9b1", + "deterministic": False, + "num_experts": 256, + "tuning_num_experts": 256, + "allow_hybrid_mode": False, + "gin_enabled": False, + "communication_backend": "nccl-device-lsa", + } + deepep = { + "deepep_version": "1.1.0", "deepep_commit": "a" * 40, + "backend_lineage": "deepep-v1", "allow_mnnvl": False, + "mnnvl_comm": "not-requested", "mode": "normal", + "num_nvl_bytes": 1024, "num_rdma_bytes": 0, + } + hybrid = { + "deepep_commit": "a" * 40, "deepep_tree": "b" * 40, + "branch": "hybrid-ep", "backend_lineage": "deepep-hybrid", + "loaded_libraries": [ + record("deepep-extension", "deep_ep_cpp", "1"), + record("deepep-hybrid-extension", "hybrid_ep_cpp", "2"), + ], + "jit_kernel_keys": hybrid_keys, + "jit_shared_objects": hybrid_artifacts, + "realized_config": hybrid_realized_config(), + } + uccl = { + "uccl_version": "0.1.1", "uccl_commit": "pkg-0.1.1", + "uccl_wrapper_commit": "c" * 40, "backend_lineage": "uccl", + "uccl_dependency_versions": dict(contracts.UCCL_DEPENDENCY_VERSIONS), + "loaded_libraries": [ + record("uccl-distribution", "uccl-0.1.1", "3"), + record("uccl-wrapper", "uccl-deepep-wrapper", "4"), + record("intervaltree-distribution", "intervaltree-3.1.0", "5"), + record("sortedcontainers-distribution", "sortedcontainers-2.4.0", "6"), + record("cuda-runtime", "nvidia-cuda-runtime-cu12-12.9.79", "7"), + ], + "mode": "normal", "num_nvl_bytes": 1024, "num_rdma_bytes": 0, + } + reference = { + "nccl_version": "2.30.4", "collective_library": "nccl", + "backend_lineage": "nccl", + } + for backend, provenance in ( + ("deepep", deepep), ("deepep-v2", v2), ("deepep-hybrid", hybrid), + ("uccl", uccl), ("nccl-ep", reference), + ): + self.assertEqual(contracts.backend_provenance_issues(backend, provenance), []) + changed = copy.deepcopy(provenance) + if "loaded_libraries" in changed: + changed["loaded_libraries"][0]["sha256"] = "invalid" + expected = "loaded_libraries" + else: + changed["backend_lineage"] = "wrong" + expected = "backend_lineage" + self.assertIn(expected, contracts.backend_provenance_issues(backend, changed)) + + changed = copy.deepcopy(uccl) + changed["uccl_dependency_versions"]["intervaltree"] = "3.2.0" + self.assertIn( + "uccl_dependency_versions", + contracts.backend_provenance_issues("uccl", changed), + ) + changed = copy.deepcopy(uccl) + changed["loaded_libraries"] = [ + item + for item in changed["loaded_libraries"] + if item["role"] != "sortedcontainers-distribution" + ] + self.assertIn( + "loaded_libraries", contracts.backend_provenance_issues("uccl", changed) + ) + + for field, mutate in ( + ("realized_config", lambda item: item["realized_config"].pop("hidden_dim")), + ("jit_kernel_keys", lambda item: item["jit_kernel_keys"].reverse()), + ( + "jit_shared_objects", + lambda item: item["jit_shared_objects"][0]["rank_artifacts"][0].update( + sha256="invalid" + ), + ), + ): + with self.subTest(hybrid_field=field): + changed = copy.deepcopy(hybrid) + mutate(changed) + self.assertIn( + field, + contracts.backend_provenance_issues("deepep-hybrid", changed), + ) + + for field, value in ( + ("jit_cubins", [{"cache_key": "invalid", "cubin_sha256": "4" * 64}]), + ("jit_random_seed", "different-seed"), + ): + with self.subTest(v2_field=field): + changed = copy.deepcopy(v2) + changed[field] = value + self.assertIn( + field, + contracts.backend_provenance_issues("deepep-v2", changed), + ) + + changed = copy.deepcopy(v2) + changed["gin_enabled"] = True + self.assertIn("gin_enabled", contracts.backend_provenance_issues("deepep-v2", changed)) + changed = copy.deepcopy(v2) + changed["communication_backend"] = "nccl-gin" + self.assertIn( + "communication_backend", contracts.backend_provenance_issues("deepep-v2", changed) + ) + changed = copy.deepcopy(v2) + changed.update( + allow_hybrid_mode=True, + gin_enabled=True, + communication_backend="nccl-gin", + ) + self.assertEqual( + contracts.backend_provenance_issues("deepep-v2", changed), + [], + ) + changed["allow_hybrid_mode"] = False + self.assertEqual( + contracts.backend_provenance_issues("deepep-v2", changed), + ["allow_hybrid_mode", "communication_backend", "gin_enabled"], + ) + for field, expected in contracts.DEEPEP_V2_V1_PROVENANCE.items(): + with self.subTest(v2_pin_field=field): + changed = copy.deepcopy(v2) + changed[field] = not expected if type(expected) is bool else "wrong" + self.assertIn( + field, + contracts.backend_provenance_issues("deepep-v2", changed), + ) + + schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text()) + provenance_schema = schema["properties"]["implementation"]["properties"]["provenance"] + self.assertEqual( + provenance_schema["properties"]["realized_config"], + {"$ref": "#/$defs/hybrid_realized_config"}, + ) + self.assertFalse(schema["$defs"]["hybrid_realized_config"]["additionalProperties"]) + self.assertEqual(provenance_schema["properties"]["jit_kernel_keys"]["minItems"], 3) + self.assertEqual(provenance_schema["properties"]["jit_shared_objects"]["minItems"], 3) + + self.assertEqual(contracts.collective_kernel_generation("nccl"), "nccl") + self.assertEqual(contracts.collective_kernel_generation("rccl"), "rccl") + with self.assertRaises(contracts.ContractError): + contracts.collective_kernel_generation("unknown") + + def test_transport_resource_provenance_is_exact(self) -> None: + self.assertEqual(contracts.hybrid_communication_domains(8, 8), (8, 1)) + self.assertEqual(contracts.hybrid_communication_domains(16, 8), (8, 2)) + self.assertEqual(contracts.hybrid_communication_domains(8, 72), (8, 1)) + self.assertEqual(contracts.hybrid_communication_domains(16, 72), (16, 1)) + + profile = contracts.project_resource_profile({ + "num_nvl_bytes": 1024, "num_rdma_bytes": 2048, + "num_qps_per_rank": 32, "heap_size": "6G", + }) + self.assertEqual(profile["persistent_bytes"], 3072) + self.assertEqual(profile["qps_per_rank"], 32) + self.assertEqual( + contracts.project_resource_profile({ + "num_nvl_bytes": 0, "num_rdma_bytes": 0, "heap_size": "6G", + })["persistent_bytes"], + 0, + ) + self.assertEqual( + contracts.project_resource_profile({"heap_size": "6G"})[ + "persistent_bytes" + ], + "6G", + ) + + mori = { + "mori_commit": "a" * 40, "kernel_type": "InterNodeV1", + "block_num": 96, "rdma_block_num": 64, + "dispatch_warps": 8, "combine_warps": 8, "num_qps": 1, + "use_external_inp_buf": True, "gpus_per_node": 8, + } + self.assertEqual(contracts.backend_provenance_issues("mori", mori), []) + for field in ( + "block_num", "rdma_block_num", "dispatch_warps", "combine_warps", + "num_qps", "use_external_inp_buf", "gpus_per_node", + ): + changed = copy.deepcopy(mori) + changed[field] = False if field == "use_external_inp_buf" else 0 + with self.subTest(mori_field=field): + self.assertIn( + field, contracts.backend_provenance_issues("mori", changed) + ) + + def test_routing_control_binds_binary_but_allows_treatment_configuration(self) -> None: + hybrid_keys, hybrid_artifacts = hybrid_jit_provenance() + implementation = { + "kernel_generation": "hybrid", + "name": "deepep-hybrid", + "provenance": { + "deepep_tree": "a" * 40, + "loaded_libraries": [{ + "role": "deepep-extension", "name": "deep_ep_cpp", "sha256": "1" * 64, + }], + "local_experts": 32, + "num_experts": 256, + "num_sms": 24, + "jit_cache_key": "case-one", + "jit_cubins": [{"cache_key": "one", "cubin_sha256": "2" * 64}], + "jit_kernel_keys": hybrid_keys, + "jit_shared_objects": hybrid_artifacts, + "realized_config": hybrid_realized_config(), + }, + "resource_profile": {"configured_units": 24}, + } + baseline = contracts.routing_implementation_control_sha256(implementation) + treatment = copy.deepcopy(implementation) + treatment["provenance"].update({ + "local_experts": 36, + "num_experts": 288, + "jit_cache_key": "case-two", + "jit_cubins": [{"cache_key": "two", "cubin_sha256": "3" * 64}], + "jit_kernel_keys": ["changed-a", "changed-b", "changed-c"], + "jit_shared_objects": hybrid_jit_provenance(3)[1], + "realized_config": { + **hybrid_realized_config(), + "num_of_experts_per_rank": 36, + }, + }) + self.assertEqual( + contracts.routing_implementation_control_sha256(treatment), baseline + ) + changed = copy.deepcopy(implementation) + changed["provenance"]["loaded_libraries"][0]["sha256"] = "4" * 64 + self.assertEqual( + contracts.routing_implementation_control_sha256(changed), baseline + ) + changed = copy.deepcopy(implementation) + changed["provenance"]["deepep_tree"] = "b" * 40 + self.assertNotEqual( + contracts.routing_implementation_control_sha256(changed), baseline + ) + changed = copy.deepcopy(implementation) + changed["provenance"]["num_sms"] = 20 + self.assertNotEqual( + contracts.routing_implementation_control_sha256(changed), baseline + ) + + def test_runtime_pins_uccl_wheel_and_hybrid_source_tree(self) -> None: + runtime = (ROOT / "runtime" / "run_in_container.sh").read_text() + common = (ROOT / "runtime" / "common.sh").read_text() + self.assertIn("cd /ix/experimental/CollectiveX", runtime) + for launcher_name in ("launch_single-slurm.sh", "launch_gb-nv.sh"): + launcher = (ROOT / "launchers" / launcher_name).read_text() + self.assertIn("$MOUNT_SRC:/ix", launcher) + self.assertIn("cx_prepare_backend_cache", launcher) + self.assertNotIn('$(cx_prepare_backend_cache', launcher) + self.assertIn("$CX_PREPARED_BACKEND_CACHE:/cx-cache", launcher) + self.assertIn("CX_BACKEND_CACHE_ROOT=/cx-cache", launcher) + self.assertIn("CX_BACKEND_SOURCE_ROOT=/ix/experimental/CollectiveX/.cx_sources", launcher) + self.assertIn('|| [ "$CX_BENCH" = deepep-hybrid ]', launcher) + self.assertIn("cx_prepare_backend_source", launcher) + cache_block = launcher[launcher.index('if [ "$CX_BENCH" = deepep-v2 ]'):] + self.assertLess( + cache_block.index("cx_set_failure_stage backend-setup"), + cache_block.index("cx_prepare_backend_cache"), + ) + self.assertLess( + cache_block.index("cx_prepare_backend_source"), + cache_block.index("cx_set_failure_stage scheduler-allocation"), + ) + self.assertIn("--frandom-seed=$seed", runtime) + self.assertIn("DEEPEP_V2_JIT_RANDOM_SEED", runtime) + persisted = runtime[runtime.index("cx_persist_backend_env()") :] + self.assertIn("CUDA_HOME CPATH NVCC_PREPEND_FLAGS", persisted) + self.assertIn( + "390c1320918972206546e44d79b132988f2818ec07e23afcd0595f7183916cec", + runtime, + ) + self.assertIn("--require-hashes", runtime) + self.assertIn("d77aeab7f1bb52b615666fe178d26ced41fae08e", common) + self.assertIn("HEAD^{tree}", runtime) + self.assertIn("$PWD/.cx_backend/deepep-hybrid-", runtime) + self.assertIn("cx_materialize_backend_source deepep-hybrid", runtime) + self.assertIn("cx_materialize_backend_source deepep-v2", runtime) + self.assertIn("cx_deepep_hybrid_marker_content_sha256", runtime) + self.assertIn("cx_deepep_hybrid_cache_is_valid", runtime) + self.assertIn("cx_extension_pair_sha256", runtime) + self.assertIn(".collectivex-complete.tmp.", runtime) + self.assertNotIn("cx_fetch_revision", runtime) + self.assertIn("cx_fetch_revision", common) + self.assertIn("third-party/fmt", common) + hybrid = runtime[ + runtime.index("cx_build_deepep_hybrid()"): + runtime.index("# UCCL EP") + ] + configure = runtime[ + runtime.index("cx_configure_deepep_hybrid_build()"): + runtime.index("cx_deepep_hybrid_marker_content_sha256()") + ] + self.assertIn("cx_prepare_cuda_cccl", hybrid) + self.assertIn("unset NVSHMEM_DIR", hybrid) + self.assertIn( + "unset HYBRID_EP_MULTINODE USE_NIXL RDMA_CORE_HOME", configure + ) + self.assertIn("cx_configure_deepep_hybrid_build || return 1", hybrid) + self.assertIn('[ "$(uname -m)" = x86_64 ]', configure) + self.assertIn('[ -n "${GLOO_SOCKET_IFNAME:-}" ]', configure) + self.assertIn('[ -d "/sys/class/infiniband/$rdma_name" ]', configure) + self.assertIn("command -v make", configure) + self.assertIn("/usr/include/infiniband/verbs.h", configure) + self.assertIn("export HYBRID_EP_MULTINODE=1 USE_NIXL=0", configure) + self.assertNotIn("cx_prepare_deepep_toolchain", hybrid) + toolchain = runtime[ + runtime.index("cx_prepare_deepep_toolchain()"): + runtime.index("cx_probe_deepep()") + ] + self.assertIn('overlay="$root/nvshmem-overlay"', toolchain) + self.assertIn("flock 8 || exit 1", toolchain) + self.assertIn('mv "$temporary" "$overlay" || exit 1', toolchain) + self.assertNotIn("/tmp/collectivex-nvshmem", toolchain) + jit = runtime[ + runtime.index("cx_enable_deepep_v2_jit_reproducibility()"): + runtime.index("cx_probe_deepep_v2()") + ] + self.assertIn('cccl="${CX_CUDA_CCCL:-}"', jit) + self.assertNotIn("/usr/local/cuda*", jit) + self.assertIn("deepep-v2-cache-v2|$cpu|sm${arch/./}", runtime) + self.assertNotIn("deepep-v2-cache-v1|", runtime) + self.assertIn('base="${CX_BACKEND_CACHE_ROOT:-}"', runtime) + self.assertNotIn("${CX_BACKEND_CACHE_ROOT:-$PWD/.cx_backend}", runtime) + self.assertIn( + "recipe=aot-persistent-nvshmem-active-cuda-maxjobs16-v2", runtime + ) + self.assertNotIn("recipe=aot-source-date-epoch-arch-maxjobs16-v1", runtime) + self.assertNotIn("recipe=$source_sha", runtime) + self.assertIn("pip=26.1.2|setuptools=82.0.1|wheel=0.47.0|ninja=1.13.0", runtime) + self.assertIn("manual-unverified", runtime) + self.assertIn("cx_deepep_v2_content_sha256", runtime) + self.assertIn("DeepEP V2 cache validation failed", runtime) + probe = runtime[ + runtime.index("cx_probe_deepep_v2()"): + runtime.index("cx_deepep_v2_content_sha256()") + ] + self.assertNotIn("torch.cuda.nccl.version", probe) + self.assertIn("ncclGetVersion", probe) + self.assertIn("runtime_version.value == 23004", probe) + self.assertIn("cx_nvidia_package_root nvidia-nccl-cu13 nccl", runtime) + self.assertIn("cx_nvidia_package_root nvidia-nvshmem-cu12 nvshmem", runtime) + self.assertNotIn("import os,nvidia.nccl", runtime) + self.assertNotIn("import os,nvidia.nvshmem", runtime) + self.assertIn( + 'export EP_JIT_CACHE_DIR="$stage_root/.cx_backend/deepep-v2-jit"', runtime + ) + self.assertIn('stage_root="${CX_BACKEND_SOURCE_ROOT%/.cx_sources}"', runtime) + self.assertNotIn('export EP_JIT_CACHE_DIR="$root/jit"', runtime) + self.assertIn('EP_NVSHMEM_ROOT_DIR="$NVSHMEM_DIR"', runtime) + reference = (HERE / "ep_nccl.py").read_text() + self.assertIn("self.kernel_generation = contracts.collective_kernel_generation", reference) + + def test_deepep_v2_cache_recovers_from_an_unpublished_partial_build(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + cache_key = "a" * 64 + content_hash = "b" * 64 + root = Path(temporary) / f"deepep-v2-{cache_key}" + root.mkdir(mode=0o700) + marker = root / ".collectivex-complete" + stale = root / "stale-partial-build" + stale.write_text("partial\n") + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")" + cache_root="$2"; expected_revision="$3"; expected_tree="$4"; expected_fmt="$5" + expected_content="$6" + cx_log() { :; } + cx_verify_backend_cache_mount() { return 0; } + cx_cuda_arch() { printf '9.0'; } + cx_deepep_v2_root() { printf '%s' "$cache_root"; } + cx_activate_deepep_v2() { export DEEPEP_V2_COMMIT="$expected_revision"; } + cx_prepare_deepep_toolchain() { export NVSHMEM_DIR=/tmp/cx-test-nvshmem; } + cx_probe_deepep_v2() { return 0; } + cx_deepep_v2_content_sha256() { printf '%s' "$expected_content"; } + cx_deepep_v2_cache_is_valid() { + test -f "$2" && test "$(wc -l < "$2" | tr -d ' ')" = 5 + } + cx_enable_deepep_v2_jit_reproducibility() { return 0; } + cx_materialize_backend_source() { mkdir -p "$2/third-party/fmt"; } + flock() { return 0; } + python3() { + if [ "${1:-}" = -m ] && [ "${2:-}" = venv ]; then + mkdir -p "$3/bin" + printf '#!/bin/sh\nexit 0\n' > "$3/bin/python" + chmod 700 "$3/bin/python" + fi + return 0 + } + git() { + case " $* " in + *' third-party/fmt rev-parse HEAD '*) printf '%s\n' "$expected_fmt" ;; + *' rev-parse HEAD^{tree} '*) printf '%s\n' "$expected_tree" ;; + *' show -s --format=%ct HEAD '*) printf '1\n' ;; + *) return 0 ;; + esac + } + cx_git_in_tree() { shift; git "$@"; } + cx_build_deepep_v2 + ''' + subprocess.run( + [ + "bash", "-c", command, "_", str(runtime), str(root), + COMMIT, TREE, FMT_COMMIT, content_hash, + ], + check=True, + ) + self.assertFalse(stale.exists()) + self.assertEqual( + marker.read_text(), + f"{COMMIT}\n{TREE}\n{FMT_COMMIT}\n{cache_key}\n{content_hash}\n", + ) + self.assertEqual(list(root.glob(".collectivex-complete.tmp.*")), []) + + def test_deepep_v2_published_cache_is_never_deleted_after_probe_failure(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + cache_key = "a" * 64 + root = Path(temporary) / f"deepep-v2-{cache_key}" + root.mkdir(mode=0o700) + marker = root / ".collectivex-complete" + marker.write_text("published\n") + sentinel = root / "active-reader" + sentinel.write_text("active\n") + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")" + cache_root="$2" + cx_log() { :; } + cx_verify_backend_cache_mount() { return 0; } + cx_cuda_arch() { printf '9.0'; } + cx_deepep_v2_root() { printf '%s' "$cache_root"; } + cx_deepep_v2_cache_is_valid() { return 0; } + cx_activate_deepep_v2() { return 0; } + cx_prepare_deepep_toolchain() { return 0; } + cx_enable_deepep_v2_jit_reproducibility() { return 0; } + cx_probe_deepep_v2() { return 1; } + ! cx_build_deepep_v2 + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), str(root)], + check=True, + ) + self.assertEqual(sentinel.read_text(), "active\n") + self.assertEqual(marker.read_text(), "published\n") + + def test_deepep_v2_corrupt_published_cache_fails_without_reset(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + cache_key = "a" * 64 + root = Path(temporary) / f"deepep-v2-{cache_key}" + root.mkdir(mode=0o700) + marker = root / ".collectivex-complete" + marker.write_text("corrupt\n") + sentinel = root / "active-reader" + sentinel.write_text("active\n") + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_build_deepep_v2()/,/^}/p' "$1")" + cache_root="$2" + cx_log() { :; } + cx_verify_backend_cache_mount() { return 0; } + cx_cuda_arch() { printf '9.0'; } + cx_deepep_v2_root() { printf '%s' "$cache_root"; } + cx_deepep_v2_cache_is_valid() { return 1; } + flock() { return 0; } + ! cx_build_deepep_v2 + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), str(root)], + check=True, + ) + self.assertEqual(sentinel.read_text(), "active\n") + self.assertEqual(marker.read_text(), "corrupt\n") + + def test_deepep_v2_marker_requires_private_owned_cache_objects(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) / "cache" + root.mkdir(mode=0o700) + (root / "source").mkdir(mode=0o700) + (root / "venv").mkdir(mode=0o700) + marker = root / ".collectivex-complete" + cache_key = "a" * 64 + content_hash = "b" * 64 + marker.write_text( + f"{COMMIT}\n{TREE}\n{FMT_COMMIT}\n{cache_key}\n{content_hash}\n" + ) + root.chmod(0o2700) + marker.chmod(0o600) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_deepep_v2_marker_content_sha256()/,/^}/p' "$1")" + cx_deepep_v2_marker_content_sha256 "$2" "$3" "$4" "$5" "$6" "$7" + ''' + args = [ + "bash", "-c", command, "_", str(runtime), str(root), str(marker), + COMMIT, TREE, FMT_COMMIT, cache_key, + ] + valid = subprocess.run(args, text=True, capture_output=True, check=True) + self.assertEqual(valid.stdout, content_hash) + marker.chmod(0o644) + self.assertNotEqual(subprocess.run(args).returncode, 0) + + def test_deepep_hybrid_marker_requires_a_private_regular_file(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) / "cache" + root.mkdir(mode=0o700) + marker = root / ".collectivex-complete" + content_hash = "b" * 64 + marker.write_text(f"{COMMIT}\n{TREE}\n{content_hash}\n") + root.chmod(0o2700) + marker.chmod(0o600) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_deepep_hybrid_marker_content_sha256()/,/^}/p' "$1")" + cx_deepep_hybrid_marker_content_sha256 "$2" "$3" "$4" "$5" + ''' + args = [ + "bash", "-c", command, "_", str(runtime), str(root), str(marker), + COMMIT, TREE, + ] + valid = subprocess.run(args, text=True, capture_output=True, check=True) + self.assertEqual(valid.stdout, content_hash) + marker_contract = runtime.read_text() + marker_contract = marker_contract[ + marker_contract.index("cx_deepep_hybrid_marker_content_sha256()"): + marker_contract.index("cx_deepep_hybrid_cache_is_valid()") + ] + self.assertIn("marker_item.st_uid != root_item.st_uid", marker_contract) + self.assertNotIn("st_uid != os.getuid()", marker_contract) + marker.chmod(0o644) + self.assertNotEqual(subprocess.run(args).returncode, 0) + + def test_deepep_v2_installed_content_digest_binds_every_distribution_file(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + site = Path(temporary) / "venv" / "lib" / "python3.11" / "site-packages" + package = site / "deep_ep" + info = site / "deep_ep-2.0.0.dist-info" + package.mkdir(parents=True) + info.mkdir() + (package / "__init__.py").write_text("__version__ = '2.0.0'\n") + extension = package / "_C.so" + extension.write_bytes(b"extension-one") + (info / "METADATA").write_text( + "Metadata-Version: 2.1\nName: deep_ep\nVersion: 2.0.0\n" + ) + (info / "RECORD").write_text( + "deep_ep/__init__.py,,\n" + "deep_ep/_C.so,,\n" + "deep_ep-2.0.0.dist-info/METADATA,,\n" + "deep_ep-2.0.0.dist-info/RECORD,,\n" + ) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_deepep_v2_content_sha256()/,/^}/p' "$1")" + cx_deepep_v2_content_sha256 + ''' + env = { + **os.environ, + "PYTHONPATH": str(site), + "VIRTUAL_ENV": str(Path(temporary) / "venv"), + } + first = subprocess.run( + ["bash", "-c", command, "_", str(runtime)], + text=True, capture_output=True, check=True, env=env, + ).stdout + extension.write_bytes(b"extension-two") + second = subprocess.run( + ["bash", "-c", command, "_", str(runtime)], + text=True, capture_output=True, check=True, env=env, + ).stdout + self.assertRegex(first, r"^[0-9a-f]{64}$") + self.assertRegex(second, r"^[0-9a-f]{64}$") + self.assertNotEqual(first, second) + extension.unlink() + outside = Path(temporary) / "outside.so" + outside.write_bytes(b"outside") + extension.symlink_to(outside) + self.assertNotEqual( + subprocess.run( + ["bash", "-c", command, "_", str(runtime)], env=env, + ).returncode, + 0, + ) + + def test_uccl_content_identity_excludes_install_generated_files(self) -> None: + keep = load_uccl_function( + "_is_uccl_runtime_payload", {"PurePosixPath": PurePosixPath} + ) + self.assertTrue(keep("uccl/ep.abi3.so")) + self.assertTrue(keep("uccl.libs/libnuma.so")) + self.assertFalse(keep("uccl/__pycache__/collective.cpython-312.pyc")) + self.assertFalse(keep("uccl-0.1.1.dist-info/RECORD")) + + def test_uccl_dependency_versions_are_exact(self) -> None: + installed = dict(contracts.UCCL_DEPENDENCY_VERSIONS) + dependency_versions = load_uccl_function( + "_uccl_dependency_versions", + { + "contracts": contracts, + "metadata": types.SimpleNamespace( + version=lambda package: installed[package] + ), + }, + ) + self.assertEqual(dependency_versions(), contracts.UCCL_DEPENDENCY_VERSIONS) + installed["intervaltree"] = "3.2.0" + with self.assertRaisesRegex(RuntimeError, "differ from the v1 contract"): + dependency_versions() + + schema = json.loads((ROOT / "schemas" / "raw-case-v1.schema.json").read_text()) + dependency_schema = schema["properties"]["implementation"]["properties"][ + "provenance" + ]["properties"]["uccl_dependency_versions"] + self.assertFalse(dependency_schema["additionalProperties"]) + self.assertEqual( + { + package: definition["const"] + for package, definition in dependency_schema["properties"].items() + }, + contracts.UCCL_DEPENDENCY_VERSIONS, + ) + + def test_uccl_support_dependency_content_is_path_free(self) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + source_entry = PurePosixPath("intervaltree/__init__.py") + cache_entry = PurePosixPath("intervaltree/__pycache__/__init__.pyc") + metadata_entry = PurePosixPath("intervaltree-3.1.0.dist-info/RECORD") + for entry in (source_entry, cache_entry, metadata_entry): + path = root / entry + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(entry.as_posix().encode()) + distribution = types.SimpleNamespace( + files=[source_entry, cache_entry, metadata_entry], + locate_file=lambda item: root / item, + ) + evidence_for = load_uccl_function( + "_python_dependency_evidence", + { + "Path": Path, + "PurePosixPath": PurePosixPath, + "contracts": contracts, + "metadata": types.SimpleNamespace( + distribution=lambda package: distribution + ), + }, + ) + evidence = evidence_for("intervaltree", "3.1.0") + self.assertEqual( + evidence, + contracts.content_manifest_evidence( + role="intervaltree-distribution", + name="intervaltree-3.1.0", + files=[(source_entry.as_posix(), root / source_entry)], + ), + ) + self.assertNotIn(str(root), json.dumps(evidence)) + + def test_uccl_hashes_the_mapped_pinned_libcudart_without_exposing_paths( + self, + ) -> None: + with tempfile.TemporaryDirectory() as directory: + root = Path(directory) + entry = PurePosixPath("nvidia/cuda_runtime/lib/libcudart.so.12") + library = root / entry + library.parent.mkdir(parents=True) + library.write_bytes(b"pinned CUDA 12 runtime") + distribution = types.SimpleNamespace( + files=[entry], + locate_file=lambda item: root / item, + ) + evidence_for = load_uccl_function( + "_loaded_libcudart_evidence", + { + "Path": Path, + "PurePosixPath": PurePosixPath, + "contracts": contracts, + "metadata": types.SimpleNamespace( + distribution=lambda package: distribution + ), + }, + ) + maps = root / "maps" + maps.write_text(f"7f00-7f10 r-xp 00000000 00:00 0 {library}\n") + evidence = evidence_for("12.9.79", maps) + self.assertEqual( + evidence, + contracts.content_manifest_evidence( + role="cuda-runtime", + name="nvidia-cuda-runtime-cu12-12.9.79", + files=[("libcudart.so", library)], + ), + ) + self.assertNotIn(str(root), json.dumps(evidence)) + + unowned = root / "unowned" / library.name + unowned.parent.mkdir() + unowned.write_bytes(library.read_bytes()) + maps.write_text(f"7f00-7f10 r-xp 00000000 00:00 0 {unowned}\n") + with self.assertRaisesRegex(RuntimeError, "not owned") as raised: + evidence_for("12.9.79", maps) + self.assertNotIn(str(root), str(raised.exception)) + + def test_private_runtime_logs_are_not_public_artifacts(self) -> None: + path = subprocess.check_output( + [ + "bash", "-c", 'source "$1"; cx_private_log_path test', "_", + str(ROOT / "runtime" / "common.sh"), + ], + text=True, + env={**os.environ, "COLLECTIVEX_EXECUTION_ID": "contract-test"}, + ).strip() + try: + log = Path(path) + self.assertEqual(stat.S_IMODE(log.stat().st_mode), 0o600) + self.assertEqual(stat.S_IMODE(log.parent.stat().st_mode), 0o700) + self.assertFalse(log.is_relative_to(ROOT)) + finally: + shutil.rmtree(Path(path).parent, ignore_errors=True) + + def test_private_runtime_logs_reject_traversal_and_symlinks(self) -> None: + common = str(ROOT / "runtime" / "common.sh") + for variable, value in ( + ("COLLECTIVEX_EXECUTION_ID", ".."), + ("CX_TEST_LABEL", ".."), + ): + environment = { + **os.environ, + "COLLECTIVEX_EXECUTION_ID": "contract-adversarial", + "CX_TEST_LABEL": "test", + variable: value, + } + result = subprocess.run( + ["bash", "-c", 'source "$1"; cx_private_log_path "$CX_TEST_LABEL"', "_", common], + text=True, + capture_output=True, + env=environment, + ) + self.assertNotEqual(result.returncode, 0) + self.assertNotIn(value, result.stderr) + + private_root = Path(f"/tmp/inferencex-collectivex-{os.getuid()}") + private_root.mkdir(mode=0o700, exist_ok=True) + self.assertFalse(private_root.is_symlink()) + os.chmod(private_root, 0o700) + with tempfile.TemporaryDirectory() as temporary: + target = Path(temporary) + tag = f"contract-symlink-{os.getpid()}" + link = private_root / tag + link.symlink_to(target, target_is_directory=True) + try: + result = subprocess.run( + ["bash", "-c", 'source "$1"; cx_private_log_path test', "_", common], + text=True, + capture_output=True, + env={**os.environ, "COLLECTIVEX_EXECUTION_ID": tag}, + ) + self.assertNotEqual(result.returncode, 0) + self.assertEqual(list(target.iterdir()), []) + finally: + link.unlink(missing_ok=True) + + tag = f"contract-log-symlink-{os.getpid()}" + directory = private_root / tag + directory.mkdir(mode=0o700) + target_file = target / "target" + target_file.write_text("unchanged") + log_link = directory / "test.log" + log_link.symlink_to(target_file) + try: + result = subprocess.run( + ["bash", "-c", 'source "$1"; cx_private_log_path test', "_", common], + text=True, + capture_output=True, + env={**os.environ, "COLLECTIVEX_EXECUTION_ID": tag}, + ) + self.assertNotEqual(result.returncode, 0) + self.assertEqual(target_file.read_text(), "unchanged") + finally: + log_link.unlink(missing_ok=True) + directory.rmdir() + + def test_operator_config_failure_is_value_free(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + config = Path(temporary) / "operator.env" + config.write_text("printf 'private-config-token\\n' >&2\nfalse\n") + config.chmod(0o600) + result = subprocess.run( + ["bash", "-c", + 'export COLLECTIVEX_EXECUTION_ID="operator-failure-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; source \"$1\"; " + "cx_load_operator_config", "_", + str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertNotEqual(result.returncode, 0) + self.assertIn("runner-local configuration failed", result.stderr) + self.assertNotIn("private-config-token", result.stderr) + + def test_ephemeral_operator_config_is_removed_after_source(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + config = Path(temporary) / "operator.env" + decoy = Path(temporary) / "decoy" + decoy.write_text("keep") + config.write_text(json.dumps(operator_config(Path(temporary) / "storage"))) + config.chmod(0o600) + result = subprocess.run( + [ + "bash", "-c", + 'export COLLECTIVEX_EXECUTION_ID="operator-ephemeral-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; " + 'config="$COLLECTIVEX_OPERATOR_CONFIG"; source "$1"; ' + 'cx_load_operator_config; test ! -e "$config"; ' + 'test "$CX_PARTITION" = test; ' + 'test -z "${COLLECTIVEX_OPERATOR_CONFIG+x}"', + "_", str(ROOT / "runtime" / "common.sh"), + ], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + "COLLECTIVEX_OPERATOR_CONFIG_EPHEMERAL": "1", + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertFalse(config.exists()) + self.assertEqual(decoy.read_text(), "keep") + + def test_operator_config_is_strict_per_runner_json(self) -> None: + command = ( + 'source "$1"; export COLLECTIVEX_EXECUTION_ID="operator-config-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; " + 'test "$CX_PARTITION" = test; ' + 'test -z "${COLLECTIVEX_OPERATOR_CONFIG_CONTENT+x}"; ' + 'test -z "${ENROOT_CACHE_PATH+x}"' + ) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + document = operator_config(root / "storage") + config = root / "operator.json" + config.write_text(json.dumps(document)) + config.chmod(0o600) + for runner in capability.PLATFORMS: + with self.subTest(runner=runner): + result = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": runner, + "ENROOT_CACHE_PATH": "/private/stale-enroot-cache", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + + lock_dir = root / "amd-locks" + document["runners"]["mi355x"]["lock_dir"] = str(lock_dir) + config.write_text(json.dumps(document)) + config.chmod(0o600) + canonical = subprocess.run( + [ + "bash", + "-c", + 'source "$1"; export COLLECTIVEX_EXECUTION_ID="canonical-lock-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; " + 'cx_lock_canonical_gha_env mi355x; test "$CX_LOCK_DIR" = "$2"', + "_", + str(ROOT / "runtime" / "common.sh"), + str(lock_dir), + ], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "mi355x", + "CX_SHARD_FILE": ".shards/test.json", + "CX_SHARD_SKU": "mi355x", + "CX_NODES": "1", + "CX_GPUS_PER_NODE": "8", + "COLLECTIVEX_CANONICAL_GHA": "1", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + "COLLECTIVEX_SOURCE_SHA": "a" * 40, + "GITHUB_ACTIONS": "true", + "GITHUB_RUN_ATTEMPT": "1", + "GITHUB_RUN_ID": "1", + "GITHUB_WORKSPACE": str(root.resolve()), + }, + ) + self.assertEqual(canonical.returncode, 0, canonical.stderr) + + selected_only = { + "schema_version": 1, + "runners": {"h100-dgxc": document["runners"]["h100-dgxc"]}, + } + result = subprocess.run( + [ + "bash", "-c", command + '; test "$CX_STAGE_DIR" = "$2"', "_", + str(ROOT / "runtime" / "common.sh"), str(root / "storage"), + ], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "CX_STAGE_DIR": "/private/stale-stage", + "ENROOT_CACHE_PATH": "/private/stale-enroot-cache", + "COLLECTIVEX_OPERATOR_CONFIG_LOADED": "1", + "COLLECTIVEX_OPERATOR_CONFIG_CONTENT": json.dumps(selected_only), + "COLLECTIVEX_OPERATOR_CONFIG_REQUIRED": "1", + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + + rejected = json.loads(json.dumps(document)) + rejected["runners"]["h100-dgxc"]["shell"] = "private-command" + boolean_version = {**document, "schema_version": True} + missing_socket = json.loads(json.dumps(document)) + del missing_socket["runners"]["h100-dgxc"]["socket_ifname"] + missing_rdma = json.loads(json.dumps(document)) + del missing_rdma["runners"]["mi355x"]["rdma_devices"] + missing_amd_stage = json.loads(json.dumps(document)) + del missing_amd_stage["runners"]["mi325x"]["stage_dir"] + missing_nvidia_stage = json.loads(json.dumps(document)) + del missing_nvidia_stage["runners"]["h100-dgxc"]["stage_dir"] + for invalid in (rejected, boolean_version, missing_nvidia_stage): + config.write_text(json.dumps(invalid)) + config.chmod(0o600) + result = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertNotEqual(result.returncode, 0) + self.assertNotIn("private-command", result.stderr) + + for valid, runner in ( + (missing_socket, "h100-dgxc"), + (missing_rdma, "h100-dgxc"), + (missing_amd_stage, "h100-dgxc"), + ): + config.write_text(json.dumps(valid)) + config.chmod(0o600) + result = subprocess.run( + [ + "bash", "-c", command + "; cx_apply_network_profile 1 nvlink", + "_", str(ROOT / "runtime" / "common.sh"), + ], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": runner, + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + + config.write_text(json.dumps(missing_socket)) + config.chmod(0o600) + scaleout = subprocess.run( + [ + "bash", "-c", command + "; cx_apply_network_profile 2 nvlink-rdma", + "_", str(ROOT / "runtime" / "common.sh"), + ], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertNotEqual(scaleout.returncode, 0) + + config.write_text(json.dumps(missing_amd_stage)) + config.chmod(0o600) + selected_missing_stage = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "mi325x", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertNotEqual(selected_missing_stage.returncode, 0) + + config.write_text(json.dumps(document)) + config.chmod(0o644) + result = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={ + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + }, + ) + self.assertNotEqual(result.returncode, 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/test_ep_precision_adapters.py b/experimental/CollectiveX/tests/test_ep_precision_adapters.py new file mode 100644 index 000000000..a9655cdcd --- /dev/null +++ b/experimental/CollectiveX/tests/test_ep_precision_adapters.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +"""CPU-only contract tests for native EP precision adapter wiring.""" +from __future__ import annotations + +import ast +import sys +import unittest +from pathlib import Path +from types import SimpleNamespace + + +ROOT = Path(__file__).resolve().parents[1] +TESTS = ROOT / "tests" +sys.path.insert(0, str(ROOT)) +sys.path.insert(0, str(TESTS)) + +import ep_precision # noqa: E402 + + +class PrecisionResolutionTests(unittest.TestCase): + def test_blank_profile_resolves_to_bf16_control(self): + profile_id, profile = ep_precision.resolve_precision( + SimpleNamespace(precision_profile=""), + backend="nccl-ep", + mode="normal", + supported_profiles={"d-bf16.c-bf16"}, + ) + self.assertEqual(profile_id, "d-bf16.c-bf16") + self.assertEqual(profile["dispatch"]["communication_format"], "bf16") + self.assertEqual(profile["combine"]["communication_format"], "bf16") + + def test_adapter_profile_rejection_is_fail_closed(self): + with self.assertRaisesRegex( + ep_precision.PrecisionError, + "nccl-ep does not realize precision profile", + ): + ep_precision.resolve_precision( + SimpleNamespace( + precision_profile=( + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16" + ) + ), + backend="nccl-ep", + mode="normal", + supported_profiles={"d-bf16.c-bf16"}, + ) + + def test_profile_mode_mismatch_is_rejected(self): + with self.assertRaisesRegex(ep_precision.PrecisionError, "not valid in mode"): + ep_precision.resolve_precision( + SimpleNamespace( + precision_profile="d-bf16.c-logfmt10-dynamic64" + ), + backend="deepep", + mode="normal", + supported_profiles={"d-bf16.c-logfmt10-dynamic64"}, + ) + + def test_required_native_keyword_is_checked(self): + def native_api(*, use_fp8=False): + return use_fp8 + + ep_precision.require_keyword(native_api, "use_fp8", api="native_api") + with self.assertRaisesRegex(ep_precision.PrecisionError, "omits 'use_logfmt'"): + ep_precision.require_keyword( + native_api, "use_logfmt", api="native_api" + ) + + def test_bf16_evidence_is_exact_and_has_no_scale_checks(self): + evidence = ep_precision.exact_axis_evidence() + self.assertTrue(evidence["passed"]) + self.assertEqual(evidence["max_abs_error"], 0.0) + self.assertEqual(evidence["max_rel_error"], 0.0) + self.assertEqual(evidence["saturation_count"], 0) + self.assertEqual(evidence["saturation_rate"], 0.0) + self.assertIsNone(evidence["scales_finite"]) + self.assertIsNone(evidence["scales_positive"]) + + +class NativeAdapterWiringTests(unittest.TestCase): + @staticmethod + def _tree(name: str) -> ast.Module: + return ast.parse((TESTS / name).read_text(encoding="utf-8")) + + @staticmethod + def _keywords(tree: ast.AST, attribute: str) -> list[set[str]]: + result = [] + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + function = node.func + if ( + isinstance(function, ast.Attribute) and function.attr == attribute + ) or ( + isinstance(function, ast.Name) and function.id == attribute + ): + result.append({keyword.arg for keyword in node.keywords}) + return result + + def test_deepep_and_uccl_wire_native_low_latency_controls(self): + for filename in ("ep_deepep.py", "ep_uccl.py"): + with self.subTest(filename=filename): + tree = self._tree(filename) + dispatch_calls = self._keywords(tree, "low_latency_dispatch") + combine_calls = self._keywords(tree, "low_latency_combine") + self.assertTrue(any("use_fp8" in call for call in dispatch_calls)) + self.assertTrue(any("use_logfmt" in call for call in combine_calls)) + + def test_elastic_and_hybrid_constructors_enable_native_fp8(self): + v2 = self._tree("ep_deepep_v2.py") + hybrid = self._tree("ep_deepep_hybrid.py") + self.assertTrue( + any("use_fp8_dispatch" in call for call in self._keywords(v2, "ElasticBuffer")) + ) + self.assertTrue( + any("use_fp8" in call for call in self._keywords(hybrid, "HybridEPBuffer")) + ) + self.assertTrue( + any("scaling_factor" in call for call in self._keywords(hybrid, "dispatch")) + ) + + def test_mori_wires_dispatch_scales_and_direct_cast_config(self): + source = (TESTS / "ep_mori.py").read_text(encoding="utf-8") + self.assertIn('"fp8_direct_cast" if self._direct_cast_combine', source) + self.assertIn("p.scales,", source) + self.assertIn("dispatch_scales=_scales", source) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/test_precision_scheduling.py b/experimental/CollectiveX/tests/test_precision_scheduling.py new file mode 100644 index 000000000..f10385537 --- /dev/null +++ b/experimental/CollectiveX/tests/test_precision_scheduling.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +"""CPU-only tests for CollectiveX communication-precision scheduling.""" +from __future__ import annotations + +import copy +from pathlib import Path +import sys +import unittest +from unittest import mock + + +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +sys.path[:0] = [str(ROOT), str(HERE)] + +import capability # noqa: E402 +import identity # noqa: E402 +import probe_precision # noqa: E402 +import sweep_matrix # noqa: E402 + + +class PrecisionSchedulingTest(unittest.TestCase): + def test_precision_probe_inventory_is_exact_and_non_mutating(self) -> None: + before = copy.deepcopy(capability.PRECISION_CAPABILITIES) + targets = probe_precision.provisional_targets() + key = lambda item: ( + item["sku"], item["backend"], item["ep"], item["mode"], + item["precision_profile"], + ) + self.assertEqual(targets, sorted(capability.provisional_precision_targets(), key=key)) + self.assertEqual(len(targets), 94) + self.assertEqual(capability.PRECISION_CAPABILITIES, before) + self.assertEqual( + len({ + (item["backend"], item["sku"], item["ep"], item["mode"], + item["precision_profile"]) + for item in targets + }), + len(targets), + ) + + def test_precision_probe_selects_only_one_exact_provisional_cell(self) -> None: + target = probe_precision.provisional_targets()[0] + selected = probe_precision.select_target( + backend=target["backend"], sku=target["sku"], ep=target["ep"], + mode=target["mode"], precision_profile=target["precision_profile"], + ) + self.assertEqual(selected, target) + with self.assertRaisesRegex(probe_precision.ProbeError, "target-not-provisional"): + probe_precision.select_target( + backend=target["backend"], sku=target["sku"], ep=target["ep"], + mode=target["mode"], precision_profile="d-bf16.c-bf16", + ) + + def test_precision_probe_workflow_plan_binds_exact_controls(self) -> None: + plan = probe_precision.workflow_plan(backend="deepep", only_sku="b200-dgxc") + self.assertTrue(plan["include"]) + self.assertTrue(all( + row["backend"] == "deepep" and row["sku"] == "b200-dgxc" + for row in plan["include"] + )) + row = plan["include"][0] + control = probe_precision.extract_control( + plan, probe_id=row["id"], sku=row["sku"], backend=row["backend"], + nodes=row["nodes"], + ) + self.assertEqual( + probe_precision.validate_control( + control, sku=row["sku"], backend=row["backend"], nodes=row["nodes"], + ), + control, + ) + with self.assertRaisesRegex(ValueError, "workflow matrix"): + probe_precision.extract_control( + plan, probe_id=row["id"], sku=row["sku"], backend=row["backend"], + nodes=row["nodes"] + 1, + ) + with self.assertRaisesRegex(ValueError, "select no provisional"): + probe_precision.workflow_plan(backend="mori", only_sku="b200-dgxc") + + def test_precision_probe_manifest_is_sanitized_and_runtime_evidence_is_required(self) -> None: + target = probe_precision.provisional_targets()[0] + topology = capability.topology_for(target["sku"], target["ep"]) + self.assertIsNotNone(topology) + topology_record = probe_precision._topology_record(topology, False) + document = probe_precision.build_manifest( + target=target, topology=topology_record, disposition="unsupported", + reason="unsupported-native-api", runtime_executed=True, evidence=None, + ) + self.assertEqual(document["result"], { + "disposition": "unsupported", + "reason": "unsupported-native-api", + "registry_mutation": False, + "runtime_executed": True, + "static_inspection_sufficient": False, + }) + with self.assertRaises((TypeError, ValueError)): + probe_precision.build_manifest( + target=target, topology=topology_record, disposition="supported", + reason=probe_precision.SUPPORTED_REASON, runtime_executed=True, + evidence=None, + ) + + def test_precision_profiles_bind_exact_formats_and_timing_boundaries(self) -> None: + scheduled = set(identity.V1_NORMAL_PRECISION_PROFILE_IDS) | set( + identity.V1_LOW_LATENCY_PRECISION_PROFILE_IDS + ) + self.assertEqual( + set(identity.V1_PRECISION_PROFILES), + scheduled | {identity.V1_CONTROL_PRECISION_PROFILE}, + ) + self.assertNotIn(identity.V1_CONTROL_PRECISION_PROFILE, scheduled) + self.assertNotIn("fp4", repr(identity.V1_PRECISION_PROFILES).lower()) + + required_axis_fields = { + "api_input_dtype", + "api_output_dtype", + "communication_format", + "scale_dtype", + "scale_layout", + "scale_group_size", + "padding_contract", + "alignment_contract", + "quantization_origin", + "conversion_boundary", + } + for name in identity.V1_PRECISION_PROFILES: + with self.subTest(profile=name): + profile = identity.precision_profile(name) + self.assertEqual(profile["profile_id"], name) + self.assertEqual(set(profile["dispatch"]), required_axis_fields) + self.assertEqual(set(profile["combine"]), required_axis_fields) + self.assertRegex(name, r"^[a-z0-9][a-z0-9.-]*$") + + prequantized = identity.precision_profile( + "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16" + )["dispatch"] + fused = identity.precision_profile( + "d-fp8-e4m3fn-b128-f32-fused.c-bf16" + )["dispatch"] + self.assertEqual(prequantized["conversion_boundary"], "before-dispatch-timing") + self.assertEqual(fused["conversion_boundary"], "inside-dispatch-timing") + self.assertEqual(prequantized["scale_group_size"], 128) + + mi325 = identity.precision_profile( + "d-fp8-e4m3fnuz-b128-f32-prequantized.c-bf16" + )["dispatch"] + self.assertEqual(mi325["communication_format"], "fp8-e4m3fnuz") + logfmt = identity.precision_profile("d-bf16.c-logfmt10-dynamic64")["combine"] + self.assertEqual( + (logfmt["communication_format"], logfmt["scale_group_size"]), + ("logfmt10", 64), + ) + + base = {"mode": "normal"} + precision_case = { + **base, + "precision_profile": "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + } + self.assertIs(identity.profile_for_case(base), identity.V1_NORMAL_CASE_PROFILE) + self.assertIn("communication_precision", identity.profile_for_case(precision_case)) + self.assertNotEqual( + identity.digest("case", identity.profile_for_case(base)), + identity.digest("case", identity.profile_for_case(precision_case)), + ) + + def test_capability_registry_uses_exact_native_targets(self) -> None: + targets = capability.precision_targets() + self.assertTrue(targets) + self.assertTrue(all(item["disposition"] == "provisional" for item in targets)) + self.assertEqual(targets, capability.provisional_precision_targets()) + keys = { + ( + item["precision_profile"], + item["backend"], + item["sku"], + item["ep"], + item["mode"], + ) + for item in targets + } + self.assertEqual(len(keys), len(targets)) + + normal = "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16" + direct = "d-bf16.c-fp8-e4m3fn-direct-cast-noscale" + fnuz_direct = "d-bf16.c-fp8-e4m3fnuz-direct-cast-noscale" + low_latency = "d-bf16.c-logfmt10-dynamic64" + cases = ( + (("h200-dgxc", "deepep-v2", 8, "normal", normal), "provisional"), + (("h100-dgxc", "deepep-v2", 8, "normal", normal), "not-applicable"), + (("h200-dgxc", "nccl-ep", 8, "normal", normal), "not-applicable"), + (("mi355x", "mori", 8, "normal", direct), "provisional"), + (("mi355x", "mori", 16, "normal", direct), "not-applicable"), + (("mi325x", "mori", 8, "normal", fnuz_direct), "provisional"), + (("h200-dgxc", "deepep", 8, "low-latency", low_latency), "provisional"), + (("h200-dgxc", "deepep-hybrid", 8, "low-latency", low_latency), + "not-applicable"), + ) + for (sku, backend, ep, mode, profile), expected in cases: + with self.subTest(sku=sku, backend=backend, profile=profile): + topology = capability.topology_for(sku, ep) + self.assertIsNotNone(topology) + disposition, _ = capability.resolve_disposition( + sku, + backend, + ep=ep, + nodes=topology["nodes"], # type: ignore[index] + mode=mode, + precision_profile=profile, + ) + self.assertEqual(disposition, expected) + + control, _ = capability.resolve_disposition( + "h200-dgxc", + "deepep", + ep=8, + nodes=1, + precision_profile=identity.V1_CONTROL_PRECISION_PROFILE, + ) + self.assertEqual(control, "supported") + + def test_split_suites_are_provisional_and_do_not_duplicate_bf16(self) -> None: + suites = sweep_matrix._load("suites.yaml") + workloads = sweep_matrix._load("workloads.yaml") + sweep_matrix.validate_config_documents(suites, workloads) + normal = suites["suites"]["ep-precision-normal-v1"] + low_latency = suites["suites"]["ep-precision-low-latency-v1"] + self.assertEqual( + ( + normal["mode"], + normal["phases"], + normal["token_points_decode"], + normal["token_points_prefill"], + ), + ("normal", ["decode", "prefill"], [128], [512]), + ) + self.assertEqual( + ( + low_latency["mode"], + low_latency["phases"], + low_latency["token_points_decode"], + ), + ("low-latency", ["decode"], [128]), + ) + self.assertTrue(normal["provisional"]) + self.assertTrue(low_latency["provisional"]) + self.assertEqual( + normal["required_publication"], "comparable-experimental" + ) + self.assertEqual( + low_latency["required_publication"], "comparable-experimental" + ) + listed = normal["precision_profiles"] + low_latency["precision_profiles"] + self.assertNotIn(identity.V1_CONTROL_PRECISION_PROFILE, listed) + self.assertEqual(len(listed), len(set(listed))) + + matrix = sweep_matrix.validate_matrix_document( + sweep_matrix.resolve_matrix(backends="all") + ) + self.assertFalse(any("precision_profile" in item["case"] for item in matrix["requested_cases"])) + with self.assertRaisesRegex(SystemExit, "provisional precision suites"): + sweep_matrix.resolve_matrix( + suites="ep-precision-normal-v1", backends="all" + ) + + stale = copy.deepcopy(suites) + stale["suites"]["ep-precision-normal-v1"]["provisional"] = False + with self.assertRaisesRegex(SystemExit, "must track unresolved"): + sweep_matrix.validate_config_documents(stale, workloads) + + def test_resolved_profiles_schedule_without_cartesian_fill(self) -> None: + suites = sweep_matrix._load("suites.yaml") + workloads = sweep_matrix._load("workloads.yaml") + promoted = copy.deepcopy(capability.PRECISION_CAPABILITIES) + for rules in promoted.values(): + for rule in rules: + rule["disposition"] = "supported" + normal_profile = "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16" + promoted[normal_profile][0]["disposition"] = "unsupported" + resolved_suites = copy.deepcopy(suites) + for name in ("ep-precision-normal-v1", "ep-precision-low-latency-v1"): + resolved_suites["suites"][name]["provisional"] = False + + def load_config(name: str) -> dict[str, object]: + if name == "suites.yaml": + return resolved_suites + if name == "workloads.yaml": + return workloads + raise AssertionError(name) + + suite_names = "ep-precision-normal-v1,ep-precision-low-latency-v1" + expected_cases = sum( + len(resolved_suites["suites"][ + "ep-precision-normal-v1" + if target["mode"] == "normal" + else "ep-precision-low-latency-v1" + ]["phases"]) + for target in capability.precision_targets() + ) + unsupported_targets = [ + target for target in capability.precision_targets([normal_profile]) + if target["backend"] == "deepep" + ] + with mock.patch.object(capability, "PRECISION_CAPABILITIES", promoted): + with self.assertRaisesRegex(SystemExit, "must track unresolved"): + sweep_matrix.validate_config_documents(suites, workloads) + with mock.patch.object(sweep_matrix, "_load", side_effect=load_config): + matrix = sweep_matrix.validate_matrix_document( + sweep_matrix.resolve_matrix(suites=suite_names, backends="all") + ) + + unsupported = [ + item for item in matrix["requested_cases"] + if item["disposition"] == "unsupported" + ] + self.assertEqual( + len(unsupported), + len(unsupported_targets) + * len(resolved_suites["suites"]["ep-precision-normal-v1"]["phases"]), + ) + self.assertTrue(all( + item["reason"] == "precision-profile-unsupported" for item in unsupported + )) + self.assertTrue(any( + item["disposition"] == "runnable" for item in matrix["requested_cases"] + )) + + self.assertEqual(len(matrix["requested_cases"]), expected_cases) + self.assertEqual( + {item["case"]["precision_profile"] for item in matrix["requested_cases"]}, + set(identity.V1_NORMAL_PRECISION_PROFILE_IDS) + | set(identity.V1_LOW_LATENCY_PRECISION_PROFILE_IDS), + ) + self.assertFalse(any( + item["case"]["precision_profile"] == identity.V1_CONTROL_PRECISION_PROFILE + for item in matrix["requested_cases"] + )) + direct_cases = [ + item for item in matrix["requested_cases"] + if "direct-cast" in item["case"]["precision_profile"] + ] + self.assertTrue(direct_cases) + self.assertEqual({item["case"]["ep"] for item in direct_cases}, {8}) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/test_publisher.py b/experimental/CollectiveX/tests/test_publisher.py new file mode 100644 index 000000000..abc43c6ec --- /dev/null +++ b/experimental/CollectiveX/tests/test_publisher.py @@ -0,0 +1,2986 @@ +#!/usr/bin/env python3 +"""Focused end-to-end tests for the isolated CollectiveX publisher.""" +from __future__ import annotations + +import copy +import hashlib +import itertools +import json +import os +from pathlib import Path +import subprocess +import sys +import tempfile +import types +import unittest +from unittest import mock +import zipfile + +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +sys.path[:0] = [str(ROOT), str(HERE)] + +import contracts # noqa: E402 +import identity # noqa: E402 +import publisher # noqa: E402 +import summarize # noqa: E402 +import sweep_matrix # noqa: E402 + + +RUN = { + "repository": "SemiAnalysisAI/InferenceX", + "run_id": "12345", + "run_attempt": 1, + "qualification_index": 1, + "source_sha": "a" * 40, +} + + +def _unsupported_delivery( + root: Path, ordinals: tuple[int, ...] = (1,), run: dict = RUN, +) -> tuple[Path, Path]: + matrix = sweep_matrix.resolve_matrix(backends="all") + wrapper = next(item for item in matrix["requested_cases"] if item["disposition"] == "unsupported") + matrix = { + "format": "collectivex.matrix.v1", + "schema_version": 1, + "requested_cases": [wrapper], + "include": [], + } + case = {key: value for key, value in wrapper["case"].items() if key != "case_id"} + artifact_name = f"cxunsupported-{run['run_id']}-{run['run_attempt']}" + git_run = { + "artifact": artifact_name, + "job": "setup", + "ref": "collectivex", + "repo": run["repository"], + "qualification_index": run["qualification_index"], + "run_attempt": str(run["run_attempt"]), + "run_id": run["run_id"], + "source_sha": run["source_sha"], + } + allocation = { + "artifact": artifact_name, + "execution_id": f"{run['run_id']}_{run['run_attempt']}_unsupported", + "job": "setup", + "qualification_index": run["qualification_index"], + "repo": run["repository"], + "run_attempt": str(run["run_attempt"]), + "run_id": run["run_id"], + "runner": "capability-resolver", + "source_sha": run["source_sha"], + } + matrix_path = root / "matrix.json" + artifact = root / artifact_name + artifact.mkdir() + matrix_path.write_text(json.dumps(matrix)) + control_sha256 = hashlib.sha256(matrix_path.read_bytes()).hexdigest() + for ordinal in ordinals: + terminal = contracts.make_terminal_document( + allocation_factors=allocation, attempt_ordinal=ordinal, case=case, + case_factors={"case": case, "profile": identity.V1_CASE_PROFILE, + "sku": wrapper["sku"]}, + control_sha256=control_sha256, failure_mode="capability", + generated_at="2026-07-04T00:00:00Z", git_run=git_run, + reason=wrapper["reason"], return_code=5, source="matrix-capability-resolver", + status="unsupported", expected_case_id=wrapper["case"]["case_id"], + ) + (artifact / f"unsupported-{ordinal}.json").write_text(json.dumps(terminal)) + return matrix_path, artifact + + +def _args( + store: Path, matrix: Path, artifact: Path, run: dict = RUN +) -> types.SimpleNamespace: + return types.SimpleNamespace( + store_root=str(store), + matrix=str(matrix), + artifact=[str(artifact)], + repository=run["repository"], + run_id=run["run_id"], + run_attempt=run["run_attempt"], + qualification_index=run["qualification_index"], + source_sha=run["source_sha"], + ) + + +def _ids(seed: str) -> tuple[str, str, str, str, str, str]: + case = identity.digest("case", {"seed": seed}) + allocation = identity.allocation_id({"seed": seed}) + attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1) + series = identity.series_id({"seed": seed}) + point = identity.point_id(series=series, tokens_per_rank=8) + evidence = identity.evidence_id( + point=point, allocation=allocation, attempt=attempt, sample_sha256="b" * 64 + ) + return case, allocation, attempt, series, point, evidence + + +def _component(scale: float = 1.0) -> dict: + latency = {"p50": 10.0 * scale, "p90": 12.0 * scale, + "p95": 14.0 * scale, "p99": 20.0 * scale} + byte_provenance = { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": 100_000, + "scale_bytes": 0, + "total_logical_bytes": 100_000, + } + return { + "origin": "measured", + "latency_us": latency, + "byte_provenance": byte_provenance, + "activation_data_rate_gbps_at_latency_percentile": { + name: byte_provenance["activation_data_bytes"] / (value * 1000.0) + for name, value in latency.items() + }, + "total_logical_data_rate_gbps_at_latency_percentile": { + name: byte_provenance["total_logical_bytes"] / (value * 1000.0) + for name, value in latency.items() + }, + "sample_count": 512, + } + + +def _precision_axis_evidence() -> dict: + return { + "encoded_payload_valid": True, + "scales_finite": None, + "scales_positive": None, + "dequantized_semantics": True, + "saturation_count": 0, + "saturation_rate": 0.0, + "max_abs_error": 0.0, + "max_rel_error": 0.0, + "passed": True, + } + + +def _precision_evidence( + profile_id: str = identity.V1_CONTROL_PRECISION_PROFILE, +) -> dict: + axis = _precision_axis_evidence() + return { + "profile_id": profile_id, + "dispatch": axis, + "combine": copy.deepcopy(axis), + "passed": True, + } + + +def _hybrid_provenance(ep_size: int = 1) -> dict: + realized = {field: 1 for field in contracts.HYBRID_REALIZED_CONFIG_FIELDS} + for field in contracts.HYBRID_REALIZED_BOOL_FIELDS: + realized[field] = True + realized.update({ + "num_of_experts_per_rank": 1, + "num_of_nodes": 1, + "num_of_ranks_per_node": ep_size, + "token_data_type": "UINT16", + }) + kernel_keys = ["combine-key", "dispatch-key", "preprocess-key"] + return { + "backend_lineage": "deepep-hybrid", "branch": "hybrid-ep", + "deepep_commit": "a" * 40, "deepep_tree": "b" * 40, + "device_sms": 1, + "jit_kernel_keys": kernel_keys, + "jit_shared_objects": [ + { + "kernel_key": key, + "rank_artifacts": [ + {"bytes": 1, "rank": rank, "sha256": f"{index + 1:x}" * 64} + for rank in range(ep_size) + ], + } + for index, key in enumerate(kernel_keys) + ], + "loaded_libraries": [ + {"name": "deep_ep_cpp", "role": "deepep-extension", "sha256": "4" * 64}, + {"name": "hybrid_ep_cpp", "role": "deepep-hybrid-extension", "sha256": "5" * 64}, + ], + "realized_config": realized, + "resource_mode": "fixed-profile", + "tuned_source": "deepep-hybrid-configurer-autotune-v1", + } + + +def _native_fixture(backend: str = "nccl-ep") -> tuple[dict, dict]: + def digest(value: object) -> str: + return hashlib.sha256(contracts.canonical_json_bytes(value)).hexdigest() + + scheduled = { + "backend": backend, "canonical": True, "eplb": False, "ep": 1, + "experts": 1, "gpus_per_node": 1, "hidden": 1, "ladder": "1", "nodes": 1, + "mode": "normal", "phase": "decode", "required_publication": "official", + "routing": "uniform", "samples_per_point": 512, + "scale_out_transport": None, "scale_up_domain": 1, + "scale_up_transport": "nvlink", "scope": "scale-up", "suite": "ep-core-v1", + "timing": "8:64:32", "topk": 1, + "topology_class": "fixture", "transport": "nvlink", + "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1", + "workload": "deepseek-v3-v1", + } + communication_precision = identity.precision_profile( + identity.V1_CONTROL_PRECISION_PROFILE + ) + case_factors = { + "case": scheduled, "profile": identity.V1_NORMAL_CASE_PROFILE, "sku": "fixture" + } + case_id = identity.digest("case", case_factors) + git_run = { + "artifact": "cxshard-fixture-999-1", "job": "sweep", "ref": "collectivex", + "repo": RUN["repository"], "qualification_index": 1, + "run_attempt": "1", "run_id": "999", + "source_sha": RUN["source_sha"], + } + allocation_factors = { + "artifact": git_run["artifact"], "execution_id": "999_1_fixture", + "job": git_run["job"], "qualification_index": 1, + "repo": git_run["repo"], "run_attempt": "1", + "run_id": "999", "runner": "fixture", "source_sha": git_run["source_sha"], + } + allocation_id = identity.allocation_id(allocation_factors) + attempt_id = identity.attempt_id(allocation=allocation_id, case=case_id, ordinal=1) + member_id, member_checksums, routing_hash, routing_rows, routing_weights = ( + contracts._expected_canonical_trace( + "uniform", hidden=1, topk=1, logical_experts=1, physical_experts=1, + ep_size=1, tokens_per_rank=1, seed=67, eplb_enabled=False, + reference_tokens_per_rank=2048, + ) + ) + workload_id = identity.workload_id({ + "members": [{"checksums": member_checksums, "workload_id": member_id}] + }) + runtime = { + "accelerator_runtime": {"kind": "cuda", "version": "13.0"}, + "collective_library": {"kind": "nccl", "version": "2.30.4"}, + "device": { + "arch": "sm100", "compute_units": 1, "memory_bytes": 1, + "product": "Fixture GPU", "warp_size": 32, + }, + "driver_version": "1", "framework": {"kind": "torch", "version": "2.10.0"}, + "machine": "fixture", "python_version": "3.12", "vendor": "nvidia", + } + implementation_provenance = ( + { + "backend": "nccl-ep", "backend_lineage": "nccl", + "collective_library": "nccl", "nccl_version": "2.30.4", + "reference_semantics": "fixture-v1", + } + if backend == "nccl-ep" + else _hybrid_provenance() + ) + kernel_generation = "nccl" if backend == "nccl-ep" else "hybrid" + implementation = { + "kernel_generation": kernel_generation, + "name": backend, + "provenance": implementation_provenance, + "resource_profile": contracts.project_resource_profile(implementation_provenance), + } + public_config = contracts.public_series_config( + kernel_generation=implementation["kernel_generation"], + provenance=implementation_provenance, + resource_profile=implementation["resource_profile"], + resource_mode="fixed-profile", + device_product=runtime["device"]["product"], + ) + series_factors = { + "backend": backend, "case_id": case_id, + "image_digest": "sha256:" + "d" * 64, + "implementation_contract_sha256": digest({ + **implementation, + "provenance": contracts.series_provenance(implementation_provenance), + }), + "public_config_sha256": contracts.public_series_config_sha256(public_config), + "routing_control_sha256": contracts.routing_implementation_control_sha256( + implementation + ), + "runtime_fingerprint_sha256": digest(runtime), + "source_sha": RUN["source_sha"], "squash_sha256": "e" * 64, + "workload_id": workload_id, + } + series_id = identity.series_id(series_factors) + point_id = identity.point_id(series=series_id, tokens_per_rank=1) + sample_components = { + name: { + "availability": "measured", "sample_count": 512, + "trials": [[latency] * 8 for _ in range(64)], + } + for name, latency in (("combine", 20.0), ("dispatch", 10.0), ("roundtrip", 40.0)) + } + sample_components["stage"] = { + "availability": "unavailable", "sample_count": 0, "trials": None, + } + sample_sha = digest({"components": sample_components, "tokens_per_rank": 1}) + evidence_id = identity.evidence_id( + point=point_id, allocation=allocation_id, attempt=attempt_id, + sample_sha256=sample_sha, + ) + samples = { + "allocation_id": allocation_id, "attempt_id": attempt_id, "case_id": case_id, + "format": contracts.SAMPLES_FORMAT, + "points": [{ + "components": sample_components, "evidence_id": evidence_id, + "point_id": point_id, "sample_sha256": sample_sha, "tokens_per_rank": 1, + }], + "sampling": { + "iterations_per_trial": 8, "reduction": "cross-rank-max-per-iteration", + "trials": 64, + }, + "qualification_index": 1, "schema_version": 1, "series_id": series_id, + } + sample_bytes = contracts.canonical_json_bytes(samples) + oracle = { + "atol": 0.02, + "checks": {name: True for name in ( + "combine_values", "counts", "metadata", "multiplicity", "payload", + "source_set", "weights", + )}, + "combine_weight_semantics": "unweighted-rank-sum", + "contract": "expert-specific-transform-v1", "dispatch_sha256": "1" * 64, + "max_absolute_error": 0.0, "max_elementwise_relative_error": 0.0, + "max_relative_error": 0.0, "max_weight_error": 0.0, + "order_sha256": "2" * 64, "ordering_contract": "fixture-order-v1", + "passed": True, "receive_count": 1, "rtol": 0.05, + } + def pct(value: float) -> dict[str, float]: + return {name: value for name in ("p50", "p90", "p95", "p99")} + + def measured(value: float) -> dict: + return { + "availability": "measured", "origin": "measured", + "percentiles_us": pct(value), "sample_count": 512, + } + row = { + "anomalies": [], + "components": { + "combine": measured(20.0), "dispatch": measured(10.0), + "isolated_sum": { + "availability": "derived", "origin": "derived-percentile-sum", + "percentiles_us": pct(30.0), "sample_count": 0, + }, + "roundtrip": measured(40.0), + "stage": { + "availability": "unavailable", "origin": None, + "percentiles_us": None, "sample_count": 0, + }, + }, + "correctness": { + "contract": "expert-specific-transform-v1", "max_relative_error": 0.0, + "passed": True, "precision": _precision_evidence(), + "rank_evidence": [{ + "input_unchanged": True, "order_stable": True, + "post_timing": copy.deepcopy(oracle), "pre_timing": copy.deepcopy(oracle), + "rank": 0, + }], + "scope": "dispatch-metadata-and-transformed-combine", + }, + "evidence_id": evidence_id, "global_tokens": 1, + "byte_provenance": { + "combine": { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": 2, "scale_bytes": 0, + "total_logical_bytes": 2, + }, + "dispatch": { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": 2, "scale_bytes": 0, + "total_logical_bytes": 2, + }, + "roundtrip": { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": 4, "scale_bytes": 0, + "total_logical_bytes": 4, + }, + "stage": { + "accounting_contract": "activation-data-plus-scales-v1", + "activation_data_bytes": 0, "scale_bytes": 0, + "total_logical_bytes": 0, + }, + }, + "point_id": point_id, + "receive": {"max": 1, "mean": 1.0, "min": 1, "total": 1}, + "routing": contracts._expected_routing_summary( + routing_rows, + routing_weights, + physical_experts=1, + ep_size=1, + tokens_per_rank=1, + gpus_per_node=1, + scale_up_domain=1, + ), + "sample_histograms": { + name: contracts._expected_histogram([value] * 512) + for name, value in (("combine", 20.0), ("dispatch", 10.0), ("roundtrip", 40.0)) + }, + "sample_sha256": sample_sha, + "token_rate_at_latency_percentile": pct(25_000.0), "tokens_per_rank": 1, + } + row["sample_histograms"]["stage"] = None + raw = { + "case": { + "attempt_ordinal": 1, "backend": backend, + "eplb": { + "calibration_token_offset": None, "calibration_trace_sha256": None, + "calibration_window": None, "calibration_workload_id": None, + "enabled": False, "imbalance_after": None, "imbalance_before": None, + "mapping_hash": None, "max_replicas": None, "num_logical_experts": 1, + "num_physical_experts": 1, "num_redundant": 0, "planner": None, + "reference_tokens_per_rank": None, "replicated_experts": 0, + }, + "ep_size": 1, "mode": "normal", "phase": "decode", + "required_publication": "official", "resource_mode": "fixed-profile", "runner": "fixture", + "shape": { + "activation_profile": "canonical-counter-source-v4", + "precision_profile": identity.V1_CONTROL_PRECISION_PROFILE, + "dispatch_precision": communication_precision["dispatch"], + "combine_precision": communication_precision["combine"], + "eplb": False, "experts": 1, "experts_per_rank": 1, "hidden": 1, + "kernel_gen": kernel_generation, "num_logical_experts": 1, + "routing": "uniform", "topk": 1, + }, + "suite": "ep-core-v1", "workload_name": "deepseek-v3-v1", + }, + "format": contracts.RAW_FORMAT, "generated_at": "2026-07-04T00:00:00Z", + "identity": { + "allocation_factors": allocation_factors, "allocation_id": allocation_id, + "attempt_id": attempt_id, "attempt_ordinal": 1, "case_factors": case_factors, + "case_id": case_id, "series_factors": series_factors, "series_id": series_id, + }, + "implementation": implementation, + "measurement": { + "component_order_contract": "qualification-hash-rotated-components-v1", + "conditioning": { + "contract": "fixed-phase-ramp-8-roundtrips-v1", + "ladder": [1, 2, 4, 8, 16, 32, 64, 128], + "roundtrips_per_shape": 8, + }, + "contract": "layout-and-dispatch-v1", + "execution_order_sha256": "9" * 64, + "qualification_index": 1, + "rows": [row], + "sampling": { + "contract": "fixed-512-v1", "iterations_per_trial": 8, + "percentile_method": "nearest-rank", + "reduction": "cross-rank-max-per-iteration", "samples_per_component": 512, + "trials": 64, "warmup_iterations": 32, + "warmup_semantics": "full-roundtrip-before-each-component-trial-point-v1", + }, + "source_allocation": "even", + }, + "outcome": { + "publication_status": "diagnostic", "reasons": [], "status": "success", + "validity": { + "anomaly_free": True, "execution_status": "complete", + "measurement_conformance": "conformant", "provenance_complete": True, + "resource_conformance": implementation["resource_profile"]["conformance_class"], + "sampling_conformance": "conformant", + "semantic_correctness": "pass", + "workload_identity": "consistent-across-ranks", + "workload_source": "canonical-serialized", + }, + }, + "provenance": { + "allocation_stratum_sha256": "f" * 64, + "command": "run_ep", "distributed_launcher": "torchrun", "git_run": git_run, + "image": { + "arch": "amd64", "digest": "sha256:" + "d" * 64, + "digest_verified": True, "reference": "fixture:1", "squash_sha256": "e" * 64, + }, + "redaction": "sanitized-v1", + }, + "record_type": "case-attempt", + "runtime_fingerprint": runtime, + "sample_artifact": { + "bytes": len(sample_bytes), "format": contracts.SAMPLES_FORMAT, + "path": "samples.json", "sha256": hashlib.sha256(sample_bytes).hexdigest(), + }, + "schema_version": 1, + "topology": { + "device_count": 1, "device_product": "Fixture GPU", "gpus_per_node": 1, + "nodes": 1, "placement": "packed", + "realized_placement": { + "gpus_per_node": 1, "nodes": 1, "ranks_per_node": 1, + "unique_local_ranks": True, "valid": True, + }, + "scale_out_transport": None, "scale_up_domain": 1, + "scale_up_transport": "nvlink", "scope": "scale-up", + "topology_class": "fixture", "transport": "nvlink", + "world_size": 1, + }, + "workload": { + "activation_generator": "collectivex-activation-counter-v4", + "activation_identity": hashlib.sha256( + b"counter|seed=67|hidden=1|gen=collectivex-activation-counter-v4" + ).hexdigest(), + "activation_profile": "canonical-counter-source-v4", "cross_rank_consistent": True, + "manifest_checksums": {member_id: member_checksums}, "members": [member_id], + "routing_generator": "collectivex-routing-counter-v3", "source": "canonical-serialized", + "trace_hashes": [routing_hash], + "trace_signature": hashlib.sha256(routing_hash.encode()).hexdigest(), + "workload_id": workload_id, + }, + } + return raw, samples + + +def _series(seed: str, backend: str, *, decision_grade: bool = False) -> tuple[dict, dict]: + case, allocation, attempt, series_id, point_id, evidence = _ids(seed) + allocations = [identity.allocation_id({"seed": seed, "run": run}) for run in range(3)] + eligibility = publisher._eligibility_record( + allocations if decision_grade else [allocation], + complete=decision_grade, + correct=True, + measured=True, + stable_ordering=True, + p50_ratio=1.01 if decision_grade else None, + p99_ratio=1.02 if decision_grade else None, + ) + component = _component(1.0 if backend == "deepep" else 1.2) + qualification_indices = [1, 2, 3] if decision_grade else [1] + communication_precision = identity.precision_profile( + identity.V1_CONTROL_PRECISION_PROFILE + ) + item = { + "series_id": series_id, + "label": f"H100 / {backend}", + "status": "decision-grade" if decision_grade else "diagnostic", + "case_ids": [case], + "allocation_ids": allocations if decision_grade else [allocation], + "model": "deepseek-v3-v1", + "suite": "ep-core-v1", + "mode": "normal", + "phase": "decode", + "publication_tier": "official", + "backend": { + "id": backend, "label": publisher.BACKEND_LABELS[backend], + "role": "reference" if backend == "nccl-ep" else "library", + "generation": "nccl" if backend == "nccl-ep" else None, + "version": "1.0"}, + "build": { + "implementation_contract_sha256": hashlib.sha256(backend.encode()).hexdigest(), + "public_config_sha256": "0" * 64, + "routing_control_sha256": hashlib.sha256(backend.encode()).hexdigest(), + "runtime_fingerprint_sha256": "3" * 64, + "image_digest": "sha256:" + "1" * 64, + "source_sha": "a" * 40, + "squash_sha256": "2" * 64, + }, + "system": { + "sku": "h100-dgxc", "label": "NVIDIA H100", "vendor": "nvidia", + "topology_class": "h100-nvlink-island", "transport": "nvlink", + "scale_up_transport": "nvlink", "scale_out_transport": None, + "scope": "scale-up", "nodes": 1, "gpus_per_node": 8, + "scale_up_domain": 8, + "world_size": 8, "ep_size": 8, "placement": "packed", + }, + "workload": { + "workload_id": identity.workload_id({"shape": "shared"}), + "hidden": 7168, "top_k": 8, "experts": 256, + "routing": "uniform", "eplb": False, + "precision_profile": identity.V1_CONTROL_PRECISION_PROFILE, + "dispatch_precision": communication_precision["dispatch"], + "combine_precision": communication_precision["combine"], + "activation_profile": "canonical-counter-source-v4", + }, + "eplb": { + "enabled": False, + "calibration_workload_id": None, "calibration_trace_sha256": None, + "calibration_window": None, "calibration_token_offset": None, + "planner": None, "mapping_sha256": None, + "logical_experts": 256, "physical_experts": 256, + "redundant_experts": 0, "reference_tokens_per_rank": None, + "replicated_experts": 0, "max_replicas": None, + "imbalance_before": None, "imbalance_after": None, + }, + "resource": {"mode": "fixed-profile", "profile": "profile-1", "comm_units_kind": "sm", "configured_units": 24}, + "measurement": { + "contract": "layout-and-dispatch-v1", + "component_order_contract": "qualification-hash-rotated-components-v1", + "combine_semantics": "activation-only", "payload_unit": "token-rank", + "sampling_contract": "fixed-512-v1", + "iters": 8, "trials": 64, "warmups": 32, "samples_per_component": 512, + "qualification_indices": qualification_indices, + "headline_component": "roundtrip", "headline_percentile": "p99", + }, + "points": [{ + "point_id": point_id, "tokens_per_rank": 8, "global_tokens": 64, + "anomalies": [], + "correctness": {"semantic_pass": True, "precision": _precision_evidence()}, + "stability": { + "complete": decision_grade, + "qualification_indices": qualification_indices, + "p50_max_min_ratio": 1.02 if decision_grade else None, + "p99_max_min_ratio": 1.02 if decision_grade else None, + "stable_p50": decision_grade, "stable_p99": decision_grade, + }, + "trial_diagnostics": { + "flagged": False, + "reasons": [], + "components": { + "dispatch": None, + "stage": None, + "combine": None, + "roundtrip": { + "drift_flagged": False, + "first_last_median_ratio": 1.0, + "outlier_flagged": False, + "robust_outlier_fraction": 0.0, + "trial_count": 192, + }, + }, + }, + "routing": { + "fanout_mean": 4.0, "recv_tokens_max": 64, + "expert_load_cv": 0.5, "payload_rank_cv": 0.25, + "hotspot_ratio": 2.0, "empty_expert_count": 0, + "empty_rank_count": 0, "routed_copies": 256, + }, + "components": {"dispatch": None, "stage": None, "combine": None, + "roundtrip": component, "isolated_sum": None}, + "roundtrip_token_rate_at_latency_percentile": { + name: 64 / (latency * 1e-6) + for name, latency in component["latency_us"].items() + }, + "evidence_ids": [evidence], + }], + "eligibility": eligibility, + } + item["build"]["public_config_sha256"] = contracts.public_series_config_sha256( + publisher._public_series_config(item) + ) + case = identity.digest("case", publisher._public_case_factors(item)) + item["case_ids"] = [case] + build = item["build"] + series_id = identity.series_id({ + "backend": item["backend"]["id"], + "case_id": case, + "image_digest": build["image_digest"], + "implementation_contract_sha256": build["implementation_contract_sha256"], + "public_config_sha256": build["public_config_sha256"], + "routing_control_sha256": build["routing_control_sha256"], + "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"], + "source_sha": build["source_sha"], + "squash_sha256": build["squash_sha256"], + "workload_id": item["workload"]["workload_id"], + }) + item["series_id"] = series_id + point_id = identity.point_id(series=series_id, tokens_per_rank=8) + item["points"][0]["point_id"] = point_id + attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1) + evidence = identity.evidence_id( + point=point_id, allocation=allocation, attempt=attempt, + sample_sha256=hashlib.sha256(seed.encode()).hexdigest(), + ) + item["points"][0]["evidence_ids"] = [evidence] + runs = { + str(run): {8: { + "latency_us": { + statistic: component["latency_us"][statistic] * (1 + run / 100) + for statistic in ("p50", "p99") + }, + "activation_data_rate_gbps_at_latency_percentile": { + statistic: component["activation_data_rate_gbps_at_latency_percentile"][statistic] / (1 + run / 100) + for statistic in ("p50", "p99") + }, + "total_logical_data_rate_gbps_at_latency_percentile": { + statistic: component["total_logical_data_rate_gbps_at_latency_percentile"][statistic] / (1 + run / 100) + for statistic in ("p50", "p99") + }, + }} + for run in range(3) + } + trial_blocks = { + run_id: {8: { + "dispatch": None, + "stage": None, + "combine": None, + "roundtrip": tuple( + tuple(metrics[8]["latency_us"]["p99"] for _ in range(8)) + for _ in range(64) + ), + }} + for run_id, metrics in runs.items() + } + internal = {"run_metrics": runs, "trial_blocks": trial_blocks} + return item, internal + + +def _precision_series( + seed: str, + precision_profile: str, + *, + tokens: tuple[int, ...] = (128,), +) -> tuple[dict, dict]: + item, internal = _series(seed, "deepep", decision_grade=True) + precision = identity.precision_profile(precision_profile) + if precision_profile != identity.V1_CONTROL_PRECISION_PROFILE: + item["suite"] = "ep-precision-normal-v1" + item["publication_tier"] = "comparable-experimental" + item["workload"].update({ + "precision_profile": precision_profile, + "dispatch_precision": precision["dispatch"], + "combine_precision": precision["combine"], + }) + template = item["points"][0] + old_token = template["tokens_per_rank"] + old_metrics = { + run_id: metrics[old_token] + for run_id, metrics in internal["run_metrics"].items() + } + old_trials = { + run_id: metrics[old_token] + for run_id, metrics in internal["trial_blocks"].items() + } + item["points"] = [] + for token in tokens: + point = copy.deepcopy(template) + point["tokens_per_rank"] = token + point["global_tokens"] = token * item["system"]["ep_size"] + point["point_id"] = identity.point_id( + series=item["series_id"], tokens_per_rank=token + ) + point["correctness"]["precision"] = _precision_evidence(precision_profile) + point["roundtrip_token_rate_at_latency_percentile"] = { + name: point["global_tokens"] / (latency * 1e-6) + for name, latency in point["components"]["roundtrip"]["latency_us"].items() + } + item["points"].append(point) + internal["run_metrics"] = { + run_id: {token: copy.deepcopy(metrics) for token in tokens} + for run_id, metrics in old_metrics.items() + } + internal["trial_blocks"] = { + run_id: {token: copy.deepcopy(metrics) for token in tokens} + for run_id, metrics in old_trials.items() + } + return item, internal + + +def _dataset() -> dict: + item, _ = _series("one", "deepep") + case = item["case_ids"][0] + allocation = item["allocation_ids"][0] + attempt = identity.attempt_id(allocation=allocation, case=case, ordinal=1) + evidence = item["points"][0]["evidence_ids"][0] + return { + "format": "collectivex.public.v1", "schema_version": 1, + "generated_at": "2026-07-04T00:00:00Z", "source_bundle_ids": ["c" * 64], + "promotion": { + "status": "diagnostic", "reason": None, "matrix_id": "d" * 64, + "allocation_ids": [allocation], "required_allocations": 3, + "qualification_indices": [1], + "requested_cases": 1, "terminal_cases": 1, + "measured_cases": 1, "unsupported_cases": 0, + "requested_points": 1, "terminal_points": 1, + "measured_points": 1, "unsupported_points": 0, + "policy": "collectivex-decision-grade-v1", + }, + "coverage": [{ + "case_id": case, "label": "case", "required": True, "sku": "h100-dgxc", + "suite": item["suite"], "workload": item["model"], + "publication_tier": item["publication_tier"], + "backend": "deepep", "backend_generation": item["backend"]["generation"], + "mode": "normal", "phase": "decode", + "routing": item["workload"]["routing"], "eplb": False, + "precision_profile": item["workload"]["precision_profile"], + "dispatch_precision": item["workload"]["dispatch_precision"], + "combine_precision": item["workload"]["combine_precision"], + "resource": item["resource"], + "topology": publisher._coverage_topology(item["system"]), + "points": [{ + "point_id": item["points"][0]["point_id"], + "series_id": item["series_id"], + "tokens_per_rank": 8, "global_tokens": 64, + "terminal_status": "measured", "reason": None, + }], + "disposition": "runnable", + "selected_attempt_id": attempt, + "outcome": "success", "failure_mode": None, "reason": None, + "attempt_ids": [attempt], + }], + "attempts": [{ + "attempt_id": attempt, + "evidence": [{"evidence_id": evidence, + "point_id": item["points"][0]["point_id"]}], + "case_id": case, + "allocation_id": allocation, "run_id": "1", "run_attempt": 1, + "qualification_index": 1, + "attempt_index": 1, + "selected": True, "outcome": "success", "failure_mode": None, "reason": None, + "series_id": item["series_id"], + "completed_at": "2026-07-04T00:00:00Z", + }], + "series": [item], "cohorts": [], "rankings": [], "recommendations": [], + "sensitivities": [], + } + + +def _promoted_dataset(*, precision_profiles: tuple[str, ...] = ()) -> dict: + specifications = [ + ("library-fast", "deepep", None, False, None), + ("library-slow", "uccl", None, False, None), + ("chip-peer", "deepep", "h200-dgxc", False, None), + ("system-one", "nccl-ep", None, True, None), + ("system-two", "nccl-ep", "h200-dgxc", True, None), + ("routing-zipf", "deepep", None, False, None), + ("routing-zipf-eplb", "deepep", None, False, None), + ] + specifications.extend( + (f"precision-{index}", "deepep", None, False, precision_profile) + for index, precision_profile in enumerate(precision_profiles) + ) + series = [] + internals = {} + attempts = [] + coverage = [] + for seed, backend, peer_sku, reference, precision_profile in specifications: + item, internal = _series(seed, backend, decision_grade=True) + if peer_sku: + platform = publisher.capability.PLATFORMS[peer_sku] + item["system"].update({ + "sku": peer_sku, + "label": f"NVIDIA {platform['product'].upper()}", + "topology_class": platform["topology_class"], + "transport": platform["transport"], + }) + if reference: + item["backend"]["role"] = "reference" + if seed.startswith("routing-zipf"): + item["suite"] = "ep-routing-v1" + item["publication_tier"] = "comparable-experimental" + item["workload"]["routing"] = "zipf" + if seed == "routing-zipf-eplb": + item["workload"]["eplb"] = True + plan, calibration = contracts._expected_eplb_calibration( + "zipf", 7168, 8, 256, 288, item["system"]["ep_size"], 67, 2048 + ) + item["eplb"] = { + "enabled": True, **calibration, "planner": "greedy-rank-major-v1", + "mapping_sha256": contracts.eplb_contract.mapping_hash(plan), + "logical_experts": 256, "physical_experts": 288, + "redundant_experts": 32, "reference_tokens_per_rank": 2048, + "replicated_experts": plan["replicated_experts"], + "max_replicas": plan["max_replicas"], + "imbalance_before": plan["imbalance_before"], + "imbalance_after": plan["imbalance_after"], + } + item["build"]["implementation_contract_sha256"] = "8" * 64 + if precision_profile is not None: + precision = identity.precision_profile(precision_profile) + item["suite"] = "ep-precision-normal-v1" + item["publication_tier"] = "comparable-experimental" + item["workload"].update({ + "precision_profile": precision_profile, + "dispatch_precision": precision["dispatch"], + "combine_precision": precision["combine"], + }) + item["points"][0]["correctness"]["precision"] = _precision_evidence( + precision_profile + ) + case_id = identity.digest("case", publisher._public_case_factors(item)) + item["case_ids"] = [case_id] + build = item["build"] + build["public_config_sha256"] = contracts.public_series_config_sha256( + publisher._public_series_config(item) + ) + item["series_id"] = identity.series_id({ + "backend": item["backend"]["id"], + "case_id": case_id, + "image_digest": build["image_digest"], + "implementation_contract_sha256": build["implementation_contract_sha256"], + "public_config_sha256": build["public_config_sha256"], + "routing_control_sha256": build["routing_control_sha256"], + "runtime_fingerprint_sha256": build["runtime_fingerprint_sha256"], + "source_sha": build["source_sha"], + "squash_sha256": build["squash_sha256"], + "workload_id": item["workload"]["workload_id"], + }) + point = item["points"][0] + point["point_id"] = identity.point_id( + series=item["series_id"], tokens_per_rank=point["tokens_per_rank"] + ) + case_attempts = [] + evidence_ids = [] + for run_id, allocation_id in enumerate(item["allocation_ids"], 1): + attempt_id = identity.attempt_id( + allocation=allocation_id, case=case_id, ordinal=1 + ) + evidence_id = identity.evidence_id( + point=point["point_id"], allocation=allocation_id, + attempt=attempt_id, + sample_sha256=hashlib.sha256(f"{seed}-{run_id}".encode()).hexdigest(), + ) + attempts.append({ + "attempt_id": attempt_id, + "evidence": [{"evidence_id": evidence_id, "point_id": point["point_id"]}], + "case_id": case_id, "allocation_id": allocation_id, + "run_id": str(run_id), "run_attempt": 1, + "qualification_index": run_id, + "attempt_index": 1, "selected": True, + "outcome": "success", "failure_mode": None, "reason": None, + "series_id": item["series_id"], + "completed_at": "2026-07-04T00:00:00Z", + }) + case_attempts.append(attempt_id) + evidence_ids.append(evidence_id) + point["evidence_ids"] = evidence_ids + coverage.append({ + "case_id": case_id, "label": seed, "required": True, + "sku": item["system"]["sku"], "suite": item["suite"], + "workload": item["model"], "publication_tier": item["publication_tier"], + "backend": backend, "backend_generation": item["backend"]["generation"], + "mode": item["mode"], "phase": item["phase"], + "routing": item["workload"]["routing"], "eplb": item["workload"]["eplb"], + "precision_profile": item["workload"]["precision_profile"], + "dispatch_precision": item["workload"]["dispatch_precision"], + "combine_precision": item["workload"]["combine_precision"], + "resource": item["resource"], "disposition": "runnable", + "topology": publisher._coverage_topology(item["system"]), + "points": [{ + "point_id": point["point_id"], "series_id": item["series_id"], + "tokens_per_rank": point["tokens_per_rank"], + "global_tokens": point["global_tokens"], + "terminal_status": "measured", "reason": None, + }], + "selected_attempt_id": case_attempts[-1], "outcome": "success", + "failure_mode": None, "reason": None, "attempt_ids": case_attempts, + }) + series.append(item) + internals[item["series_id"]] = internal + + unsupported_case, unsupported = next( + (case_id, case) + for case_id, case in publisher._canonical_coverage_cases().items() + if case["sku"] == "mi355x" and case["backend"] == "deepep" + and case["phase"] == "decode" and case["routing"] == "uniform" + and not case["eplb"] and case["ep"] == 8 + ) + unsupported_attempts = [] + for run_id in range(1, 4): + allocation_id = identity.allocation_id( + {"seed": "planned-unsupported", "run": run_id} + ) + attempt_id = identity.attempt_id( + allocation=allocation_id, case=unsupported_case, ordinal=1 + ) + attempts.append({ + "attempt_id": attempt_id, "evidence": [], "case_id": unsupported_case, + "allocation_id": allocation_id, "run_id": str(run_id), + "run_attempt": 1, "qualification_index": run_id, + "attempt_index": 1, "selected": True, "outcome": "unsupported", + "failure_mode": "capability", "reason": "backend-platform-unsupported", + "series_id": None, "completed_at": "2026-07-04T00:00:00Z", + }) + unsupported_attempts.append(attempt_id) + coverage.append({ + "case_id": unsupported_case, "label": "planned unsupported", "required": True, + "sku": unsupported["sku"], "suite": unsupported["suite"], + "workload": unsupported["workload"], + "publication_tier": unsupported["required_publication"], + "backend": unsupported["backend"], "backend_generation": None, + "mode": unsupported["mode"], "phase": unsupported["phase"], + "routing": unsupported["routing"], "eplb": unsupported["eplb"], + "precision_profile": identity.V1_CONTROL_PRECISION_PROFILE, + "dispatch_precision": identity.precision_profile( + identity.V1_CONTROL_PRECISION_PROFILE + )["dispatch"], + "combine_precision": identity.precision_profile( + identity.V1_CONTROL_PRECISION_PROFILE + )["combine"], + "resource": { + "mode": "fixed-profile", "profile": None, + "comm_units_kind": None, "configured_units": None, + }, + "topology": publisher._coverage_topology(unsupported), + "points": [{ + "point_id": None, "series_id": None, + "tokens_per_rank": token, "global_tokens": token * unsupported["ep"], + "terminal_status": "unsupported", + "reason": "backend-platform-unsupported", + } for token in map(int, unsupported["ladder"].split())], + "disposition": "unsupported", "selected_attempt_id": unsupported_attempts[-1], + "outcome": "unsupported", "failure_mode": "capability", + "reason": "backend-platform-unsupported", "attempt_ids": unsupported_attempts, + }) + cohorts, rankings, recommendations, sensitivities = publisher.build_decisions( + series, internals + ) + return { + "format": "collectivex.public.v1", "schema_version": 1, + "generated_at": "2026-07-04T00:00:00Z", + "source_bundle_ids": ["a" * 64, "b" * 64, "c" * 64], + "promotion": { + "status": "promoted", "reason": None, + "matrix_id": publisher.CANONICAL_FULL_V1_MATRIX_SHA256, + "allocation_ids": sorted({item["allocation_id"] for item in attempts}), + "required_allocations": 3, "qualification_indices": [1, 2, 3], + "requested_cases": len(coverage), "terminal_cases": len(coverage), + "measured_cases": len(coverage) - 1, "unsupported_cases": 1, + "requested_points": sum(len(item["points"]) for item in coverage), + "terminal_points": sum(len(item["points"]) for item in coverage), + "measured_points": sum( + point["terminal_status"] == "measured" + for item in coverage for point in item["points"] + ), + "unsupported_points": sum( + point["terminal_status"] == "unsupported" + for item in coverage for point in item["points"] + ), + "policy": "collectivex-decision-grade-v1", + }, + "coverage": sorted(coverage, key=lambda item: item["case_id"]), + "attempts": sorted(attempts, key=lambda item: item["attempt_id"]), + "series": sorted(series, key=lambda item: item["series_id"]), + "cohorts": cohorts, "rankings": rankings, + "recommendations": recommendations, "sensitivities": sensitivities, + } + + +def _cohort_counts(dataset: dict) -> dict[str, int]: + return { + kind: sum(item["kind"] == kind for item in dataset["cohorts"]) + for kind in ("library", "system", "routing") + } + + +class PublisherTest(unittest.TestCase): + def test_trial_diagnostics_flag_drift_and_robust_outliers(self) -> None: + def runs() -> dict[str, dict[int, dict[str, object]]]: + return { + str(index): { + 8: { + "dispatch": tuple(tuple([10.0] * 8) for _ in range(64)), + "stage": None, + "combine": tuple(tuple([10.0] * 8) for _ in range(64)), + "roundtrip": tuple(tuple([10.0] * 8) for _ in range(64)), + } + } + for index in range(1, 4) + } + + stable = publisher._trial_diagnostics(runs(), 8) + self.assertFalse(stable["flagged"]) + + drift = runs() + drift["1"][8]["roundtrip"] = tuple( + tuple([12.0 if trial >= 56 else 10.0] * 8) for trial in range(64) + ) + self.assertEqual(publisher._trial_diagnostics(drift, 8)["reasons"], ["trial-drift"]) + + outliers = runs() + outliers["1"][8]["roundtrip"] = tuple( + tuple([100.0 if 20 <= trial < 36 else 10.0] * 8) for trial in range(64) + ) + summary = publisher._trial_diagnostics(outliers, 8) + self.assertEqual(summary["reasons"], ["trial-outliers"]) + self.assertGreater( + summary["components"]["roundtrip"]["robust_outlier_fraction"], + publisher.TRIAL_OUTLIER_FRACTION_LIMIT, + ) + + def test_terminal_allocation_and_source_status_are_bound(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + path = next(artifact.glob("*.json")) + terminal = contracts.strict_load(path) + self.assertIs(contracts.validate_terminal_document(terminal), terminal) + self.assertEqual( + contracts.validate_delivery( + [str(path)], str(matrix), disposition="unsupported" + ), + 1, + ) + + for control_sha256 in (None, "0" * 64): + broken = copy.deepcopy(terminal) + broken["provenance"]["control_sha256"] = control_sha256 + path.write_text(json.dumps(broken)) + with self.assertRaisesRegex(contracts.ContractError, "exact control document"): + contracts.validate_delivery( + [str(path)], str(matrix), disposition="unsupported" + ) + path.write_text(json.dumps(terminal)) + + for field in ( + "artifact", "job", "repo", "run_attempt", "run_id", "source_sha", "runner" + ): + broken = copy.deepcopy(terminal) + broken["identity"]["allocation_factors"][field] = f"forged-{field}" + allocation_id = identity.allocation_id( + broken["identity"]["allocation_factors"] + ) + broken["identity"]["allocation_id"] = allocation_id + broken["identity"]["attempt_id"] = identity.attempt_id( + allocation=allocation_id, + case=broken["identity"]["case_id"], + ordinal=broken["identity"]["attempt_ordinal"], + ) + with self.assertRaisesRegex( + contracts.ContractError, "allocation factors differ" + ): + contracts.validate_terminal_document(broken) + + broken = copy.deepcopy(terminal) + broken["outcome"]["status"] = "failed" + with self.assertRaisesRegex(contracts.ContractError, "source and outcome"): + contracts.validate_terminal_document(broken) + broken = copy.deepcopy(terminal) + broken["provenance"]["source"] = "runtime-emitter" + with self.assertRaisesRegex(contracts.ContractError, "source and outcome"): + contracts.validate_terminal_document(broken) + + for path_parts, replacement in ( + (("provenance", "source"), "unregistered-producer"), + (("outcome", "failure_mode"), "unsupported-capability"), + (("outcome", "reason"), "unregistered-capability"), + ): + with self.subTest(path=path_parts): + broken = copy.deepcopy(terminal) + broken[path_parts[0]][path_parts[1]] = replacement + with self.assertRaises(publisher.PublisherError): + publisher._schema("terminal-outcome-v1.schema.json", broken) + with self.assertRaises(contracts.ContractError): + contracts.validate_terminal_document(broken) + + runtime_allocation = copy.deepcopy( + terminal["identity"]["allocation_factors"] + ) + runtime_allocation["runner"] = terminal["identity"]["case_factors"]["sku"] + runtime = contracts.make_terminal_document( + allocation_factors=runtime_allocation, + attempt_ordinal=1, + case=terminal["case"], + case_factors=terminal["identity"]["case_factors"], + control_sha256=terminal["provenance"]["control_sha256"], + failure_mode="setup", + generated_at=terminal["generated_at"], + git_run=terminal["provenance"]["git_run"], + reason="launcher-setup-failed", + return_code=1, + source="runtime-emitter", + status="failed", + expected_case_id=terminal["identity"]["case_id"], + ) + publisher._schema("terminal-outcome-v1.schema.json", runtime) + broken = copy.deepcopy(runtime) + broken["outcome"]["reason"] = "backend-setup-failed" + with self.assertRaises(publisher.PublisherError): + publisher._schema("terminal-outcome-v1.schema.json", broken) + with self.assertRaises(contracts.ContractError): + contracts.validate_terminal_document(broken) + + def test_post_emit_demotion_uses_closed_failure_taxonomy(self) -> None: + raw, _ = _native_fixture() + expected = { + 5: "runtime-identity", + 6: "execution", + 124: "timeout", + 137: "timeout", + 134: "execution", + 9: "execution", + } + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + for return_code, failure_mode in expected.items(): + with self.subTest(return_code=return_code): + path = root / f"attempt-{return_code}.json" + path.write_text(json.dumps(raw)) + terminal = contracts.demote_raw_attempt(path, return_code) + self.assertEqual( + terminal["outcome"], + { + "failure_mode": failure_mode, + "reason": "post-emit-distributed-command-failed", + "return_code": return_code, + "status": "failed", + }, + ) + self.assertEqual(terminal["provenance"]["source"], "post-emit-command") + publisher._schema("terminal-outcome-v1.schema.json", terminal) + + broken = copy.deepcopy(terminal) + broken["outcome"]["reason"] = "distributed-command-failed" + with self.assertRaises(publisher.PublisherError): + publisher._schema("terminal-outcome-v1.schema.json", broken) + with self.assertRaises(contracts.ContractError): + contracts.validate_terminal_document(broken) + + def test_artifact_safety_accepts_current_v1_fixtures(self) -> None: + raw, samples = _native_fixture() + publisher.artifact_safety.assert_publication_safe([ + sweep_matrix.resolve_matrix(backends="all"), + raw, + samples, + _dataset(), + _promoted_dataset(), + ]) + + def test_native_raw_and_sample_schema_match_semantic_validator(self) -> None: + raw, samples = _native_fixture() + publisher._schema("samples-v1.schema.json", samples) + publisher._schema("raw-case-v1.schema.json", raw) + self.assertIs(contracts.validate_raw_document(raw, samples), raw) + provenance = raw["provenance"] + image = provenance["image"] + self.assertTrue(contracts.provenance_complete( + raw["implementation"]["provenance"], raw["case"]["backend"], + provenance["git_run"], + allocation_stratum_sha256=provenance["allocation_stratum_sha256"], + image_digest=image["digest"], image_verified=image["digest_verified"], + squash_sha256=image["squash_sha256"], + )) + self.assertFalse(contracts.provenance_complete( + raw["implementation"]["provenance"], raw["case"]["backend"], + provenance["git_run"], allocation_stratum_sha256=None, + image_digest=image["digest"], image_verified=image["digest_verified"], + squash_sha256=image["squash_sha256"], + )) + missing_stratum = copy.deepcopy(raw) + missing_stratum["provenance"]["allocation_stratum_sha256"] = None + with self.assertRaises(publisher.PublisherError): + publisher._schema("raw-case-v1.schema.json", missing_stratum) + with self.assertRaisesRegex(contracts.ContractError, "allocation stratum"): + contracts.validate_raw_document(missing_stratum, samples) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + (root / "samples.json").write_bytes(contracts.canonical_json_bytes(samples)) + (root / "raw.json").write_bytes(contracts.canonical_json_bytes(raw)) + self.assertEqual(contracts.load_raw_attempt(root / "raw.json"), raw) + for target in ("raw", "samples"): + broken_raw, broken_samples = copy.deepcopy((raw, samples)) + broken = broken_raw if target == "raw" else broken_samples + broken["unexpected"] = True + with self.assertRaises(publisher.PublisherError): + publisher._schema( + "raw-case-v1.schema.json" if target == "raw" else "samples-v1.schema.json", + broken, + ) + with self.assertRaises(contracts.ContractError): + contracts.validate_raw_document(broken_raw, broken_samples) + tampered = copy.deepcopy(raw) + tampered["measurement"]["rows"][0]["token_rate_at_latency_percentile"]["p50"] *= 2 + with self.assertRaisesRegex(contracts.ContractError, "token_rate_at_latency_percentile"): + contracts.validate_raw_document(tampered, samples) + tampered = copy.deepcopy(raw) + tampered["case"]["shape"]["hidden"] = 2 + with self.assertRaises(contracts.ContractError): + contracts.validate_raw_document(tampered, samples) + tampered = copy.deepcopy(raw) + configured = tampered["implementation"]["resource_profile"]["configured_units"] + tampered["implementation"]["resource_profile"]["configured_units"] = ( + 1 if configured is None else configured + 1 + ) + with self.assertRaisesRegex(contracts.ContractError, "resource profile"): + contracts.validate_raw_document(tampered, samples) + tampered = copy.deepcopy(raw) + oracle = tampered["measurement"]["rows"][0]["correctness"]["rank_evidence"][0] + oracle["pre_timing"]["checks"]["combine_values"] = False + with self.assertRaisesRegex(contracts.ContractError, "passed differs"): + contracts.validate_raw_document(tampered, samples) + + def test_hybrid_raw_binds_realized_config_and_every_rank_artifact(self) -> None: + raw, samples = _native_fixture("deepep-hybrid") + publisher._schema("raw-case-v1.schema.json", raw) + self.assertIs(contracts.validate_raw_document(raw, samples), raw) + + mutations = { + "hidden_dim": lambda provenance: provenance["realized_config"].update( + hidden_dim=2 + ), + "experts_per_rank": lambda provenance: provenance["realized_config"].update( + num_of_experts_per_rank=2 + ), + "ranks_per_node": lambda provenance: provenance["realized_config"].update( + num_of_ranks_per_node=2 + ), + "num_nodes": lambda provenance: provenance["realized_config"].update( + num_of_nodes=2 + ), + "token_data_type": lambda provenance: provenance["realized_config"].update( + token_data_type="UINT8" + ), + "rank_coverage": lambda provenance: [ + artifact["rank_artifacts"].append({ + "bytes": 1, "rank": 1, "sha256": "9" * 64, + }) + for artifact in provenance["jit_shared_objects"] + ], + } + for name, mutate in mutations.items(): + with self.subTest(name=name): + changed = copy.deepcopy(raw) + mutate(changed["implementation"]["provenance"]) + with self.assertRaisesRegex( + contracts.ContractError, + "DeepEP Hybrid realized config/JIT evidence differs", + ): + contracts.validate_raw_document(changed, samples) + + def test_native_contract_recomputes_routing_receive_histograms_and_anomalies(self) -> None: + raw, samples = _native_fixture() + + tampered = copy.deepcopy(raw) + changed = tampered["measurement"]["rows"][0] + changed["routing"]["routed_copies"] *= 2 + for name in ("combine", "dispatch", "roundtrip"): + byte_provenance = changed["byte_provenance"][name] + byte_provenance["activation_data_bytes"] *= 2 + byte_provenance["total_logical_bytes"] *= 2 + with self.assertRaisesRegex(contracts.ContractError, "routing.routed_copies"): + contracts.validate_raw_document(tampered, samples) + + tampered = copy.deepcopy(raw) + changed = tampered["measurement"]["rows"][0] + changed["routing"]["payload_copies_per_rank"] = [2] + changed["receive"] = {"max": 2, "mean": 2.0, "min": 2, "total": 2} + with self.assertRaisesRegex(contracts.ContractError, "payload_copies_per_rank"): + contracts.validate_raw_document(tampered, samples) + + tampered = copy.deepcopy(raw) + tampered["measurement"]["rows"][0]["sample_histograms"]["roundtrip"][ + "counts" + ] = [511] + with self.assertRaisesRegex(contracts.ContractError, "sample_histograms"): + contracts.validate_raw_document(tampered, samples) + + tampered = copy.deepcopy(raw) + tampered["measurement"]["rows"][0]["anomalies"] = [{ + "type": "roundtrip_gt_isolated_sum", + "T": 1, + "roundtrip_p99": 40.0, + "isolated_sum_p99": 30.0, + "ratio": 1.33, + "threshold": 3.0, + }] + tampered["outcome"]["validity"]["anomaly_free"] = False + with self.assertRaisesRegex(contracts.ContractError, "anomalies"): + contracts.validate_raw_document(tampered, samples) + + anomalous_raw, anomalous_samples = copy.deepcopy((raw, samples)) + sample_point = anomalous_samples["points"][0] + sample_point["components"]["roundtrip"]["trials"] = [ + [100.0] * 8 for _ in range(64) + ] + sample_core = { + "components": sample_point["components"], + "tokens_per_rank": sample_point["tokens_per_rank"], + } + sample_sha = hashlib.sha256( + contracts.canonical_json_bytes(sample_core) + ).hexdigest() + point_id = sample_point["point_id"] + evidence_id = identity.evidence_id( + point=point_id, + allocation=anomalous_raw["identity"]["allocation_id"], + attempt=anomalous_raw["identity"]["attempt_id"], + sample_sha256=sample_sha, + ) + sample_point.update({"sample_sha256": sample_sha, "evidence_id": evidence_id}) + changed = anomalous_raw["measurement"]["rows"][0] + changed["sample_sha256"] = sample_sha + changed["evidence_id"] = evidence_id + changed["components"]["roundtrip"]["percentiles_us"] = { + name: 100.0 for name in ("p50", "p90", "p95", "p99") + } + changed["token_rate_at_latency_percentile"] = { + name: 10_000.0 for name in ("p50", "p90", "p95", "p99") + } + changed["sample_histograms"]["roundtrip"] = contracts._expected_histogram( + [100.0] * 512 + ) + changed["anomalies"] = contracts._expected_anomalies(1, changed["components"]) + anomalous_raw["outcome"]["validity"]["anomaly_free"] = False + sample_bytes = contracts.canonical_json_bytes(anomalous_samples) + anomalous_raw["sample_artifact"].update({ + "bytes": len(sample_bytes), + "sha256": hashlib.sha256(sample_bytes).hexdigest(), + }) + self.assertIs( + contracts.validate_raw_document(anomalous_raw, anomalous_samples), + anomalous_raw, + ) + changed["anomalies"] = [] + anomalous_raw["outcome"]["validity"]["anomaly_free"] = True + with self.assertRaisesRegex(contracts.ContractError, "anomalies"): + contracts.validate_raw_document(anomalous_raw, anomalous_samples) + + def test_native_contract_rejects_every_schema_only_nested_mutation(self) -> None: + raw, samples = _native_fixture() + self.assertIs(contracts.validate_raw_document(raw, samples), raw) + + def locate(document: object, path: tuple[object, ...]) -> object: + value = document + for part in path: + value = value[part] # type: ignore[index] + return value + + def reject_raw(document: dict) -> None: + with self.assertRaises(publisher.PublisherError): + publisher._schema("raw-case-v1.schema.json", document) + with self.assertRaises(contracts.ContractError): + contracts.validate_raw_document(document, samples) + + required_fields = ( + (("measurement", "rows", 0, "receive"), "total"), + (("measurement", "rows", 0, "routing"), "fanout_mean"), + (("measurement", "rows", 0, "routing", "source_token_stats"), "ranks"), + (("measurement", "rows", 0, "sample_histograms"), "roundtrip"), + (("measurement", "rows", 0, "sample_histograms", "roundtrip"), "n"), + (("runtime_fingerprint", "accelerator_runtime"), "kind"), + (("runtime_fingerprint", "collective_library"), "kind"), + (("runtime_fingerprint", "framework"), "kind"), + ) + for path, required in required_fields: + with self.subTest(path=path, mutation="missing"): + broken = copy.deepcopy(raw) + del locate(broken, path)[required] # type: ignore[index] + reject_raw(broken) + with self.subTest(path=path, mutation="extra"): + broken = copy.deepcopy(raw) + locate(broken, path)["unexpected"] = None # type: ignore[index] + reject_raw(broken) + + invalid_values = ( + (("measurement", "rows", 0, "receive", "mean"), "one"), + (("measurement", "rows", 0, "routing", "fanout_mean"), "one"), + (("measurement", "rows", 0, "sample_histograms", "roundtrip", "bins"), 0), + (("provenance", "image", "arch"), "AMD64"), + (("runtime_fingerprint", "accelerator_runtime", "kind"), "rocm"), + ) + for path, invalid in invalid_values: + with self.subTest(path=path, mutation="value"): + broken = copy.deepcopy(raw) + parent = locate(broken, path[:-1]) + parent[path[-1]] = invalid # type: ignore[index] + reject_raw(broken) + + def reject_samples(document: dict) -> None: + with self.assertRaises(publisher.PublisherError): + publisher._schema("samples-v1.schema.json", document) + with self.assertRaises(contracts.ContractError): + contracts.validate_samples_document(document) + + for path, required in ( + (("points", 0), "evidence_id"), + (("points", 0, "components"), "roundtrip"), + (("points", 0, "components", "roundtrip"), "trials"), + (("sampling",), "reduction"), + ): + with self.subTest(path=path, artifact="samples"): + broken = copy.deepcopy(samples) + del locate(broken, path)[required] # type: ignore[index] + reject_samples(broken) + + def test_terminal_contract_and_schema_reject_the_same_shape_gaps(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + _, artifact = _unsupported_delivery(Path(temporary).resolve()) + terminal = contracts.strict_load(next(artifact.glob("*.json"))) + publisher._schema("terminal-outcome-v1.schema.json", terminal) + self.assertIs(contracts.validate_terminal_document(terminal), terminal) + + def reject(document: dict) -> None: + with self.assertRaises(publisher.PublisherError): + publisher._schema("terminal-outcome-v1.schema.json", document) + with self.assertRaises(contracts.ContractError): + contracts.validate_terminal_document(document) + + for path, invalid in ( + (("outcome", "failure_mode"), "Not Safe"), + (("outcome", "reason"), "x" * 241), + (("provenance", "source"), "Not Safe"), + (("provenance", "git_run", "ref"), ""), + ): + with self.subTest(path=path): + broken = copy.deepcopy(terminal) + parent = broken + for part in path[:-1]: + parent = parent[part] + parent[path[-1]] = invalid + reject(broken) + + def test_invalid_retry_is_quarantined_before_valid_retry_upload(self) -> None: + raw, samples = _native_fixture() + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + sample_bytes = contracts.canonical_json_bytes(samples) + bad = copy.deepcopy(raw) + bad["sample_artifact"].update({ + "path": "a01.samples.json", "bytes": len(sample_bytes), + "sha256": hashlib.sha256(sample_bytes).hexdigest(), + }) + bad["measurement"]["rows"][0]["token_rate_at_latency_percentile"]["p50"] *= 2 + (root / "a01.samples.json").write_bytes(sample_bytes) + (root / "a01.json").write_bytes(contracts.canonical_json_bytes(bad)) + self.assertTrue(contracts.quarantine_invalid_attempt(root / "a01.json")) + valid = copy.deepcopy(raw) + valid["sample_artifact"].update({ + "path": "a02.samples.json", "bytes": len(sample_bytes), + "sha256": hashlib.sha256(sample_bytes).hexdigest(), + }) + (root / "a02.samples.json").write_bytes(sample_bytes) + (root / "a02.json").write_bytes(contracts.canonical_json_bytes(valid)) + paths = sorted(str(path) for path in root.glob("*.json")) + self.assertEqual(contracts.validate_attempt_paths(paths), 1) + self.assertTrue((root / "a01.json.quarantine").is_file()) + self.assertTrue((root / "a01.samples.json.quarantine").is_file()) + + def test_ingest_archives_without_publishing_a_channel(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + self.assertEqual(len(summarize.load_results(str(artifact), None, None)), 1) + result = publisher.ingest_command(_args(root / "store", matrix, artifact)) + store = publisher.Store(root / "store") + self.assertEqual(result["status"], "accepted") + self.assertTrue((store.incoming / result["incoming_id"] / "COMPLETE").is_file()) + self.assertTrue((store.bundles / result["bundle_id"] / "COMPLETE").is_file()) + self.assertEqual(list(store.channels.iterdir()), []) + self.assertEqual(list(store.datasets.iterdir()), []) + self.assertEqual(os.stat(store.private).st_mode & 0o777, 0o700) + self.assertEqual(os.stat(store.public).st_mode & 0o777, 0o755) + self.assertEqual(os.stat(store.bundles / result["bundle_id"]).st_mode & 0o777, 0o500) + + def test_repeated_ingest_is_content_idempotent(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + args = _args(root / "store", matrix, artifact) + first = publisher.ingest_command(args) + store = publisher.Store(root / "store") + second = publisher.ingest_command(args) + self.assertEqual(second, first) + self.assertEqual(len(list(store.incoming.iterdir())), 1) + self.assertEqual(len(list(store.bundles.iterdir())), 1) + self.assertEqual(len(list(store.datasets.iterdir())), 0) + self.assertEqual(len(list(store.channels.iterdir())), 0) + bundle = publisher.strict_load( + store.bundles / first["bundle_id"] / "bundle.json" + ) + terminal = publisher.strict_load(next(artifact.glob("*.json"))) + self.assertEqual(bundle["created_at"], terminal["generated_at"]) + + def test_dataset_is_invariant_to_bundle_argument_order(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + store_root = root / "store" + bundle_ids = [] + for run_id in (9, 11, 10): + run = {**RUN, "run_id": str(run_id)} + delivery = root / f"run-{run_id}" + delivery.mkdir() + matrix, artifact = _unsupported_delivery(delivery, run=run) + result = publisher.ingest_command( + _args(store_root, matrix, artifact, run=run) + ) + bundle_ids.append(result["bundle_id"]) + datasets = [ + publisher.build_dataset( + publisher.Store(store_root), order, promote=False, + ) + for order in itertools.permutations(bundle_ids) + ] + self.assertTrue(all(dataset == datasets[0] for dataset in datasets[1:])) + self.assertEqual(datasets[0]["generated_at"], "2026-07-04T00:00:00Z") + selected = datasets[0]["coverage"][0]["selected_attempt_id"] + selected_attempt = next( + item for item in datasets[0]["attempts"] + if item["attempt_id"] == selected + ) + self.assertEqual(selected_attempt["run_id"], "11") + + def test_diagnostic_dataset_orders_reruns_by_run_attempt(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + store_root = root / "store" + bundle_ids = [] + for run_attempt in (1, 2): + run = {**RUN, "run_attempt": run_attempt} + delivery = root / f"attempt-{run_attempt}" + delivery.mkdir() + matrix, artifact = _unsupported_delivery(delivery, run=run) + result = publisher.ingest_command( + _args(store_root, matrix, artifact, run=run) + ) + bundle_ids.append(result["bundle_id"]) + dataset = publisher.build_dataset( + publisher.Store(store_root), bundle_ids, promote=False + ) + selected_id = dataset["coverage"][0]["selected_attempt_id"] + selected = next( + item for item in dataset["attempts"] + if item["attempt_id"] == selected_id + ) + self.assertEqual(selected["run_attempt"], 2) + + def test_promotion_requires_every_runnable_case_to_succeed_in_every_bundle(self) -> None: + cases = { + "runnable": {"_disposition": "runnable"}, + "planned-unsupported": {"_disposition": "unsupported"}, + } + bundles = [] + for _ in range(3): + runnable = { + "identity": {"case_id": "runnable"}, + "outcome": {"status": "success"}, + } + unsupported = { + "identity": {"case_id": "planned-unsupported"}, + "outcome": {"status": "unsupported"}, + } + bundles.append({ + "selected": {"runnable": runnable, "planned-unsupported": unsupported}, + "documents": {"runnable": runnable, "planned-unsupported": unsupported}, + }) + publisher._require_runnable_promotion_success(bundles, cases) + + for status in ("failed", "invalid", "unsupported", "diagnostic"): + with self.subTest(status=status): + broken = copy.deepcopy(bundles) + broken[1]["selected"]["runnable"]["outcome"]["status"] = status + with self.assertRaisesRegex( + publisher.PublisherError, "every runnable matrix case" + ): + publisher._require_runnable_promotion_success(broken, cases) + + broken = copy.deepcopy(bundles) + broken[1]["documents"]["retry"] = { + "identity": {"case_id": "runnable"}, + "outcome": {"status": "failed"}, + } + with self.assertRaisesRegex(publisher.PublisherError, "rejects runnable cases"): + publisher._require_runnable_promotion_success(broken, cases) + + def test_promoted_public_dataset_rejects_failed_retry_history(self) -> None: + dataset = _promoted_dataset() + successful = next( + item for item in dataset["attempts"] + if item["outcome"] == "success" + ) + failed = copy.deepcopy(successful) + old_attempt_id = successful["attempt_id"] + successful["attempt_index"] = 2 + successful["attempt_id"] = identity.attempt_id( + allocation=successful["allocation_id"], case=successful["case_id"], ordinal=2 + ) + failed.update({ + "attempt_id": old_attempt_id, + "attempt_index": 1, + "outcome": "failed", + "failure_mode": "execution", + "reason": "execution-failed", + "series_id": None, + "selected": False, + "evidence": [], + }) + dataset["attempts"].append(failed) + dataset["attempts"].sort(key=lambda item: item["attempt_id"]) + coverage = next( + item for item in dataset["coverage"] + if item["case_id"] == failed["case_id"] + ) + coverage["attempt_ids"] = [ + successful["attempt_id"] if value == old_attempt_id else value + for value in coverage["attempt_ids"] + ] + coverage["attempt_ids"].append(failed["attempt_id"]) + coverage["attempt_ids"].sort() + if coverage["selected_attempt_id"] == old_attempt_id: + coverage["selected_attempt_id"] = successful["attempt_id"] + + fixture_catalog = publisher._case_disposition_catalog_sha256(dataset["coverage"]) + with mock.patch.object( + publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog + ), self.assertRaisesRegex(publisher.PublisherError, "rejects runnable cases"): + publisher.validate_public_dataset(dataset) + + def test_unselected_success_does_not_reference_an_unpublished_series(self) -> None: + raw, _ = _native_fixture() + retained = publisher._public_attempt(raw, selected=False) + selected = publisher._public_attempt(raw, selected=True) + self.assertEqual(retained["outcome"], "success") + self.assertIsNone(retained["series_id"]) + self.assertEqual(selected["series_id"], raw["identity"]["series_id"]) + + def test_public_dataset_selects_latest_derived_retry(self) -> None: + dataset = _dataset() + first = dataset["attempts"][0] + second = copy.deepcopy(first) + second.update({ + "attempt_id": identity.attempt_id( + allocation=first["allocation_id"], case=first["case_id"], ordinal=2 + ), + "attempt_index": 2, + "selected": False, + "series_id": None, + "evidence": [], + }) + dataset["attempts"].append(second) + dataset["attempts"].sort(key=lambda item: item["attempt_id"]) + dataset["coverage"][0]["attempt_ids"].append(second["attempt_id"]) + dataset["coverage"][0]["attempt_ids"].sort() + with self.assertRaisesRegex(publisher.PublisherError, "select the latest retry"): + publisher.validate_public_dataset(dataset) + + second["attempt_id"] = identity.digest("attempt", {"not": "derived"}) + dataset["attempts"].sort(key=lambda item: item["attempt_id"]) + dataset["coverage"][0]["attempt_ids"] = [ + item["attempt_id"] for item in dataset["attempts"] + ] + with self.assertRaisesRegex(publisher.PublisherError, "retry identity differs"): + publisher.validate_public_dataset(dataset) + + def test_promotion_requires_an_eligible_cohort_for_every_comparison_kind(self) -> None: + stable_fast, stable_fast_internal = _series( + "stable-fast", "deepep", decision_grade=True + ) + stable_slow, stable_slow_internal = _series( + "stable-slow", "uccl", decision_grade=True + ) + unstable_fast, unstable_fast_internal = _series( + "unstable-fast", "deepep", decision_grade=True + ) + unstable_slow, unstable_slow_internal = _series( + "unstable-slow", "uccl", decision_grade=True + ) + unstable_fast["phase"] = unstable_slow["phase"] = "prefill" + unstable_fast["series_id"] = identity.series_id({"test": "unstable-fast"}) + unstable_slow["series_id"] = identity.series_id({"test": "unstable-slow"}) + for statistic in ("p50", "p99"): + unstable_slow_internal["run_metrics"]["1"][8]["latency_us"][statistic] = ( + unstable_fast_internal["run_metrics"]["1"][8]["latency_us"][statistic] + / 2 + ) + for field in ( + "activation_data_rate_gbps_at_latency_percentile", + "total_logical_data_rate_gbps_at_latency_percentile", + ): + unstable_slow_internal["run_metrics"]["1"][8][field][statistic] = ( + unstable_fast_internal["run_metrics"]["1"][8][field][statistic] * 2 + ) + series = [stable_fast, stable_slow, unstable_fast, unstable_slow] + internals = { + stable_fast["series_id"]: stable_fast_internal, + stable_slow["series_id"]: stable_slow_internal, + unstable_fast["series_id"]: unstable_fast_internal, + unstable_slow["series_id"]: unstable_slow_internal, + } + cohorts, _, _, _ = publisher.build_decisions(series, internals) + eligible = [item for item in cohorts if item["eligibility"]["decision_grade"]] + ineligible = [item for item in cohorts if not item["eligibility"]["decision_grade"]] + self.assertEqual({item["kind"] for item in eligible}, {"library"}) + self.assertTrue(ineligible) + anchor_series = [ + { + "series_id": name, + "workload": {"routing": routing, "eplb": eplb}, + "build": {"implementation_contract_sha256": "1" * 64}, + } + for name, routing, eplb in ( + ("uniform", "uniform", False), + ("zipf", "zipf", False), + ("zipf-eplb", "zipf", True), + ) + ] + required = eligible + [ + { + "kind": kind, + "eligibility": {"decision_grade": True}, + **({"series_ids": [item["series_id"] for item in anchor_series]} + if kind == "routing" else {}), + } + for kind in publisher.REQUIRED_COHORT_KINDS + if kind != "library" + ] + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", {} + ), mock.patch.object( + publisher, "_expected_chip_cohort_count", return_value=1 + ): + publisher._require_promotion_cohorts( + required + ineligible, anchor_series + ) + for kind in publisher.REQUIRED_COHORT_KINDS: + with self.subTest(missing_kind=kind), self.assertRaisesRegex( + publisher.PublisherError, rf"cohort kinds:.*{kind}" + ): + publisher._require_promotion_cohorts([ + item for item in required + ineligible + if item["kind"] != kind or not item["eligibility"]["decision_grade"] + ], anchor_series) + + def test_promotion_requires_exact_counts_and_routing_anchors(self) -> None: + dataset = _promoted_dataset() + counts = _cohort_counts(dataset) + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts + ): + publisher._require_promotion_cohorts( + dataset["cohorts"], dataset["series"] + ) + routing = next( + item for item in dataset["cohorts"] if item["kind"] == "routing" + ) + eplb = next( + item for item in dataset["series"] + if item["series_id"] in routing["series_ids"] + and item["workload"]["eplb"] + ) + eplb["workload"]["eplb"] = False + with self.assertRaisesRegex(publisher.PublisherError, "exact uniform"): + publisher._require_promotion_cohorts( + dataset["cohorts"], dataset["series"] + ) + + dataset = _promoted_dataset() + routing = next(item for item in dataset["cohorts"] if item["kind"] == "routing") + zipf = next( + item for item in dataset["series"] + if item["series_id"] in routing["series_ids"] + and item["workload"]["routing"] == "zipf" + and not item["workload"]["eplb"] + ) + zipf["build"]["implementation_contract_sha256"] = "f" * 64 + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts + ), self.assertRaisesRegex(publisher.PublisherError, "identical off-EPLB"): + publisher._require_promotion_cohorts(dataset["cohorts"], dataset["series"]) + + wrong_counts = {**counts, "library": counts["library"] + 1} + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", wrong_counts + ), self.assertRaisesRegex(publisher.PublisherError, "exactly"): + publisher._require_promotion_cohorts( + dataset["cohorts"], dataset["series"] + ) + + def test_promotion_requires_every_derived_chip_cohort_to_be_stable(self) -> None: + dataset = _promoted_dataset() + chip = next(item for item in dataset["cohorts"] if item["kind"] == "chip") + self.assertEqual( + publisher._expected_chip_cohort_count(dataset["series"]), + sum(item["kind"] == "chip" for item in dataset["cohorts"]), + ) + with mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", _cohort_counts(dataset) + ): + missing = [item for item in dataset["cohorts"] if item is not chip] + with self.assertRaisesRegex(publisher.PublisherError, "derived chip cohorts"): + publisher._require_promotion_cohorts(missing, dataset["series"]) + + chip["eligibility"]["decision_grade"] = False + with self.assertRaisesRegex(publisher.PublisherError, "derived chip cohorts"): + publisher._require_promotion_cohorts( + dataset["cohorts"], dataset["series"] + ) + + def test_promotion_rejects_more_than_three_bundles(self) -> None: + bundles = { + str(run_id): { + "id": str(run_id), "cases": [], + "manifest": { + "matrix": {"sha256": publisher.CANONICAL_FULL_V1_MATRIX_SHA256}, + "run": { + "run_id": str(run_id), "run_attempt": 1, + "qualification_index": min(run_id, 3), + }, + }, + } + for run_id in range(1, 5) + } + with mock.patch.object( + publisher, "load_bundle", side_effect=lambda _, bundle_id: bundles[bundle_id] + ), self.assertRaisesRegex(publisher.PublisherError, "qualification indices"): + publisher.build_dataset(object(), list(bundles), promote=True) + + dataset = _promoted_dataset() + dataset["source_bundle_ids"].append("d" * 64) + counts = _cohort_counts(dataset) + with mock.patch.object( + publisher, + "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", + publisher._case_disposition_catalog_sha256(dataset["coverage"]), + ), mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", counts + ), self.assertRaisesRegex(publisher.PublisherError, "complete coverage"): + publisher.validate_public_dataset(dataset) + + def test_standalone_promotion_binds_matrix_and_requested_dispositions(self) -> None: + dataset = _promoted_dataset() + fixture_catalog = publisher._case_disposition_catalog_sha256(dataset["coverage"]) + with self.assertRaisesRegex( + publisher.PublisherError, "canonical case/disposition catalog" + ): + publisher.validate_public_dataset(dataset) + with mock.patch.object( + publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog + ), mock.patch.object( + publisher, + "REQUIRED_PROMOTION_COHORT_COUNTS", + _cohort_counts(dataset), + ): + publisher.validate_public_dataset(dataset) + + diagnostic = copy.deepcopy(dataset) + item = diagnostic["series"][0] + item["status"] = "diagnostic" + item["eligibility"].update({ + "decision_grade": False, + "stable_p50": False, + "p50_max_min_ratio": 1.20, + "reasons": ["unstable-p50"], + }) + with mock.patch.object( + publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", fixture_catalog + ), mock.patch.object( + publisher, + "REQUIRED_PROMOTION_COHORT_COUNTS", + _cohort_counts(dataset), + ), self.assertRaisesRegex( + publisher.PublisherError, "unstable or incomplete required series" + ): + publisher.validate_public_dataset(diagnostic) + + broken = copy.deepcopy(dataset) + broken["promotion"]["matrix_id"] = "d" * 64 + with self.assertRaisesRegex(publisher.PublisherError, "canonical full-v1 matrix"): + publisher.validate_public_dataset(broken) + + for original, replacement in (("runnable", "unsupported"), + ("unsupported", "runnable")): + with self.subTest(original=original): + broken = copy.deepcopy(dataset) + item = next( + coverage for coverage in broken["coverage"] + if coverage["disposition"] == original + ) + item["disposition"] = replacement + with mock.patch.object( + publisher, + "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", + publisher._case_disposition_catalog_sha256(broken["coverage"]), + ), self.assertRaisesRegex( + publisher.PublisherError, + "requested dispositions" if original == "runnable" + else "coverage dimensions", + ): + publisher.validate_public_dataset(broken) + + def test_workflow_matrix_and_catalog_digests_do_not_drift(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + matrix_path = Path(temporary) / "matrix_full.json" + result = subprocess.run( + [ + sys.executable, str(ROOT / "sweep_matrix.py"), + "--suites", "all", "--max-cases", "128", + "--backends", "all", "--out", str(matrix_path), + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 0, result.stderr) + if publisher.capability.provisional_precision_targets(): + workflow = ( + ROOT.parent.parent / ".github" / "workflows" / "collectivex-sweep.yml" + ).read_text() + self.assertIn( + "V1 sweeps require every precision capability cell to be resolved", + workflow, + ) + return + self.assertEqual( + hashlib.sha256(matrix_path.read_bytes()).hexdigest(), + publisher.CANONICAL_FULL_V1_MATRIX_SHA256, + ) + matrix = contracts.strict_load(matrix_path) + coverage = [ + { + "case_id": item["case"]["case_id"], + "disposition": item["disposition"], + } + for item in matrix["requested_cases"] + ] + self.assertEqual( + publisher._case_disposition_catalog_sha256(coverage), + publisher.CANONICAL_FULL_V1_CASE_CATALOG_SHA256, + ) + self.assertEqual( + ( + len(matrix["include"]), len(coverage), + sum(item["disposition"] == "runnable" for item in coverage), + sum(item["disposition"] == "unsupported" for item in coverage), + sum( + len(item["case"]["ladder"].split()) + for item in matrix["requested_cases"] + ), + sum( + len(item["case"]["ladder"].split()) + for item in matrix["requested_cases"] + if item["disposition"] == "runnable" + ), + sum( + len(item["case"]["ladder"].split()) + for item in matrix["requested_cases"] + if item["disposition"] == "unsupported" + ), + ), + (58, 608, 364, 244, 1600, 940, 660), + ) + library: dict[tuple, set[str]] = {} + system: dict[tuple, set[str]] = {} + routing: dict[tuple, list[tuple[str, bool]]] = {} + for requested in matrix["requested_cases"]: + if requested["disposition"] != "runnable": + continue + case = requested["case"] + shape = tuple( + case[field] + for field in ( + "workload", "mode", "hidden", "topk", "experts", "ep", "phase" + ) + ) + route = (case["routing"], case["eplb"]) + if case["backend"] != "nccl-ep": + library.setdefault((requested["sku"], shape, route), set()).add( + case["backend"] + ) + else: + system.setdefault((shape, route), set()).add(requested["sku"]) + routing.setdefault( + (requested["sku"], case["backend"], shape), [] + ).append(route) + anchors = {("uniform", False), ("zipf", False), ("zipf", True)} + self.assertEqual( + { + "library": sum(len(variants) >= 2 for variants in library.values()), + "system": sum(len(variants) >= 2 for variants in system.values()), + "routing": sum( + len(variants) == 3 and set(variants) == anchors + for variants in routing.values() + ), + }, + publisher.REQUIRED_PROMOTION_COHORT_COUNTS, + ) + + def test_build_promotion_requires_canonical_full_matrix(self) -> None: + bundles = { + str(run_id): { + "id": str(run_id), "cases": [], + "manifest": { + "matrix": {"sha256": "d" * 64}, + "run": { + "run_id": str(run_id), "run_attempt": 1, + "qualification_index": run_id, + }, + }, + } + for run_id in range(1, 4) + } + with mock.patch.object( + publisher, "load_bundle", side_effect=lambda _, bundle_id: bundles[bundle_id] + ), self.assertRaisesRegex(publisher.PublisherError, "canonical full-v1 matrix"): + publisher.build_dataset(object(), list(bundles), promote=True) + + def test_rejection_is_quarantined_without_updating_dev_latest(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + store = publisher.Store(root / "store") + sentinel = b"existing-promoted-pointer\n" + (store.channels / "dev-latest.json").write_bytes(sentinel) + (artifact / "unknown.json").write_text('{"format":"unknown"}') + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + self.assertEqual((store.channels / "dev-latest.json").read_bytes(), sentinel) + self.assertFalse((store.channels / "latest-attempt.json").exists()) + self.assertEqual(list(store.datasets.iterdir()), []) + self.assertTrue(any(store.quarantine.iterdir())) + + def test_repeated_rejection_is_content_idempotent(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + store = publisher.Store(root / "store") + (artifact / "unknown.json").write_text('{"format":"unknown"}') + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + counts = tuple( + len(list(path.iterdir())) + for path in (store.incoming, store.quarantine, store.datasets, store.channels) + ) + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + self.assertEqual( + tuple( + len(list(path.iterdir())) + for path in ( + store.incoming, store.quarantine, store.datasets, store.channels + ) + ), + counts, + ) + + def test_distinct_rejections_create_distinct_quarantine_objects(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + store = publisher.Store(root / "store") + unknown = artifact / "unknown.json" + unknown.write_text('{"format":"unknown-one"}') + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + first = {path.name for path in store.quarantine.iterdir()} + unknown.write_text('{"format":"unknown-two"}') + with self.assertRaises(publisher.PublisherError): + publisher.ingest_command(_args(store.root, matrix, artifact)) + second = {path.name for path in store.quarantine.iterdir()} + self.assertNotEqual(second, first) + self.assertEqual(len(second), 2) + self.assertEqual(list(store.datasets.iterdir()), []) + self.assertEqual(list(store.channels.iterdir()), []) + + def test_zip_traversal_is_rejected(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "bad.zip" + with zipfile.ZipFile(archive, "w") as handle: + handle.writestr("../escape.json", "{}") + with self.assertRaisesRegex(publisher.PublisherError, "escapes"): + publisher.extract_archive(archive, root / "out") + + def test_store_and_directory_archive_reject_symlinks(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + real = root / "real" + real.mkdir() + alias = root / "alias" + alias.symlink_to(real, target_is_directory=True) + with self.assertRaisesRegex(publisher.PublisherError, "symlinked parent"): + publisher.Store(alias / "store") + self.assertFalse((real / "store").exists()) + artifact = root / f"cxunsupported-{RUN['run_id']}-{RUN['run_attempt']}" + artifact.mkdir() + target = root / "target.json" + target.write_text("{}") + (artifact / "linked.json").symlink_to(target) + with self.assertRaisesRegex(publisher.PublisherError, "symlink"): + publisher._archive_download_directory(artifact, root / "artifact.zip") + + def test_offline_caller_metadata_is_validated_before_store_creation(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + store_root = root / "store" + args = _args(store_root, matrix, artifact) + args.run_id = "0" + with self.assertRaisesRegex(publisher.PublisherError, "run-id"): + publisher.ingest_command(args) + self.assertFalse(store_root.exists()) + + promote = types.SimpleNamespace( + store_root=str(store_root), bundle=["not-a-digest"] + ) + with self.assertRaisesRegex(publisher.PublisherError, "bundle IDs"): + publisher.promote_command(promote) + self.assertFalse(store_root.exists()) + with self.assertRaisesRegex(publisher.PublisherError, "absolute path"): + publisher._store_from_args(types.SimpleNamespace(store_root="relative-store")) + + def test_store_rejects_group_or_world_writable_root(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() / "unsafe-store" + root.mkdir() + root.chmod(0o772) + with self.assertRaisesRegex(publisher.PublisherError, "group/world writable"): + publisher.Store(root) + + def test_retry_ordinals_must_be_contiguous_from_one(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root, (1, 3)) + with self.assertRaisesRegex(publisher.PublisherError, "contiguous ordinals"): + publisher.ingest_command(_args(root / "store", matrix, artifact)) + + def test_delivery_rejects_extra_archive_and_non_native_member(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + extra = root / f"cxshard-extra-{RUN['run_id']}-{RUN['run_attempt']}" + extra.mkdir() + (extra / "extra.json").write_text("{}") + args = _args(root / "store-extra", matrix, artifact) + args.artifact.append(str(extra)) + with self.assertRaisesRegex(publisher.PublisherError, "archive set"): + publisher.ingest_command(args) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + (artifact / "notes.txt").write_text("not native evidence") + with self.assertRaisesRegex(publisher.PublisherError, "unconsumed"): + publisher.ingest_command(_args(root / "store-member", matrix, artifact)) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + matrix, artifact = _unsupported_delivery(root) + path = next(artifact.glob("*.json")) + terminal = json.loads(path.read_text()) + terminal["outcome"]["reason"] = next( + reason for reason in contracts.CAPABILITY_FAILURE_REASONS + if reason != terminal["outcome"]["reason"] + ) + path.write_text(json.dumps(terminal)) + with self.assertRaisesRegex(publisher.PublisherError, "reason differs"): + publisher.ingest_command(_args(root / "store-reason", matrix, artifact)) + + def test_rates_invert_latency_and_global_tokens_use_ep_size(self) -> None: + dataset = _dataset() + publisher.validate_public_dataset(dataset) + rates = dataset["series"][0]["points"][0]["components"]["roundtrip"][ + "activation_data_rate_gbps_at_latency_percentile" + ] + self.assertGreater(rates["p50"], rates["p99"]) + broken = copy.deepcopy(dataset) + broken["series"][0]["points"][0]["global_tokens"] = 128 + with self.assertRaisesRegex(publisher.PublisherError, "EP size"): + publisher.validate_public_dataset(broken) + broken = copy.deepcopy(dataset) + broken["series"][0]["points"][0]["roundtrip_token_rate_at_latency_percentile"]["p99"] *= 2 + with self.assertRaisesRegex(publisher.PublisherError, "token throughput"): + publisher.validate_public_dataset(broken) + broken = copy.deepcopy(dataset) + broken["attempts"][0]["evidence"][0]["point_id"] = identity.point_id( + series=broken["series"][0]["series_id"], tokens_per_rank=16 + ) + with self.assertRaisesRegex(publisher.PublisherError, "point evidence"): + publisher.validate_public_dataset(broken) + broken = copy.deepcopy(dataset) + broken["attempts"][0]["series_id"] = None + with self.assertRaisesRegex(publisher.PublisherError, "present exactly for selected success"): + publisher.validate_public_dataset(broken) + broken = copy.deepcopy(dataset) + component = broken["series"][0]["points"][0]["components"]["roundtrip"] + component["activation_data_rate_gbps_at_latency_percentile"] = None + with self.assertRaisesRegex(publisher.PublisherError, "measured data rates are missing"): + publisher.validate_public_dataset(broken) + + for mutate in ( + lambda item: item.update({"model": "different-model"}), + lambda item: item["workload"].update({"hidden": 4096}), + lambda item: item["workload"].update({"top_k": 4}), + lambda item: item["workload"].update({"experts": 128}), + ): + broken = copy.deepcopy(dataset) + mutate(broken["series"][0]) + with self.assertRaisesRegex(publisher.PublisherError, "frozen v1"): + publisher.validate_public_dataset(broken) + + broken = copy.deepcopy(dataset) + broken["series"][0]["eplb"]["mapping_sha256"] = "f" * 64 + with self.assertRaisesRegex(publisher.PublisherError, "claims a plan"): + publisher.validate_public_dataset(broken) + + broken = copy.deepcopy(dataset) + broken["series"][0]["backend"].update({ + "id": "nccl-ep", "label": publisher.BACKEND_LABELS["nccl-ep"], + "role": "reference", "generation": "rccl", + }) + broken["coverage"][0]["backend"] = "nccl-ep" + with self.assertRaisesRegex(publisher.PublisherError, "configuration"): + publisher.validate_public_dataset(broken) + + def test_public_coverage_binds_exact_topology_and_case_identity(self) -> None: + dataset = _promoted_dataset() + dataset["promotion"]["status"] = "diagnostic" + self.assertEqual( + {item["disposition"] for item in dataset["coverage"]}, + {"runnable", "unsupported"}, + ) + for item in dataset["coverage"]: + self.assertEqual( + tuple(item["topology"]), publisher.COVERAGE_TOPOLOGY_FIELDS + ) + publisher.validate_public_dataset(dataset) + + broken = copy.deepcopy(dataset) + unsupported = next( + item for item in broken["coverage"] + if item["disposition"] == "unsupported" + ) + unsupported["topology"]["nodes"] = 2 + with self.assertRaisesRegex(publisher.PublisherError, "capability registry"): + publisher.validate_public_dataset(broken) + + broken = copy.deepcopy(dataset) + unsupported = next( + item for item in broken["coverage"] + if item["disposition"] == "unsupported" + ) + unsupported["sku"] = "mi325x" + topology = publisher.capability.topology_for("mi325x", 8) + self.assertIsNotNone(topology) + unsupported["topology"] = publisher._coverage_topology({ + "ep_size": 8, **topology, + }) + with self.assertRaisesRegex(publisher.PublisherError, "case identity"): + publisher.validate_public_dataset(broken) + + def test_cohort_contract_and_labels_name_mode_explicitly(self) -> None: + dataset = _promoted_dataset() + dataset["promotion"]["status"] = "diagnostic" + publisher.validate_public_dataset(dataset) + for cohort in dataset["cohorts"]: + self.assertIn("mode", cohort["controlled_factors"]) + self.assertIn("/ normal /", cohort["label"]) + + broken = copy.deepcopy(dataset) + cohort = broken["cohorts"][0] + cohort["controlled_factors"].remove("mode") + cohort["cohort_id"] = publisher._derived_id("cxcohort-v1-", { + "kind": cohort["kind"], "series_ids": cohort["series_ids"], + "controlled_factors": cohort["controlled_factors"], + "varying_factors": cohort["varying_factors"], + }) + broken["cohorts"].sort(key=lambda item: item["cohort_id"]) + with self.assertRaisesRegex(publisher.PublisherError, "cohort factors"): + publisher.validate_public_dataset(broken) + + def test_routing_and_eplb_facts_must_match_across_repeats(self) -> None: + raw, _ = _native_fixture() + descriptor = publisher._eplb_descriptor(raw) + facts = publisher._routing_facts(raw["measurement"]["rows"][0]) + self.assertEqual( + publisher._exact_repeat_value([descriptor, copy.deepcopy(descriptor)], "EPLB"), + descriptor, + ) + self.assertEqual( + publisher._exact_repeat_value([facts, copy.deepcopy(facts)], "routing"), + facts, + ) + changed = copy.deepcopy(facts) + changed["hotspot_ratio"] += 0.1 + with self.assertRaisesRegex(publisher.PublisherError, "routing differs"): + publisher._exact_repeat_value([facts, changed], "routing") + + dataset = _promoted_dataset() + dataset["promotion"]["status"] = "diagnostic" + eplb = next(item for item in dataset["series"] if item["eplb"]["enabled"]) + eplb["points"][0]["routing"]["empty_expert_count"] = 280 + publisher.validate_public_dataset(dataset) + eplb["points"][0]["routing"]["empty_expert_count"] = 288 + with self.assertRaisesRegex(publisher.PublisherError, "routing/load facts"): + publisher.validate_public_dataset(dataset) + + for field, value in ( + ("mapping_sha256", "0" * 64), + ("redundant_experts", 31), + ("replicated_experts", 1), + ("max_replicas", 2), + ("replicated_experts", 257), + ("max_replicas", 999), + ("imbalance_after", 0.4), + ("planner", "different-planner"), + ("reference_tokens_per_rank", 1024), + ): + broken = _promoted_dataset() + broken["promotion"]["status"] = "diagnostic" + descriptor = next( + item["eplb"] for item in broken["series"] if item["eplb"]["enabled"] + ) + descriptor[field] = value + with self.subTest(eplb_field=field), self.assertRaisesRegex( + publisher.PublisherError, "EPLB descriptor" + ): + publisher.validate_public_dataset(broken) + + def test_publisher_owns_stable_rankings_and_recommendations(self) -> None: + fast, fast_internal = _series("fast", "deepep", decision_grade=True) + slow, slow_internal = _series("slow", "uccl", decision_grade=True) + reference, reference_internal = _series("reference", "nccl-ep", decision_grade=True) + reference_peer, reference_peer_internal = _series( + "reference-peer", "nccl-ep", decision_grade=True + ) + reference["backend"]["role"] = "reference" + reference_peer["backend"]["role"] = "reference" + reference_peer["system"].update({"sku": "h200-dgxc", "label": "NVIDIA H200"}) + cohorts, rankings, recommendations, _ = publisher.build_decisions( + [fast, slow, reference, reference_peer], { + fast["series_id"]: fast_internal, + slow["series_id"]: slow_internal, + reference["series_id"]: reference_internal, + reference_peer["series_id"]: reference_peer_internal, + } + ) + library = next(item for item in cohorts if item["kind"] == "library") + ranking = next(item for item in rankings if item["cohort_id"] == library["cohort_id"] + and item["metric"]["measure"] == "latency_us" + and item["metric"]["statistic"] == "p99") + self.assertTrue(library["eligibility"]["decision_grade"]) + self.assertEqual(ranking["entries"][0]["series_id"], fast["series_id"]) + self.assertTrue(any(item["series_id"] == fast["series_id"] for item in recommendations)) + self.assertFalse(any( + entry["series_id"] == reference["series_id"] + for item in rankings if item["cohort_id"] == library["cohort_id"] + for entry in item["entries"] + )) + self.assertTrue(any( + item["kind"] == "system" and reference["series_id"] in item["series_ids"] + for item in cohorts + )) + + def test_routing_evidence_is_experimental_and_not_a_configuration_recommendation(self) -> None: + dataset = _promoted_dataset() + routing = next(item for item in dataset["cohorts"] if item["kind"] == "routing") + members = [ + item for item in dataset["series"] + if item["series_id"] in routing["series_ids"] + ] + self.assertEqual( + {(item["workload"]["routing"], item["workload"]["eplb"]) for item in members}, + {("uniform", False), ("zipf", False), ("zipf", True)}, + ) + self.assertIn("implementation-static-build", routing["controlled_factors"]) + self.assertIn("resource", routing["controlled_factors"]) + self.assertEqual( + routing["varying_factors"], + ["workload.routing", "workload.eplb", "implementation-config"], + ) + self.assertEqual( + len({item["build"]["routing_control_sha256"] for item in members}), + 1, + ) + self.assertGreater( + len({item["build"]["implementation_contract_sha256"] for item in members}), + 1, + ) + self.assertEqual(len({json.dumps(item["resource"], sort_keys=True) for item in members}), 1) + self.assertEqual(routing["publication_tier"], "comparable-experimental") + self.assertTrue(any( + item["cohort_id"] == routing["cohort_id"] for item in dataset["rankings"] + )) + self.assertFalse(any( + item["cohort_id"] == routing["cohort_id"] for item in dataset["recommendations"] + )) + self.assertTrue(all( + item["publication_tier"] == "official" + for item in dataset["recommendations"] + )) + self.assertFalse(any( + dataset_cohort["publication_tier"] == "comparable-experimental" + and item["cohort_id"] == dataset_cohort["cohort_id"] + for item in dataset["recommendations"] + for dataset_cohort in dataset["cohorts"] + )) + self.assertTrue(all( + item["publication_tier"] == "comparable-experimental" + for item in dataset["sensitivities"] + if item["cohort_id"] == routing["cohort_id"] + )) + + def test_routing_implementation_mismatch_blocks_all_decisions(self) -> None: + dataset = _promoted_dataset() + published = next(item for item in dataset["cohorts"] if item["kind"] == "routing") + members = [ + item for item in dataset["series"] + if item["series_id"] in published["series_ids"] + ] + zipf = next( + item for item in members + if item["workload"]["routing"] == "zipf" and not item["workload"]["eplb"] + ) + zipf["build"]["implementation_contract_sha256"] = "f" * 64 + internals = {} + for member in members: + point = member["points"][0] + roundtrip = point["components"]["roundtrip"] + metrics = { + "latency_us": { + name: roundtrip["latency_us"][name] for name in ("p50", "p99") + }, + **{ + field: { + name: roundtrip[field][name] for name in ("p50", "p99") + } + for field in ( + "activation_data_rate_gbps_at_latency_percentile", + "total_logical_data_rate_gbps_at_latency_percentile", + ) + }, + } + internals[member["series_id"]] = { + "run_metrics": { + str(run): {point["tokens_per_rank"]: metrics} + for run in range(3) + } + } + cohorts, rankings, recommendations, sensitivities = publisher.build_decisions( + members, internals + ) + routing = next(item for item in cohorts if item["kind"] == "routing") + self.assertFalse(routing["eligibility"]["decision_grade"]) + self.assertIn( + "implementation-config-mismatch", routing["eligibility"]["reasons"] + ) + self.assertEqual((rankings, recommendations, sensitivities), ([], [], [])) + + def test_promoted_series_fields_are_bound_to_case_and_series_identities(self) -> None: + dataset = _promoted_dataset() + changed = copy.deepcopy(dataset) + series = next( + item for item in changed["series"] + if item["system"]["sku"] == "h100-dgxc" + ) + series["system"].update({ + "sku": "h200-dgxc", "label": "NVIDIA H200", + "topology_class": "h200-nvlink-island", + }) + for case_id in series["case_ids"]: + coverage = next( + item for item in changed["coverage"] if item["case_id"] == case_id + ) + coverage["sku"] = "h200-dgxc" + coverage["topology"] = publisher._coverage_topology(series["system"]) + with self.assertRaisesRegex(publisher.PublisherError, "configuration|case identity"): + publisher.validate_public_dataset(changed) + + for field, value in ( + ("source_sha", "b" * 40), + ("image_digest", "sha256:" + "4" * 64), + ("squash_sha256", "5" * 64), + ("runtime_fingerprint_sha256", "6" * 64), + ("implementation_contract_sha256", "7" * 64), + ("public_config_sha256", "9" * 64), + ("routing_control_sha256", "8" * 64), + ): + changed = copy.deepcopy(dataset) + changed["series"][0]["build"][field] = value + with self.subTest(build_field=field), self.assertRaisesRegex( + publisher.PublisherError, "commit" + ): + publisher.validate_public_dataset(changed) + changed = copy.deepcopy(dataset) + changed["series"][0]["workload"]["workload_id"] = identity.workload_id( + {"changed": True} + ) + with self.assertRaisesRegex(publisher.PublisherError, "committed factors"): + publisher.validate_public_dataset(changed) + + for mutate, message in ( + (lambda item: item["backend"].update({ + "generation": "fabricated", "version": "fabricated-999", + }), "configuration"), + (lambda item: item["resource"].update({ + "profile": "profile-fabricated", "configured_units": 99, + }), "configuration"), + (lambda item: item["system"].update({"label": "Fabricated H100"}), "projection|commit"), + ): + changed = copy.deepcopy(dataset) + mutate(changed["series"][0]) + with self.assertRaisesRegex(publisher.PublisherError, message): + publisher.validate_public_dataset(changed) + + diagnostic = _dataset() + diagnostic["series"][0]["build"]["source_sha"] = "b" * 40 + with self.assertRaisesRegex(publisher.PublisherError, "committed factors"): + publisher.validate_public_dataset(diagnostic) + + def test_all_decision_metrics_require_stable_repeat_ordering(self) -> None: + fast, fast_internal = _series("ordering-fast", "deepep", decision_grade=True) + slow, slow_internal = _series("ordering-slow", "uccl", decision_grade=True) + internals = { + fast["series_id"]: fast_internal, + slow["series_id"]: slow_internal, + } + + cohorts, rankings, recommendations, _ = publisher.build_decisions( + [fast, slow], internals + ) + library = next(item for item in cohorts if item["kind"] == "library") + self.assertTrue(library["eligibility"]["decision_grade"]) + self.assertEqual( + len([item for item in rankings if item["cohort_id"] == library["cohort_id"]]), + 6, + ) + self.assertEqual( + len([ + item for item in recommendations + if item["cohort_id"] == library["cohort_id"] + ]), + 1, + ) + + for statistic in ("p50", "p99"): + for field in ( + "activation_data_rate_gbps_at_latency_percentile", + "total_logical_data_rate_gbps_at_latency_percentile", + ): + slow_internal["run_metrics"]["1"][8][field][statistic] = ( + fast_internal["run_metrics"]["1"][8][field][statistic] * 2 + ) + cohorts, rankings, recommendations, _ = publisher.build_decisions( + [fast, slow], internals + ) + library = next(item for item in cohorts if item["kind"] == "library") + self.assertFalse(library["eligibility"]["decision_grade"]) + self.assertIn("unstable-ordering", library["eligibility"]["reasons"]) + self.assertFalse(any( + item["cohort_id"] == library["cohort_id"] for item in rankings + )) + self.assertFalse(any( + item["cohort_id"] == library["cohort_id"] for item in recommendations + )) + + def test_p99_bootstrap_is_deterministic_and_dataset_bound(self) -> None: + fast, fast_internal = _series("bootstrap-fast", "deepep", decision_grade=True) + slow, slow_internal = _series("bootstrap-slow", "uccl", decision_grade=True) + internals = { + fast["series_id"]: fast_internal, + slow["series_id"]: slow_internal, + } + + first = publisher._hierarchical_p99_ratio( + fast["series_id"], slow["series_id"], 8, internals, "a" * 64 + ) + repeated = publisher._hierarchical_p99_ratio( + fast["series_id"], slow["series_id"], 8, internals, "a" * 64 + ) + rebound = publisher._hierarchical_p99_ratio( + fast["series_id"], slow["series_id"], 8, internals, "b" * 64 + ) + + self.assertEqual(first, repeated) + self.assertEqual(first["resamples"], 10_000) + self.assertEqual(first["confidence"], 0.95) + self.assertEqual(first["equivalence_band"], 0.05) + self.assertTrue(first["all_runs_agree"]) + self.assertTrue(first["baseline_wins"]) + self.assertGreater(first["ci95"][0], 1.05) + self.assertNotEqual(first["seed_sha256"], rebound["seed_sha256"]) + + def test_p99_equivalence_band_emits_competition_tie_without_recommendation(self) -> None: + fast, fast_internal = _series("tie-fast", "deepep", decision_grade=True) + near, near_internal = _series("tie-near", "uccl", decision_grade=True) + fast_point = fast["points"][0] + near_point = near["points"][0] + fast_component = fast_point["components"]["roundtrip"] + near_component = near_point["components"]["roundtrip"] + for statistic, latency in fast_component["latency_us"].items(): + near_latency = latency * 1.03 + near_component["latency_us"][statistic] = near_latency + for field, byte_field in ( + ("activation_data_rate_gbps_at_latency_percentile", "activation_data_bytes"), + ("total_logical_data_rate_gbps_at_latency_percentile", "total_logical_bytes"), + ): + near_component[field][statistic] = ( + near_component["byte_provenance"][byte_field] + / (near_latency * 1000.0) + ) + near_point["roundtrip_token_rate_at_latency_percentile"][statistic] = ( + near_point["global_tokens"] / (near_latency * 1e-6) + ) + for run_id, fast_metrics in fast_internal["run_metrics"].items(): + for statistic in ("p50", "p99"): + latency = fast_metrics[8]["latency_us"][statistic] * 1.03 + near_internal["run_metrics"][run_id][8]["latency_us"][statistic] = latency + for field, byte_field in ( + ("activation_data_rate_gbps_at_latency_percentile", "activation_data_bytes"), + ("total_logical_data_rate_gbps_at_latency_percentile", "total_logical_bytes"), + ): + near_internal["run_metrics"][run_id][8][field][statistic] = ( + near_component["byte_provenance"][byte_field] + / (latency * 1000.0) + ) + near_internal["trial_blocks"][run_id][8]["roundtrip"] = tuple( + tuple(sample * 1.03 for sample in block) + for block in fast_internal["trial_blocks"][run_id][8]["roundtrip"] + ) + internals = { + fast["series_id"]: fast_internal, + near["series_id"]: near_internal, + } + + cohorts, rankings, recommendations, _ = publisher.build_decisions( + [fast, near], internals, dataset_binding="c" * 64 + ) + library = next(item for item in cohorts if item["kind"] == "library") + ranking = next( + item for item in rankings + if item["cohort_id"] == library["cohort_id"] + and item["metric"]["measure"] == "latency_us" + and item["metric"]["statistic"] == "p99" + ) + self.assertEqual([entry["rank"] for entry in ranking["entries"]], [1, 1]) + self.assertFalse(any( + item["cohort_id"] == library["cohort_id"] + for item in recommendations + )) + self.assertNotIn( + "trial_blocks", json.dumps({"series": [fast, near], "rankings": rankings}) + ) + + def test_p99_winner_requires_every_run_to_agree(self) -> None: + fast, fast_internal = _series("run-fast", "deepep", decision_grade=True) + slow, slow_internal = _series("run-slow", "uccl", decision_grade=True) + ratios = {"0": 0.98, "1": 1.20, "2": 1.20} + for run_id, ratio in ratios.items(): + slow_internal["trial_blocks"][run_id][8]["roundtrip"] = tuple( + tuple(sample * ratio for sample in block) + for block in fast_internal["trial_blocks"][run_id][8]["roundtrip"] + ) + result = publisher._hierarchical_p99_ratio( + fast["series_id"], slow["series_id"], 8, + { + fast["series_id"]: fast_internal, + slow["series_id"]: slow_internal, + }, + "d" * 64, + ) + self.assertFalse(result["all_runs_agree"]) + self.assertFalse(result["baseline_wins"]) + self.assertTrue(result["tie"]) + + def test_precision_cohorts_isolate_axes_and_never_recommend(self) -> None: + profiles = ( + identity.V1_CONTROL_PRECISION_PROFILE, + "d-fp8-e4m3fn-b128-f32-fused.c-bf16", + "d-bf16.c-logfmt10-dynamic64", + "d-fp8-e4m3fn-b128-f32-fused.c-logfmt10-dynamic64", + ) + series = [] + internals = {} + for index, profile_id in enumerate(profiles): + item, internal = _series( + f"precision-{index}", "deepep", decision_grade=True + ) + precision = identity.precision_profile(profile_id) + item["suite"] = ( + "ep-low-latency-v1" + if index == 0 + else "ep-precision-low-latency-v1" + ) + item["mode"] = "low-latency" + item["publication_tier"] = ( + "official" if index == 0 else "comparable-experimental" + ) + item["workload"].update({ + "precision_profile": profile_id, + "dispatch_precision": precision["dispatch"], + "combine_precision": precision["combine"], + }) + item["series_id"] = identity.series_id({"precision-fixture": profile_id}) + series.append(item) + internals[item["series_id"]] = internal + + cohorts, rankings, recommendations, sensitivities = publisher.build_decisions( + series, internals, dataset_binding="e" * 64 + ) + precision_cohorts = [ + cohort for cohort in cohorts + if cohort["kind"] in publisher.PRECISION_COHORT_KINDS + ] + self.assertEqual( + {kind: sum(cohort["kind"] == kind for cohort in precision_cohorts) + for kind in publisher.PRECISION_COHORT_KINDS}, + {"dispatch-precision": 2, "combine-precision": 2, "precision-pair": 1}, + ) + self.assertTrue(all( + cohort["publication_tier"] == "comparable-experimental" + and cohort["eligibility"]["decision_grade"] + for cohort in precision_cohorts + )) + self.assertEqual(len(rankings), 30) + self.assertEqual(len(sensitivities), 24) + self.assertEqual(recommendations, []) + pair = next( + cohort for cohort in precision_cohorts + if cohort["kind"] == "precision-pair" + ) + self.assertEqual( + pair["varying_factors"], + [ + "dispatch-precision", "combine-precision", "precision-profile", + "resource", + ], + ) + self.assertNotIn("resource", pair["controlled_factors"]) + self.assertFalse(any( + sensitivity["cohort_id"] == pair["cohort_id"] + for sensitivity in sensitivities + )) + + def test_private_trial_copy_is_component_extensible(self) -> None: + blocks = [[float(trial + iteration + 1) for iteration in range(8)] + for trial in range(64)] + copied = publisher._private_trial_components({ + "points": [{ + "tokens_per_rank": 8, + "components": { + "roundtrip": {"availability": "measured", "trials": blocks}, + "stage": {"availability": "measured", "trials": blocks}, + "combine": {"availability": "not-applicable", "trials": None}, + }, + }], + }) + self.assertEqual(set(copied[8]), {"roundtrip", "stage", "combine"}) + self.assertEqual(len(copied[8]["stage"]), 64) + self.assertIsNone(copied[8]["combine"]) + + def test_missing_private_trials_blocks_decision_grade(self) -> None: + fast, fast_internal = _series("trials-fast", "deepep", decision_grade=True) + slow, slow_internal = _series("trials-slow", "uccl", decision_grade=True) + del slow_internal["trial_blocks"] + cohorts, rankings, recommendations, sensitivities = publisher.build_decisions( + [fast, slow], { + fast["series_id"]: fast_internal, + slow["series_id"]: slow_internal, + } + ) + library = next(item for item in cohorts if item["kind"] == "library") + self.assertFalse(library["eligibility"]["decision_grade"]) + self.assertIn("missing-trial-blocks", library["eligibility"]["reasons"]) + self.assertEqual((rankings, recommendations, sensitivities), ([], [], [])) + + def test_extra_eligibility_reason_blocks_decision_grade(self) -> None: + allocations = [identity.allocation_id({"run": run}) for run in range(3)] + eligibility = publisher._eligibility_record( + allocations, complete=True, correct=True, measured=True, + stable_ordering=True, p50_ratio=1.01, p99_ratio=1.02, + extra_reasons=["incomplete-provenance"], + ) + self.assertFalse(eligibility["decision_grade"]) + self.assertEqual(eligibility["reasons"], ["incomplete-provenance"]) + self.assertIs(publisher._eligibility(eligibility, "fixture"), eligibility) + broken = {**eligibility, "decision_grade": True} + with self.assertRaisesRegex(publisher.PublisherError, "promotion gates"): + publisher._eligibility(broken, "fixture") + + def test_schema_is_strict_and_channel_target_must_be_complete(self) -> None: + dataset = _dataset() + dataset["unexpected"] = True + with self.assertRaises(publisher.PublisherError): + publisher.validate_public_dataset(dataset) + with mock.patch.object(publisher, "MAX_PUBLIC_DATASET_BYTES", 1), self.assertRaisesRegex( + publisher.PublisherError, "serving size limit" + ): + publisher.validate_public_dataset(_dataset()) + with tempfile.TemporaryDirectory() as temporary: + store = publisher.Store(Path(temporary).resolve()) + dataset = _promoted_dataset() + with mock.patch.object( + publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", + publisher._case_disposition_catalog_sha256(dataset["coverage"]), + ), mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", _cohort_counts(dataset), + ): + digest, size = store.install_dataset(dataset) + store.update_channel("dev-latest", digest, size, dataset["generated_at"]) + self.assertEqual( + store.verify_channel("dev-latest")["dataset"]["sha256"], digest + ) + channel_path = store.channels / "dev-latest.json" + pointer = publisher.strict_load(channel_path) + pointer["generated_at"] = "2099-01-01T00:00:00Z" + channel_path.write_bytes(contracts.canonical_json_bytes(pointer)) + with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"): + store.verify_channel("dev-latest") + store.update_channel("dev-latest", digest, size, dataset["generated_at"]) + with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"): + store.update_channel( + "dev-latest", digest, size + 1, dataset["generated_at"] + ) + with self.assertRaisesRegex(publisher.PublisherError, "metadata differs"): + store.update_channel( + "dev-latest", digest, size, "2026-07-05T00:00:00Z" + ) + os.chmod(channel_path, 0o666) + with self.assertRaisesRegex(publisher.PublisherError, "regular 644"): + store.verify_channel("dev-latest") + os.chmod(channel_path, 0o644) + dataset_dir = store.datasets / digest + os.chmod(dataset_dir, 0o755) + with self.assertRaisesRegex(publisher.PublisherError, "mode differs"): + store.verify_channel("dev-latest") + os.chmod(dataset_dir, 0o555) + os.chmod(dataset_dir / "dataset.json", 0o644) + with self.assertRaisesRegex(publisher.PublisherError, "mode differs"): + store.verify_channel("dev-latest") + os.chmod(dataset_dir / "dataset.json", 0o444) + os.chmod(dataset_dir, 0o755) + (dataset_dir / "COMPLETE").unlink() + os.chmod(dataset_dir, 0o555) + with self.assertRaisesRegex(publisher.PublisherError, "incomplete"): + store.verify_channel("dev-latest") + + def test_store_modes_do_not_depend_on_process_umask(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + previous = os.umask(0o077) + try: + store = publisher.Store(Path(temporary).resolve()) + dataset = _promoted_dataset() + with mock.patch.object( + publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", + publisher._case_disposition_catalog_sha256(dataset["coverage"]), + ), mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", + _cohort_counts(dataset), + ): + digest, size = store.install_dataset(dataset) + store.update_channel( + "dev-latest", digest, size, dataset["generated_at"] + ) + with store.locked(): + pass + finally: + os.umask(previous) + self.assertEqual( + store.root.stat().st_mode & 0o777, + 0o750, + ) + self.assertEqual( + (store.channels / "dev-latest.json").stat().st_mode & 0o777, + 0o644, + ) + self.assertEqual( + (store.datasets / digest / "dataset.json").stat().st_mode & 0o777, + 0o444, + ) + self.assertEqual( + (store.locks / "publisher.lock").stat().st_mode & 0o777, + 0o600, + ) + + def test_verify_requires_a_promoted_dev_latest_channel(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + args = types.SimpleNamespace( + store_root=str(root / "store"), channel=None, bundle=[] + ) + with self.assertRaises(publisher.PublisherError): + publisher.verify_command(args) + store = publisher.Store(args.store_root) + dataset = _promoted_dataset() + with mock.patch.object( + publisher, "CANONICAL_FULL_V1_CASE_CATALOG_SHA256", + publisher._case_disposition_catalog_sha256(dataset["coverage"]), + ), mock.patch.object( + publisher, "REQUIRED_PROMOTION_COHORT_COUNTS", _cohort_counts(dataset), + ): + digest, size = store.install_dataset(dataset) + store.update_channel( + "dev-latest", digest, size, dataset["generated_at"] + ) + result = publisher.verify_command(args) + self.assertEqual(set(result["channels"]), {"dev-latest"}) + explicit = types.SimpleNamespace( + store_root=args.store_root, channel=["dev-latest"], bundle=[] + ) + self.assertEqual( + publisher.verify_command(explicit)["channels"], result["channels"] + ) + unknown = types.SimpleNamespace( + store_root=args.store_root, channel=["latest-attempt"], bundle=[] + ) + with self.assertRaisesRegex(publisher.PublisherError, "unknown channel"): + publisher.verify_command(unknown) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/test_qualification_planning.py b/experimental/CollectiveX/tests/test_qualification_planning.py new file mode 100644 index 000000000..4718818ed --- /dev/null +++ b/experimental/CollectiveX/tests/test_qualification_planning.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +"""CPU-only tests for qualification-specific shard execution planning.""" +from __future__ import annotations + +import copy +import hashlib +import json +import os +from pathlib import Path +import sys +import tempfile +import unittest +from unittest import mock + + +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +sys.path[:0] = [str(ROOT), str(HERE)] + +import identity # noqa: E402 +import sweep_matrix # noqa: E402 + + +def _canonical(value: object) -> bytes: + return json.dumps( + value, ensure_ascii=True, sort_keys=True, separators=(",", ":") + ).encode() + + +class QualificationPlanningTest(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls.matrix = sweep_matrix.validate_matrix_document( + sweep_matrix.resolve_matrix( + backend="deepep", only_sku="h100-dgxc", max_cases=128 + ) + ) + cls.shard = next(item for item in cls.matrix["include"] if item["n"] >= 3) + + def test_matrix_semantics_do_not_depend_on_qualification_index(self) -> None: + expected = _canonical(self.matrix) + for qualification_index in (1, 2, 3): + with mock.patch.dict( + os.environ, + {"CX_QUALIFICATION_INDEX": str(qualification_index)}, + ): + observed = sweep_matrix.validate_matrix_document( + sweep_matrix.resolve_matrix( + backend="deepep", only_sku="h100-dgxc", max_cases=128 + ) + ) + self.assertEqual(_canonical(observed), expected) + + def test_extract_shard_has_deterministic_distinct_exact_plans(self) -> None: + matrix_bytes = _canonical(self.matrix) + b"\n" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + matrix_path = root / "matrix.json" + matrix_path.write_bytes(matrix_bytes) + original_digest = hashlib.sha256(matrix_path.read_bytes()).hexdigest() + controls = [] + for qualification_index in (1, 2, 3): + first = sweep_matrix.extract_shard( + matrix_path, + self.shard["id"], + root / f"q{qualification_index}-first.json", + sku=self.shard["sku"], + backend=self.shard["backend"], + nodes=self.shard["nodes"], + qualification_index=qualification_index, + ) + repeated = sweep_matrix.extract_shard( + matrix_path, + self.shard["id"], + root / f"q{qualification_index}-repeated.json", + sku=self.shard["sku"], + backend=self.shard["backend"], + nodes=self.shard["nodes"], + qualification_index=qualification_index, + ) + self.assertEqual(first, repeated) + self.assertEqual(first["qualification_index"], qualification_index) + self.assertEqual( + {case["case_id"] for case in first["cases"]}, + set(self.shard["case_ids"]), + ) + plan = [ + [ + case["case_id"], + case.get( + "precision_profile", + identity.V1_CONTROL_PRECISION_PROFILE, + ), + ] + for case in first["cases"] + ] + self.assertEqual( + first["execution_plan_sha256"], + hashlib.sha256( + json.dumps(plan, separators=(",", ":")).encode() + ).hexdigest(), + ) + sweep_matrix.validate_shard_control( + first, + sku=self.shard["sku"], + backend=self.shard["backend"], + nodes=self.shard["nodes"], + qualification_index=qualification_index, + ) + controls.append(first) + self.assertEqual( + len({control["execution_plan_sha256"] for control in controls}), 3 + ) + self.assertEqual( + len({tuple(case["case_id"] for case in control["cases"]) + for control in controls}), + 3, + ) + self.assertEqual( + hashlib.sha256(matrix_path.read_bytes()).hexdigest(), original_digest + ) + + def test_precision_profiles_and_cases_rotate_across_repeats(self) -> None: + profiles = list(identity.V1_PRECISION_PROFILES)[:3] + cases = [ + { + "case_id": identity.digest("case", {"fixture": index}), + "precision_profile": profiles[index % len(profiles)], + } + for index in range(9) + ] + plans = [ + sweep_matrix.qualification_execution_order( + "qualification-fixture", cases, qualification_index + ) + for qualification_index in (1, 2, 3) + ] + expected_ids = {case["case_id"] for case in cases} + self.assertTrue(all( + {case["case_id"] for case in plan} == expected_ids for plan in plans + )) + self.assertEqual( + len({tuple(case["case_id"] for case in plan) for plan in plans}), 3 + ) + self.assertEqual( + len({tuple(case["precision_profile"] for case in plan) for plan in plans}), + 3, + ) + + def test_matrix_execution_plan_digest_is_repeat_specific_and_stable(self) -> None: + digests = [ + sweep_matrix.qualification_execution_plan_sha256(self.matrix, index) + for index in (1, 2, 3) + ] + self.assertEqual(len(set(digests)), 3) + self.assertTrue(all(len(digest) == 64 for digest in digests)) + self.assertEqual( + digests, + [ + sweep_matrix.qualification_execution_plan_sha256( + copy.deepcopy(self.matrix), index + ) + for index in (1, 2, 3) + ], + ) + self.assertTrue(all(shard["execution_weight"] > 0 for shard in self.matrix["include"])) + tampered = copy.deepcopy(self.matrix) + tampered["include"][0]["execution_weight"] += 1 + with self.assertRaisesRegex( + sweep_matrix.MatrixError, "execution_weight differs from its cases" + ): + sweep_matrix.qualification_execution_plan_sha256(tampered, 1) + + def test_frontend_catalog_covers_every_requested_case_and_point(self) -> None: + catalog = sweep_matrix.frontend_catalog(self.matrix) + self.assertEqual(catalog["format"], "collectivex.frontend-catalog.v1") + self.assertEqual(catalog["case_count"], len(self.matrix["requested_cases"])) + self.assertEqual( + catalog["point_count"], + sum( + len(item["case"]["ladder"].split()) + for item in self.matrix["requested_cases"] + ), + ) + self.assertEqual( + {item["case_id"] for item in catalog["cases"]}, + {item["case"]["case_id"] for item in self.matrix["requested_cases"]}, + ) + self.assertLess(len(_canonical(catalog)) + 1, 1024 * 1024) + + def test_invalid_qualification_controls_are_rejected(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + matrix_path = root / "matrix.json" + matrix_path.write_bytes(_canonical(self.matrix) + b"\n") + arguments = { + "sku": self.shard["sku"], + "backend": self.shard["backend"], + "nodes": self.shard["nodes"], + } + for invalid in (0, 4, True): + with self.subTest(qualification_index=invalid), self.assertRaisesRegex( + sweep_matrix.MatrixError, "integer in 1..3" + ): + sweep_matrix.extract_shard( + matrix_path, + self.shard["id"], + root / "invalid.json", + qualification_index=invalid, + **arguments, + ) + with mock.patch.dict(os.environ, {"CX_QUALIFICATION_INDEX": "invalid"}): + with self.assertRaisesRegex( + sweep_matrix.MatrixError, "integer in 1..3" + ): + sweep_matrix.extract_shard( + matrix_path, + self.shard["id"], + root / "invalid-env.json", + **arguments, + ) + + with mock.patch.dict(os.environ, {}, clear=True): + control = sweep_matrix.extract_shard( + matrix_path, + self.shard["id"], + root / "default.json", + **arguments, + ) + self.assertEqual(control["qualification_index"], 1) + + invalid_control = copy.deepcopy(control) + invalid_control["qualification_index"] = 4 + with self.assertRaisesRegex( + sweep_matrix.MatrixError, "integer in 1..3" + ): + sweep_matrix.validate_shard_control(invalid_control, **arguments) + tampered = copy.deepcopy(control) + tampered["execution_plan_sha256"] = "0" * 64 + with self.assertRaisesRegex( + sweep_matrix.MatrixError, "differs from its ordered cases" + ): + sweep_matrix.validate_shard_control(tampered, **arguments) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/test_sampling_contract.py b/experimental/CollectiveX/tests/test_sampling_contract.py new file mode 100644 index 000000000..e0c10225f --- /dev/null +++ b/experimental/CollectiveX/tests/test_sampling_contract.py @@ -0,0 +1,3433 @@ +#!/usr/bin/env python3 +"""CPU-only behavioral tests for the CollectiveX v1 execution contract.""" +from __future__ import annotations + +import argparse +import ast +import copy +import hashlib +import io +import json +import os +from pathlib import Path +import re +import stat +import subprocess +import sys +import tarfile +import tempfile +import types +import unittest +from unittest import mock + +import numpy as np + +HERE = Path(__file__).resolve().parent +ROOT = HERE.parent +sys.path[:0] = [str(ROOT), str(HERE)] + +import artifact_safety # noqa: E402 +import capability # noqa: E402 +import contracts # noqa: E402 +import eplb # noqa: E402 +import ep_harness # noqa: E402 +import identity # noqa: E402 +import run_ep # noqa: E402 +import source_archive # noqa: E402 +import summarize # noqa: E402 +import sweep_matrix # noqa: E402 +import workload # noqa: E402 + + +class SamplingContractTest(unittest.TestCase): + def test_identity_and_fixed_sampling_profile(self) -> None: + identity.verify_test_vector() + self.assertTrue(identity.is_typed_id(identity.IDENTITY_TEST_VECTOR["series_id"], "series")) + self.assertEqual(ep_harness.SAMPLING_CONTRACT, "fixed-512-v1") + self.assertEqual( + ( + ep_harness.TIMED_ITERS_PER_TRIAL, + ep_harness.TRIALS_PER_POINT, + ep_harness.TIMED_SAMPLES_PER_POINT, + ep_harness.WARMUP_ITERS_PER_TRIAL, + ), + (8, 64, 512, 32), + ) + self.assertEqual(identity.V1_CASE_PROFILE["activation_profile"], "canonical-counter-source-v4") + self.assertEqual( + identity.V1_CASE_PROFILE["activation_generator"], + "collectivex-activation-counter-v4", + ) + self.assertEqual(identity.V1_CASE_PROFILE["sampling_contract"], "fixed-512-v1") + self.assertEqual(identity.V1_CASE_PROFILE["percentile_method"], "nearest-rank") + self.assertEqual( + identity.V1_CASE_PROFILE["rank_reduction"], + "cross-rank-max-per-iteration", + ) + self.assertEqual( + identity.V1_CASE_PROFILE["oracle_contract"], + "expert-specific-transform-v1", + ) + self.assertEqual( + set(identity.V1_CASE_PROFILES), {"normal", "low-latency"} + ) + self.assertEqual( + identity.V1_LOW_LATENCY_CASE_PROFILE["payload_unit"], "token-expert" + ) + self.assertNotEqual( + identity.digest("case", identity.V1_NORMAL_CASE_PROFILE), + identity.digest("case", identity.V1_LOW_LATENCY_CASE_PROFILE), + ) + parser = argparse.ArgumentParser() + ep_harness.add_common_args(parser) + args = parser.parse_args( + [ + "--runner", "test", "--topology-class", "test", + "--scope", "scale-up", "--scale-up-transport", "nvlink", + "--out", "result.json", + ] + ) + self.assertEqual((args.iters, args.trials, args.warmup), (8, 64, 32)) + self.assertEqual(args.qualification_index, 1) + for profile in ((8, 64, 32), (128, 4, 32), (8, 1, 4), (0, 64, 32)): + with self.subTest(profile=profile): + self.assertEqual( + ep_harness.sampling_contract_error(*profile) is None, + profile == (8, 64, 32), + ) + + def test_nearest_rank_percentiles_use_all_512_samples(self) -> None: + samples = list(range(1, 513)) + self.assertEqual(ep_harness.percentile(samples, 50), 256) + self.assertEqual(ep_harness.percentile(samples, 99), 507) + + def test_qualification_order_is_deterministic_and_position_balanced(self) -> None: + values = [1, 2, 4, 8, 16, 32, 64, 128] + for qualification_index in range(1, ep_harness.QUALIFICATION_RUNS + 1): + orders = [ + ep_harness.qualification_order(values, qualification_index, trial) + for trial in range(64) + ] + self.assertEqual( + orders, + [ + ep_harness.qualification_order(values, qualification_index, trial) + for trial in range(64) + ], + ) + self.assertTrue(all(sorted(order) == values for order in orders)) + for position in range(len(values)): + self.assertEqual( + {value: sum(order[position] == value for order in orders) for value in values}, + {value: 8 for value in values}, + ) + with self.assertRaises(ValueError): + ep_harness.qualification_order(values, 0, 0) + with self.assertRaises(ValueError): + ep_harness.qualification_order([1, 1], 1, 0) + + def test_sample_evidence_preserves_exact_trial_blocks(self) -> None: + trials = [ + [float(trial * 8 + sample) for sample in range(8)] + for trial in range(64) + ] + evidence = ep_harness.sampled_component_evidence(trials) + self.assertEqual(evidence["availability"], "measured") + self.assertEqual(evidence["sample_count"], 512) + self.assertEqual(evidence["trials"], trials) + self.assertIsNot(evidence["trials"], trials) + self.assertEqual( + ep_harness.sampled_component_evidence([]), + {"availability": "unavailable", "sample_count": 0, "trials": None}, + ) + for malformed in (trials[:-1], [*trials[:-1], trials[-1][:-1]]): + with self.assertRaises(ValueError): + ep_harness.sampled_component_evidence(malformed) + invalid = copy.deepcopy(trials) + invalid[0][0] = float("nan") + with self.assertRaises(ValueError): + ep_harness.sampled_component_evidence(invalid) + + def test_terminal_summary_uses_bound_sku_and_route(self) -> None: + terminal = { + "format": contracts.TERMINAL_FORMAT, + "case": { + "backend": "deepep", "phase": "prefill", "ep": 8, + "suite": "ep-routing-v1", "routing": "zipf", "eplb": True, + "required_publication": "comparable-experimental", + }, + "identity": {"case_factors": {"sku": "h100-dgxc"}}, + } + self.assertEqual( + summarize._identity(terminal), + ( + "h100-dgxc", "ep-routing-v1", "zipf", "prefill", True, + "comparable-experimental", 8, + ), + ) + + def test_matrix_cases_and_shards_are_identity_bound(self) -> None: + matrix = sweep_matrix.validate_matrix_document( + sweep_matrix.resolve_matrix(backends="all") + ) + requested = {item["case"]["case_id"]: item for item in matrix["requested_cases"]} + assigned = [case_id for shard in matrix["include"] for case_id in shard["case_ids"]] + runnable = { + case_id for case_id, item in requested.items() + if item["disposition"] == "runnable" + } + runnable_cases = [ + item for item in matrix["requested_cases"] + if item["disposition"] == "runnable" + ] + unsupported_cases = [ + item for item in matrix["requested_cases"] + if item["disposition"] == "unsupported" + ] + self.assertEqual( + ( + len(matrix["include"]), + len(matrix["requested_cases"]), + len(runnable_cases), + len(unsupported_cases), + sum( + len(item["case"]["ladder"].split()) + for item in matrix["requested_cases"] + ), + sum(len(item["case"]["ladder"].split()) for item in runnable_cases), + sum(len(item["case"]["ladder"].split()) for item in unsupported_cases), + ), + (58, 608, 364, 244, 1600, 940, 660), + ) + expected_topologies = {} + for sku, product in ( + ("h100-dgxc", "h100"), ("h200-dgxc", "h200"), + ("b200-dgxc", "b200"), ("b300", "b300"), + ): + expected_topologies[sku, 8] = ( + 1, 8, 8, "scale-up", "nvlink", None, "nvlink", + f"{product}-nvlink-island", + ) + expected_topologies[sku, 16] = ( + 2, 8, 8, "scale-out", "nvlink", "rdma", "nvlink-rdma", + f"{product}-nvlink-rdma", + ) + for sku in ("gb200", "gb300"): + topology_class = f"{sku}-nvl72-mnnvl" + expected_topologies[sku, 8] = ( + 2, 4, 72, "scale-up", "mnnvl", None, "mnnvl", topology_class, + ) + expected_topologies[sku, 16] = ( + 4, 4, 72, "scale-up", "mnnvl", None, "mnnvl", topology_class, + ) + for sku in ("mi325x", "mi355x"): + expected_topologies[sku, 8] = ( + 1, 8, 8, "scale-up", "xgmi", None, "xgmi", f"{sku}-xgmi", + ) + expected_topologies[sku, 16] = ( + 2, 8, 8, "scale-out", "xgmi", "rdma", "xgmi-rdma", + f"{sku}-xgmi-rdma", + ) + topology_fields = sweep_matrix.TOPOLOGY_FIELDS + observed_topologies: dict[tuple[str, int], set[tuple[object, ...]]] = {} + for item in matrix["requested_cases"]: + case = item["case"] + observed_topologies.setdefault((item["sku"], case["ep"]), set()).add( + tuple(case[field] for field in topology_fields) + ) + self.assertEqual( + {key: next(iter(values)) for key, values in observed_topologies.items()}, + expected_topologies, + ) + self.assertTrue(all(len(values) == 1 for values in observed_topologies.values())) + self.assertEqual( + { + (sku, ep): tuple(topology[field] for field in topology_fields) + for sku, platform in capability.PLATFORMS.items() + for ep, topology in platform["topologies"].items() + }, + expected_topologies, + ) + self.assertEqual( + {shard["n"] for shard in matrix["include"]}, {6, 7} + ) + self.assertEqual( + sum(shard["n"] == 7 for shard in matrix["include"]), 16 + ) + ll_cases = [ + item for item in matrix["requested_cases"] + if item["case"]["mode"] == "low-latency" + ] + self.assertEqual(len(ll_cases), 32) + self.assertTrue(all( + item["case"]["suite"] == "ep-low-latency-v1" + and item["case"]["backend"] in {"deepep", "uccl"} + and item["case"]["phase"] == "decode" + and item["case"]["routing"] == "uniform" + and not item["case"]["eplb"] + and item["case"]["ladder"] == "1 2 4 8 16 32 64 128" + for item in ll_cases + )) + for shard in matrix["include"]: + ep = next( + requested[case_id]["case"]["ep"] for case_id in shard["case_ids"] + ) + self.assertEqual( + tuple(shard[field] for field in topology_fields), + expected_topologies[shard["sku"], ep], + ) + routing_points = { + phase: { + int(point) + for item in matrix["requested_cases"] + if item["case"]["suite"] == "ep-routing-v1" + and item["case"]["phase"] == phase + for point in item["case"]["ladder"].split() + } + for phase in ("decode", "prefill") + } + self.assertEqual(routing_points, {"decode": {128}, "prefill": {512}}) + skus = sorted({shard["sku"] for shard in matrix["include"]}) + self.assertEqual( + [shard["sku"] for shard in matrix["include"][:len(skus)]], + skus, + ) + self.assertEqual(set(assigned), runnable) + self.assertEqual(len(assigned), len(set(assigned))) + self.assertEqual({item["case"]["ep"] for item in matrix["requested_cases"]}, {8, 16}) + self.assertFalse(capability.resolve("gb200", "deepep", ep=8, nodes=1)[0]) + excluded = { + "uccl": {"b200-dgxc", "b300"}, + } + for backend, skus in excluded.items(): + for sku in skus: + with self.subTest(backend=backend, sku=sku): + self.assertFalse(capability.resolve(sku, backend)[0]) + for case_id, item in requested.items(): + case = {key: value for key, value in item["case"].items() if key != "case_id"} + self.assertEqual( + case_id, + identity.case_id( + sku=item["sku"], profile=identity.profile_for_case(case), case=case + ), + ) + self.assertEqual(case["timing"], "8:64:32") + self.assertEqual(case["samples_per_point"], 512) + + bad_matrix = copy.deepcopy(matrix) + bad_matrix["schema_version"] = True + with self.assertRaises(sweep_matrix.MatrixError): + sweep_matrix.validate_matrix_document(bad_matrix) + + bad_catalog = copy.deepcopy(matrix) + wrapper = next( + item for item in bad_catalog["requested_cases"] + if item["disposition"] == "runnable" + ) + old_id = wrapper["case"]["case_id"] + wrapper["case"]["hidden"] = 1 + factors = {key: value for key, value in wrapper["case"].items() if key != "case_id"} + new_id = identity.case_id( + sku=wrapper["sku"], profile=identity.V1_CASE_PROFILE, case=factors + ) + wrapper["case"]["case_id"] = new_id + for shard in bad_catalog["include"]: + shard["case_ids"] = [new_id if value == old_id else value for value in shard["case_ids"]] + with self.assertRaisesRegex(sweep_matrix.MatrixError, "frozen v1"): + sweep_matrix.validate_matrix_document(bad_catalog) + + bad_topology = copy.deepcopy(matrix) + wrapper = next( + item for item in bad_topology["requested_cases"] + if item["disposition"] == "runnable" + ) + old_id = wrapper["case"]["case_id"] + wrapper["case"]["transport"] = "incorrect-transport" + factors = {key: value for key, value in wrapper["case"].items() if key != "case_id"} + new_id = identity.case_id( + sku=wrapper["sku"], profile=identity.V1_CASE_PROFILE, case=factors + ) + wrapper["case"]["case_id"] = new_id + for shard in bad_topology["include"]: + shard["case_ids"] = [new_id if value == old_id else value for value in shard["case_ids"]] + with self.assertRaisesRegex(sweep_matrix.MatrixError, "platform registry"): + sweep_matrix.validate_matrix_document(bad_topology) + + shard_meta = matrix["include"][0] + requested_cases = {item["case"]["case_id"]: item["case"] for item in matrix["requested_cases"]} + shard = { + "schema_version": True, + "id": shard_meta["id"], + "sku": shard_meta["sku"], + "backend": shard_meta["backend"], + "nodes": shard_meta["nodes"], + "n": shard_meta["n"], + "cases": [requested_cases[value] for value in shard_meta["case_ids"]], + } + with self.assertRaises(sweep_matrix.MatrixError): + sweep_matrix.validate_shard_control( + shard, sku=shard_meta["sku"], backend=shard_meta["backend"], + nodes=shard_meta["nodes"], + ) + + def test_matrix_yaml_and_config_validation_are_strict(self) -> None: + suites = sweep_matrix._load("suites.yaml") + workloads = sweep_matrix._load("workloads.yaml") + self.assertEqual( + {tuple(suite["ep_degrees"]) for suite in suites["suites"].values()}, + {(8, 16)}, + ) + invalid = ( + ("unknown top", lambda s, _w: s.update({"typo": True})), + ( + "unknown suite field", + lambda s, _w: s["suites"]["ep-core-v1"].update({"modes": ["normal"]}), + ), + ( + "unknown workload field", + lambda _s, w: w["model_derived"]["deepseek-v3-v1"].update({"unused": 1}), + ), + ( + "string phases", + lambda s, _w: s["suites"]["ep-core-v1"].update({"phases": "decode"}), + ), + ( + "unknown routing", + lambda s, _w: s["suites"]["ep-core-v1"].update({"routings": ["random"]}), + ), + ( + "integer EPLB", + lambda s, _w: s["suites"]["ep-routing-v1"].update({"eplb": [0, 1]}), + ), + ( + "duplicate platform", + lambda s, _w: s["suites"]["ep-core-v1"]["platforms"].append("h100-dgxc"), + ), + ( + "missing EP degrees", + lambda s, _w: s["suites"]["ep-core-v1"].pop("ep_degrees"), + ), + ( + "non-v1 EP degrees", + lambda s, _w: s["suites"]["ep-core-v1"].update({"ep_degrees": [8]}), + ), + ("missing top field", lambda s, _w: s.pop("schema_version")), + ( + "string dimension", + lambda _s, w: w["model_derived"]["deepseek-v3-v1"].update({"hidden": "7168"}), + ), + ( + "unreachable phase ladder", + lambda s, _w: s["suites"]["ep-routing-v1"].update({"phases": ["prefill"]}), + ), + ) + for label, mutate in invalid: + with self.subTest(label=label), self.assertRaises(SystemExit): + bad_suites, bad_workloads = copy.deepcopy(suites), copy.deepcopy(workloads) + mutate(bad_suites, bad_workloads) + sweep_matrix.validate_config_documents(bad_suites, bad_workloads) + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + (root / "configs").mkdir() + (root / "configs" / "duplicate.yaml").write_text( + "schema_version: 1\nsuites:\n same: 1\n same: 2\n" + ) + with mock.patch.object(sweep_matrix, "HERE", root), self.assertRaisesRegex( + SystemExit, "duplicate YAML key" + ): + sweep_matrix._load("duplicate.yaml") + + def test_semantically_duplicate_suite_points_are_rejected(self) -> None: + matrix = sweep_matrix.resolve_matrix() + with mock.patch.object( + sweep_matrix, "_semantic_points", return_value=["duplicate"] + ), self.assertRaisesRegex( + sweep_matrix.MatrixError, "duplicates a semantic token point" + ): + sweep_matrix.validate_matrix_document(matrix) + + def test_only_three_shared_launchers_are_registered(self) -> None: + expected = { + "launch_single-slurm.sh", + "launch_gb-nv.sh", + "launch_mi-amds.sh", + } + self.assertEqual({path.name for path in (ROOT / "launchers").glob("launch_*.sh")}, expected) + self.assertEqual( + {platform["launcher"] for platform in capability.PLATFORMS.values()}, + {"single-slurm", "gb-nv", "mi-amds"}, + ) + for platform in capability.PLATFORMS.values(): + launcher = ROOT / "launchers" / f"launch_{platform['launcher']}.sh" + self.assertTrue(launcher.is_file()) + source = launcher.read_text() + self.assertNotIn("RUNNER_NAME", source) + self.assertIn("cx_preflight_allocation", source) + lock_environment = 'cx_lock_canonical_gha_env "$RUNNER"' + self.assertIn(lock_environment, source) + self.assertLess( + source.index("cx_load_operator_config"), + source.index(lock_environment), + ) + validate = 'cx_validate_shard_control "$CX_DIR"' + stage = 'MOUNT_SRC="$(cx_stage_path ' + self.assertIn(validate, source) + self.assertLess(source.index(validate), source.index(stage)) + self.assertLess(source.index(stage), source.index('cx_stage_repo "$REPO_ROOT"')) + self.assertLess(source.index(validate), source.index("cx_require_vars")) + if platform["launcher"] in {"single-slurm", "mi-amds"}: + network = "cx_validate_network_profile_on_job" + self.assertIn(network, source) + self.assertLess(source.index("cx_salloc_jobid"), source.index(network)) + self.assertLess(source.index(network), source.index("cx_preflight_allocation")) + if platform["launcher"] == "single-slurm": + self.assertLess( + source.index(network), + source.index("CX_ENROOT_LOCAL_IMPORT=1 cx_ensure_squash"), + ) + + common = (ROOT / "runtime" / "common.sh").read_text() + workflow = (ROOT.parent.parent / ".github" / "workflows" / "collectivex-sweep.yml").read_text() + self.assertNotIn("RUNNER_NAME", common) + self.assertNotIn("RUNNER_NAME:", workflow) + self.assertNotIn("flashinfer", capability.BACKENDS) + self.assertFalse((HERE / "ep_flashinfer.py").exists()) + + def test_canonical_operator_config_requires_a_private_audit_salt(self) -> None: + salt = "a" * 64 + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + config = root / "operator.json" + document = { + "schema_version": 1, + "audit_salt": salt, + "runners": { + "h100-dgxc": { + "partition": "test", "account": "test", + "squash_dir": str(root), "stage_dir": str(root), + }, + }, + } + command = ( + 'source "$1"; export COLLECTIVEX_EXECUTION_ID="audit-config-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; " + 'test "$CX_AUDIT_SALT" = "$EXPECTED_AUDIT_SALT"' + ) + + def invoke( + value: dict, *, canonical: bool, expect_salt: bool = True + ) -> subprocess.CompletedProcess[str]: + config.write_text(json.dumps(value)) + config.chmod(0o600) + environment = { + **os.environ, + "CX_RUNNER": "h100-dgxc", + "COLLECTIVEX_OPERATOR_CONFIG": str(config), + "EXPECTED_AUDIT_SALT": salt, + } + if canonical: + environment["COLLECTIVEX_CANONICAL_GHA"] = "1" + invocation = command if expect_salt else ( + 'source "$1"; export COLLECTIVEX_EXECUTION_ID="audit-config-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; cx_load_operator_config; " + 'test -z "${CX_AUDIT_SALT+x}"' + ) + return subprocess.run( + ["bash", "-c", invocation, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env=environment, + ) + + accepted = invoke(document, canonical=True) + self.assertEqual(accepted.returncode, 0, accepted.stderr) + self.assertNotIn(salt, accepted.stdout + accepted.stderr) + + missing = copy.deepcopy(document) + del missing["audit_salt"] + rejected = invoke(missing, canonical=True) + self.assertNotEqual(rejected.returncode, 0) + self.assertNotIn(salt, rejected.stdout + rejected.stderr) + + manual = invoke(missing, canonical=False, expect_salt=False) + self.assertEqual(manual.returncode, 0, manual.stderr) + self.assertNotIn(salt, manual.stdout + manual.stderr) + + malformed = copy.deepcopy(document) + malformed["audit_salt"] = "A" * 64 + rejected = invoke(malformed, canonical=False) + self.assertNotEqual(rejected.returncode, 0) + self.assertNotIn("A" * 64, rejected.stdout + rejected.stderr) + + def test_scaleout_network_profile_is_explicit_and_allowlisted(self) -> None: + command = r''' + set -euo pipefail + source "$1" + ! (unset CX_SOCKET_IFNAME CX_RDMA_DEVICES; cx_apply_network_profile 2 nvlink-rdma) + ! (export CX_SOCKET_IFNAME=eth0; unset CX_RDMA_DEVICES; cx_apply_network_profile 2 nvlink-rdma) + export CX_SOCKET_IFNAME=ib0 CX_RDMA_DEVICES=mlx5_0:1,mlx5_1:1 + export NCCL_NET=Socket NCCL_IB_HCA=stale NVSHMEM_HCA_LIST=stale + cx_apply_network_profile 1 nvlink + test -z "${NCCL_NET+x}${NCCL_IB_HCA+x}${NVSHMEM_HCA_LIST+x}" + cx_apply_network_profile 4 mnnvl + test -z "${NCCL_NET+x}${NCCL_IB_HCA+x}${NVSHMEM_HCA_LIST+x}" + export CX_IB_GID_INDEX=3 CX_RDMA_SERVICE_LEVEL=2 + cx_apply_network_profile 2 nvlink-rdma + test "$NCCL_SOCKET_IFNAME:$GLOO_SOCKET_IFNAME:$UCCL_SOCKET_IFNAME" = ib0:ib0:ib0 + test "$NCCL_NET:$NCCL_IB_HCA" = 'IB:=mlx5_0:1,mlx5_1:1' + test "$NVSHMEM_HCA_LIST" = mlx5_0:1,mlx5_1:1 + test "$MORI_RDMA_DEVICES:$EP_NIC_NAME" = mlx5_0,mlx5_1:mlx5_0 + test "$NCCL_IB_GID_INDEX:$NCCL_IB_SL" = 3:2 + test "$NVSHMEM_IB_ENABLE_IBGDA:$NVSHMEM_IBGDA_NIC_HANDLER" = 1:gpu + ''' + subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + check=True, + env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"}, + ) + + def test_network_profile_validation_is_private_and_all_node(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + binary = root / "srun" + arguments = root / "arguments" + script = root / "script" + binary.write_text( + "#!/usr/bin/env bash\n" + "printf '%s\\n' \"$@\" > \"$CAPTURE_ARGS\"\n" + "cat > \"$CAPTURE_SCRIPT\"\n" + "exit \"${SRUN_RC:-0}\"\n" + ) + binary.chmod(0o700) + command = ( + 'source "$1"; export COLLECTIVEX_EXECUTION_ID="network-test-$$"; ' + "trap 'cx_cleanup_private_logs 0' EXIT; " + 'cx_validate_network_profile_on_job 42 2 nvlink-rdma' + ) + environment = { + **os.environ, + "PATH": f"{root}:{os.environ['PATH']}", + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "CAPTURE_ARGS": str(arguments), + "CAPTURE_SCRIPT": str(script), + "CX_SOCKET_IFNAME": "privateif0", + "CX_RDMA_DEVICES": "privatehca0:1", + "CX_IB_GID_INDEX": "3", + } + result = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env=environment, + ) + self.assertEqual(result.returncode, 0, result.stderr) + invoked = arguments.read_text() + self.assertIn("--nodes=2", invoked) + self.assertIn("--ntasks=2", invoked) + self.assertIn("--input=all", invoked) + self.assertIn("CX_SOCKET_IFNAME,CX_RDMA_DEVICES,CX_IB_GID_INDEX", invoked) + self.assertIn('/sys/class/infiniband/$device/ports', script.read_text()) + self.assertNotIn("privateif0", result.stdout + result.stderr) + self.assertNotIn("privatehca0", result.stdout + result.stderr) + + failed = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh")], + text=True, + capture_output=True, + env={**environment, "SRUN_RC": "9"}, + ) + self.assertNotEqual(failed.returncode, 0) + self.assertNotIn("privateif0", failed.stdout + failed.stderr) + self.assertNotIn("privatehca0", failed.stdout + failed.stderr) + + arguments.unlink() + subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_validate_network_profile_on_job 42 1 nvlink', + "_", str(ROOT / "runtime" / "common.sh"), + ], + check=True, + env=environment, + ) + self.assertFalse(arguments.exists()) + + def test_allocation_preflight_proves_shared_write_visibility(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + mount = root / "mount" + runtime = mount / "experimental" / "CollectiveX" / "runtime" + runtime.mkdir(parents=True) + (runtime / "run_in_container.sh").write_text("#!/bin/sh\n") + squash = root / "image.sqsh" + squash.write_bytes(b"squash") + binary = root / "bin" + binary.mkdir() + (binary / "unsquashfs").write_text("#!/bin/sh\nexit 0\n") + (binary / "unsquashfs").chmod(0o700) + (binary / "srun").write_text( + "#!/usr/bin/env bash\n" + "set -euo pipefail\n" + "case \" $* \" in *' --input=all '*) ;; *) exit 97 ;; esac\n" + "worker=\"$FAKE_ROOT/worker.sh\"\n" + "cat > \"$worker\"\n" + "args=(\"$@\")\n" + "start=0\n" + "for ((i=0; i<${#args[@]}; i++)); do\n" + " [ \"${args[$i]}\" != -- ] || start=$((i + 1))\n" + "done\n" + "[ \"$start\" -gt 0 ]\n" + "worker_args=(\"${args[@]:$start}\")\n" + "probe=\"${worker_args[4]}\"\n" + "case \"${FAKE_MODE:-success}\" in\n" + " missing-source) rm -f -- \"$probe/source\" ;;\n" + " readonly) chmod 500 \"$probe\" ;;\n" + "esac\n" + "for ((node=0; node None: + runtime = (ROOT / "runtime" / "run_in_container.sh").read_text() + probe = runtime[runtime.index("cx_probe_deepep()"): + runtime.index("cx_activate_deepep_v2()")] + self.assertIn('expected_version="1.2.1"', probe) + self.assertIn('expected_version="1.1.0+814e508"', probe) + self.assertNotIn("pip install", probe) + self.assertNotIn("cx_fetch_revision", probe) + self.assertIn("Path(deep_ep.__file__).resolve() in recorded_files", probe) + self.assertIn("Path(buffer_module.__file__).resolve() in recorded_files", probe) + + harness = (HERE / "ep_harness.py").read_text() + pass_one = harness[harness.index("# ---- Pass 1"): + harness.index("# ---- Pass 2")] + self.assertLess( + pass_one.index("input_snapshots[T] ="), + pass_one.index("oracle = _run_expert_oracle"), + ) + self.assertIn("pre_input_unchanged", pass_one) + self.assertIn( + "hh = prep_combine()\n torch.cuda.synchronize()", + harness, + ) + + def test_squash_imports_are_reproducible_and_use_a_fresh_cache_key(self) -> None: + common = (ROOT / "runtime" / "common.sh").read_text() + amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text() + self.assertIn('CX_SQUASH_FORMAT_VERSION="repro-v1"', common) + self.assertIn("SOURCE_DATE_EPOCH=\"$CX_SQUASH_SOURCE_DATE_EPOCH\"", common) + self.assertIn("${COLLECTIVEX_IMAGE_DIGEST#sha256:}", common) + self.assertIn("cx_ensure_squash_on_job", amd) + self.assertIn('"${CX_LOCK_DIR:-}"', amd) + self.assertNotIn('"${CX_LOCK_DIR:-/tmp}"', amd) + self.assertIn('[ -n "$lock_dir" ] || lock_dir="$squash_dir/.locks"', common) + self.assertGreaterEqual(common.count("--chdir=/tmp"), 2) + self.assertGreaterEqual(amd.count("--chdir=/tmp"), 2) + self.assertIn('ENROOT_CACHE_PATH="$compute_home/enroot-cache"', common) + self.assertIn('ENROOT_RUNTIME_PATH="$compute_home/enroot-run"', common) + self.assertEqual(common.count('cx_reverify_registry_image "$image"'), 2) + result = subprocess.run( + [ + "bash", + "-c", + f'source "{ROOT / "runtime" / "common.sh"}"; ' + 'COLLECTIVEX_IMAGE_DIGEST="sha256:$(printf b%.0s {1..64})"; ' + 'CX_IMAGE_PLATFORM=linux/amd64; cx_squash_path /cache repo/image:tag; ' + 'printf "\\n"; CX_IMAGE_PLATFORM=linux/arm64; ' + 'cx_squash_path /cache repo/image:tag', + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 0, result.stderr) + digest = "b" * 64 + self.assertEqual( + result.stdout.splitlines(), + [ + f"/cache/repro-v1_{digest}_repo_image_tag.sqsh", + f"/cache/repro-v1_linux_arm64_{digest}_repo_image_tag.sqsh", + ], + ) + + def test_launchers_preserve_platform_specific_runtime_requirements(self) -> None: + single = (ROOT / "launchers" / "launch_single-slurm.sh").read_text() + gb = (ROOT / "launchers" / "launch_gb-nv.sh").read_text() + amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text() + common = (ROOT / "runtime" / "common.sh").read_text() + self.assertIn("ALLOC_EXTRA=(--mem=0)", single) + self.assertIn("ALLOC_EXTRA=(-N 1 --mem=0)", single) + self.assertIn("SRUN_EXTRA=(--mpi=none --container-remap-root)", single) + self.assertIn("CX_ENROOT_LOCAL_IMPORT=1", single) + self.assertIn('PRODUCT="${CX_SHARD_SKU:-${CX_GB_PRODUCT:-', gb) + self.assertIn("cx_ensure_squash_on_job", gb) + self.assertIn("--mem=0 --cpus-per-task=35", gb) + self.assertIn("--container-writable", gb) + self.assertIn("--container-remap-root", gb) + workload_stage = common[ + common.index("workload_args=("): + common.index("workload_log=", common.index("workload_args=(")) + ] + self.assertNotIn("--workload", workload_stage) + self.assertIn("mi325x) CPUS_PER_TASK=256", amd) + self.assertIn("/dev/kfd:/dev/kfd,/dev/dri:/dev/dri", amd) + self.assertIn("--container-writable --container-remap-root", amd) + self.assertIn( + "CX_DISTRIBUTED_CONTAINER_ARGS=(--container-writable --container-remap-root)", + amd, + ) + collect = common[common.index("cx_collect_results()"): + common.index("cx_cleanup_stage()")] + cleanup = common[common.index("cx_launcher_cleanup()"): + common.index("cx_install_launcher_fail_safe()")] + self.assertNotIn("cx_cleanup_stage", collect) + self.assertLess(cleanup.index("cx_cancel_job"), cleanup.index("cx_cleanup_stage")) + runtime = (ROOT / "runtime" / "run_in_container.sh").read_text() + self.assertIn('distribution.read_text("direct_url.json")', runtime) + self.assertIn("6548e9c504a12b2471af4b7f4d9546321210a57a456b5dc55bd4a8dad0f932ac", runtime) + self.assertIn("2671cff7baf8c2c214ff4bac721af875d513130670bec57601998bd1aae82882", runtime) + + def test_deferred_backend_provenance_resolves_before_measurement(self) -> None: + harness = (ROOT / "tests" / "ep_harness.py").read_text() + conditioning = harness.index("for wt in conditioning_ladder") + provenance = harness.index("# Setup may materialize deferred provenance") + measurement = harness.index("# ---- Pass 1: build each deterministic problem") + self.assertLess(conditioning, provenance) + self.assertLess(provenance, measurement) + + def test_backend_specific_routing_contracts_are_explicit(self) -> None: + hybrid = (ROOT / "tests" / "ep_deepep_hybrid.py").read_text() + self.assertIn("self.domain_rank = int(self.buffer.local_rank)", hybrid) + self.assertIn( + "probability_columns = self.domain_rank * self.local_experts + local_expert_ids", + hybrid, + ) + self.assertIn("h.recv_probs[:count][rows, probability_columns]", hybrid) + + mori = (ROOT / "tests" / "ep_mori.py").read_text() + self.assertIn("topk_idx=indices", mori) + self.assertIn("indices=indices", mori) + self.assertIn( + "combine_indices = p.indices if self._async_ll else h.dispatch_indices", + mori, + ) + self.assertIn("h.combine_input,\n None,\n combine_indices", mori) + self.assertIn('"use_external_inp_buf": self._external_input', mori) + self.assertIn("self.block_num = self._block_target = 64", mori) + self.assertIn('config_kwargs["block_num"] = self.block_num', mori) + self.assertIn( + 'config_kwargs["warp_num_per_block"] = self.dispatch_warps', mori + ) + self.assertIn("count > tensor.size(0)", mori) + self.assertIn("return combined[:p.T]", mori) + self.assertNotIn("return combined\n", mori) + self.assertIn( + "raw_expert_ids < local_start + experts_per_rank", + mori, + ) + self.assertNotIn("MoRI returned a non-local expert", mori) + harness = (ROOT / "tests" / "ep_harness.py").read_text() + self.assertIn("problem.recv_tokens = backend.recv_tokens(handle)", harness) + + def test_mori_masks_global_topk_metadata_to_the_local_rank(self) -> None: + path = HERE / "ep_mori.py" + tree = ast.parse(path.read_text(), str(path)) + helper = next( + node + for node in tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_project_local_metadata" + ) + namespace: dict[str, object] = {} + exec(compile(ast.Module(body=[helper], type_ignores=[]), str(path), "exec"), namespace) + raw_ids = np.array([[0, 32, 63, -1], [64, 95, 7, 96]], dtype=np.int64) + raw_weights = np.arange(8, dtype=np.float32).reshape(2, 4) + torch_module = types.SimpleNamespace( + where=np.where, + full_like=np.full_like, + zeros_like=np.zeros_like, + ) + ids, weights, local_ids = namespace["_project_local_metadata"]( + torch_module, raw_ids, raw_weights, 1, 32 + ) + np.testing.assert_array_equal( + ids, + np.array([[-1, 32, 63, -1], [-1, -1, -1, -1]], dtype=np.int64), + ) + np.testing.assert_array_equal( + weights, + np.array([[0, 1, 2, 0], [0, 0, 0, 0]], dtype=np.float32), + ) + counts = np.bincount(local_ids, minlength=32) + self.assertEqual((counts[0], counts[31], int(counts.sum())), (1, 1, 2)) + commit_helper = next( + node for node in tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_mori_source_commit" + ) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + module = root / "python" / "mori" / "__init__.py" + module.parent.mkdir(parents=True) + module.touch() + git = root / ".git" + git.mkdir() + (git / "HEAD").write_text("a" * 40 + "\n") + commit_namespace = { + "Path": Path, + "re": re, + "mori": types.SimpleNamespace(__file__=str(module)), + } + exec( + compile(ast.Module(body=[commit_helper], type_ignores=[]), str(path), "exec"), + commit_namespace, + ) + self.assertEqual(commit_namespace["_mori_source_commit"](), "a" * 40) + (git / "HEAD").write_text("ref: refs/heads/main\n") + with self.assertRaisesRegex(RuntimeError, "detached commit"): + commit_namespace["_mori_source_commit"]() + + profile = contracts.project_resource_profile( + { + "block_num": 64, + "device_cus": 304, + "kernel_type": "AsyncLL", + "tuned_source": "upstream-asyncll-64x8-external-input", + } + ) + self.assertIsNone(profile["comm_units_kind"]) + self.assertIsNone(profile["configured_units"]) + + def test_squash_identity_rehashes_instead_of_trusting_a_sidecar(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + image = Path(temporary) / "image.sqsh" + image.write_bytes(b"current squash bytes") + sidecar = Path(f"{image}.sha256") + sidecar.write_text("a" * 64) + os.utime(sidecar, (image.stat().st_mtime + 10, image.stat().st_mtime + 10)) + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; COLLECTIVEX_EXECUTION_ID="squash-hash-$$"; ' + 'cx_export_squash_identity "$2"; cx_cleanup_private_logs 0; ' + 'printf "%s" "$COLLECTIVEX_SQUASH_SHA256"', + "_", str(ROOT / "runtime" / "common.sh"), str(image), + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual(result.stdout, hashlib.sha256(image.read_bytes()).hexdigest()) + + def _run_salloc_scenario( + self, salloc_body: str, squeue_body: str, *, cleanup: bool + ) -> dict[str, object]: + prefix = f"inferencex-collectivex-{os.getpid()}-1-" + with tempfile.TemporaryDirectory(prefix=prefix, dir="/tmp") as temporary: + root = Path(temporary) + command_dir = root / "bin" + repo = root / "repo" + command_dir.mkdir() + repo.mkdir() + paths = { + name: root / name + for name in ("arguments", "squeue-calls", "sleep-calls", "scancel-calls") + } + scripts = { + "salloc": ( + "printf '%s\\n' \"$@\" > \"$CX_TEST_SALLOC_ARGUMENTS\"\n" + + salloc_body + ), + "squeue": ( + "printf '%s\\n' \"$*\" >> \"$CX_TEST_SQUEUE_CALLS\"\n" + + squeue_body + ), + "sleep": "printf '%s\\n' \"$1\" >> \"$CX_TEST_SLEEP_CALLS\"\n", + "scancel": ( + "printf '%s\\n' \"$*\" >> \"$CX_TEST_SCANCEL_CALLS\"\n" + ), + } + for name, body in scripts.items(): + path = command_dir / name + path.write_text(f"#!/usr/bin/env bash\n{body}\n") + path.chmod(0o700) + execution_id = f"scheduler-{root.name}" + expected_name = "cx-" + hashlib.sha256( + execution_id.encode() + ).hexdigest()[:24] + command = r''' + source "$1" + JOB_ID="" + set +e + cx_salloc_jobid --partition=compute + run_rc=$? + set -e + printf '%s:%s:%s\n' \ + "$run_rc" "$JOB_ID" "$CX_ALLOCATION_UNCERTAIN" + cx_cleanup_private_logs 0 + if [ "$3" = cleanup ]; then + export CX_JOB_ROOT="$2" REPO_ROOT="$2/repo" MOUNT_SRC="$2/repo" + export COLLECTIVEX_CANONICAL_GHA=1 + cx_write_cleanup_guard() { + rm -f -- "$CX_JOB_ROOT/cleanup-safe" "$CX_JOB_ROOT/cleanup-unsafe" + : > "$CX_JOB_ROOT/cleanup-$1" + } + unset CX_BENCH + cx_launcher_cleanup "$run_rc" + fi + exit "$run_rc" + ''' + result = subprocess.run( + [ + "bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"), + str(root), "cleanup" if cleanup else "no-cleanup", + ], + text=True, + capture_output=True, + env={ + **os.environ, + "PATH": f"{command_dir}:{os.environ['PATH']}", + "COLLECTIVEX_EXECUTION_ID": execution_id, + "CX_TEST_SALLOC_ARGUMENTS": str(paths["arguments"]), + "CX_TEST_SQUEUE_CALLS": str(paths["squeue-calls"]), + "CX_TEST_SLEEP_CALLS": str(paths["sleep-calls"]), + "CX_TEST_SCANCEL_CALLS": str(paths["scancel-calls"]), + }, + ) + return { + "result": result, + "job_name": expected_name, + "arguments": paths["arguments"].read_text().splitlines(), + "squeue_calls": ( + paths["squeue-calls"].read_text().splitlines() + if paths["squeue-calls"].exists() else [] + ), + "sleep_calls": ( + paths["sleep-calls"].read_text().splitlines() + if paths["sleep-calls"].exists() else [] + ), + "scancel_calls": ( + paths["scancel-calls"].read_text().splitlines() + if paths["scancel-calls"].exists() else [] + ), + "cleanup_safe": (root / "cleanup-safe").is_file(), + "cleanup_unsafe": (root / "cleanup-unsafe").is_file(), + } + + def test_salloc_job_id_parser_uses_the_portable_grant_message(self) -> None: + scenario = self._run_salloc_scenario( + "printf 'salloc: Granted job allocation 4242\\n' >&2", + "exit 2", + cleanup=False, + ) + result = scenario["result"] + self.assertIsInstance(result, subprocess.CompletedProcess) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual( + result.stdout, "0:4242:0\n" + ) + self.assertEqual( + scenario["arguments"], + [ + "--partition=compute", + f"--job-name={scenario['job_name']}", + "--no-shell", + ], + ) + self.assertEqual(scenario["squeue_calls"], []) + + def test_salloc_verified_rejection_is_cleanup_safe(self) -> None: + scenario = self._run_salloc_scenario("exit 1", "exit 0", cleanup=True) + result = scenario["result"] + self.assertEqual(result.returncode, 1) + self.assertEqual(result.stdout, "1::0\n") + self.assertEqual(len(scenario["squeue_calls"]), 3) + scheduler_user = subprocess.check_output(["id", "-un"], text=True).strip() + self.assertTrue(all( + f"--name={scenario['job_name']}" in call + and f"--user={scheduler_user}" in call + for call in scenario["squeue_calls"] + )) + self.assertEqual(scenario["sleep_calls"], ["1", "2"]) + self.assertTrue(scenario["cleanup_safe"]) + self.assertFalse(scenario["cleanup_unsafe"]) + + def test_salloc_recovers_and_cancels_one_matching_allocation(self) -> None: + scenario = self._run_salloc_scenario( + "exit 1", + r''' + case " $* " in + *" --name="*) printf '5151\n' ;; + *" -j 5151 "*) exit 0 ;; + *) exit 2 ;; + esac + ''', + cleanup=True, + ) + result = scenario["result"] + self.assertEqual(result.returncode, 1) + self.assertEqual(result.stdout, "1:5151:0\n") + self.assertEqual(scenario["scancel_calls"], ["5151"]) + self.assertTrue(scenario["cleanup_safe"]) + self.assertFalse(scenario["cleanup_unsafe"]) + + def test_salloc_ambiguous_lookup_remains_cleanup_unsafe(self) -> None: + scenario = self._run_salloc_scenario( + "exit 1", "printf '5151\\n5152\\n'", cleanup=True + ) + result = scenario["result"] + self.assertEqual(result.returncode, 1) + self.assertEqual(result.stdout, "1::1\n") + self.assertEqual(scenario["scancel_calls"], []) + self.assertFalse(scenario["cleanup_safe"]) + self.assertTrue(scenario["cleanup_unsafe"]) + + def test_salloc_query_failure_and_interruption_remain_cleanup_unsafe(self) -> None: + query_failure = self._run_salloc_scenario("exit 1", "exit 2", cleanup=True) + self.assertEqual(query_failure["result"].returncode, 1) + self.assertEqual(len(query_failure["squeue_calls"]), 1) + self.assertFalse(query_failure["cleanup_safe"]) + self.assertTrue(query_failure["cleanup_unsafe"]) + + interrupted = self._run_salloc_scenario("exit 130", "exit 0", cleanup=True) + self.assertEqual(interrupted["result"].returncode, 1) + self.assertEqual(interrupted["squeue_calls"], []) + self.assertFalse(interrupted["cleanup_safe"]) + self.assertTrue(interrupted["cleanup_unsafe"]) + + def test_allocation_cleanup_fails_closed_when_scheduler_queries_fail(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + directory = Path(temporary) + for name, body in { + "scancel": "exit 0", + "squeue": "exit 2", + "sleep": "exit 0", + }.items(): + command = directory / name + command.write_text(f"#!/usr/bin/env bash\n{body}\n") + command.chmod(0o700) + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_cancel_job 4242', + "_", str(ROOT / "runtime" / "common.sh"), + ], + text=True, + capture_output=True, + env={**os.environ, "PATH": f"{directory}:{os.environ['PATH']}"}, + ) + self.assertNotEqual(result.returncode, 0) + self.assertIn("did not terminate", result.stderr) + + workflow = (ROOT.parent.parent / ".github" / "workflows" / "collectivex-sweep.yml").read_text() + self.assertIn("cleanup-unsafe", workflow) + self.assertIn("cleanup-safe", workflow) + self.assertIn("Confirm allocation cleanup", workflow) + self.assertIn("Prepare pinned backend source archive", workflow) + self.assertIn("Install pinned backend source seed", workflow) + self.assertIn("CX_BACKEND_SOURCE_SEED_ROOT", workflow) + self.assertIn("steps.gen.outputs.source_backends", workflow) + self.assertIn('python3 "$destination/source_archive.py"', workflow) + artifact_validation = workflow[workflow.index("- name: Validate shard artifact safety"):] + self.assertIn("steps.allocation_cleanup.outcome == 'success'", artifact_validation) + self.assertIn( + "inputs.operation != 'probe-precision' || steps.sweep_shard.outcome == 'success'", + artifact_validation, + ) + cleanup_function = (ROOT / "runtime" / "common.sh").read_text() + self.assertIn('[ "${CX_PRECISION_PROBE:-0}" != 1 ]', cleanup_function) + sweep_workflow = workflow[workflow.index(" sweep:"):] + self.assertNotIn("GITHUB_WORKSPACE", sweep_workflow) + self.assertNotIn("RUNNER_WORKSPACE", sweep_workflow) + self.assertIn('CX_SOURCE_ROOT: /tmp/inferencex-collectivex-', sweep_workflow) + source_step = sweep_workflow[:sweep_workflow.index("- uses: actions/download-artifact")] + self.assertNotIn("unsafe_guards=", source_step) + self.assertIn("cutoff = time.time() - 86400", source_step) + self.assertIn("stat.S_IMODE(metadata.st_mode) != 0o700", source_step) + self.assertIn('for marker_name in ("cleanup-safe", "cleanup-unsafe")', source_step) + self.assertIn("stat.S_IMODE(marker.st_mode) == 0o600", source_step) + self.assertIn("shutil.rmtree(entry.path)", source_step) + self.assertLess( + source_step.index('rev-parse HEAD'), + source_step.index("echo 'prepared=true'"), + ) + upload = workflow[workflow.index("- name: Stage shard artifact"):] + self.assertIn("id: stage_artifact", upload) + self.assertIn("id: upload_artifact", upload) + self.assertIn("steps.stage_artifact.outcome == 'success'", upload) + cleanup = workflow[workflow.index("- name: Cleanup isolated workspace"):] + for step in ( + "sweep_shard", "allocation_cleanup", "artifact_safety", + "delivery_contracts", "stage_artifact", "upload_artifact", + ): + self.assertIn(f"steps.{step}.outcome", cleanup) + self.assertLess( + cleanup.index('cleanup-safe" ]'), + cleanup.index('rm -rf -- "$CX_JOB_ROOT"'), + ) + + def test_v1_publication_requires_explicit_release_markers(self) -> None: + workflows = ROOT.parent.parent / ".github" / "workflows" + sweep = (workflows / "collectivex-sweep.yml").read_text() + self.assertFalse((workflows / "collectivex-publish.yml").exists()) + + self.assertIn("options: [sweep, probe-precision, publish-v1, refresh-v1]", sweep) + self.assertIn("collectivex.precision-probe-plan.v1", (ROOT / "tests" / "probe_precision.py").read_text()) + self.assertIn("cxprecision-probes-${{ github.run_id }}-${{ github.run_attempt }}", sweep) + self.assertIn("--validate-bundle", sweep) + self.assertIn("release_tag:", sweep) + self.assertIn("default: unversioned", sweep) + self.assertIn("options: [unversioned, v1]", sweep) + self.assertIn("qualification_index:", sweep) + self.assertIn("inputs.release_tag == 'v1'", sweep) + self.assertIn("collectivex.release-tag.v1", sweep) + self.assertIn("V1 release tag requires the locked full matrix", sweep) + self.assertIn("EXPECTED_MATRIX_SHA256", sweep) + self.assertIn("cxrelease-v1-${{ github.run_id }}-${{ github.run_attempt }}", sweep) + + self.assertIn("publish_run_ids must contain exactly three IDs", sweep) + self.assertIn("source runs do not share one source SHA", sweep) + self.assertIn("cxrelease-v1-$run_id-$attempt/release.json", sweep) + self.assertIn("run $run_id is not tagged for V1 publication", sweep) + self.assertIn("ref: ${{ steps.runs.outputs.source_sha }}", sweep) + self.assertIn("[ \"$attempt\" = 1 ]", sweep) + self.assertIn("cxpublication-v1-${{ github.run_id }}-${{ github.run_attempt }}", sweep) + self.assertIn("refresh source bytes differ from their requested digest", sweep) + self.assertIn("retention-days: 90", sweep) + self.assertNotIn("workflow_run:", sweep) + + def test_source_archive_preserves_only_contained_leaf_symlinks(self) -> None: + selected = "deepep-hybrid-pinned" + other = "deepep-v2-pinned" + + def directory(name: str) -> tarfile.TarInfo: + member = tarfile.TarInfo(name) + member.type = tarfile.DIRTYPE + member.mode = 0o755 + return member + + def regular( + name: str, payload: bytes, mode: int = 0o644 + ) -> tuple[tarfile.TarInfo, io.BytesIO]: + member = tarfile.TarInfo(name) + member.size = len(payload) + member.mode = mode + return member, io.BytesIO(payload) + + def symbolic(name: str, target: str) -> tarfile.TarInfo: + member = tarfile.TarInfo(name) + member.type = tarfile.SYMTYPE + member.linkname = target + member.mode = 0o777 + return member + + def write_archive(path: Path, extras: list[tarfile.TarInfo] | None = None) -> None: + root = f".cx_sources/{selected}" + with tarfile.open(path, "w") as archive: + for name in ( + ".cx_sources", root, f"{root}/third-party", + f"{root}/third-party/nccl", f"{root}/third-party/nccl/pkg", + f"{root}/third-party/nccl/pkg/debian", + f".cx_sources/{other}", + ): + archive.addfile(directory(name)) + member, stream = regular( + f"{root}/third-party/nccl/LICENSE.txt", b"license\n" + ) + archive.addfile(member, stream) + member, stream = regular(f".cx_sources/{other}/sentinel", b"other\n") + archive.addfile(member, stream) + member, stream = regular(f"{root}/group-executable", b"exec\n", 0o010) + archive.addfile(member, stream) + archive.addfile(symbolic( + f"{root}/third-party/nccl/pkg/debian/copyright", + "../../LICENSE.txt", + )) + for member in extras or []: + archive.addfile(member) + path.chmod(0o600) + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "source.tar" + destination = root / "destination" + destination.mkdir(mode=0o700) + write_archive(archive) + source_archive.extract_source_archive(archive, destination, selected) + link = ( + destination / ".cx_sources" / selected / "third-party" / "nccl" + / "pkg" / "debian" / "copyright" + ) + self.assertTrue(link.is_symlink()) + self.assertEqual(os.readlink(link), "../../LICENSE.txt") + self.assertEqual(link.read_text(), "license\n") + self.assertFalse((destination / ".cx_sources" / other).exists()) + extracted = destination / ".cx_sources" / selected + self.assertEqual( + stat.S_IMODE((extracted / "group-executable").stat().st_mode), 0o700 + ) + self.assertEqual( + stat.S_IMODE( + (extracted / "third-party" / "nccl" / "LICENSE.txt").stat().st_mode + ), + 0o600, + ) + + invalid: dict[str, list[tarfile.TarInfo]] = { + "absolute member": [directory("/outside")], + "traversal member": [directory(".cx_sources/../outside")], + "duplicate member": [directory(f".cx_sources/{selected}")], + "absolute link": [symbolic(f".cx_sources/{selected}/absolute", "/tmp/x")], + "escaping link": [symbolic(f".cx_sources/{selected}/escape", "../x")], + "cross-root link": [ + symbolic(f".cx_sources/{selected}/cross", f"../{other}/sentinel") + ], + "missing target": [symbolic(f".cx_sources/{selected}/missing", "none")], + } + hardlink = tarfile.TarInfo(f".cx_sources/{selected}/hard") + hardlink.type = tarfile.LNKTYPE + hardlink.linkname = f".cx_sources/{selected}/third-party/nccl/LICENSE.txt" + invalid["hardlink"] = [hardlink] + fifo = tarfile.TarInfo(f".cx_sources/{selected}/fifo") + fifo.type = tarfile.FIFOTYPE + invalid["fifo"] = [fifo] + character = tarfile.TarInfo(f".cx_sources/{selected}/character") + character.type = tarfile.CHRTYPE + invalid["character device"] = [character] + block = tarfile.TarInfo(f".cx_sources/{selected}/block") + block.type = tarfile.BLKTYPE + invalid["block device"] = [block] + unknown = tarfile.TarInfo(f".cx_sources/{selected}/unknown") + unknown.type = b"Z" + invalid["unknown type"] = [unknown] + invalid["unsafe unselected root"] = [ + symbolic(f".cx_sources/{other}/escape", f"../{selected}/group-executable") + ] + chain_target = symbolic( + f".cx_sources/{selected}/chain-target", "third-party/nccl/LICENSE.txt" + ) + invalid["symlink chain"] = [ + chain_target, symbolic(f".cx_sources/{selected}/chain", "chain-target") + ] + linked_child = tarfile.TarInfo(f".cx_sources/{selected}/linked-file/child") + invalid["symlink parent"] = [ + symbolic( + f".cx_sources/{selected}/linked-file", + "third-party/nccl/LICENSE.txt", + ), + linked_child, + ] + for label, extras in invalid.items(): + with self.subTest(label=label), tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "source.tar" + destination = root / "destination" + destination.mkdir(mode=0o700) + write_archive(archive, extras) + with self.assertRaises(source_archive.SourceArchiveError): + source_archive.extract_source_archive(archive, destination, selected) + self.assertFalse((destination / ".cx_sources").exists()) + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "source.tar" + destination = root / "destination" + destination.mkdir(mode=0o700) + existing = destination / ".cx_sources" + existing.mkdir(mode=0o700) + marker = existing / "marker" + marker.write_text("existing\n") + write_archive(archive) + with self.assertRaises(source_archive.SourceArchiveError): + source_archive.extract_source_archive(archive, destination, selected) + self.assertEqual(marker.read_text(), "existing\n") + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "source.tar" + write_archive(archive) + real_destination = root / "real-destination" + real_destination.mkdir(mode=0o700) + linked_destination = root / "linked-destination" + linked_destination.symlink_to(real_destination, target_is_directory=True) + with self.assertRaises((OSError, source_archive.SourceArchiveError)): + source_archive.extract_source_archive(archive, linked_destination, selected) + self.assertFalse((real_destination / ".cx_sources").exists()) + + unsafe_destination = root / "unsafe-destination" + unsafe_destination.mkdir(mode=0o700) + unsafe_destination.chmod(0o755) + with self.assertRaises(source_archive.SourceArchiveError): + source_archive.extract_source_archive(archive, unsafe_destination, selected) + self.assertFalse((unsafe_destination / ".cx_sources").exists()) + + for limit, value in ( + ("MAX_ARCHIVE_MEMBERS", 1), + ("MAX_MEMBER_BYTES", 1), + ("MAX_EXPANDED_BYTES", 1), + ("MAX_ARCHIVE_BYTES", 1), + ("MAX_ARCHIVE_HEADERS", 1), + ): + with self.subTest(limit=limit), tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "source.tar" + destination = root / "destination" + destination.mkdir(mode=0o700) + write_archive(archive) + with mock.patch.object(source_archive, limit, value): + with self.assertRaises(source_archive.SourceArchiveError): + source_archive.extract_source_archive(archive, destination, selected) + self.assertFalse((destination / ".cx_sources").exists()) + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "source.tar" + destination = root / "destination" + destination.mkdir(mode=0o700) + write_archive(archive) + long_name = f".cx_sources/{selected}/long-name-result\0".encode() + with tarfile.open(archive, "a") as handle: + for _ in range(3): + extension = tarfile.TarInfo("././@LongLink") + extension.type = tarfile.GNUTYPE_LONGNAME + extension.size = len(long_name) + handle.addfile(extension, io.BytesIO(long_name)) + member, stream = regular("placeholder", b"payload\n") + handle.addfile(member, stream) + archive.chmod(0o600) + for limit, value in ( + ("MAX_EXTENSION_CHAIN", 1), + ("MAX_EXTENSION_MEMBER_BYTES", 1), + ("MAX_EXTENSION_BYTES", len(long_name) * 2), + ): + with self.subTest(limit=limit), mock.patch.object( + source_archive, limit, value + ): + with self.assertRaises(source_archive.SourceArchiveError): + source_archive.extract_source_archive( + archive, destination, selected + ) + self.assertFalse((destination / ".cx_sources").exists()) + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "source.tar" + destination = root / "destination" + destination.mkdir(mode=0o700) + write_archive(archive) + with tarfile.open(archive, "a", format=tarfile.PAX_FORMAT) as handle: + member, stream = regular( + f".cx_sources/{selected}/sparse-v1", b"1\n0\n1\n" + ) + member.pax_headers = { + "GNU.sparse.major": "1", + "GNU.sparse.minor": "0", + "GNU.sparse.realsize": "1", + } + handle.addfile(member, stream) + archive.chmod(0o600) + with self.assertRaises(source_archive.SourceArchiveError): + source_archive.extract_source_archive(archive, destination, selected) + self.assertFalse((destination / ".cx_sources").exists()) + + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + archive = root / "source.tar" + destination = root / "destination" + destination.mkdir(mode=0o700) + write_archive(archive) + original_next = tarfile.TarFile.next + + def sparse_next(handle: tarfile.TarFile) -> tarfile.TarInfo | None: + member = original_next(handle) + if member is not None and member.isfile(): + member.sparse = [(0, 1)] + return member + + with mock.patch.object(tarfile.TarFile, "next", sparse_next): + with self.assertRaises(source_archive.SourceArchiveError): + source_archive.extract_source_archive(archive, destination, selected) + self.assertFalse((destination / ".cx_sources").exists()) + + def test_runtime_identity_and_realized_placement_are_behavioral(self) -> None: + self.assertFalse(capability.runtime_identity_issues( + "mi325x", vendor="amd", arch="gfx942", machine="amd64", + device_name="AMD Instinct MI325X", device_count=8, world_size=8, + )) + self.assertTrue(capability.runtime_identity_issues( + "mi355x", vendor="amd", arch="gfx942", machine="amd64", + device_name="AMD Instinct MI325X", device_count=8, world_size=8, + )) + records = [("private-a", rank) for rank in range(4)] + [ + ("private-b", rank) for rank in range(4) + ] + self.assertEqual( + run_ep._summarize_realized_placement( + records, expected_nodes=2, expected_gpus_per_node=4, expected_world_size=8 + ), + { + "gpus_per_node": 4, + "nodes": 2, + "ranks_per_node": 4, + "unique_local_ranks": True, + "valid": True, + }, + ) + with self.assertRaises(ValueError): + run_ep._summarize_realized_placement( + records[:-1] + [("private-b", 2)], + expected_nodes=2, + expected_gpus_per_node=4, + expected_world_size=8, + ) + + def test_private_allocation_stratum_is_salted_ordered_and_rank_consistent(self) -> None: + salt = "a" * 64 + hosts = ["private-node-b", "private-node-a", "private-node-a"] + selectors = { + "ib_gid_index": "3", + "rdma_devices": "private-hca0:1,private-hca1:1", + "rdma_service_level": "2", + "socket_ifname": "private-if0", + } + digest = run_ep._allocation_stratum_sha256( + hosts, audit_salt=salt, fabric_selectors=selectors, required=True + ) + self.assertRegex(digest or "", r"^[0-9a-f]{64}$") + self.assertEqual( + digest, + run_ep._allocation_stratum_sha256( + list(reversed(hosts)), + audit_salt=salt, + fabric_selectors=selectors, + required=True, + ), + ) + for changed_hosts, changed_salt, changed_selectors in ( + (hosts + ["private-node-c"], salt, selectors), + (hosts, "b" * 64, selectors), + (hosts, salt, {**selectors, "ib_gid_index": "4"}), + ): + self.assertNotEqual( + digest, + run_ep._allocation_stratum_sha256( + changed_hosts, + audit_salt=changed_salt, + fabric_selectors=changed_selectors, + required=True, + ), + ) + serialized = json.dumps({"allocation_stratum_sha256": digest}) + private_literals = [ + salt, + *hosts, + selectors["rdma_devices"], + selectors["socket_ifname"], + ] + self.assertFalse(any(value in serialized for value in private_literals)) + self.assertNotIn("physical_hosts", serialized) + self.assertNotIn("fabric_selectors", serialized) + self.assertEqual( + run_ep._common_allocation_stratum([digest, digest, digest], required=True), + digest, + ) + with self.assertRaisesRegex(ValueError, "differs across distributed ranks"): + run_ep._common_allocation_stratum([digest, "b" * 64], required=True) + with self.assertRaisesRegex(ValueError, "requires"): + run_ep._allocation_stratum_sha256( + hosts, audit_salt=None, fabric_selectors=selectors, required=True + ) + self.assertIsNone(run_ep._allocation_stratum_sha256( + hosts, audit_salt=None, fabric_selectors=selectors, required=False + )) + + def test_collective_version_and_rccl_fingerprint_are_normalized(self) -> None: + self.assertEqual(ep_harness.format_collective_version(23004), "2.30.4") + self.assertEqual(ep_harness.format_collective_version(21805), "2.18.5") + self.assertEqual(ep_harness.format_collective_version((2, 21, 5)), "2.21.5") + + properties = types.SimpleNamespace( + multi_processor_count=304, total_memory=1024, warp_size=64 + ) + fake = types.SimpleNamespace( + __version__="2.9.0", + version=types.SimpleNamespace(cuda=None, hip="7.2"), + cuda=types.SimpleNamespace( + get_device_properties=lambda _device: properties, + get_device_name=lambda _device: "AMD Instinct MI325X", + nccl=types.SimpleNamespace(version=lambda: 21805), + ), + ) + with mock.patch.object( + run_ep, "_loaded_collective_version", return_value="2.18.5" + ): + fingerprint = run_ep._runtime_fingerprint( + fake, "device", machine="amd64", vendor="amd", arch="gfx942" + ) + self.assertEqual(fingerprint["collective_library"], {"kind": "rccl", "version": "2.18.5"}) + self.assertEqual(fingerprint["accelerator_runtime"], {"kind": "hip", "version": "7.2"}) + + class FakeCollective: + @staticmethod + def ncclGetVersion(pointer) -> int: + pointer._obj.value = 23004 + return 0 + + maps = "0-1 r-xp 0 00:00 0 /runtime/libnccl.so.2\n" + with ( + mock.patch("builtins.open", return_value=io.StringIO(maps)), + mock.patch.object(run_ep.os.path, "isfile", return_value=True), + mock.patch.object( + run_ep.os.path, "realpath", return_value="/runtime/libnccl.so.2" + ), + mock.patch.object(run_ep.ctypes, "CDLL", return_value=FakeCollective()), + ): + self.assertEqual(run_ep._loaded_collective_version(), "2.30.4") + + path = HERE / "ep_nccl.py" + tree = ast.parse(path.read_text(), str(path)) + helper = next( + node for node in tree.body + if isinstance(node, ast.FunctionDef) and node.name == "_runtime_collective" + ) + namespace = {"re": re} + exec(compile(ast.Module(body=[helper], type_ignores=[]), str(path), "exec"), namespace) + args = types.SimpleNamespace( + runtime_fingerprint={ + "collective_library": {"kind": "nccl", "version": "2.30.4"} + } + ) + cuda = types.SimpleNamespace(version=types.SimpleNamespace(hip=None)) + self.assertEqual(namespace["_runtime_collective"](args, cuda), ("nccl", "2.30.4")) + args.runtime_fingerprint["collective_library"]["version"] = None + with self.assertRaisesRegex(RuntimeError, "runtime identity is unavailable"): + namespace["_runtime_collective"](args, cuda) + self.assertNotIn("torch.cuda.nccl.version", path.read_text()) + + def test_workloads_bind_generator_activation_and_trace(self) -> None: + args = ("uniform", 7168, 8, 256, 8, 64, 67) + first = workload.compute_workload_id(*args) + self.assertTrue(identity.is_typed_id(first, "workload")) + self.assertEqual(first, workload.compute_workload_id(*args)) + self.assertNotEqual(first, workload.compute_workload_id(*args[:-1], 68)) + self.assertNotEqual( + first, + workload.compute_workload_id(*args, trace_checksum="a" * 64), + ) + _, _, manifest = workload.build_workload(8, 2, 4, "uniform", 4, 67, 2) + member, checksums, _, _ = workload.canonical_member( + "uniform", 8, 2, 4, 2, 2, 67 + ) + self.assertEqual(member, manifest["workload_id"]) + self.assertEqual(checksums, manifest["checksums"]) + + def test_eplb_calibration_window_is_disjoint_and_identity_bound(self) -> None: + evaluation = workload.canonical_member("zipf", 8, 2, 8, 2, 4, 67) + calibration = workload.canonical_eplb_calibration_member( + "zipf", 8, 2, 8, 2, 4, 67 + ) + self.assertNotEqual(evaluation[0], calibration[0]) + self.assertNotEqual(evaluation[1]["trace"], calibration[1]["trace"]) + self.assertGreater( + workload.EPLB_CALIBRATION_TOKEN_OFFSET, + 2 * 4, + ) + repeated = workload.canonical_eplb_calibration_member( + "zipf", 8, 2, 8, 2, 4, 67 + ) + self.assertEqual(calibration, repeated) + with self.assertRaises(ValueError): + workload.canonical_routing_rows( + 8, 8, 2, "zipf", 67, token_offset=-1 + ) + + def test_canonical_members_are_bound_to_each_scheduled_row(self) -> None: + case = { + "routing": "uniform", "hidden": 8, "topk": 2, "experts": 4, "ep": 2, + "mode": "normal", + } + eplb_record = { + "enabled": False, "mapping_hash": None, "num_physical_experts": 4, + } + + def expected( + *, tokens: int = 1, hidden: int = 8 + ) -> tuple[str, dict[str, str], str]: + member, checksums, row_hash, _, _ = contracts._expected_canonical_trace( + "uniform", hidden, 2, 4, 4, 2, tokens, 67, False, 2048 + ) + return member, checksums, row_hash + + member, checksums, row_hash = expected() + rows = [{"tokens_per_rank": 1, "routing": {"hash": row_hash}}] + proof = { + "manifest_checksums": {member: checksums}, + "members": [member], + "workload_id": identity.workload_id({ + "members": [{"checksums": checksums, "workload_id": member}] + }), + } + contracts._validate_canonical_workload(proof, case, rows, eplb_record) + + def replace_member(document: dict, replacement: tuple[str, dict[str, str], str]) -> None: + replacement_id, replacement_checksums, _ = replacement + document["members"] = [replacement_id] + document["manifest_checksums"] = {replacement_id: replacement_checksums} + document["workload_id"] = identity.workload_id({ + "members": [{ + "checksums": replacement_checksums, + "workload_id": replacement_id, + }] + }) + + mutations = { + "wrong member token": lambda document, mutated_rows: replace_member( + document, expected(tokens=2) + ), + "wrong member dimensions": lambda document, mutated_rows: replace_member( + document, expected(hidden=16) + ), + "wrong member checksum": lambda document, mutated_rows: replace_member( + document, + ( + member, + {**checksums, "topk_idx": "0" * 64}, + row_hash, + ), + ), + "row hash unrelated to member": lambda document, mutated_rows: mutated_rows[0][ + "routing" + ].update({"hash": "f" * 64}), + } + for label, mutate in mutations.items(): + with self.subTest(label=label), self.assertRaises(contracts.ContractError): + bad_proof, bad_rows = copy.deepcopy(proof), copy.deepcopy(rows) + mutate(bad_proof, bad_rows) + contracts._validate_canonical_workload( + bad_proof, case, bad_rows, eplb_record + ) + + def test_eplb_row_hash_is_bound_to_the_frozen_remap(self) -> None: + case = { + "routing": "zipf", "hidden": 8, "topk": 2, "experts": 4, "ep": 2, + "mode": "normal", + } + physical = eplb.physical_count(4, 32, 2) + plan = contracts._expected_eplb_plan("zipf", 2, 4, physical, 2, 67, 2048) + eplb_record = { + "enabled": True, + "mapping_hash": eplb.mapping_hash(plan), + "num_physical_experts": physical, + } + member, checksums, row_hash, _, _ = contracts._expected_canonical_trace( + "zipf", 8, 2, 4, physical, 2, 1, 67, True, 2048 + ) + self.assertNotEqual(row_hash, checksums["trace"]) + workload_proof = { + "manifest_checksums": {member: checksums}, + "members": [member], + "workload_id": identity.workload_id({ + "members": [{"checksums": checksums, "workload_id": member}] + }), + } + rows = [{"tokens_per_rank": 1, "routing": {"hash": row_hash}}] + contracts._validate_canonical_workload(workload_proof, case, rows, eplb_record) + with self.assertRaisesRegex(contracts.ContractError, "EPLB mapping"): + contracts._validate_canonical_workload( + workload_proof, case, rows, {**eplb_record, "mapping_hash": "0" * 64} + ) + + def test_oracle_pass_cannot_ignore_combined_value_failure(self) -> None: + oracle = { + "atol": ep_harness.ORACLE_ATOL, + "checks": { + "combine_values": True, + "counts": True, + "metadata": True, + "multiplicity": True, + "payload": True, + "source_set": True, + "weights": True, + }, + "combine_weight_semantics": "unweighted-rank-sum", + "contract": ep_harness.ORACLE_CONTRACT, + "dispatch_sha256": "a" * 64, + "max_absolute_error": 0.0, + "max_elementwise_relative_error": 0.0, + "max_relative_error": 0.0, + "max_weight_error": 0.0, + "order_sha256": "b" * 64, + "ordering_contract": "stable-v1", + "passed": True, + "receive_count": 1, + "rtol": ep_harness.ORACLE_RTOL, + } + contracts._validate_oracle(oracle, "oracle") + weighted = copy.deepcopy(oracle) + weighted["combine_weight_semantics"] = "native-gate-weighted" + with self.assertRaisesRegex(contracts.ContractError, "differs from v1"): + contracts._validate_oracle(weighted, "oracle") + tampered = copy.deepcopy(oracle) + tampered["checks"]["combine_values"] = False + with self.assertRaises(contracts.ContractError): + contracts._validate_oracle(tampered, "oracle") + + def test_oracle_stability_canonicalizes_native_receive_order(self) -> None: + source = (HERE / "ep_harness.py").read_text() + begin = source.index("canonical_order = torch.argsort") + canonical = source[begin:source.index("problem.recv_tokens = receive_count", begin)] + self.assertIn("canonical_sources", canonical) + self.assertIn("canonical_ids", canonical) + self.assertIn("canonical_weights", canonical) + self.assertNotIn("_tensor_sha256(source_ids", canonical) + mori = (HERE / "ep_mori.py").read_text() + self.assertIn('"inter-node-v1" if self._inter_node', mori) + self.assertIn('else "async-ll" if self._async_ll', mori) + backend = types.SimpleNamespace(name="mori", kernel_generation="async-ll") + self.assertEqual(ep_harness.kernel_generation(backend), "async-ll") + backend.kernel_generation = "inter-node-v1" + self.assertEqual(ep_harness.kernel_generation(backend), "inter-node-v1") + + def test_terminal_fail_safe_fills_only_missing_shard_cases(self) -> None: + matrix = sweep_matrix.resolve_matrix(backends="all", max_cases=128) + shard = next(item for item in matrix["include"] if item["n"] >= 2) + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + matrix_path = root / "matrix.json" + control_path = root / "control.json" + out_dir = root / "results" + matrix_path.write_text(json.dumps(matrix)) + control = sweep_matrix.extract_shard( + matrix_path, shard["id"], control_path, + sku=shard["sku"], backend=shard["backend"], nodes=shard["nodes"], + ) + control["cases"] = control["cases"][:2] + control["n"] = 2 + control_path.write_text(json.dumps(control)) + first = {key: value for key, value in control["cases"][0].items() if key != "case_id"} + git_run = { + "artifact": "artifact", "job": "job", "ref": "collectivex", + "repo": "SemiAnalysisAI/InferenceX", "run_attempt": "1", + "run_id": "123", "source_sha": "a" * 40, + "qualification_index": 1, + } + allocation = { + "artifact": "artifact", "execution_id": "execution", "job": "job", + "repo": "SemiAnalysisAI/InferenceX", "run_attempt": "1", "run_id": "123", + "runner": shard["sku"], "source_sha": "a" * 40, + "qualification_index": 1, + } + out_dir.mkdir() + existing = contracts.make_terminal_document( + allocation_factors=allocation, attempt_ordinal=1, case=first, + case_factors={"case": first, "profile": identity.V1_CASE_PROFILE, "sku": shard["sku"]}, + control_sha256=hashlib.sha256(control_path.read_bytes()).hexdigest(), + failure_mode="setup", generated_at="2026-07-04T00:00:00Z", git_run=git_run, + reason="launcher-setup-failed", return_code=7, source="runtime-emitter", + status="failed", + expected_case_id=control["cases"][0]["case_id"], + ) + (out_dir / "existing.json").write_text(json.dumps(existing)) + (out_dir / "partial.json").write_text(json.dumps({ + "format": contracts.RAW_FORMAT, + "identity": {"case_id": control["cases"][1]["case_id"]}, + "sample_artifact": {"path": "partial.samples.json"}, + })) + (out_dir / "partial.samples.json").write_text("{broken") + environment = { + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "CX_SHARD_FILE": str(control_path), + "CX_SHARD_SKU": shard["sku"], + "CX_RUNNER": shard["sku"], + "CX_BENCH": shard["backend"], + "CX_NODES": str(shard["nodes"]), + "COLLECTIVEX_EXECUTION_ID": "execution", + "COLLECTIVEX_ARTIFACT_NAME": "artifact", + "GITHUB_JOB": "job", "GITHUB_REF_NAME": "collectivex", + "GITHUB_REPOSITORY": "SemiAnalysisAI/InferenceX", + "GITHUB_RUN_ATTEMPT": "1", "GITHUB_RUN_ID": "123", + "GITHUB_SHA": "a" * 40, + "CX_QUALIFICATION_INDEX": "1", + } + subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_emit_setup_failures "$2" "$3" "$4" 7', + "_", str(ROOT / "runtime" / "common.sh"), str(ROOT), + str(out_dir), shard["backend"], + ], + check=True, + env=environment, + ) + attempts = [contracts.strict_load(path) for path in out_dir.glob("*.json")] + self.assertEqual(len(attempts), 2) + self.assertEqual( + contracts.validate_attempt_paths([str(path) for path in out_dir.glob("*.json")]), + 2, + ) + delivery = [str(path) for path in out_dir.glob("*.json")] + self.assertEqual(contracts.validate_delivery(delivery, str(control_path)), 2) + with self.assertRaises(contracts.ContractError): + contracts.validate_delivery(delivery[:1], str(control_path)) + self.assertEqual( + {attempt["identity"]["case_id"] for attempt in attempts}, + {case["case_id"] for case in control["cases"]}, + ) + self.assertTrue((out_dir / "partial.json.quarantine").is_file()) + self.assertTrue((out_dir / "partial.samples.json.quarantine").is_file()) + + preallocation = root / "preallocation" + preallocation_results = preallocation / "experimental" / "CollectiveX" / "results" + preallocation_results.mkdir(parents=True) + failed = subprocess.run( + [ + "bash", "-c", + 'source "$1"; REPO_ROOT="$2"; export REPO_ROOT; ' + 'cx_install_launcher_fail_safe; cx_load_operator_config', + "_", str(ROOT / "runtime" / "common.sh"), str(preallocation), + ], + env={**environment, "COLLECTIVEX_OPERATOR_CONFIG_REQUIRED": "1"}, + ) + self.assertNotEqual(failed.returncode, 0) + preallocation_attempts = [ + contracts.validate_terminal_document(contracts.strict_load(path)) + for path in preallocation_results.glob("*.json") + ] + self.assertEqual( + {attempt["identity"]["case_id"] for attempt in preallocation_attempts}, + {case["case_id"] for case in control["cases"]}, + ) + + def test_runtime_identity_mismatch_is_failed_not_unsupported(self) -> None: + wrapper = next( + item for item in sweep_matrix.resolve_matrix()["requested_cases"] + if item["disposition"] == "runnable" + ) + case = wrapper["case"] + environment = { + "CX_RUNNER": wrapper["sku"], "CX_CASE_ID": case["case_id"], + "CX_SUITE": case["suite"], "CX_WORKLOAD_NAME": case["workload"], + "CX_REQUIRED_PUBLICATION": case["required_publication"], + "CX_ROUTING": case["routing"], "CX_EPLB": "1" if case["eplb"] else "", + "CX_EP": str(case["ep"]), "CX_NGPUS": str(case["ep"]), + "CX_HIDDEN": str(case["hidden"]), "CX_TOPK": str(case["topk"]), + "CX_EXPERTS": str(case["experts"]), "CX_NODES": str(case["nodes"]), + "CX_GPUS_PER_NODE": str(case["gpus_per_node"]), + "CX_SCALE_UP_DOMAIN": str(case["scale_up_domain"]), + "CX_MODE": case["mode"], "CX_SCOPE": case["scope"], + "CX_TOPO": case["topology_class"], "CX_TRANSPORT": case["transport"], + "CX_SCALE_UP_TRANSPORT": case["scale_up_transport"], + "CX_SCALE_OUT_TRANSPORT": case["scale_out_transport"] or "", + "CX_TOKENS_LADDER": case["ladder"], "CX_CANONICAL": "1", + "CX_ITERS": "8", "CX_TRIALS": "64", "CX_WARMUP": "32", + "CX_SAMPLES_PER_POINT": "512", "GITHUB_RUN_ID": "123", + "GITHUB_RUN_ATTEMPT": "1", "GITHUB_REF_NAME": "collectivex", + "GITHUB_SHA": "a" * 40, "GITHUB_REPOSITORY": "SemiAnalysisAI/InferenceX", + "GITHUB_JOB": "sweep", "COLLECTIVEX_ARTIFACT_NAME": "artifact", + "COLLECTIVEX_EXECUTION_ID": "execution", + } + with mock.patch.dict(os.environ, environment, clear=False): + terminal = contracts.make_terminal_from_environment( + backend=case["backend"], phase=case["phase"], return_code=5 + ) + self.assertEqual(terminal["identity"]["case_id"], case["case_id"]) + self.assertEqual( + terminal["outcome"], + { + "failure_mode": "runtime-identity", + "reason": "runtime-identity-mismatch", + "return_code": 5, + "status": "failed", + }, + ) + for mode, reason in contracts.RUNTIME_FAILURE_REASONS.items(): + with self.subTest(mode=mode), mock.patch.dict(os.environ, environment, clear=False): + staged = contracts.make_terminal_from_environment( + backend=case["backend"], phase=case["phase"], return_code=1, + failure_mode=mode, + ) + self.assertEqual(staged["outcome"]["reason"], reason) + mismatched = copy.deepcopy(staged) + mismatched["outcome"]["reason"] = "distributed-command-failed" + if reason == "distributed-command-failed": + mismatched["outcome"]["reason"] = "backend-setup-failed" + with self.assertRaisesRegex( + contracts.ContractError, "source and outcome are not registered" + ): + contracts.validate_terminal_document(mismatched) + with mock.patch.dict(os.environ, environment, clear=False): + with self.assertRaisesRegex( + contracts.ContractError, "runtime failure mode is not registered" + ) as raised: + contracts.make_terminal_from_environment( + backend=case["backend"], phase=case["phase"], return_code=1, + failure_mode="raw-private-error", + ) + self.assertNotIn("raw-private-error", str(raised.exception)) + with mock.patch.dict(os.environ, environment, clear=False): + generic = contracts.make_terminal_from_environment( + backend=case["backend"], phase=case["phase"], return_code=6, + ) + self.assertEqual( + generic["outcome"], + { + "failure_mode": "execution", + "reason": "distributed-command-failed", + "return_code": 6, + "status": "failed", + }, + ) + manual_environment = { + "CX_RUNNER": "manual-runner", + "COLLECTIVEX_EXECUTION_ID": "manual-execution", + } + with mock.patch.dict(os.environ, manual_environment, clear=True): + manual = contracts.make_terminal_from_environment( + backend="nccl-ep", phase="decode", return_code=6, + ) + self.assertIsNone(manual["provenance"]["git_run"]) + self.assertEqual( + { + field: manual["case"][field] + for field in ("suite", "workload", "canonical", "required_publication") + }, + { + "suite": "manual", "workload": "manual", "canonical": False, + "required_publication": "diagnostic", + }, + ) + self.assertEqual( + manual["identity"]["allocation_factors"], + { + "artifact": None, "execution_id": "manual-execution", "job": None, + "qualification_index": 1, "repo": None, + "run_attempt": None, "run_id": None, + "runner": "manual-runner", "source_sha": None, + }, + ) + broken = copy.deepcopy(manual) + broken["identity"]["allocation_factors"]["artifact"] = "forged-artifact" + allocation_id = identity.allocation_id( + broken["identity"]["allocation_factors"] + ) + broken["identity"]["allocation_id"] = allocation_id + broken["identity"]["attempt_id"] = identity.attempt_id( + allocation=allocation_id, + case=broken["identity"]["case_id"], + ordinal=broken["identity"]["attempt_ordinal"], + ) + with self.assertRaisesRegex( + contracts.ContractError, "allocation factors differ" + ): + contracts.validate_terminal_document(broken) + + def test_launchers_use_private_logs_and_allowlisted_failure_stages(self) -> None: + expected = { + "launch_single-slurm.sh": { + "setup", "registry-verification", "container-import", "container-hash", + "repository-stage", "scheduler-allocation", "container-launch", + "artifact-collection", + }, + "launch_gb-nv.sh": { + "setup", "registry-verification", "container-import", "container-hash", + "repository-stage", "scheduler-allocation", "container-launch", "backend-setup", + "execution", "artifact-collection", + }, + "launch_mi-amds.sh": { + "setup", "repository-stage", "registry-verification", "scheduler-allocation", + "container-import", "container-hash", "container-launch", "artifact-collection", + }, + } + common = (ROOT / "runtime" / "common.sh").read_text() + for name, stages in expected.items(): + launcher = (ROOT / "launchers" / name).read_text() + stage_source = launcher + common if name == "launch_gb-nv.sh" else launcher + self.assertNotIn("--export=ALL", launcher) + if name == "launch_gb-nv.sh": + self.assertIn("cx_run_distributed_shard", launcher) + else: + self.assertIn("cx_container_exports", launcher) + self.assertIn("collect_rc=0", launcher) + for stage in stages: + with self.subTest(launcher=name, stage=stage): + self.assertIn(f"cx_set_failure_stage {stage}", stage_source) + amd = (ROOT / "launchers" / "launch_mi-amds.sh").read_text() + self.assertIn("cx_ensure_squash_on_job", amd) + self.assertIn("cx_fail_stage container-hash", amd) + self.assertNotIn('cat "$import_log"', amd) + self.assertIn('bash -s -- "$sq" "$lock" "$image"', common) + self.assertIn("> \"$log\" 2>&1 <<'BASH'", common) + self.assertIn("cx_fail_stage container-import", common) + runtime = (ROOT / "runtime" / "run_in_container.sh").read_text() + export_start = common.index("\ncx_container_exports() {") + exports = common[export_start:common.index("\n}", export_start)] + export_names = { + name + for payload in re.findall(r"printf '%s' '([^']*)'", exports) + for name in payload.split(",") if name + } + for private_name in ( + "COLLECTIVEX_OPERATOR_CONFIG", "GITHUB_TOKEN", "GITHUB_WORKSPACE", "HOME", + "CX_PARTITION", "CX_ACCOUNT", "CX_SQUASH_DIR", "CX_STAGE_DIR", + ): + self.assertNotIn(private_name, export_names) + self.assertIn("CX_BACKEND_CACHE_ROOT", export_names) + self.assertIn("CX_BACKEND_CACHE_SENTINEL_SHA256", export_names) + self.assertNotIn("CX_PREPARED_BACKEND_CACHE", export_names) + self.assertIn("MORI_COMMIT", export_names) + self.assertIn("cx_write_runtime_stage backend-setup", runtime) + self.assertIn("cx_write_runtime_stage execution", runtime) + distributed = common[common.index("cx_run_distributed_shard()") :] + self.assertIn("cx_private_log_path shard-summary", distributed) + self.assertIn("cx_fail_stage execution", distributed) + self.assertIn('cx_fail_stage execution "$runtime_log"', distributed) + + def test_case_failure_diagnostic_precedes_normal_srun_footer(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + log = Path(temporary) / "runtime.log" + log.write_text( + "WARN: deepep decode run failed rc=1 (CX_RUN_TIMEOUT=900s)\n" + "SHARD done: 6/6 case(s) failed\n" + "srun: error: task exited 1\n" + ) + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_fail_stage execution "$2"', + "_", str(ROOT / "runtime" / "common.sh"), str(log), + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 1) + self.assertIn("diagnostic=benchmark-case-failure", result.stderr) + + def test_non_timeout_failure_warning_is_classified_as_case_failure(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + log = Path(temporary) / "runtime.log" + log.write_text("WARN: deepep decode run failed rc=1\nsrun: task exited 1\n") + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_fail_stage execution "$2"', + "_", str(ROOT / "runtime" / "common.sh"), str(log), + ], + text=True, + capture_output=True, + ) + self.assertEqual(result.returncode, 1) + self.assertNotIn("diagnostic=network-or-timeout", result.stderr) + self.assertIn("diagnostic=benchmark-case-failure", result.stderr) + + def test_private_runtime_failure_signatures_override_case_footer(self) -> None: + signatures = { + "DeepEP V2 no-GIN run is outside one realized LSA domain": + "accelerator-topology", + "NCCL exception (/src/nccl.cu:111): 3": "accelerator-topology", + "NCCL exception (/src/nccl.cu:112): 3": "accelerator-topology", + "CUDA error: call requires newer driver": "accelerator-driver", + "NCCL failure in ncclCommWindowRegister": "nccl-device-api", + "Communicator does not support symmetric memory": "nccl-device-api", + "NCCL exception (/src/nccl.cu:106): 5": "nccl-device-api", + "NCCL exception (/src/nccl.cu:127): 5": "nccl-device-api", + "NCCL exception (/src/nccl.cu:128): 5": "nccl-device-api", + "NCCL exception (/src/nccl.cu:129): 5": "nccl-device-api", + "NCCL exception (/src/nccl.cu:135): 5": "nccl-device-api", + "NVCC compilation failed": "jit-toolchain", + "CUDA out of memory": "accelerator-memory", + "torch rendezvous timed out": "network-or-timeout", + } + with tempfile.TemporaryDirectory() as temporary: + log = Path(temporary) / "runtime.log" + for signature, diagnostic in signatures.items(): + log.write_text(f"{signature}\nSHARD done: 6/6 case(s) failed\n") + result = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_fail_stage execution "$2"', + "_", str(ROOT / "runtime" / "common.sh"), str(log), + ], + text=True, + capture_output=True, + env={**os.environ, "CX_BENCH": "deepep-v2"}, + ) + self.assertEqual(result.returncode, 1) + self.assertIn(f"diagnostic={diagnostic}", result.stderr) + + log.write_text( + "NCCL exception (/src/nccl.cu:106): 5\n" + "SHARD done: 6/6 case(s) failed\n" + ) + result = subprocess.run( + [ + "bash", "-c", 'source "$1"; cx_fail_stage execution "$2"', + "_", str(ROOT / "runtime" / "common.sh"), str(log), + ], + text=True, capture_output=True, + env={**os.environ, "CX_BENCH": "deepep"}, + ) + self.assertIn("diagnostic=benchmark-case-failure", result.stderr) + + def test_runtime_stage_marker_distinguishes_launch_from_execution(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + mount = Path(temporary) + root = mount / "experimental" / "CollectiveX" + root.mkdir(parents=True) + command = r''' + set -euo pipefail + source "$1" + export COLLECTIVEX_EXECUTION_ID=test_1_shard CX_TS=test + cx_set_failure_stage container-launch + cx_prepare_runtime_marker "$2" + (cd "$2/experimental/CollectiveX"; cx_write_runtime_stage backend-setup) + cx_adopt_runtime_stage "$2" + test "$CX_FAILSAFE_MODE" = backend-setup + (cd "$2/experimental/CollectiveX"; cx_write_runtime_stage execution) + cx_adopt_runtime_stage "$2" + test "$CX_FAILSAFE_MODE" = execution + ''' + subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"), + str(mount)], + check=True, + ) + + def test_canonical_gha_environment_is_locked_but_manual_overrides_survive(self) -> None: + common = ROOT / "runtime" / "common.sh" + command = r''' + set -euo pipefail + source "$1" + export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true + export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1 + export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x + export CX_NODES=1 CX_GPUS_PER_NODE=8 + export CX_IMAGE=untrusted CX_IMAGE_DIGEST=untrusted CX_NGPUS=99 + export CX_NCCL_HOME=/untrusted CX_LOCK_DIR=/tmp CX_SQUASH_DIR=/shared/containers + export CX_STAGE_DIR=/private/stale-stage + export CX_MORI_KERNEL_TYPE=intranode MORI_ENABLE_SDMA=0 + export NCCL_MNNVL_ENABLE=1 MC_FORCE_MNNVL=1 CX_DRYRUN=1 + export CX_BACKEND_CACHE_ROOT=/untrusted CX_BACKEND_CACHE_SENTINEL_SHA256=bad + export CX_PREPARED_BACKEND_CACHE=/untrusted CX_BACKEND_SOURCE_ROOT=/untrusted + ! (cx_lock_canonical_gha_env mi325x) + export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$ + export CX_STAGE_DIR="$GITHUB_WORKSPACE" CX_AUDIT_SALT="$(printf 'a%.0s' {1..64})" + unset CX_LOCK_DIR + cx_lock_canonical_gha_env mi325x + test "$CX_IMAGE" = "$CX_IMAGE_AMD_MORI_MI325" + test "$CX_IMAGE_DIGEST" = "$CX_IMAGE_AMD_MORI_MI325_DIGEST" + test "$CX_NGPUS:$CX_SEED:$CX_RUN_TIMEOUT" = 8:67:1800 + test "$CX_MORI_KERNEL_TYPE:$MORI_DISABLE_AUTO_XGMI:$MORI_ENABLE_SDMA" = asyncll:0:1 + test "$MORI_COMMIT" = "$CX_MORI_COMMIT_MI325" + test "$MORI_APP_LOG_LEVEL:$MORI_SHMEM_LOG_LEVEL:$MORI_IO_LOG_LEVEL" = info:info:info + test "$CX_STAGE_DIR" = "$GITHUB_WORKSPACE" + test -z "${CX_NCCL_HOME+x}${CX_LOCK_DIR+x}${NCCL_MNNVL_ENABLE+x}${MC_FORCE_MNNVL+x}" + test -z "${CX_BACKEND_CACHE_ROOT+x}${CX_BACKEND_CACHE_SENTINEL_SHA256+x}" + test -z "${CX_PREPARED_BACKEND_CACHE+x}${CX_BACKEND_SOURCE_ROOT+x}" + test -z "${CX_DRYRUN+x}" + + export CX_STAGE_DIR=/shared/gb-stage + export CX_SHARD_SKU=gb300 CX_NODES=2 CX_GPUS_PER_NODE=4 + export CX_IMAGE=untrusted CX_NGPUS=1 CX_MORI_KERNEL_TYPE=untrusted + export MORI_ENABLE_SDMA=0 CX_NCCL_HOME=/untrusted CX_MASTER_PORT=1 + cx_lock_canonical_gha_env gb300 + test "$CX_IMAGE" = "$CX_IMAGE_MULTIARCH" + test "$CX_IMAGE_DIGEST" = "$CX_IMAGE_MULTIARCH_DIGEST" + test "$CX_NGPUS:$CX_SEED:$CX_RUN_TIMEOUT" = 8:67:900 + test "$CX_NCCL_HOME:$CX_MASTER_PORT" = /usr:29551 + test "$CX_STAGE_DIR" = /shared/gb-stage + test -z "${CX_MORI_KERNEL_TYPE+x}${MORI_ENABLE_SDMA+x}" + + export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$ + export CX_SHARD_SKU=mi355x CX_NODES=1 CX_GPUS_PER_NODE=8 + export CX_LOCK_DIR=/validated/amd-locks CX_STAGE_DIR=/validated/amd-stage + export CX_AUDIT_SALT="$(printf 'a%.0s' {1..64})" + cx_lock_canonical_gha_env mi355x + test "$CX_LOCK_DIR" = /validated/amd-locks + test "$CX_STAGE_DIR" = /validated/amd-stage + test "$MORI_COMMIT" = "$CX_MORI_COMMIT_MI355" + + unset COLLECTIVEX_CANONICAL_GHA + unset COLLECTIVEX_OPERATOR_CONFIG_LOADED + CX_IMAGE=manual CX_IMAGE_DIGEST=manual CX_NGPUS=3 + CX_MORI_KERNEL_TYPE=manual + cx_lock_canonical_gha_env mi355x + test "$CX_IMAGE:$CX_IMAGE_DIGEST:$CX_NGPUS:$CX_MORI_KERNEL_TYPE" = manual:manual:3:manual + ''' + with tempfile.TemporaryDirectory(dir=Path.home()) as workspace: + Path(workspace).chmod(0o720) + subprocess.run( + ["bash", "-c", command, "_", str(common)], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "GITHUB_WORKSPACE": workspace, + }, + ) + self.assertEqual(list(Path(workspace).iterdir()), []) + + def test_canonical_amd_stage_uses_config_not_world_writable_workspace(self) -> None: + common = ROOT / "runtime" / "common.sh" + command = r''' + source "$1" + export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true + export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1 + export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x + export CX_NODES=1 CX_GPUS_PER_NODE=8 CX_SQUASH_DIR=/shared/containers + export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$ CX_STAGE_DIR=/shared/amd-stage + export CX_AUDIT_SALT="$(printf 'a%.0s' {1..64})" + cx_lock_canonical_gha_env mi325x + printf '%s' "$CX_STAGE_DIR" + ''' + with tempfile.TemporaryDirectory(dir=Path.home()) as workspace: + Path(workspace).chmod(0o702) + result = subprocess.run( + ["bash", "-c", command, "_", str(common)], + text=True, + capture_output=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "GITHUB_WORKSPACE": workspace, + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual(result.stdout, "/shared/amd-stage") + self.assertNotIn(workspace, result.stderr) + + def test_canonical_amd_stage_uses_config_not_symlinked_workspace(self) -> None: + common = ROOT / "runtime" / "common.sh" + command = r''' + source "$1" + export COLLECTIVEX_CANONICAL_GHA=1 GITHUB_ACTIONS=true + export GITHUB_RUN_ID=123 GITHUB_RUN_ATTEMPT=1 + export COLLECTIVEX_SOURCE_SHA=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + export CX_SHARD_FILE=.shards/test.json CX_SHARD_SKU=mi325x + export CX_NODES=1 CX_GPUS_PER_NODE=8 CX_SQUASH_DIR=/shared/containers + export COLLECTIVEX_OPERATOR_CONFIG_LOADED=$$ CX_STAGE_DIR=/shared/amd-stage + export CX_AUDIT_SALT="$(printf 'a%.0s' {1..64})" + cx_lock_canonical_gha_env mi325x + printf '%s' "$CX_STAGE_DIR" + ''' + with tempfile.TemporaryDirectory(dir=Path.home()) as temporary: + root = Path(temporary) + real = root / "real" + real.mkdir() + link = root / "workspace" + link.symlink_to(real, target_is_directory=True) + result = subprocess.run( + ["bash", "-c", command, "_", str(common)], + text=True, + capture_output=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "GITHUB_WORKSPACE": str(link), + }, + ) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual(result.stdout, "/shared/amd-stage") + self.assertNotIn(str(root), result.stderr) + + def test_image_selection_and_registry_verification_are_fail_closed(self) -> None: + common = ROOT / "runtime" / "common.sh" + command = r''' + source "$1" + test "$(cx_default_image mi325x)" = "$CX_IMAGE_AMD_MORI_MI325" + test "$(cx_default_image mi355x)" = "$CX_IMAGE_AMD_MORI" + pinned="sha256:$(printf 'a%.0s' {1..64})" + curl() { + case "$*" in + *auth.docker.io*) printf '{"token":"test"}' ;; + *) printf 'Docker-Content-Digest: %s\r\n' "$pinned" ;; + esac + } + test "$(cx_resolve_registry_digest ubuntu:latest)" = "$pinned" + test "$(cx_resolve_registry_digest docker.io/library/ubuntu:latest)" = "$pinned" + ! (cx_resolve_registry_digest "ubuntu@$pinned") + ! (cx_resolve_registry_digest ghcr.io/example/image:tag) + ! (cx_resolve_registry_digest 'ubuntu@sha256:bad') + curl() { + case "$*" in *auth.docker.io*) printf '{"token":"test"}';; esac + } + ! (cx_resolve_registry_digest ubuntu:latest) + cx_resolve_registry_digest() { printf '%s' "$CX_IMAGE_MULTIARCH_DIGEST"; } + cx_verify_registry_image "$CX_IMAGE_MULTIARCH" + test "$COLLECTIVEX_IMAGE_DIGEST_VERIFIED" = 1 + test "$COLLECTIVEX_IMAGE_DIGEST" = "$CX_IMAGE_MULTIARCH_DIGEST" + cx_reverify_registry_image "$CX_IMAGE_MULTIARCH" + cx_resolve_registry_digest() { printf 'sha256:%064d' 0; } + ! (cx_reverify_registry_image "$CX_IMAGE_MULTIARCH") + ! (cx_verify_registry_image "$CX_IMAGE_MULTIARCH") + ''' + subprocess.run( + ["bash", "-c", command, "_", str(common)], + check=True, + env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"}, + ) + + def test_canonical_gha_requires_compute_visible_staging(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + repo = root / "repo" + squash = root / "squash" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + squash.mkdir() + (source / "public.py").write_text("public\n") + (source / "private-infra.md").write_text("private\n") + command = r''' + set -euo pipefail + source "$1" + unset CX_SHARD_FILE CX_STAGE_DIR + ! (COLLECTIVEX_CANONICAL_GHA=1; cx_stage_path "$2" "") + staged="$(COLLECTIVEX_CANONICAL_GHA=0; cx_stage_path "$2" "")" + cx_stage_repo "$2" "$staged" + test "$staged" != "$2" + test -f "$staged/experimental/CollectiveX/public.py" + test ! -e "$staged/experimental/CollectiveX/private-infra.md" + cx_cleanup_stage "$staged" "$2" + test ! -e "$staged" + ''' + subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"), + str(repo)], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "CX_SQUASH_DIR": str(squash), + }, + ) + self.assertEqual(list(squash.iterdir()), []) + + def test_manual_stage_does_not_write_to_checkout_parent(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + parent = Path(temporary).resolve() / "readonly-parent" + repo = parent / "repo" + squash = parent / "squash" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + squash.mkdir(mode=0o700) + (source / "public.py").write_text("public\n") + original_mode = parent.stat().st_mode & 0o777 + parent.chmod(0o555) + try: + command = r''' + set -euo pipefail + source "$1" + unset CX_STAGE_DIR + staged="$(cx_stage_path "$2" "")" + cx_stage_repo "$2" "$staged" + case "$staged" in "$3"/.collectivex-stage-*) ;; *) exit 1 ;; esac + test -f "$staged/experimental/CollectiveX/public.py" + test ! -e "$4/.collectivex-stage" + cx_cleanup_stage "$staged" "$2" + test ! -e "$staged" + ''' + subprocess.run( + [ + "bash", "-c", command, "_", + str(ROOT / "runtime" / "common.sh"), str(repo), + str(squash), str(parent), + ], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "CX_SQUASH_DIR": str(squash), + }, + ) + finally: + parent.chmod(original_mode) + self.assertEqual( + sorted(path.name for path in parent.iterdir()), + ["repo", "squash"], + ) + self.assertEqual(list(squash.iterdir()), []) + + def test_stage_refuses_to_reuse_an_execution_child(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + repo = root / "repo" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + (source / "public.py").write_text("public\n") + base = root / "stage" + child = base / "job_collision" + child.mkdir(parents=True, mode=0o700) + sentinel = child / "keep" + sentinel.write_text("keep") + command = r''' + source "$1" + ! (cx_stage_repo "$2" "$3/job_collision") + ''' + subprocess.run( + [ + "bash", "-c", command, "_", + str(ROOT / "runtime" / "common.sh"), str(repo), str(base), + ], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "COLLECTIVEX_CANONICAL_GHA": "1", + "COLLECTIVEX_EXECUTION_ID": "collision", + "CX_STAGE_DIR": str(base), + }, + ) + self.assertEqual(sentinel.read_text(), "keep") + + def test_stage_removes_its_execution_child_when_rsync_fails(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + repo = root / "repo" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + (source / "public.py").write_text("public\n") + base = root / "stage" + base.mkdir(mode=0o700) + sentinel = root / "rsync-called" + command = r''' + source "$1" + rsync() { : > "$RSYNC_CALLED"; return 1; } + staged="$(cx_stage_path "$2" "$3")" + ! cx_stage_repo "$2" "$staged" + ''' + subprocess.run( + [ + "bash", "-c", command, "_", + str(ROOT / "runtime" / "common.sh"), str(repo), str(base), + ], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "COLLECTIVEX_CANONICAL_GHA": "1", + "CX_STAGE_DIR": str(base), + "RSYNC_CALLED": str(sentinel), + }, + ) + self.assertTrue(sentinel.is_file()) + self.assertEqual(list(base.iterdir()), []) + + def test_interrupted_stage_is_cleanup_capable_before_copy(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + repo = root / "repo" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + (source / "public.py").write_text("public\n") + base = root / "stage" + base.mkdir(mode=0o700) + sibling = base / "keep" + sibling.write_text("keep\n") + command = r''' + set -euo pipefail + source "$1" + export REPO_ROOT="$2" CX_BENCH=nccl-ep + MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "$3")" + cx_install_launcher_fail_safe + rsync() { kill -TERM $$; return 143; } + cx_stage_repo "$REPO_ROOT" "$MOUNT_SRC" + ''' + result = subprocess.run( + [ + "bash", "-c", command, "_", + str(ROOT / "runtime" / "common.sh"), str(repo), str(base), + ], + text=True, + capture_output=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "COLLECTIVEX_CANONICAL_GHA": "1", + "COLLECTIVEX_EXECUTION_ID": "interrupted", + "CX_STAGE_DIR": str(base), + }, + ) + self.assertNotEqual(result.returncode, 0) + self.assertFalse((base / "job_interrupted").exists()) + self.assertEqual(sibling.read_text(), "keep\n") + + def test_stage_base_and_early_cleanup_are_isolated(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary).resolve() + repo = root / "repo" + source = repo / "experimental" / "CollectiveX" + source.mkdir(parents=True) + (source / "public.py").write_text("public\n") + nested = repo / "stage" + nested.mkdir(mode=0o700) + group_writable = root / "group-stage" + group_writable.mkdir(mode=0o770) + group_writable.chmod(0o770) + setgid = root / "setgid-stage" + setgid.mkdir(mode=0o750) + setgid.chmod(0o2750) + command = r''' + set -euo pipefail + source "$1" + ! (CX_STAGE_DIR="$3"; cx_stage_path "$2" "$3") + ! (CX_STAGE_DIR="$4"; cx_stage_path "$2" "$4") + export CX_STAGE_DIR="$5" COLLECTIVEX_EXECUTION_ID="setgid-$$" + trap 'cx_cleanup_private_logs 0' EXIT + staged="$(cx_stage_path "$2" "$CX_STAGE_DIR")" + cx_stage_repo "$2" "$staged" + chmod 2700 "$staged" + cx_cleanup_stage "$staged" "$2" + test ! -e "$staged" + ''' + subprocess.run( + [ + "bash", "-c", command, "_", + str(ROOT / "runtime" / "common.sh"), str(repo), str(nested), + str(group_writable), str(setgid), + ], + check=True, + env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"}, + ) + + early = r''' + set -euo pipefail + source "$1" + export REPO_ROOT="$2" CX_STAGE_DIR="$3" CX_BENCH=nccl-ep + export COLLECTIVEX_EXECUTION_ID="pre-marker-$$" + MOUNT_SRC="$(cx_stage_path "$REPO_ROOT" "$CX_STAGE_DIR")" + cx_install_launcher_fail_safe + mkdir -m 700 "$MOUNT_SRC" + exit 17 + ''' + result = subprocess.run( + [ + "bash", "-c", early, "_", + str(ROOT / "runtime" / "common.sh"), str(repo), str(setgid), + ], + text=True, + capture_output=True, + env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"}, + ) + self.assertEqual(result.returncode, 17, result.stderr) + self.assertEqual(list(setgid.iterdir()), []) + + def test_backend_cache_reuses_v3_and_falls_back_once_without_repair(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + parent = Path(temporary) / "stage" + parent.mkdir(mode=0o700) + concurrent = Path(temporary) / "concurrent" + concurrent.mkdir(mode=0o700) + command = r''' + set -euo pipefail + source "$1" + for worker in 1 2 3; do + ( + cx_prepare_backend_cache "$2" + printf '%s %s\n' "$CX_BACKEND_CACHE_SENTINEL_SHA256" \ + "$CX_PREPARED_BACKEND_CACHE" > "$3/$worker" + ) & + done + wait + cmp "$3/1" "$3/2" + cmp "$3/1" "$3/3" + cx_prepare_backend_cache "$2" + first="$CX_PREPARED_BACKEND_CACHE" + first_digest="$CX_BACKEND_CACHE_SENTINEL_SHA256" + chmod 2700 "$first" + cx_prepare_backend_cache "$2" + second="$CX_PREPARED_BACKEND_CACHE" + test "$first" = "$second" + test "$first_digest" = "$CX_BACKEND_CACHE_SENTINEL_SHA256" + test "$first" = "$(cd "$2" && pwd -P)/.collectivex-backend-cache-v3-$(id -u)" + export CX_BACKEND_CACHE_ROOT="$first" + cx_verify_backend_cache_mount + export CX_BACKEND_CACHE_SENTINEL_SHA256="$(printf '0%.0s' {1..64})" + ! cx_verify_backend_cache_mount + ''' + subprocess.run( + [ + "bash", "-c", command, "_", str(common), str(parent), + str(concurrent), + ], + check=True, + ) + cache = parent / f".collectivex-backend-cache-v3-{os.getuid()}" + self.assertTrue(cache.is_dir()) + self.assertEqual(cache.stat().st_mode & 0o777, 0o700) + self.assertEqual( + list(cache.glob(".collectivex-mount-sentinel-v1.tmp.*")), [] + ) + alias = Path(temporary) / "stage-alias" + alias.symlink_to(parent, target_is_directory=True) + canonical = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_prepare_backend_cache "$2"; ' + 'printf "%s\\n%s\\n" "$CX_PREPARED_BACKEND_CACHE" ' + '"$CX_BACKEND_CACHE_SENTINEL_SHA256"', + "_", str(common), str(alias), + ], + text=True, + capture_output=True, + check=True, + ) + cache_path, digest = canonical.stdout.splitlines() + self.assertEqual(cache_path, str(cache.resolve())) + self.assertRegex(digest, r"^[0-9a-f]{64}$") + saved = parent / "saved-cache" + cache.rename(saved) + cache.mkdir(mode=0o700) + replacement = cache / ".collectivex-mount-sentinel-v1" + replacement.write_bytes(b"replacement".ljust(32, b"!")) + replacement.chmod(0o600) + replaced = subprocess.run( + [ + "bash", "-c", + 'source "$1"; export CX_BACKEND_CACHE_ROOT="$2" ' + 'CX_BACKEND_CACHE_SENTINEL_SHA256="$3"; ' + 'cx_verify_backend_cache_mount', + "_", str(common), str(cache), digest, + ] + ) + self.assertNotEqual(replaced.returncode, 0) + replacement.unlink() + cache.rmdir() + saved.rename(cache) + (cache / ".collectivex-mount-sentinel-v1").unlink() + cache.rmdir() + target = Path(temporary) / "target" + target.mkdir(mode=0o700) + cache.symlink_to(target, target_is_directory=True) + fallback = subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_prepare_backend_cache "$2"; ' + 'printf "%s\\n" "$CX_PREPARED_BACKEND_CACHE"', + "_", str(common), str(parent), + ], + text=True, + capture_output=True, + check=True, + ) + v4 = parent / f".collectivex-backend-cache-v4-{os.getuid()}" + self.assertEqual(fallback.stdout.strip(), str(v4.resolve())) + self.assertTrue(cache.is_symlink()) + self.assertTrue(v4.is_dir()) + (v4 / ".collectivex-mount-sentinel-v1").unlink() + v4.rmdir() + v4.symlink_to(target, target_is_directory=True) + result = subprocess.run( + [ + "bash", "-c", 'source "$1"; cx_prepare_backend_cache "$2"', + "_", str(common), str(parent), + ], + text=True, + capture_output=True, + ) + self.assertNotEqual(result.returncode, 0) + self.assertNotIn(str(parent), result.stderr) + self.assertTrue(cache.is_symlink()) + self.assertTrue(v4.is_symlink()) + + source = common.read_text().split("cx_prepare_backend_cache() {", 1)[1] + program = source.split("<<'PY'\n", 1)[1].split("\nPY\n", 1)[0] + with tempfile.TemporaryDirectory() as temporary: + parent = Path(temporary) / "stage" + parent.mkdir(mode=0o700) + fake_os = types.ModuleType("os") + fake_os.__dict__.update(os.__dict__) + fake_os.fsync = mock.Mock(side_effect=OSError("forced fsync failure")) + with ( + mock.patch.dict(sys.modules, {"os": fake_os}), + mock.patch.object(sys, "argv", ["-", str(parent)]), + mock.patch.object(sys, "stdout", io.StringIO()), + self.assertRaises(SystemExit) as failure, + ): + exec(compile(program, "", "exec"), {}) + self.assertEqual(failure.exception.code, 1) + self.assertEqual( + list(parent.rglob(".collectivex-mount-sentinel-v1.tmp.*")), [] + ) + + def test_nvidia_namespace_package_roots_come_from_distribution_files(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + site = Path(temporary) / "site" + package = site / "nvidia" / "nccl" + (package / "include").mkdir(parents=True) + (package / "lib").mkdir() + (package / "include" / "nccl.h").write_text("header\n") + (package / "lib" / "libnccl.so.2").write_text("library\n") + info = site / "nvidia_nccl_cu13-2.30.4.dist-info" + info.mkdir() + (info / "METADATA").write_text( + "Metadata-Version: 2.1\nName: nvidia-nccl-cu13\nVersion: 2.30.4\n" + ) + (info / "RECORD").write_text( + "nvidia/nccl/include/nccl.h,,\n" + "nvidia/nccl/lib/libnccl.so.2,,\n" + "nvidia_nccl_cu13-2.30.4.dist-info/METADATA,,\n" + "nvidia_nccl_cu13-2.30.4.dist-info/RECORD,,\n" + ) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_nvidia_package_root()/,/^}/p' "$1")" + root="$(cx_nvidia_package_root nvidia-nccl-cu13 nccl)" + test "$root" = "$2/nvidia/nccl" + ! cx_nvidia_package_root nvidia-nccl-cu13 nvshmem + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), str(site.resolve())], + check=True, + env={**os.environ, "PYTHONPATH": str(site)}, + ) + + def test_cuda_cccl_exports_the_resolved_jit_toolchain_root(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + toolkit = root / "cuda-13.0" + (toolkit / "bin").mkdir(parents=True) + (toolkit / "include").mkdir() + (toolkit / "lib64").mkdir() + cccl = toolkit / "targets" / "x86_64-linux" / "include" / "cccl" + cccl.mkdir(parents=True) + nvcc = toolkit / "bin" / "nvcc" + nvcc.write_text("#!/bin/sh\nexit 0\n") + nvcc.chmod(0o755) + alias = root / "cuda" + alias.symlink_to(toolkit, target_is_directory=True) + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_prepare_cuda_cccl()/,/^}/p' "$1")" + cx_prepare_cuda_cccl + test "$CUDA_HOME" = "$2" + test "$CX_CUDA_CCCL" = "$2/targets/x86_64-linux/include/cccl" + test "$CPATH" = "$2/targets/x86_64-linux/include/cccl:" + test "$NVCC_PREPEND_FLAGS" = "-I$2/targets/x86_64-linux/include/cccl " + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), str(toolkit.resolve())], + check=True, + env={ + **os.environ, + "PATH": f"{alias / 'bin'}:{os.environ['PATH']}", + "CPATH": "", + "NVCC_PREPEND_FLAGS": "", + }, + ) + + def test_deepep_v2_toolchain_rejects_overlay_lock_failure(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_prepare_deepep_toolchain()/,/^}/p' "$1")" + cache_root="$2" + cx_nvidia_package_root() { printf '%s' /unused; } + cx_deepep_v2_root() { printf '%s' "$cache_root"; } + cx_log() { :; } + flock() { return 1; } + ! cx_prepare_deepep_toolchain + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), temporary], + check=True, + ) + + def test_pinned_source_fetch_retries_transient_failures(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_git()/,/^}/p' "$1")" + eval "$(sed -n '/^cx_git_in_tree()/,/^}/p' "$1")" + eval "$(sed -n '/^cx_fetch_revision()/,/^}/p' "$1")" + attempts=0 + expected_directory="$(cd -P -- "$3" && pwd -P)" + sleep() { :; } + git() { + local argument has_directory=0 has_trust=0 + if [ "$1" = '-c' ] && [ "$3" = init ]; then + mkdir -p "${@: -1}" + return 0 + fi + for argument in "$@"; do + [ "$argument" != '-C' ] || has_directory=1 + [ "$argument" != "safe.directory=$expected_directory" ] || has_trust=1 + [ "$argument" != 'safe.directory=*' ] || return 1 + done + [ "$has_directory" = 0 ] || [ "$has_trust" = 1 ] || return 1 + case " $* " in + *' fetch '*) + attempts=$((attempts + 1)) + [ "$attempts" = 3 ] + ;; + *' rev-parse HEAD '*) printf '%s\n' "$revision" ;; + *) return 0 ;; + esac + } + cx_fetch_revision https://example.invalid/repo "$2" "$3" + test "$attempts" = 3 + ''' + revision = "a" * 40 + subprocess.run( + ["bash", "-c", command, "_", str(common), revision, temporary], + check=True, + ) + + def test_git_tree_trust_is_exact_and_command_scoped(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + repository = root / "repo" + repository.mkdir() + alias = root / "alias" + alias.symlink_to(repository, target_is_directory=True) + wildcard = root / "*" + wildcard.mkdir() + arguments = root / "arguments" + command = r''' + set -euo pipefail + eval "$(sed -n '/^cx_git()/,/^}/p' "$1")" + eval "$(sed -n '/^cx_git_in_tree()/,/^}/p' "$1")" + arguments="$4" + git() { printf '%s\n' "$@" > "$arguments"; } + cx_git_in_tree "$2" status --porcelain + ! cx_git_in_tree relative status + ! cx_git_in_tree "$3" status + ! cx_git_in_tree "$5" status + ''' + subprocess.run( + [ + "bash", + "-c", + command, + "_", + str(common), + str(repository), + str(alias), + str(arguments), + str(wildcard), + ], + check=True, + ) + self.assertEqual( + arguments.read_text().splitlines(), + [ + "-c", + "credential.helper=", + "-c", + f"safe.directory={repository.resolve()}", + "-C", + str(repository.resolve()), + "status", + "--porcelain", + ], + ) + self.assertNotIn("safe.directory=*", arguments.read_text()) + + def test_runtime_materializes_the_verified_host_source_without_network(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + seed = root / "seed" + seed.mkdir() + (seed / "pinned").write_text("source\n") + destination = root / "build" + fetched = root / "network-fetch" + command = r''' + set -euo pipefail + source "$1" + export CX_BACKEND_SOURCE_ROOT="$2/source" + SEED="$3" FETCHED="$5" + copy_mode= + cx_backend_source_path() { printf '%s' "$SEED"; } + cx_backend_source_is_valid() { test -f "$2/pinned"; } + cx_fetch_revision() { : > "$FETCHED"; return 1; } + cp() { + test "$1" = -R + copy_mode=recursive + command cp "$@" + } + cx_materialize_backend_source deepep-hybrid "$4" + test -f "$4/pinned" + test "$copy_mode" = recursive + python3 - "$4" <<'PY' +import os +import stat +import sys +assert stat.S_IMODE(os.stat(sys.argv[1]).st_mode) == 0o700 +PY + test ! -e "$FETCHED" + ''' + subprocess.run( + [ + "bash", "-c", command, "_", str(common), str(root), + str(seed), str(destination), str(fetched), + ], + check=True, + ) + + def test_backend_source_validation_rejects_status_errors_and_ignored_files(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + command = r''' + set -euo pipefail + source "$1" + cx_backend_source_pin() { printf '%s|%s|' revision tree; } + git() { + case " $* " in + *' rev-parse HEAD '*) printf '%s\n' revision ;; + *' rev-parse HEAD^{tree} '*) printf '%s\n' tree ;; + *' status --porcelain '*) [ "$mode" != status-error ] ;; + *' ls-files --others --ignored '*) + [ "$mode" != ignored ] || printf '%s\n' ignored.bin + ;; + *) return 1 ;; + esac + } + mode=status-error + ! cx_backend_source_is_valid backend "$2" + mode=ignored + ! cx_backend_source_is_valid backend "$2" + mode=clean + cx_backend_source_is_valid backend "$2" + ''' + subprocess.run( + ["bash", "-c", command, "_", str(common), temporary], + check=True, + ) + + def test_backend_source_root_normalizes_inherited_special_mode(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + source_root = root / "experimental" / "CollectiveX" / ".cx_sources" + source = source_root / "backend-revision" + source.mkdir(parents=True) + command = r''' + set -euo pipefail + source "$1" + export COLLECTIVEX_EXECUTION_ID="source-mode-$$" + trap 'cx_cleanup_private_logs 0' EXIT + expected_mount="$2" + expected_source="$3" + expected_root="${expected_source%/*}" + observed_mode=2700 + mock_stage_owner=4200 + mock_root_owner=4200 + chmod_calls=0 + chmod() { + test "$1" = 700 && test "$2" = "$expected_root" + chmod_calls=$((chmod_calls + 1)) + [ "$chmod_calls" = 2 ] || return 1 + observed_mode=700 + } + stat() { + case "$2" in + %u) + case "$3" in + "$expected_mount") printf '%s\n' "$mock_stage_owner" ;; + "$expected_root") printf '%s\n' "$mock_root_owner" ;; + *) return 1 ;; + esac + ;; + %a) + case "$3" in + "$expected_mount") printf '2700\n' ;; + "$expected_root") printf '%s\n' "$observed_mode" ;; + *) return 1 ;; + esac + ;; + *) return 1 ;; + esac + } + cx_backend_source_path() { printf '%s' "$expected_source"; } + cx_backend_source_is_valid() { + test "$1" = backend && test "$2" = "$expected_source" + } + cx_prepare_backend_source "$2" backend + test "$observed_mode" = 2700 + test "$chmod_calls" = 0 + observed_mode=2750 + ! _cx_prepare_backend_source "$2" backend + test "$chmod_calls" = 1 + _cx_prepare_backend_source "$2" backend + test "$observed_mode" = 700 + mock_root_owner=4300 + ! _cx_prepare_backend_source "$2" backend + ''' + subprocess.run( + ["bash", "-c", command, "_", str(common), str(root), str(source)], + check=True, + ) + + def test_canonical_backend_sources_use_verified_seed_without_network(self) -> None: + common = ROOT / "runtime" / "common.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + mount = root / "mount" + source_root = mount / "experimental" / "CollectiveX" / ".cx_sources" + seed_root = root / "seed" + seeds = [ + seed_root / f"{backend}-revision" + for backend in ("backend-one", "backend-two") + ] + mount.mkdir(mode=0o700) + source_root.parent.mkdir(parents=True, mode=0o700) + for seed in seeds: + seed.mkdir(parents=True, mode=0o700) + (seed / "pinned").write_text("source\n") + network = root / "network" + command = r''' + set -euo pipefail + source "$1" + export COLLECTIVEX_CANONICAL_GHA=1 + export CX_BACKEND_SOURCE_SEED_ROOT="$4" + export COLLECTIVEX_EXECUTION_ID="source-seed-$$" + trap 'cx_cleanup_private_logs 0' EXIT + NETWORK="$5" + stat() { + case "$2" in + %u) printf '4200\n' ;; + %a) printf '700\n' ;; + *) return 1 ;; + esac + } + cx_backend_source_path() { printf '%s/%s-revision' "$1" "$2"; } + cx_backend_source_is_valid() { test -f "$2/pinned"; } + cx_fetch_revision() { : > "$NETWORK"; return 1; } + cx_prepare_backend_source "$2" backend-one + cx_prepare_backend_source "$2" backend-two + test -f "$3/backend-one-revision/pinned" + test -f "$3/backend-two-revision/pinned" + test ! -e "$NETWORK" + rm -rf -- "$3/backend-one-revision" "$3/backend-two-revision" + unset CX_BACKEND_SOURCE_SEED_ROOT + ! _cx_prepare_backend_source "$2" backend-one + test ! -e "$NETWORK" + ''' + subprocess.run( + [ + "bash", "-c", command, "_", str(common), str(mount), + str(source_root), str(seed_root), str(network), + ], + check=True, + ) + + def test_deepep_hybrid_cache_reuse_revalidates_extensions(self) -> None: + common = ROOT / "runtime" / "common.sh" + runtime = ROOT / "runtime" / "run_in_container.sh" + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + (root / "deep_ep_cpp.so").write_bytes(b"deep") + (root / "hybrid_ep_cpp.so").write_bytes(b"hybrid") + command = r''' + set -euo pipefail + chmod 700 "$3" + source "$1" + eval "$(sed -n '/^cx_deepep_hybrid_marker_content_sha256()/,/^}/p' "$2")" + eval "$(sed -n '/^cx_deepep_hybrid_cache_is_valid()/,/^}/p' "$2")" + revision=revision tree=tree + cx_git() { + case " $* " in + *' rev-parse HEAD '*) printf '%s\n' "$revision" ;; + *' rev-parse HEAD^{tree} '*) printf '%s\n' "$tree" ;; + *' status --porcelain '*|*' ls-files --others '*) return 0 ;; + *) return 1 ;; + esac + } + cx_git_in_tree() { shift; cx_git "$@"; } + marker="$3/.collectivex-complete" + digest="$(cx_extension_pair_sha256 "$3" 'deep_ep_cpp*.so' 'hybrid_ep_cpp*.so')" + (umask 077; printf '%s\n%s\n%s\n' "$revision" "$tree" "$digest" > "$marker") + cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree" + printf changed > "$3/hybrid_ep_cpp.so" + ! cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree" + printf hybrid > "$3/hybrid_ep_cpp.so" + cp "$3/deep_ep_cpp.so" "$3/deep_ep_cpp-extra.so" + ! cx_deepep_hybrid_cache_is_valid "$3" "$marker" "$revision" "$tree" + ''' + subprocess.run( + ["bash", "-c", command, "_", str(common), str(runtime), temporary], + check=True, + ) + + def test_rack_backend_environment_is_shared_per_node_and_required(self) -> None: + runtime = ROOT / "runtime" / "run_in_container.sh" + launcher = (ROOT / "launchers" / "launch_gb-nv.sh").read_text() + assignment = next( + line for line in launcher.splitlines() + if line.startswith("SOURCE_BACKEND_ENV=") + ) + self.assertNotIn("/tmp/.cx_backend_env", launcher) + self.assertIn('[ -f "$env_file" ] && [ -r "$env_file" ]', launcher) + self.assertIn('[ ! -L "$env_file" ]', launcher) + self.assertIn('$(stat -c "%u" "$env_root"):600', launcher) + self.assertIn('case "$(stat -c "%a" "$env_root")" in 700|[1-7]700)', launcher) + self.assertIn("node-${SLURM_NODEID}.sh", launcher) + self.assertIn("HybridEPBuffer", launcher) + self.assertIn('. "$env_file" || exit 66', launcher) + with tempfile.TemporaryDirectory() as temporary: + consumer = r''' + eval "$1" + env_root="$2/env" + SOURCE_BACKEND_ENV="${SOURCE_BACKEND_ENV//\/ix\/experimental\/CollectiveX\/.cx_backend\/env/$env_root}" + mkdir -p "$env_root" + env_file="$env_root/node-1.sh" + printf 'printf sourced > "$CX_SENTINEL"\n' > "$env_file" + chmod 600 "$env_file" + export CX_SENTINEL="$2/sentinel" + stat() { + [ "${STAT_FAIL:-0}" = 0 ] || return 1 + case "$2" in + %a) printf '%s\n' "$ROOT_MODE" ;; + %u) printf '1000\n' ;; + %u:%a) printf '%s\n' "$FILE_OWNER_MODE" ;; + *) return 2 ;; + esac + } + run_case() { + rm -f "$CX_SENTINEL" + ROOT_MODE="$1" FILE_OWNER_MODE="$2" STAT_FAIL="$3" SLURM_NODEID="$4" + ( eval "$SOURCE_BACKEND_ENV" ) + rc=$? + [ "$rc" = "$5" ] || return 1 + if [ "$5" = 0 ]; then + [ -f "$CX_SENTINEL" ] + else + [ ! -e "$CX_SENTINEL" ] + fi + } + run_case 700 1000:600 0 1 0 + run_case 2700 1000:600 0 1 0 + run_case 755 1000:600 0 1 66 + run_case 700 1000:600 1 1 66 + run_case 700 2000:600 0 1 66 + mv "$env_file" "$env_file.real" + ln -s "$env_file.real" "$env_file" + run_case 700 1000:600 0 1 66 + rm "$env_file" + mv "$env_file.real" "$env_file" + run_case 700 1000:600 0 invalid 66 + ''' + subprocess.run( + ["bash", "-c", consumer, "_", assignment, temporary], + check=True, + ) + command = r''' + set -euo pipefail + cd "$2" + eval "$(sed -n '/^cx_persist_backend_env()/,/^}/p' "$1")" + export SLURM_NODEID=1 PYTHONPATH=/ix/pinned DEEPEP_COMMIT=abc + cx_persist_backend_env + env_file="$PWD/.cx_backend/env/node-1.sh" + test -f "$env_file" + test "$(stat -f %Lp "$env_file" 2>/dev/null || stat -c %a "$env_file")" = 600 + unset PYTHONPATH DEEPEP_COMMIT + . "$env_file" + test "$PYTHONPATH" = /ix/pinned + test "$DEEPEP_COMMIT" = abc + SLURM_NODEID=invalid && ! cx_persist_backend_env + ''' + subprocess.run( + ["bash", "-c", command, "_", str(runtime), temporary], + check=True, + ) + + def test_stage_cleanup_failure_fails_job_but_marks_allocation_safe(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + (root / "repo").mkdir() + (root / "stage").mkdir() + command = r''' + source "$1" + cx_write_cleanup_guard() { + rm -f -- "$CX_JOB_ROOT/cleanup-safe" "$CX_JOB_ROOT/cleanup-unsafe" + : > "$CX_JOB_ROOT/cleanup-$1" + } + cx_cleanup_stage() { return 1; } + cx_cleanup_private_logs() { : > "$CX_JOB_ROOT/logs-deleted"; } + export CX_JOB_ROOT="$2" REPO_ROOT="$2/repo" MOUNT_SRC="$2/stage" + export COLLECTIVEX_CANONICAL_GHA=1 CX_ALLOCATION_UNCERTAIN=0 + unset CX_BENCH JOB_ID + cx_launcher_cleanup 0 + ''' + result = subprocess.run( + ["bash", "-c", command, "_", str(ROOT / "runtime" / "common.sh"), + str(root)], + text=True, + capture_output=True, + env={**os.environ, "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null"}, + ) + self.assertEqual(result.returncode, 1, result.stderr) + self.assertTrue((root / "cleanup-safe").is_file()) + self.assertFalse((root / "cleanup-unsafe").exists()) + self.assertFalse((root / "logs-deleted").exists()) + + def test_generated_stage_cleanup_never_removes_configured_base(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + base = root / "stage" + repo = root / "repo" + generated = base / "job_execution" + generated.mkdir(parents=True, mode=0o700) + repo.mkdir() + marker = generated / ".collectivex-stage-v1" + marker.write_text("collectivex-stage-v1\nexecution\n") + marker.chmod(0o600) + (generated / "payload").write_text("temporary") + subprocess.run( + [ + "bash", "-c", + 'source "$1"; cx_cleanup_stage "$2" "$3"; ' + '! cx_cleanup_stage "$4" "$3"', + "_", str(ROOT / "runtime" / "common.sh"), str(generated), + str(repo), str(base), + ], + check=True, + env={ + **os.environ, + "COLLECTIVEX_OPERATOR_CONFIG": "/dev/null", + "COLLECTIVEX_EXECUTION_ID": "execution", + "CX_STAGE_DIR": str(base), + }, + ) + self.assertFalse(generated.exists()) + self.assertTrue(base.is_dir()) + self.assertTrue(repo.is_dir()) + + def test_adapters_do_not_retain_dead_expected_methods(self) -> None: + for path in HERE.glob("ep_*.py"): + tree = ast.parse(path.read_text(), str(path)) + methods = { + node.name for node in ast.walk(tree) + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) + } + self.assertNotIn("expected", methods, path.name) + + def test_artifact_safety_rejects_sensitive_material(self) -> None: + private_address = ".".join(str(octet) for octet in (10, 0, 0, 1)) + secret = "github_pat_" + "A" * 24 + sensitive = { + "ipv4": ({"note": private_address}, private_address), + "ipv6": ({"note": "[2001:db8::1]:29500"}, "2001:db8::1"), + "user-at-host": ({"note": "ssh admin@private-host"}, "admin@private-host"), + "hostname": ({"note": "host=compute-17"}, "compute-17"), + "private-dns": ({"note": "worker-7.cluster.local"}, "worker-7.cluster.local"), + "suffixed-host": ({"worker_hostname": "relative"}, "worker_hostname"), + "suffixed-address": ({"control_address": "relative"}, "control_address"), + "suffixed-path": ({"scheduler_path": "relative"}, "scheduler_path"), + "exact-address": ({"address": "relative"}, "address"), + "exact-ip": ({"ip": "relative"}, "ip"), + "camel-host": ({"workerHost": "relative"}, "workerHost"), + "camel-path": ({"schedulerPath": "relative"}, "schedulerPath"), + "acronym-gpu-uuid": ({"gpuUUID": "relative"}, "gpuUUID"), + "acronym-device-uuid": ({"deviceUUID": "relative"}, "deviceUUID"), + "acronym-pci-bus": ({"pciBusID": "relative"}, "pciBusID"), + "mac-address": ({"note": "00:11:22:33:44:55"}, "00:11:22:33:44:55"), + "ib-guid": ({"note": "00:11:22:33:44:55:66:77"}, "00:11:22:33:44:55:66:77"), + "dgx-host": ({"note": "dgx-b300-001"}, "dgx-b300-001"), + "cloud-host": ({"note": "ip-10-20-30-40"}, "ip-10-20-30-40"), + "credential-field": ({"service_token": "short"}, "service_token"), + "prefixed-token": ({"note": secret}, secret), + "hf-token": ({"note": "hf_" + "A" * 24}, "hf_" + "A" * 24), + "payment-token": ({"note": "sk_live_" + "A" * 24}, "sk_live_" + "A" * 24), + "generic-secret": ({"note": "password=not-a-real-secret"}, "not-a-real-secret"), + } + for root in ("data", "it-share", "lustre", "raid", "nvme_home", "scratch", "gpfs", "fsx"): + value = f"/{root}/collectivex/run" + sensitive[f"private-root-{root}"] = ({"note": value}, value) + for name, (document, offending) in sensitive.items(): + with self.subTest(name=name), self.assertRaises( + artifact_safety.ArtifactSafetyError + ) as caught: + artifact_safety.assert_publication_safe([document]) + self.assertNotIn(offending, str(caught.exception)) + + artifact_safety.assert_publication_safe([{ + "runner": "b300", + "redaction": "sanitized-v1", + "path": "datasets/" + "a" * 64 + "/dataset.json", + "timing": "8:64:32", + "image_digest": "sha256:" + "b" * 64, + "source": "github.com", + }]) + for ref in ("release@candidate", "worker1-feature", "sk-refactor-long-component-name"): + artifact_safety.assert_publication_safe([{"ref": ref}]) + + def test_artifact_safety_cli_does_not_echo_sensitive_values(self) -> None: + private_value = ".".join(str(octet) for octet in (10, 24, 68, 12)) + with tempfile.TemporaryDirectory() as temporary: + path = Path(temporary) / "artifact.json" + path.write_text(json.dumps({"note": private_value})) + result = subprocess.run( + [sys.executable, str(ROOT / "artifact_safety.py"), str(path)], + text=True, + capture_output=True, + ) + self.assertNotEqual(result.returncode, 0) + self.assertIn("forbidden ipv4-address value", result.stderr) + self.assertNotIn(private_value, result.stderr) + + def test_artifact_safety_rejects_linked_and_special_inputs(self) -> None: + with tempfile.TemporaryDirectory() as temporary: + root = Path(temporary) + source = root / "source.json" + source.write_text("{}") + linked = root / "linked.json" + linked.symlink_to(source) + fifo = root / "fifo.json" + os.mkfifo(fifo) + for path in (linked, fifo): + with self.subTest(path=path.name), self.assertRaises( + artifact_safety.ArtifactSafetyError + ): + artifact_safety.load_documents([str(path)]) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/test_schema_v1_contract.py b/experimental/CollectiveX/tests/test_schema_v1_contract.py new file mode 100644 index 000000000..a517cd5d5 --- /dev/null +++ b/experimental/CollectiveX/tests/test_schema_v1_contract.py @@ -0,0 +1,436 @@ +#!/usr/bin/env python3 +"""Focused structural tests for the fail-closed CollectiveX V1 schemas.""" +from __future__ import annotations + +import copy +import json +import sys +import unittest +from pathlib import Path + +import jsonschema + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +import identity # noqa: E402 + + +def _load(name: str) -> dict: + return json.loads((ROOT / "schemas" / name).read_text()) + + +def _definition_validator(schema: dict, name: str) -> jsonschema.Validator: + return jsonschema.Draft202012Validator( + { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$defs": schema["$defs"], + "$ref": f"#/$defs/{name}", + } + ) + + +class CollectiveXV1SchemaContractTest(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + cls.raw = _load("raw-case-v1.schema.json") + cls.samples = _load("samples-v1.schema.json") + cls.public = _load("public-dataset-v1.schema.json") + cls.bundle = _load("private-bundle-v1.schema.json") + cls.terminal = _load("terminal-outcome-v1.schema.json") + + def test_all_checked_in_schemas_are_draft_2020_12_valid(self) -> None: + for path in sorted((ROOT / "schemas").glob("*.schema.json")): + with self.subTest(path=path.name): + schema = json.loads(path.read_text()) + jsonschema.Draft202012Validator.check_schema(schema) + self.assertFalse(schema["additionalProperties"]) + + def test_precision_catalog_and_axes_are_exact_and_strict(self) -> None: + expected = set(identity.V1_PRECISION_PROFILES) + self.assertEqual(set(self.raw["$defs"]["precision_profile"]["enum"]), expected) + self.assertEqual(set(self.public["$defs"]["precisionProfile"]["enum"]), expected) + self.assertEqual(set(self.terminal["$defs"]["precisionProfile"]["enum"]), expected) + + axis_keys = { + "alignment_contract", + "api_input_dtype", + "api_output_dtype", + "communication_format", + "conversion_boundary", + "padding_contract", + "quantization_origin", + "scale_dtype", + "scale_group_size", + "scale_layout", + } + for schema, name in ( + (self.raw, "communication_axis"), + (self.public, "communicationAxis"), + (self.terminal, "communicationAxis"), + ): + with self.subTest(schema=schema["$id"]): + axis = schema["$defs"][name] + self.assertFalse(axis["additionalProperties"]) + self.assertEqual(set(axis["required"]), axis_keys) + self.assertEqual(set(axis["properties"]), axis_keys) + + axis_validator = _definition_validator(self.raw, "communication_axis") + profile_validator = _definition_validator(self.raw, "communication_precision") + for name in sorted(expected): + profile = identity.precision_profile(name) + with self.subTest(profile=name): + axis_validator.validate(profile["dispatch"]) + axis_validator.validate(profile["combine"]) + profile_validator.validate(profile) + raw_case_profile = _definition_validator(self.raw, "case_profile") + terminal_case_profile = _definition_validator(self.terminal, "caseProfile") + for case in ( + {"mode": "normal"}, + { + "mode": "normal", + "precision_profile": "d-fp8-e4m3fn-b128-f32-prequantized.c-bf16", + }, + { + "mode": "low-latency", + "precision_profile": "d-fp8-e4m3fn-b128-f32-fused.c-bf16", + }, + ): + resolved = identity.profile_for_case(case) + raw_case_profile.validate(resolved) + terminal_case_profile.validate(resolved) + + shape = self.raw["properties"]["case"]["properties"]["shape"] + self.assertIn("precision_profile", shape["required"]) + self.assertIn("dispatch_precision", shape["required"]) + self.assertIn("combine_precision", shape["required"]) + self.assertNotIn("dispatch_dtype", shape["properties"]) + self.assertNotIn("quant", shape["properties"]) + + workload = self.public["$defs"]["series"]["properties"]["workload"] + self.assertIn("precision_profile", workload["required"]) + self.assertIn("dispatch_precision", workload["required"]) + self.assertIn("combine_precision", workload["required"]) + self.assertNotIn("dispatch_dtype", workload["properties"]) + self.assertNotIn("combine_dtype", workload["properties"]) + + profile = self.raw["$defs"]["case_profile"] + self.assertEqual( + profile["properties"]["activation_generator"]["const"], + identity.V1_NORMAL_CASE_PROFILE["activation_generator"], + ) + self.assertEqual( + profile["properties"]["activation_profile"]["const"], + identity.V1_NORMAL_CASE_PROFILE["activation_profile"], + ) + self.assertEqual( + profile["properties"]["source_identity_contract"]["const"], + identity.V1_NORMAL_CASE_PROFILE["source_identity_contract"], + ) + + def test_qualification_index_is_bound_across_private_and_public_records(self) -> None: + paths = ( + self.raw["properties"]["measurement"]["properties"]["qualification_index"], + self.raw["properties"]["identity"]["properties"]["allocation_factors"]["properties"]["qualification_index"], + self.raw["$defs"]["git_run"]["properties"]["qualification_index"], + self.samples["properties"]["qualification_index"], + self.bundle["properties"]["run"]["properties"]["qualification_index"], + self.terminal["$defs"]["allocationFactors"]["properties"]["qualification_index"], + self.terminal["$defs"]["gitRun"]["properties"]["qualification_index"], + self.public["$defs"]["attempt"]["properties"]["qualification_index"], + ) + for value in paths: + self.assertEqual((value["minimum"], value["maximum"]), (1, 3)) + promotion_indices = self.public["properties"]["promotion"]["properties"]["qualification_indices"] + series_indices = self.public["$defs"]["series"]["properties"]["measurement"]["properties"]["qualification_indices"] + for schema, valid_values in ( + (promotion_indices, ([], [1], [1, 2, 3])), + (series_indices, ([1], [2, 3], [1, 2, 3])), + ): + validator = jsonschema.Draft202012Validator(schema) + for value in valid_values: + validator.validate(value) + for value in ([0], [4], [1, 1], [1, 2, 3, 1]): + with self.assertRaises(jsonschema.ValidationError): + validator.validate(value) + measurement = self.raw["properties"]["measurement"] + self.assertIn("execution_order_sha256", measurement["required"]) + self.assertEqual( + measurement["properties"]["execution_order_sha256"]["pattern"], + "^[0-9a-f]{64}$", + ) + + def test_private_allocation_stratum_is_required_only_in_raw_canonical_evidence(self) -> None: + provenance = self.raw["properties"]["provenance"] + self.assertIn("allocation_stratum_sha256", provenance["required"]) + stratum = provenance["properties"]["allocation_stratum_sha256"] + jsonschema.Draft202012Validator(stratum).validate(None) + jsonschema.Draft202012Validator(stratum).validate("a" * 64) + with self.assertRaises(jsonschema.ValidationError): + jsonschema.Draft202012Validator(stratum).validate("A" * 64) + conditional = self.raw["allOf"][0] + self.assertEqual( + conditional["if"]["properties"]["workload"]["properties"]["source"], + {"const": "canonical-serialized"}, + ) + canonical_stratum = conditional["then"]["properties"]["provenance"][ + "properties" + ]["allocation_stratum_sha256"] + with self.assertRaises(jsonschema.ValidationError): + jsonschema.Draft202012Validator(canonical_stratum).validate(None) + self.assertNotIn("allocation_stratum_sha256", json.dumps(self.public)) + + def test_stage_samples_are_absent_or_exactly_512(self) -> None: + validator = _definition_validator(self.samples, "component") + measured = { + "availability": "measured", + "sample_count": 512, + "trials": [[1.0] * 8 for _ in range(64)], + } + validator.validate(measured) + unavailable = { + "availability": "unavailable", + "sample_count": 0, + "trials": None, + } + validator.validate(unavailable) + for mutate in ( + lambda value: value.update(sample_count=511), + lambda value: value["trials"].pop(), + lambda value: value["trials"][0].pop(), + ): + broken = copy.deepcopy(measured) + mutate(broken) + with self.assertRaises(jsonschema.ValidationError): + validator.validate(broken) + + raw_components = self.raw["properties"]["measurement"]["properties"]["rows"]["items"]["properties"]["components"] + public_components = self.public["$defs"]["point"]["properties"]["components"] + self.assertIn("stage", raw_components["required"]) + self.assertIn("stage", public_components["required"]) + + def test_byte_provenance_supports_precision_aware_rates(self) -> None: + expected = { + "accounting_contract", + "activation_data_bytes", + "scale_bytes", + "total_logical_bytes", + } + for schema, name in ( + (self.raw, "byte_accounting"), + (self.public, "byteAccounting"), + ): + definition = schema["$defs"][name] + self.assertFalse(definition["additionalProperties"]) + self.assertEqual(set(definition["required"]), expected) + self.assertEqual(set(definition["properties"]), expected) + component = self.public["$defs"]["component"] + self.assertIn("byte_provenance", component["required"]) + self.assertIn("activation_data_rate_gbps_at_latency_percentile", component["required"]) + self.assertIn("total_logical_data_rate_gbps_at_latency_percentile", component["required"]) + + def test_raw_correctness_carries_directional_precision_evidence(self) -> None: + correctness = self.raw["properties"]["measurement"]["properties"]["rows"]["items"]["properties"]["correctness"] + self.assertIn("precision", correctness["required"]) + evidence = self.raw["$defs"]["precision_evidence"] + self.assertFalse(evidence["additionalProperties"]) + self.assertEqual( + set(evidence["required"]), + {"profile_id", "dispatch", "combine", "passed"}, + ) + axis = self.raw["$defs"]["precision_axis_evidence"] + self.assertFalse(axis["additionalProperties"]) + self.assertEqual( + set(axis["required"]), + { + "encoded_payload_valid", + "scales_finite", + "scales_positive", + "dequantized_semantics", + "saturation_count", + "saturation_rate", + "max_abs_error", + "max_rel_error", + "passed", + }, + ) + + def test_eplb_calibration_provenance_is_explicit(self) -> None: + fields = { + "calibration_workload_id", + "calibration_trace_sha256", + "calibration_window", + "calibration_token_offset", + } + raw = self.raw["properties"]["case"]["properties"]["eplb"] + public = self.public["$defs"]["series"]["properties"]["eplb"] + for descriptor in (raw, public): + self.assertTrue(fields <= set(descriptor["required"])) + self.assertTrue(fields <= set(descriptor["properties"])) + self.assertFalse(descriptor["additionalProperties"]) + + def test_public_coverage_is_a_full_case_and_point_inventory(self) -> None: + coverage = self.public["$defs"]["coverage"] + dimensions = { + "sku", + "suite", + "workload", + "publication_tier", + "backend", + "backend_generation", + "resource", + "topology", + "phase", + "mode", + "routing", + "eplb", + "precision_profile", + "dispatch_precision", + "combine_precision", + "points", + } + self.assertTrue(dimensions <= set(coverage["required"])) + self.assertFalse(coverage["additionalProperties"]) + point = self.public["$defs"]["coveragePoint"] + self.assertEqual( + set(point["properties"]["terminal_status"]["enum"]), + {"measured", "unsupported", "failed", "invalid", "diagnostic"}, + ) + self.assertIn("tokens_per_rank", point["required"]) + self.assertIn("global_tokens", point["required"]) + self.assertIn("reason", point["required"]) + + promotion = self.public["properties"]["promotion"] + counts = { + "requested_cases", + "terminal_cases", + "measured_cases", + "unsupported_cases", + "requested_points", + "terminal_points", + "measured_points", + "unsupported_points", + } + self.assertTrue(counts <= set(promotion["required"])) + self.assertFalse(promotion["additionalProperties"]) + + def test_public_coverage_point_reason_tracks_terminal_status(self) -> None: + validator = _definition_validator(self.public, "coveragePoint") + point_id = "cxpoint-v1-" + "1" * 64 + series_id = "cxseries-v1-" + "2" * 64 + measured = { + "point_id": point_id, + "series_id": series_id, + "tokens_per_rank": 8, + "global_tokens": 64, + "terminal_status": "measured", + "reason": None, + } + validator.validate(measured) + unsupported = { + **measured, + "point_id": None, + "series_id": None, + "terminal_status": "unsupported", + "reason": "backend-platform-unsupported", + } + validator.validate(unsupported) + failed = {**unsupported, "terminal_status": "failed", "reason": "execution-failed"} + validator.validate(failed) + + for broken in ( + {**measured, "reason": "unexpected-reason"}, + {**unsupported, "reason": None}, + {**failed, "reason": None}, + {**unsupported, "reason": "contains spaces"}, + ): + with self.assertRaises(jsonschema.ValidationError): + validator.validate(broken) + + def test_public_measured_point_has_bounded_detail_and_three_run_stability(self) -> None: + point = self.public["$defs"]["point"] + self.assertFalse(point["additionalProperties"]) + self.assertTrue({"anomalies", "correctness", "stability"} <= set(point["required"])) + self.assertNotIn("correct", point["properties"]) + + axis = { + "encoded_payload_valid": True, + "scales_finite": None, + "scales_positive": None, + "dequantized_semantics": True, + "saturation_count": 0, + "saturation_rate": 0.0, + "max_abs_error": 0.0, + "max_rel_error": 0.0, + "passed": True, + } + correctness = { + "semantic_pass": True, + "precision": { + "profile_id": identity.V1_CONTROL_PRECISION_PROFILE, + "dispatch": axis, + "combine": copy.deepcopy(axis), + "passed": True, + }, + } + correctness_validator = _definition_validator(self.public, "pointCorrectness") + correctness_validator.validate(correctness) + broken_correctness = copy.deepcopy(correctness) + broken_correctness["precision"]["dispatch"]["unexpected"] = True + with self.assertRaises(jsonschema.ValidationError): + correctness_validator.validate(broken_correctness) + + stability_validator = _definition_validator(self.public, "pointStability") + stability_validator.validate({ + "complete": True, + "qualification_indices": [1, 2, 3], + "p50_max_min_ratio": 1.02, + "p99_max_min_ratio": 1.04, + "stable_p50": True, + "stable_p99": True, + }) + stability_validator.validate({ + "complete": False, + "qualification_indices": [2], + "p50_max_min_ratio": None, + "p99_max_min_ratio": None, + "stable_p50": False, + "stable_p99": False, + }) + for broken in ( + { + "complete": True, + "qualification_indices": [1, 2], + "p50_max_min_ratio": 1.0, + "p99_max_min_ratio": 1.0, + "stable_p50": True, + "stable_p99": True, + }, + { + "complete": False, + "qualification_indices": [1], + "p50_max_min_ratio": 1.0, + "p99_max_min_ratio": None, + "stable_p50": False, + "stable_p99": False, + }, + ): + with self.assertRaises(jsonschema.ValidationError): + stability_validator.validate(broken) + + anomalies = point["properties"]["anomalies"] + self.assertEqual(anomalies["maxItems"], 16) + self.assertTrue(anomalies["uniqueItems"]) + anomaly_validator = jsonschema.Draft202012Validator({ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$defs": self.public["$defs"], + "$ref": "#/$defs/point/properties/anomalies", + }) + anomaly_validator.validate(["roundtrip-gt-isolated-sum"]) + with self.assertRaises(jsonschema.ValidationError): + anomaly_validator.validate([f"anomaly-{index}" for index in range(17)]) + + +if __name__ == "__main__": + unittest.main() diff --git a/experimental/CollectiveX/tests/workload.py b/experimental/CollectiveX/tests/workload.py new file mode 100644 index 000000000..74860be80 --- /dev/null +++ b/experimental/CollectiveX/tests/workload.py @@ -0,0 +1,416 @@ +#!/usr/bin/env python3 +"""Canonical, byte-stable CollectiveX routing workloads. + +A *canonical workload* is a routing trace generated ONCE, serialized to a platform-independent +file, and referenced by an immutable `workload_id`. Every promoted benchmark point consumes the +SAME serialized bytes, so "did NVIDIA and AMD run the identical workload?" is answered by a +checksum match, not by trusting that two machines re-ran the same seeded generator. + +Layout on disk (one workload = two files, basename = workload_id): + /.npz topk_idx [gt,topk] int32, topk_weights [gt,topk] float32 + /.manifest.json dims, routing profile, generator version, seed, SHA-256s + +Routing and gate weights come from a stdlib integer counter, not a framework RNG. The same +parameters therefore produce the same int32/float32 bytes across PyTorch and accelerator images. +""" +from __future__ import annotations + +from array import array +import bisect +import hashlib +import json +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import identity # noqa: E402 + +WORKLOAD_SCHEMA_VERSION = 1 +# Bump when the counter or byte encoding changes. The workload ID binds parameters and trace bytes. +GENERATOR_VERSION = "collectivex-routing-counter-v3" +GATE_WEIGHT_FORMAT = "counter-u16-normalized-f32" +ACTIVATION_GENERATOR = "collectivex-activation-counter-v4" +EPLB_CALIBRATION_WINDOW = "collectivex-eplb-calibration-window-v1" +EPLB_CALIBRATION_TOKEN_OFFSET = 1 << 32 +_MASK64 = (1 << 64) - 1 + + +def _sha256(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() + + +def _mix64(value: int) -> int: + value = (value + 0x9E3779B97F4A7C15) & _MASK64 + value = ((value ^ (value >> 30)) * 0xBF58476D1CE4E5B9) & _MASK64 + value = ((value ^ (value >> 27)) * 0x94D049BB133111EB) & _MASK64 + return value ^ (value >> 31) + + +def _counter(seed: int, token: int, slot: int, attempt: int, stream: int) -> int: + value = ( + (seed & _MASK64) + ^ (((token + 1) * 0xD2B74407B1CE6E93) & _MASK64) + ^ (((slot + 1) * 0xCA5A826395121157) & _MASK64) + ^ (((attempt + 1) * 0x9E3779B185EBCA87) & _MASK64) + ^ (((stream + 1) * 0xA24BAED4963EE407) & _MASK64) + ) + return _mix64(value) + + +def canonical_routing_rows( + global_tokens: int, + experts: int, + topk: int, + routing: str, + seed: int, + *, + token_offset: int = 0, +) -> tuple[list[list[int]], list[list[float]]]: + """Generate a deterministic routing window from exact integer counters.""" + if routing not in {"uniform", "zipf"}: + raise ValueError(f"unknown routing {routing!r} (uniform|zipf)") + if global_tokens <= 0 or experts <= 0 or topk <= 0 or topk > experts: + raise ValueError("global_tokens/experts/topk must be positive and topk <= experts") + if type(token_offset) is not int or token_offset < 0: + raise ValueError("token_offset must be a non-negative integer") + + cumulative: list[int] | None = None + if routing == "zipf": + total = 0 + cumulative = [] + for expert in range(experts): + total += (1 << 32) // (expert + 1) + cumulative.append(total) + + indices: list[list[int]] = [] + weights: list[list[float]] = [] + for local_token in range(global_tokens): + token = token_offset + local_token + selected: list[int] = [] + used: set[int] = set() + for slot in range(topk): + attempt = 0 + while True: + value = _counter(seed, token, slot, attempt, 0) + expert = ( + value % experts + if cumulative is None + else bisect.bisect_right(cumulative, value % cumulative[-1]) + ) + if expert not in used: + used.add(expert) + selected.append(expert) + break + attempt += 1 + if attempt > experts * 16: + raise RuntimeError("counter routing could not select distinct experts") + raw = [1 + _counter(seed, token, slot, 0, 1) % 65535 for slot in range(topk)] + denominator = float(sum(raw)) + indices.append(selected) + weights.append([value / denominator for value in raw]) + return indices, weights + + +def _canonical_bytes( + indices: list[list[int]], weights: list[list[float]] +) -> tuple[bytes, bytes]: + idx = array("i", (value for row in indices for value in row)) + gate = array("f", (value for row in weights for value in row)) + if idx.itemsize != 4 or gate.itemsize != 4: + raise RuntimeError("canonical workload requires 32-bit int and float arrays") + if sys.byteorder != "little": + idx.byteswap() + gate.byteswap() + return idx.tobytes(), gate.tobytes() + + +def trace_checksums( + indices: list[list[int]], weights: list[list[float]] +) -> dict[str, str]: + """Return the manifest hashes for exact logical or remapped routing rows.""" + idx_bytes, weight_bytes = _canonical_bytes(indices, weights) + return { + "topk_idx": _sha256(idx_bytes), + "topk_weights": _sha256(weight_bytes), + "trace": _sha256(idx_bytes + weight_bytes), + } + + +def canonical_member( + routing: str, + hidden: int, + topk: int, + experts: int, + ep_size: int, + tokens_per_rank: int, + seed: int, + *, + token_offset: int = 0, +) -> tuple[str, dict[str, str], list[list[int]], list[list[float]]]: + """Derive one canonical manifest member and retain its rows for proof checks.""" + global_tokens = ep_size * tokens_per_rank + indices, weights = canonical_routing_rows( + global_tokens, + experts, + topk, + routing, + seed, + token_offset=token_offset, + ) + checksums = trace_checksums(indices, weights) + member = compute_workload_id( + routing, + hidden, + topk, + experts, + ep_size, + global_tokens, + seed, + trace_checksum=checksums["trace"], + token_offset=token_offset, + ) + return member, checksums, indices, weights + + +def canonical_eplb_calibration_member( + routing: str, + hidden: int, + topk: int, + experts: int, + ep_size: int, + tokens_per_rank: int, + seed: int, +) -> tuple[str, dict[str, str], list[list[int]], list[list[float]]]: + """Return the EPLB calibration trace from a disjoint global-token window.""" + return canonical_member( + routing, + hidden, + topk, + experts, + ep_size, + tokens_per_rank, + seed, + token_offset=EPLB_CALIBRATION_TOKEN_OFFSET, + ) + + +def compute_workload_id(routing: str, hidden: int, topk: int, experts: int, + ep_size: int, global_tokens: int, seed: int, + generator: str = GENERATOR_VERSION, + trace_checksum: str | None = None, + token_offset: int = 0) -> str: + """Deterministic ID over parameters and canonical trace bytes.""" + if generator != GENERATOR_VERSION: + raise ValueError(f"unsupported workload generator {generator!r}") + if type(token_offset) is not int or token_offset < 0: + raise ValueError("token_offset must be a non-negative integer") + if trace_checksum is None: + indices, weights = canonical_routing_rows( + global_tokens, + experts, + topk, + routing, + seed, + token_offset=token_offset, + ) + idx_bytes, weight_bytes = _canonical_bytes(indices, weights) + trace_checksum = _sha256(idx_bytes + weight_bytes) + key = { + "generator": generator, "routing": routing, "hidden": hidden, "topk": topk, + "experts": experts, "ep_size": ep_size, "global_tokens": global_tokens, + "seed": seed, "trace_sha256": trace_checksum, + "activation_generator": ACTIVATION_GENERATOR, + "activation_identity": compute_activation_identity(seed, hidden), + } + if token_offset: + key.update({ + "routing_window": EPLB_CALIBRATION_WINDOW, + "token_offset": token_offset, + }) + return identity.workload_id(key) + + +def compute_activation_identity(seed, hidden, generator=ACTIVATION_GENERATOR) -> str: + """Identity of the exact counter-derived activation generator.""" + key = f"counter|seed={seed}|hidden={hidden}|gen={generator}" + return _sha256(key.encode()) + + +def build_manifest(routing, hidden, topk, experts, global_tokens, seed, experts_per_rank, + idx_np, weights_np): + """Assemble the manifest dict from the (numpy) trace arrays. Pure numpy/stdlib.""" + if experts % experts_per_rank: + raise ValueError("experts must be divisible by experts_per_rank") + idx_bytes = idx_np.astype(" str: + import numpy as np + os.makedirs(out_dir, exist_ok=True) + wid = manifest["workload_id"] + np.savez_compressed(os.path.join(out_dir, f"{wid}.npz"), + topk_idx=idx_np.astype(np.int32), topk_weights=weights_np.astype(np.float32)) + with open(os.path.join(out_dir, f"{wid}.manifest.json"), "w") as fh: + json.dump(manifest, fh, indent=2, sort_keys=True) + return wid + + +def load_workload(npz_path, verify=True): + """Load a canonical trace (numpy + stdlib only). Returns (idx_np, weights_np, manifest). + Raises ValueError if verify=True and the on-disk bytes don't match the manifest checksums.""" + import numpy as np + base = npz_path[:-4] if npz_path.endswith(".npz") else npz_path + with open(base + ".manifest.json") as fh: + manifest = json.load(fh) + if manifest.get("workload_id") != os.path.basename(base): + raise ValueError(f"workload manifest ID does not match filename for {base}") + with np.load(base + ".npz", allow_pickle=False) as archive: + if set(archive.files) != {"topk_idx", "topk_weights"}: + raise ValueError(f"workload archive fields differ for {base}") + idx_np = np.ascontiguousarray(archive["topk_idx"]) + w_np = np.ascontiguousarray(archive["topk_weights"]) + if verify: + ok, reason = verify_workload(manifest, idx_np, w_np) + if not ok: + raise ValueError(f"workload checksum mismatch for {base}: {reason}") + return idx_np, w_np, manifest + + +def verify_workload(manifest, idx_np, weights_np): + """Recompute checksums and compare to the manifest. Returns (ok, reason).""" + import numpy as np + expected_fields = { + "schema_version", "workload_id", "generator_version", "gate_weight_format", "dims", + "routing_profile", "seed", "checksums", "activation_profile", "activation_generator", + "activation_identity", + } + if not isinstance(manifest, dict) or set(manifest) != expected_fields: + return False, "manifest fields differ from the v1 contract" + if (manifest["schema_version"] != WORKLOAD_SCHEMA_VERSION + or manifest["generator_version"] != GENERATOR_VERSION + or manifest["gate_weight_format"] != GATE_WEIGHT_FORMAT + or manifest["routing_profile"] not in {"uniform", "zipf"}): + return False, "manifest version or generator is unsupported" + if (isinstance(manifest["seed"], bool) or not isinstance(manifest["seed"], int) + or not identity.is_typed_id(manifest["workload_id"], "workload")): + return False, "manifest seed or workload ID is invalid" + dims = manifest["dims"] + dim_fields = {"hidden", "topk", "experts", "ep_size", "tokens_per_rank", + "global_tokens", "experts_per_rank"} + if not isinstance(dims, dict) or set(dims) != dim_fields: + return False, "manifest dimensions are invalid" + if any(isinstance(dims[key], bool) or not isinstance(dims[key], int) or dims[key] <= 0 + for key in dim_fields): + return False, "manifest dimensions must be positive integers" + if (dims["experts"] != dims["ep_size"] * dims["experts_per_rank"] + or dims["global_tokens"] != dims["ep_size"] * dims["tokens_per_rank"]): + return False, "manifest EP dimensions are inconsistent" + shape = (dims["global_tokens"], dims["topk"]) + if (idx_np.dtype != np.int32 or weights_np.dtype != np.float32 + or idx_np.shape != shape or weights_np.shape != shape + or not idx_np.flags.c_contiguous or not weights_np.flags.c_contiguous): + return False, "workload array dtype, shape, or layout is invalid" + if (np.any(idx_np < 0) or np.any(idx_np >= dims["experts"]) + or np.any(np.diff(np.sort(idx_np, axis=1), axis=1) == 0)): + return False, "expert indices are out of range or repeated" + if (not np.isfinite(weights_np).all() or np.any(weights_np < 0) + or not np.allclose(weights_np.sum(axis=1), 1.0, rtol=1e-5, atol=1e-6)): + return False, "gate weights are invalid" + if (manifest["activation_profile"] != "canonical-counter-source-v3" + or manifest["activation_generator"] != ACTIVATION_GENERATOR + or manifest["activation_identity"] + != compute_activation_identity( + manifest["seed"], dims["hidden"], manifest["activation_generator"] + )): + return False, "activation identity is invalid" + ib = idx_np.astype(" must fail + idx2[0, 0] = (int(idx2[0, 0]) + 1) % 256 + bad, _ = verify_workload(man2, idx2, w2) + assert not bad, "verify must catch tampering" + print(f"save/load/verify roundtrip OK (workload_id={wid})") + except ImportError: + print("(numpy unavailable — skipped serialization roundtrip; id logic passed)") + print("workload self-test: PASS") + sys.exit(0)